Pages: [1]
  Print  
Author Topic: Split data  (Read 438 times)
oli
Newbie
*
Posts: 6


« on: May 10, 2013, 07:09:12 PM »

Hi,

I have got another question, hopefully someone might be able to point me in the right direction.

I am using the knn process and looping through a lot of data based on the name.

I want to split my data but by different amounts depending on where I am through the data.

My decisions are done on a time basis, the top part of my data is the earliest observations and the bottom the later observations. I will try and show an simple example below.

Example Name 1 is in the data set 10 times. The first time it appears in the data set it will have no previous results so a KNN can not be done, so I would discard this example.

The second time the name appears I want to base the KNN on the example that has happened before, so the top 10% of the data for Name 1 would go into creating the model. Then the current example would go into apply model and I would discard the other 80% (as from this examples point of view it has not happened yet so it is information I would not have at the time).

The third time the name appears I would base the KNN on the two above examples, so the top 20% of data would go into creating the model. The the current example would go into apply model and I would discard 70%.

I want to carry on doing this as per the below table.

   Make Model   Apply Model   Discard
4th   30%   10%   60%
5th   40%   10%   50%
6th   50%   10%   40%
7th   60%   10%   30%
8th   70%   10%   20%
9th   80%   10%   10%
10th   90%   10%   0%
I should also note that names might occur different times sometimes just once other times over 20.

I was hoping to use the split data function with a macro to split the data. I have the percentages in my data, but I am struggling to get the figures into my split data operator.

This is my current operation, I have tried to use macros but have taken them out as it did not work and replaced them with some random ratio.

Any help would be much appreciated.

Thanks,

Oli

Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="">
    <process expanded="true">
      <operator activated="true" class="free_memory" compatibility="5.3.008" expanded="true" height="60" name="Free Memory" width="90" x="179" y="75"/>
      <operator activated="true" class="retrieve" compatibility="5.3.008" expanded="true" height="60" name="Retrieve Names2" width="90" x="45" y="345">
        <parameter key="repository_entry" value="data/Names2"/>
      </operator>
      <operator activated="true" class="loop_values" compatibility="5.3.008" expanded="true" height="76" name="Loop Values" width="90" x="179" y="210">
        <parameter key="attribute" value="NAME"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="5.3.008" expanded="true" height="60" name="Retrieve data aw (2)" width="90" x="45" y="435">
            <parameter key="repository_entry" value="data/data aw"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="5.3.008" expanded="true" height="76" name="Filter Examples (2)" width="90" x="45" y="255">
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="parameter_string" value="NAME=%{loop_value}"/>
          </operator>
          <operator activated="true" class="free_memory" compatibility="5.3.008" expanded="true" height="60" name="Free Memory (2)" width="90" x="112" y="75"/>
          <operator activated="true" class="split_data" compatibility="5.3.008" expanded="true" height="94" name="Split Data" width="90" x="179" y="165">
            <enumeration key="partitions">
              <parameter key="ratio" value="0.2"/>
              <parameter key="ratio" value="0.8"/>
            </enumeration>
            <parameter key="sampling_type" value="linear sampling"/>
          </operator>
          <operator activated="true" class="k_nn" compatibility="5.3.008" expanded="true" height="76" name="k-NN (2)" width="90" x="315" y="30"/>
          <operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model (2)" width="90" x="450" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.3.008" expanded="true" height="76" name="Performance (2)" width="90" x="585" y="30"/>
          <operator activated="true" class="materialize_data" compatibility="5.3.008" expanded="true" height="76" name="Materialize Data" width="90" x="514" y="210"/>
          <connect from_op="Retrieve data aw (2)" from_port="output" to_op="Filter Examples (2)" to_port="example set input"/>
          <connect from_op="Filter Examples (2)" from_port="example set output" to_op="Split Data" to_port="example set"/>
          <connect from_op="Split Data" from_port="partition 1" to_op="k-NN (2)" to_port="training set"/>
          <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
          <connect from_op="k-NN (2)" from_port="model" to_op="Apply Model (2)" to_port="model"/>
          <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
          <connect from_op="Performance (2)" from_port="example set" to_op="Materialize Data" to_port="example set input"/>
          <connect from_op="Materialize Data" from_port="example set output" to_port="out 1"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="append" compatibility="5.3.008" expanded="true" height="76" name="Append" width="90" x="380" y="165"/>
      <operator activated="true" class="write_csv" compatibility="5.3.008" expanded="true" height="76" name="Write CSV" width="90" x="447" y="300">
        <parameter key="csv_file" value="C:\Users\Oliver\Documents\Gambling\Dump\write test.CSV"/>
        <parameter key="column_separator" value=","/>
      </operator>
      <connect from_op="Retrieve Names2" from_port="output" to_op="Loop Values" to_port="example set"/>
      <connect from_op="Loop Values" from_port="out 1" to_op="Append" to_port="example set 1"/>
      <connect from_op="Append" from_port="merged set" to_op="Write CSV" to_port="input"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
    </process>
  </operator>
</process>
Logged
oli
Newbie
*
Posts: 6


« Reply #1 on: May 16, 2013, 10:23:52 PM »

Hi,

Just wondered if anyone had any suggestions on this, all help very much appreciated.

Thanks,

Oli
Logged
Pages: [1]
  Print  
 
Jump to: