Pages: [1]
  Print  
Author Topic: Impute Missing Values Weird behaviour  (Read 799 times)
amerkel
Newbie
*
Posts: 3


« on: April 06, 2011, 03:05:05 PM »

Hello,

I wanted to compare clustering of a Dataset with missing values vs imputed values. Unfortunately, as soon as I insert a branch that imputes the values and run it, this seems to be applied to all steps of the process. Even wenn I try to look at the data right after the import the missing values are replaced.
(A second but less important question is why "impute values" component needs a label. It is easy to work around that by labeling an attribute before imputation and unlabeling it afterwards, but....)

Here is my process.
Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
    <process expanded="true" height="672" width="949">
      <operator activated="true" class="retrieve" compatibility="5.1.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="165">
        <parameter key="repository_entry" value="Marketing"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Multiply" width="90" x="45" y="255"/>
      <operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role" width="90" x="112" y="390">
        <parameter key="name" value="SEX"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="impute_missing_values" compatibility="5.1.006" expanded="true" height="60" name="Impute Missing Values" width="90" x="246" y="435">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="invert_selection" value="true"/>
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="iterate" value="false"/>
        <process expanded="true" height="690" width="911">
          <operator activated="true" class="naive_bayes" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes" width="90" x="149" y="83"/>
          <connect from_port="example set source" to_op="Naive Bayes" to_port="training set"/>
          <connect from_op="Naive Bayes" from_port="model" to_port="model sink"/>
          <portSpacing port="source_example set source" spacing="0"/>
          <portSpacing port="sink_model sink" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (2)" width="90" x="380" y="435">
        <parameter key="name" value="SEX"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM (2)" width="90" x="581" y="435">
        <parameter key="N" value="2.0"/>
        <parameter key="add_as_label" value="true"/>
      </operator>
      <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM" width="90" x="246" y="300">
        <parameter key="N" value="2.0"/>
        <parameter key="add_as_label" value="true"/>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="W-EM" to_port="example set"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Multiply" from_port="output 3" to_port="result 1"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
      <connect from_op="Impute Missing Values" from_port="example set out" to_op="Set Role (2)" to_port="example set input"/>
      <connect from_op="Set Role (2)" from_port="example set output" to_op="W-EM (2)" to_port="example set"/>
      <connect from_op="W-EM" from_port="clustered set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

thanks,

Alex
« Last Edit: April 06, 2011, 03:08:12 PM by amerkel » Logged
awchisholm
Sr. Member
****
Posts: 368


WWW
« Reply #1 on: April 06, 2011, 05:43:54 PM »

Hello

The multiply operator doesn't take a copy, it actually provides a reference so if the data in one changes, they all do.

One way out of this is to use the "Materialise Data" operator.

I made some changes - see the enclosed.

regards

Andrew

Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
    <process expanded="true" height="672" width="949">
      <operator activated="true" class="retrieve" compatibility="5.1.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
        <parameter key="repository_entry" value="//Samples/data/Labor-Negotiations"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply" width="90" x="45" y="255"/>
      <operator activated="true" class="materialize_data" compatibility="5.1.006" expanded="true" height="76" name="Materialize Data" width="90" x="179" y="345"/>
      <operator activated="true" class="impute_missing_values" compatibility="5.1.006" expanded="true" height="60" name="Impute Missing Values" width="90" x="380" y="345">
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="iterate" value="false"/>
        <process expanded="true" height="690" width="911">
          <operator activated="true" class="k_nn" compatibility="5.1.006" expanded="true" height="76" name="k-NN" width="90" x="495" y="30"/>
          <connect from_port="example set source" to_op="k-NN" to_port="training set"/>
          <connect from_op="k-NN" from_port="model" to_port="model sink"/>
          <portSpacing port="source_example set source" spacing="0"/>
          <portSpacing port="sink_model sink" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM (2)" width="90" x="581" y="345">
        <parameter key="N" value="2.0"/>
        <parameter key="add_as_label" value="true"/>
      </operator>
      <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM" width="90" x="447" y="165">
        <parameter key="N" value="2.0"/>
        <parameter key="add_as_label" value="true"/>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="W-EM" to_port="example set"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Materialize Data" to_port="example set input"/>
      <connect from_op="Materialize Data" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
      <connect from_op="Impute Missing Values" from_port="example set out" to_op="W-EM (2)" to_port="example set"/>
      <connect from_op="W-EM (2)" from_port="cluster model" to_port="result 3"/>
      <connect from_op="W-EM (2)" from_port="clustered set" to_port="result 4"/>
      <connect from_op="W-EM" from_port="cluster model" to_port="result 1"/>
      <connect from_op="W-EM" from_port="clustered set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>
Logged

amerkel
Newbie
*
Posts: 3


« Reply #2 on: April 11, 2011, 12:02:21 PM »

Thanks a lot

Alex
Logged
Pages: [1]
  Print  
 
Jump to: