Pages: [1]
  Print  
Author Topic: Model selection using cross validation  (Read 70 times)
Smith
Newbie
*
Posts: 1


« on: July 07, 2014, 04:04:37 PM »

Hallo,

I am trying to do a simple model selection using cross validation in Rapidminer.
The goal is to evaluate various classification methods using the same folds of cross validation for each method and select the one with the best averaged performance over the folds. This should be done within one process automatically.

Below is the process I've created to accomplish this goal.

The potential problem I see with this approach is the fact that the X-Validation operator is placed within the Loop operator. I am not sure if every classification method will be evaluated on the same folds of X-Validation.. Is this guaranteed when the local random seed parameter of X-Validation is set to true?
Is there maybe an easier way to finding the best classifier for the given classification task automatically?

Many thanks in advance.


Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.013" expanded="true" height="60" name="Golf" width="90" x="112" y="75">
        <parameter key="repository_entry" value="//Samples/data/Golf"/>
      </operator>
      <operator activated="true" class="loop" compatibility="5.3.013" expanded="true" height="76" name="Loop" width="90" x="313" y="75">
        <parameter key="set_iteration_macro" value="true"/>
        <parameter key="iterations" value="3"/>
        <process expanded="true">
          <operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="179" y="30">
            <parameter key="number_of_validations" value="4"/>
            <parameter key="use_local_random_seed" value="true"/>
            <process expanded="true">
              <operator activated="true" class="select_subprocess" compatibility="5.3.013" expanded="true" height="76" name="Select Subprocess (2)" width="90" x="159" y="30">
                <parameter key="select_which" value="%{iteration}"/>
                <process expanded="true">
                  <operator activated="true" class="k_nn" compatibility="5.3.013" expanded="true" height="76" name="k-NN (2)" width="90" x="44" y="30"/>
                  <connect from_port="input 1" to_op="k-NN (2)" to_port="training set"/>
                  <connect from_op="k-NN (2)" from_port="model" to_port="output 1"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_output 1" spacing="0"/>
                  <portSpacing port="sink_output 2" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="true" class="naive_bayes" compatibility="5.3.013" expanded="true" height="76" name="Naive Bayes (2)" width="90" x="46" y="30"/>
                  <connect from_port="input 1" to_op="Naive Bayes (2)" to_port="training set"/>
                  <connect from_op="Naive Bayes (2)" from_port="model" to_port="output 1"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_output 1" spacing="0"/>
                  <portSpacing port="sink_output 2" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="true" class="decision_tree" compatibility="5.3.013" expanded="true" height="76" name="Decision Tree (2)" width="90" x="48" y="30"/>
                  <connect from_port="input 1" to_op="Decision Tree (2)" to_port="training set"/>
                  <connect from_op="Decision Tree (2)" from_port="model" to_port="output 1"/>
                  <portSpacing port="source_input 1" spacing="0"/>
                  <portSpacing port="source_input 2" spacing="0"/>
                  <portSpacing port="sink_output 1" spacing="0"/>
                  <portSpacing port="sink_output 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="training" to_op="Select Subprocess (2)" to_port="input 1"/>
              <connect from_op="Select Subprocess (2)" from_port="output 1" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="259" y="30">
                <list key="class_weights"/>
              </operator>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="input 1" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="output 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Golf" from_port="output" to_op="Loop" to_port="input 1"/>
      <connect from_op="Loop" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
[ /code]

Logged
Pages: [1]
  Print  
 
Jump to: