Pages: [1]
  Print  
Author Topic: error in calculating AUC  (Read 1867 times)
dan_
Full Member
***
Posts: 114


« on: June 19, 2010, 11:30:18 PM »

I am not sure how to insert an image - I got a classification model showing an ideal ROC curve so it should have an AUC (area under ROC curve) equal to 1; however RM displays an AUC of 0.5. This seems to be a bug.

Regards
Dan
Logged
Sebastian Land
Administrator
Hero Member
*****
Posts: 2426


« Reply #1 on: June 21, 2010, 08:12:41 AM »

Hi Dan,
could you post the process generating this data here?

Greetings,
  Sebastian
Logged
dan_
Full Member
***
Posts: 114


« Reply #2 on: June 21, 2010, 09:27:45 PM »

Sebastian, please find the process here appended. It contains generated data and data sampling for model evaluation - so randomness is involved, theoretically speaking. However one expects practically you to get an ideal confusion matrix (accuracy=1) and an ideal ROC but, surprisingly, with an AUC=0.5. If you do not get this, you may wish me to email you the image files with the ROC curve and the confusion matrix.

Regards
Dan

Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" expanded="true" name="Process">
    <parameter key="logverbosity" value="3"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="1"/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <parameter key="parallelize_main_process" value="false"/>
    <process expanded="true" height="422" width="547">
      <operator activated="true" class="generate_churn_data" expanded="true" height="60" name="Generate Churn Data" width="90" x="45" y="75">
        <parameter key="number_examples" value="1000"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <operator activated="true" class="nominal_to_binominal" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="120">
        <parameter key="return_preprocessing_model" value="false"/>
        <parameter key="create_view" value="false"/>
        <parameter key="attribute_filter_type" value="0"/>
        <parameter key="attribute" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="0"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="4"/>
        <parameter key="block_type" value="0"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="0"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="transform_binominal" value="false"/>
        <parameter key="use_underscore_in_name" value="false"/>
      </operator>
      <operator activated="true" class="remap_binominals" expanded="true" height="76" name="Remap Binominals" width="90" x="246" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="label"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="0"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="0"/>
        <parameter key="block_type" value="0"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="0"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="negative_value" value="ok"/>
        <parameter key="positive_value" value="terminate"/>
      </operator>
      <operator activated="true" class="split_validation" expanded="true" height="112" name="Validation" width="90" x="380" y="75">
        <parameter key="create_complete_model" value="false"/>
        <parameter key="split" value="1"/>
        <parameter key="split_ratio" value="0.7"/>
        <parameter key="training_set_size" value="100"/>
        <parameter key="test_set_size" value="-1"/>
        <parameter key="sampling_type" value="stratified sampling"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="parallelize_training" value="false"/>
        <parameter key="parallelize_testing" value="false"/>
        <process expanded="true" height="443" width="207">
          <operator activated="true" class="decision_tree" expanded="true" height="76" name="Decision Tree" width="90" x="45" y="30">
            <parameter key="criterion" value="gain_ratio"/>
            <parameter key="minimal_size_for_split" value="4"/>
            <parameter key="minimal_leaf_size" value="2"/>
            <parameter key="minimal_gain" value="0.04"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="confidence" value="0.25"/>
            <parameter key="number_of_prepruning_alternatives" value="3"/>
            <parameter key="no_pre_pruning" value="false"/>
            <parameter key="no_pruning" value="false"/>
          </operator>
          <connect from_port="training" to_op="Decision Tree" to_port="training set"/>
          <connect from_op="Decision Tree" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="443" width="255">
          <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="performance" expanded="true" height="76" name="Performance" width="90" x="155" y="30">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Generate Churn Data" from_port="output" to_op="Nominal to Binominal" to_port="example set input"/>
      <connect from_op="Nominal to Binominal" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
      <connect from_op="Remap Binominals" from_port="example set output" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="36"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="162"/>
      <portSpacing port="sink_result 4" spacing="54"/>
    </process>
  </operator>
</process>
Logged
Ingo Mierswa
Administrator
Hero Member
*****
Posts: 1226



WWW
« Reply #3 on: June 21, 2010, 11:13:14 PM »

Hi Dan,

I understand that this seems hard to believe but as far as I can see the calculation is indeed correct:

  • if you only have the reference points (0,0) and (1,1) the trapecoidal calculation of the AUC will deliver exactly the half of rectangle which results in 0.5
  • the optimisting calculation is also easy to understand: here the upper bounds for each rectangle are used and this results in 1.0
  • the one thing which might surprise why the pessimistic calculation also results in 1 and hence is better than 0.5: but here the lower rectangles are used - which in this case is exactly the same rectangle like in the optimistic case
Cheers,
Ingo
Logged

Did you try our new Marketplace? Upload or download new Extensions, add comments, and organize your operators. Have a look at  http://marketplace.rapid-i.com
dan_
Full Member
***
Posts: 114


« Reply #4 on: June 22, 2010, 10:05:56 AM »

Hi Ingo,

Thanks for the explanation. Actually the ROC curve in this case contains the point (0,1), the so called "perfect classification" - see http://en.wikipedia.org/wiki/Receiver_operating_characteristic
So you have the points (0,1) and (1,1) in the curve graph.

You can see also the drawing of the ROC produced by RM in this case: indeed the area under this curve is 1. Therefore AUC indicator should be calculated to 1.

Moreover, please note that an AUC of 0.5 is achieved in general by the random classifiers (which provide for instance an equal number of good and bad answers - assuming we have the positive and negative classes of the same size). This is improper for the particular decision tree I provided - which happens to be a perfect classifier (accuracy=1).

Also, it is widely accepted that AUC is one of the indicators of the quality of a binary classifier. As said, the above decision tree is a perfect classifier, so it is natural it to have the highest AUC as opposed to an AUC=0.5.

So everything indicates that AUC should be calculated to be 1 here. This would be consistent also with the optimistic and pessimistic calculations.

Best,
Dan
Logged
Ingo Mierswa
Administrator
Hero Member
*****
Posts: 1226



WWW
« Reply #5 on: June 22, 2010, 10:19:34 AM »

Hi,

Quote
You can see also the drawing of the ROC produced by RM in this case: indeed the area under this curve is 1. Therefore AUC indicator should be calculated to 1.

ok, that's weird, I didn't check this. If the point (0,1) is also part of the thresholds (are you sure it is or is it just the painting?) then indeed I would also expect the AUC to be 1. You could file a bug in our community bug tracker in this case.

Cheers,
Ingo
Logged

Did you try our new Marketplace? Upload or download new Extensions, add comments, and organize your operators. Have a look at  http://marketplace.rapid-i.com
haddock
Hero Member
*****
Posts: 853



WWW
« Reply #6 on: October 14, 2010, 09:25:31 AM »

Hi Folks,

Given that the author of the above code has recently posted thus..

Quote
However, perhaps this suggestion may be useful to consider after the ROC Analysis implemented in Rapid Miner would be revised as it is still unreliable in this package (i.e. AUC calculation needs corrections, as I have shown on the forum http://rapid-i.com/rapidforum/index.php?PHPSESSID=18d6261d2d63b2ca946477f03c2552bc&topic=2237.0
, and Find Threshold operator does not find the best threshold as expected but provides suboptimal solutions - I emailed a complete report to the RM development team, with relevant processes illustrating this).

here...

http://rapid-i.com/rapidforum/index.php/topic,2584.msg10537.html#msg10537

I took another look at the code and noticed that we have binominal mapping/remapping of the label, without view creation, which changes the underlying data, generates an error, and is not necessary for the learner, like this...

Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" expanded="true" name="Process">
    <parameter key="logverbosity" value="3"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="1"/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <parameter key="parallelize_main_process" value="false"/>
    <process expanded="true" height="422" width="547">
      <operator activated="true" class="generate_churn_data" expanded="true" height="60" name="Generate Churn Data" width="90" x="45" y="75">
        <parameter key="number_examples" value="1000"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <operator activated="true" class="nominal_to_binominal" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="120">
        <parameter key="return_preprocessing_model" value="false"/>
        <parameter key="create_view" value="false"/>
        <parameter key="attribute_filter_type" value="0"/>
        <parameter key="attribute" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="0"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="4"/>
        <parameter key="block_type" value="0"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="0"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="transform_binominal" value="false"/>
        <parameter key="use_underscore_in_name" value="false"/>
      </operator>
      <operator activated="true" class="remap_binominals" expanded="true" height="76" name="Remap Binominals" width="90" x="246" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="label"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="0"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="0"/>
        <parameter key="block_type" value="0"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="0"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="negative_value" value="ok"/>
        <parameter key="positive_value" value="terminate"/>
      </operator>
      <operator activated="true" class="split_validation" expanded="true" height="112" name="Validation" width="90" x="380" y="75">
        <parameter key="create_complete_model" value="false"/>
        <parameter key="split" value="1"/>
        <parameter key="split_ratio" value="0.7"/>
        <parameter key="training_set_size" value="100"/>
        <parameter key="test_set_size" value="-1"/>
        <parameter key="sampling_type" value="stratified sampling"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="parallelize_training" value="false"/>
        <parameter key="parallelize_testing" value="false"/>
        <process expanded="true" height="443" width="207">
          <operator activated="true" class="decision_tree" expanded="true" height="76" name="Decision Tree" width="90" x="45" y="30">
            <parameter key="criterion" value="gain_ratio"/>
            <parameter key="minimal_size_for_split" value="4"/>
            <parameter key="minimal_leaf_size" value="2"/>
            <parameter key="minimal_gain" value="0.04"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="confidence" value="0.25"/>
            <parameter key="number_of_prepruning_alternatives" value="3"/>
            <parameter key="no_pre_pruning" value="false"/>
            <parameter key="no_pruning" value="false"/>
          </operator>
          <connect from_port="training" to_op="Decision Tree" to_port="training set"/>
          <connect from_op="Decision Tree" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="443" width="255">
          <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="performance" expanded="true" height="76" name="Performance" width="90" x="155" y="30">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Generate Churn Data" from_port="output" to_op="Nominal to Binominal" to_port="example set input"/>
      <connect from_op="Nominal to Binominal" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
      <connect from_op="Remap Binominals" from_port="example set output" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="36"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="162"/>
      <portSpacing port="sink_result 4" spacing="54"/>
    </process>
  </operator>
</process>

Disable these operators, and the warnings disappear, and rather a different result emerges , like this...

Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Process">
    <process expanded="true" height="422" width="547">
      <operator activated="true" class="generate_churn_data" compatibility="5.0.0" expanded="true" height="60" name="Generate Churn Data" width="90" x="45" y="75">
        <parameter key="number_examples" value="1000"/>
      </operator>
      <operator activated="false" class="nominal_to_binominal" compatibility="5.0.0" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="120"/>
      <operator activated="false" class="remap_binominals" compatibility="5.0.0" expanded="true" height="76" name="Remap Binominals" width="90" x="246" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="label"/>
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="negative_value" value="ok"/>
        <parameter key="positive_value" value="terminate"/>
      </operator>
      <operator activated="true" class="split_validation" compatibility="5.0.0" expanded="true" height="112" name="Validation" width="90" x="380" y="75">
        <parameter key="sampling_type" value="stratified sampling"/>
        <process expanded="true" height="443" width="207">
          <operator activated="true" class="decision_tree" compatibility="5.0.0" expanded="true" height="76" name="Decision Tree" width="90" x="45" y="30">
            <parameter key="minimal_gain" value="0.04"/>
          </operator>
          <connect from_port="training" to_op="Decision Tree" to_port="training set"/>
          <connect from_op="Decision Tree" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="443" width="255">
          <operator activated="true" class="apply_model" compatibility="5.0.0" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.0.0" expanded="true" height="76" name="Performance" width="90" x="155" y="30"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Generate Churn Data" from_port="output" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="36"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="162"/>
    </process>
  </operator>
</process>


Or am I missing something?

Toodle Pip!

Logged

Where is the wisdom we have lost in knowledge?
Where is the knowledge we have lost in information?

T.S.Eliot ~ Choruses from the Rock 1934
dan_
Full Member
***
Posts: 114


« Reply #7 on: July 16, 2013, 08:12:13 AM »

Hi,

Just checked and this error of RM in calculating AUC has not been corrected since this was posted. 
Here is a recall. http://rapid-i.com/rapidforum/index.php/topic,6871.msg24166.html#msg24166

As one of the participants at this discussion asked
Quote
am I missing something?
- yes, perhaps understanding the essential thing. RM still makes this AUC calculation error 2 years after. Toodle Pip.

Dan

PS By the way AUC is the area under the ROC curve. As reported to RapidI team some time ago, RM produces some wrong results within the ROC analysis too.
« Last Edit: July 16, 2013, 08:47:57 AM by dan_ » Logged
Pages: [1]
  Print  
 
Jump to: