Pages: [1]
  Print  
Author Topic: Process Failed - out of memory  (Read 879 times)
mrcrowley
Newbie
*
Posts: 6


« on: January 14, 2014, 06:47:04 PM »

Hi all,

I have some hundreds text files, some of them over 1GB which I have to process but even for some dozens of files max 50 MB large each I get after some time an error.:

Quote
This process would need more than the maximum amount of available memory. ...

I have a notebook with 8GB RAM and I can see that RM uses about 5GB.
Is there any way to get this working?
Is Radoop able to do something like this?
I cannot find it in the extensions.

Here is my XML:

Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files (2)" width="90" x="45" y="30">
        <list key="text_directories">
          <parameter key="testhtml" value="E:\#APPSDATA\RapidMiner\test5"/>
        </list>
        <parameter key="file_pattern" value="*.txt"/>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="vector_creation" value="Term Occurrences"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_below_absolute" value="0"/>
        <parameter key="prune_above_absolute" value="10"/>
        <parameter key="datamanagement" value="float_array"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.001" expanded="true" height="60" name="Extract Content" width="90" x="45" y="30">
            <parameter key="minimum_text_block_length" value="3"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="179" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="313" y="30">
            <parameter key="min_chars" value="3"/>
            <parameter key="max_chars" value="30"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="380" y="165"/>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="514" y="165"/>
          <operator activated="true" class="text:filter_stopwords_french" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (French)" width="90" x="648" y="165"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="648" y="30"/>
          <connect from_port="document" to_op="Extract Content" to_port="document"/>
          <connect from_op="Extract Content" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
          <connect from_op="Filter Tokens (2)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
          <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Stopwords (French)" to_port="document"/>
          <connect from_op="Filter Stopwords (French)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="loop_batches" compatibility="5.3.015" expanded="true" height="60" name="Loop Batches" width="90" x="179" y="30">
        <parameter key="batch_size" value="100"/>
        <process expanded="true">
          <portSpacing port="source_exampleSet" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="transpose" compatibility="5.3.015" expanded="true" height="76" name="Transpose" width="90" x="380" y="30"/>
      <operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="581" y="30">
        <parameter key="excel_file" value="E:\#APPSDATA\RapidMiner\excel\20140114a"/>
        <parameter key="file_format" value="xlsx"/>
      </operator>
      <connect from_op="Process Documents from Files (2)" from_port="example set" to_op="Loop Batches" to_port="example set"/>
      <connect from_op="Loop Batches" from_port="example set" to_op="Transpose" to_port="example set input"/>
      <connect from_op="Transpose" from_port="example set output" to_op="Write Excel" to_port="input"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Logged
Marius
Administrator
Hero Member
*****
Posts: 1794



WWW
« Reply #1 on: January 15, 2014, 10:11:55 AM »

Hi,

if you only need the TF/IDF statistics and don't need the text any more after extracting it in Process Documents, you can disable the keep_text option in that operator. That should prevent RapidMiner from keeping all the files in memory.

Apart from that, in which operator does the OutOfMemoryException occur?

Best regards,
Marius
Logged

Please add [SOLVED] to the topic title when your problem has been solved! (do so by editing the first post in the thread and modifying the title)
Please click here before posting.
mrcrowley
Newbie
*
Posts: 6


« Reply #2 on: January 15, 2014, 10:25:12 AM »

Hi,

thanks for your respond.

I don't want to do TF/IDF statistics, I need the extracted words.

The keep_text option doesn't help.

The OutOfMemoryException occurs certainly in the Process Documents from Files - Extract Content and maybe in the Write Excel operator.

So, is there any way to deal with such big files in RM?

Best regards
« Last Edit: January 15, 2014, 02:59:14 PM by mrcrowley » Logged
Pages: [1]
  Print  
 
Jump to: