Pages: [1]
  Print  
Author Topic: Duplicate attribute name: Content-Type  (Read 1662 times)
rapidox
Newbie
*
Posts: 3


« on: June 19, 2013, 01:31:19 PM »

Hi all,
Rapid Miner is a fantastic tool I am using.

I am trying to get Keyword clustering using web mining and text mining example by http://www.simafore.com/blog/bid/116340/ , but I get a "Duplicate attribute name: Content-Type" error.

I have to read a mysql database table and get the LINK information as attribute.

(mysql)
LINK attribute is:

http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html
http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html
http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html

I'd like to get keyword clusters that are based on those web pages content.

Do You know a way to get this process working ?

I attach the xml process here.

I thank You for good collaboration in advance !

Have a good day.
Alex

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="45" y="75">
        <parameter key="define_connection" value="url"/>
        <parameter key="connection" value="libero"/>
        <parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
        <parameter key="username" value="root"/>
        <parameter key="password" value="***********************"/>
        <parameter key="define_query" value="table name"/>
        <parameter key="table_name" value="textmine"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
        <parameter key="link_attribute" value="Link"/>
        <parameter key="page_attribute" value="PAGE"/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="delay" value="random"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="attribute" value="PAGEOUTPUT"/>
        <parameter key="attributes" value="PAGEOUTPUT"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
            <parameter key="ignore_non_html_tags" value="false"/>
          </operator>
          <connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
          <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|text"/>
        <parameter key="numeric_condition" value="&lt;5"/>
      </operator>
      <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435"/>
      <connect from_op="Read Database" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
      <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
      <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>


Logged
Marius
Administrator
Hero Member
*****
Posts: 1793



WWW
« Reply #1 on: June 20, 2013, 12:34:56 PM »

Hi,

for me the process you supplied runs fine if I replace the Read Database operator with a data set that contains a Link attribute with the links you provided.

Can you send me a link for which the described error occurs?

Best regards,
Marius
Logged

Please add [SOLVED] to the topic title when your problem has been solved! (do so by editing the first post in the thread and modifying the title)
Please click here before posting.
rapidox
Newbie
*
Posts: 3


« Reply #2 on: June 20, 2013, 03:00:12 PM »

Hi Marius,
I am very happy to read You rapid reply.

I replaced the Read Database operator, following your suggestion.

We can't get any content from the linked web pages, I don't know why, maybe I have to change the "Get Pages" operator.

The csv file contains now:

Link;
http://corrieredelveneto.corriere.it/notizie/politica/2013/28-maggio-2013/vincitori-vinti-disperati-2221364926711.shtml,
http://www.corriere.it/sette/13_maggio_22/2013-21-gramigna-aulla_12fb6dea-c2e8-11e2-b767-d844a9f1da92.shtml,
http://corrieredelveneto.corriere.it/notizie/cronaca/2013/23-maggio-2013/alluvione-stretta-controlli-ma-resta-nodo-bacini-2221283139532.shtml


Here the whole process:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_csv" compatibility="5.3.008" expanded="true" height="60" name="Read CSV" width="90" x="45" y="75">
        <parameter key="csv_file" value="/home/alex/Scrivania/url.csv"/>
        <parameter key="column_separators" value=","/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <parameter key="locale" value="Italian (Italy)"/>
        <parameter key="encoding" value="UTF-8"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Link.true.binominal.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
        <parameter key="link_attribute" value="Link"/>
        <parameter key="page_attribute" value="PAGE"/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="user_agent" value="User-Agent: Mozilla/5.0 (Windows NT 6.1; rv:20.0) Gecko/20100101 Firefox/20.0"/>
        <parameter key="accept_cookies" value="all"/>
        <parameter key="delay" value="random"/>
        <parameter key="min_delay_amount" value="1000"/>
        <parameter key="max_delay_amount" value="2000"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="210">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="attribute" value="PAGEOUTPUT"/>
        <parameter key="attributes" value="PAGEOUTPUT"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="75">
        <parameter key="vector_creation" value="Term Occurrences"/>
        <parameter key="keep_text" value="true"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content" width="90" x="246" y="75">
            <parameter key="neglegt_span_tags" value="false"/>
            <parameter key="neglect_p_tags" value="false"/>
            <parameter key="neglect_b_tags" value="false"/>
            <parameter key="neglect_i_tags" value="false"/>
            <parameter key="neglect_br_tags" value="false"/>
            <parameter key="ignore_non_html_tags" value="false"/>
          </operator>
          <connect from_port="document" to_op="Extract Content" to_port="document"/>
          <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="112" name="Multiply" width="90" x="380" y="345"/>
      <operator activated="true" class="write_csv" compatibility="5.3.008" expanded="true" height="76" name="Write CSV" width="90" x="581" y="570">
        <parameter key="csv_file" value="/home/alex/Scrivania/out.csv"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|Link"/>
        <parameter key="numeric_condition" value="&lt;5"/>
      </operator>
      <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="849" y="435">
        <parameter key="add_as_label" value="true"/>
      </operator>
      <connect from_op="Read CSV" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
      <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Multiply" from_port="output 3" to_op="Write CSV" to_port="input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
      <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>

Can You suggest a solution and attach a full working process for text Keyword clustering ?

I thank You for Your good support Marius!

Have a good evening.
Alex
Logged
Marius
Administrator
Hero Member
*****
Posts: 1793



WWW
« Reply #3 on: June 21, 2013, 12:24:19 PM »

It's probably rather a problem with the import. Here I create the data manual with Generate Data by User Specification and Append, and the process works like a charm.

Best regards,
Marius

Code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="false" class="read_database" compatibility="5.3.008" expanded="true" height="60" name="Read Database" width="90" x="179" y="345">
        <parameter key="define_connection" value="url"/>
        <parameter key="connection" value="libero"/>
        <parameter key="database_url" value="jdbc:mysql://localhost:3306/libero"/>
        <parameter key="username" value="root"/>
        <parameter key="password" value="lgklMQSth6iWCSUxBK2HqHVOMUczkF6b"/>
        <parameter key="define_query" value="table name"/>
        <parameter key="table_name" value="textmine"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="30">
        <list key="attribute_values">
          <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/cronaca/1261117/Veneto--Zaia--necessario-assicurarsi-contro-eventi-catastrofici.html&quot;"/>
        </list>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="120">
        <list key="attribute_values">
          <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/sostenibilita/1257087/L-Agenzia-europea-per-l-ambiente-lancia-l-allarme-clima--rischio-permanente----.html&quot;"/>
        </list>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="generate_data_user_specification" compatibility="5.3.008" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="210">
        <list key="attribute_values">
          <parameter key="Link" value="&quot;http://www.liberoquotidiano.it/news/cronaca/1254046/Maltempo--Grosseto--sopralluogo-di-Marras-con-D-Angelis-in-zone-alluvione.html&quot;"/>
        </list>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="append" compatibility="5.3.008" expanded="true" height="112" name="Append" width="90" x="179" y="30"/>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.000" expanded="true" height="60" name="Get Pages" width="90" x="313" y="30">
        <parameter key="link_attribute" value="Link"/>
        <parameter key="page_attribute" value="PAGE"/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="delay" value="random"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="210">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="attribute" value="PAGEOUTPUT"/>
        <parameter key="attributes" value="PAGEOUTPUT"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="75">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.000" expanded="true" height="60" name="Extract Content (2)" width="90" x="447" y="210">
            <parameter key="ignore_non_html_tags" value="false"/>
          </operator>
          <connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
          <connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.3.008" expanded="true" height="94" name="Multiply" width="90" x="380" y="345"/>
      <operator activated="true" class="select_attributes" compatibility="5.3.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="648" y="390">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|text"/>
        <parameter key="numeric_condition" value="&lt;5"/>
      </operator>
      <operator activated="true" class="k_medoids" compatibility="5.3.008" expanded="true" height="76" name="Clustering" width="90" x="782" y="390"/>
      <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
      <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
      <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
      <connect from_op="Append" from_port="merged set" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 4"/>
      <connect from_op="Multiply" from_port="output 1" to_port="result 1"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
      <connect from_op="Clustering" from_port="clustered set" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>
Logged

Please add [SOLVED] to the topic title when your problem has been solved! (do so by editing the first post in the thread and modifying the title)
Please click here before posting.
rapidox
Newbie
*
Posts: 3


« Reply #4 on: June 25, 2013, 03:18:59 PM »

Marius You are Great !!!

I succeed using the Read Csv operator !

now for a scientific research I need to get earthquake (=terremoto) related italian article data from a freely available newspaper article archive search engine

http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto

Searching for " terremoto " You will find 11210 articles.

The pagination system uses a javascript script to assign value to the pageNumber input variable.

function submitform(page) {      
      var query = document.getElementById("queryString").value;
      var action = "archivioStoricoEngine";
      if (query != null && query != "") {
         action += "?q=" + query;
      } else {
         action += "?q=";
      }
        document.getElementById("pageNumber").value = page;
        document.getElementById("pagerForm").action=action;
        document.getElementById("pagerForm").submit();
   }

The form uses POST Method and hidden inputed variables, instead of GET method.

Maybe for You is a simple question, but I am a newbe in data mining field, so please explain to me how can I proceed.

What Rapid Miner operators have I to use?

How can I set the javascript pageNumber variable to loop the article extraction?

Is it possible to add a Referer ?

Here my process, it works for ordinary search engine web page, but I don't know how to extract data from Form POSTing search engine results. 

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="loop" compatibility="5.3.008" expanded="true" height="94" name="Loop" width="90" x="179" y="210">
        <parameter key="set_iteration_macro" value="true"/>
        <parameter key="iterations" value="317"/>
        <parameter key="timeout" value="120"/>
        <parameter key="parallelize_iteration" value="true"/>
        <process expanded="true">
          <operator activated="true" class="generate_macro" compatibility="5.3.008" expanded="true" height="76" name="Generate Macro" width="90" x="45" y="30">
            <list key="function_descriptions">
              <parameter key="Pagepos" value="(%{iteration})+1"/>
            </list>
          </operator>
          <operator activated="true" class="log" compatibility="5.3.008" expanded="true" height="76" name="Log" width="90" x="512" y="30">
            <parameter key="filename" value="/home/alex/Documents/Logs/log-perfetto.txt"/>
            <list key="log">
              <parameter key="time" value="operator.Crawl Web.value.time"/>
              <parameter key="execution time" value="operator.Crawl Web.value.execution-time"/>
              <parameter key="looptime" value="operator.Crawl Web.value.looptime"/>
              <parameter key="cpu execution time" value="operator.Crawl Web.value.cpu-execution-time"/>
              <parameter key="Max Token Length" value="operator.Tokenize.parameter.max_token_length"/>
            </list>
          </operator>
          <operator activated="true" class="web:process_web" compatibility="5.3.000" expanded="true" height="60" name="Process Documents from Web" width="90" x="112" y="300">
            <parameter key="url" value="http://sitesearch.corriere.it/archivioStoricoEngine?q=terremoto&amp;queryMode=simpleany&amp;autore=&amp;fromDay=01&amp;fromMonth=01&amp;fromYear=1992&amp;toDay=31&amp;toMonth=12&amp;toYear=2013&amp;orderBy=data&amp;sectionCorriere=true&amp;__checkbox_sectionCorriere=true&amp;__checkbox_sectionLavoro=true&amp;__checkbox_sectionEconomia=true&amp;__checkbox_sectionSalute=true&amp;__checkbox_sectionSoldi=true&amp;__checkbox_sectionViviMilano=true&amp;Ricerca=Cerca&amp;pageNumber=%{Pagepos}"/>
            <list key="crawling_rules">
              <parameter key="follow_link_with_matching_text" value="terremoto"/>
            </list>
            <parameter key="add_pages_as_attribute" value="true"/>
            <parameter key="max_page_size" value="10000"/>
            <parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/21.0.1"/>
            <parameter key="really_ignore_exclusion" value="true"/>
            <parameter key="parallelize_process_webpage" value="true"/>
            <process expanded="true">
              <operator activated="true" class="text:cut_document" compatibility="5.3.000" expanded="true" height="60" name="Cut Document" width="90" x="205" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries">
                  <parameter key="Article" value="&lt;div&gt; &lt;h1&gt;.&lt;/p&gt; &lt;/div&gt;"/>
                </list>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="Article" value="//h:div"/>
                </list>
                <list key="namespaces"/>
                <parameter key="ignore_CDATA" value="false"/>
                <list key="index_queries"/>
                <parameter key="parallelize_segment_processing" value="true"/>
                <process expanded="true">
                  <operator activated="true" class="text:extract_information" compatibility="5.3.000" expanded="true" height="60" name="Extract Information" width="90" x="214" y="30">
                    <parameter key="query_type" value="XPath"/>
                    <list key="string_machting_queries"/>
                    <list key="regular_expression_queries"/>
                    <list key="regular_region_queries"/>
                    <list key="xpath_queries">
                      <parameter key="Date" value="//h:div/h:p/h:span[1]"/>
                      <parameter key="Article" value="//h:div"/>
                      <parameter key="article-link" value="//h:div/h:h1/h:a"/>
                    </list>
                    <list key="namespaces"/>
                    <parameter key="ignore_CDATA" value="false"/>
                    <list key="index_queries"/>
                  </operator>
                  <connect from_port="segment" to_op="Extract Information" to_port="document"/>
                  <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="document" to_op="Cut Document" to_port="document"/>
              <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="write_database" compatibility="5.3.008" expanded="true" height="60" name="Write Database" width="90" x="514" y="300">
            <parameter key="define_connection" value="url"/>
            <parameter key="connection" value="italiaoggi"/>
            <parameter key="database_url" value="jdbc:mysql://localhost:3306/corriere"/>
            <parameter key="username" value="root"/>
            <parameter key="password" value="*****************"/>
            <parameter key="table_name" value="textmine"/>
            <parameter key="overwrite_mode" value="append"/>
            <parameter key="default_varchar_length" value="10000"/>
            <parameter key="db_key_attribute_name" value="Link"/>
          </operator>
          <connect from_port="input 1" to_op="Generate Macro" to_port="through 1"/>
          <connect from_op="Generate Macro" from_port="through 1" to_op="Log" to_port="through 1"/>
          <connect from_op="Log" from_port="through 1" to_port="output 1"/>
          <connect from_op="Process Documents from Web" from_port="example set" to_op="Write Database" to_port="input"/>
          <connect from_op="Write Database" from_port="through" to_port="output 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
          <portSpacing port="sink_output 3" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Loop" from_port="output 1" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

I wait for Your kind and good suggestion.

Have a wonderful day Marius.

Alex
Logged
Marius
Administrator
Hero Member
*****
Posts: 1793



WWW
« Reply #5 on: June 26, 2013, 10:30:31 AM »

Ciao Alex,

the Get Page operator supports POST requests. Maybe you can play around a bit with that operator, and if you manage to retrieve one page successfully, you can probably use it in a loop to retrieve all pages.

Just as a side note, did you check that the site policy/copyright allows you to machine-crawl the archive of the Corriere della Sera?

Una buona giornata anche a te!
Marius
Logged

Please add [SOLVED] to the topic title when your problem has been solved! (do so by editing the first post in the thread and modifying the title)
Please click here before posting.
Pages: [1]
  Print  
 
Jump to: