Pages: [1]
  Print  
Author Topic: WordNet in RM 5  (Read 1112 times)
simon.knoll
Newbie
*
Posts: 40


« on: June 08, 2010, 10:40:32 AM »

Hello all,
short question: in RM 4.x there was this WordNetSynonymStemmer. is this operator gone in ver. 5 and one has to use groovy scripting instead?

thx
simon knoll
Logged
Wanttoknow
Newbie
*
Posts: 7


« Reply #1 on: June 09, 2010, 08:13:14 AM »

Hi,

I was asking myself the same thing: Where is the Wordnet stemmer in RM5?
Logged
Tobias Malbrecht
Global Moderator
Sr. Member
*****
Posts: 290



WWW
« Reply #2 on: June 09, 2010, 05:21:52 PM »

Hi,

I think the WordNet stemmer was removed since it did not work that well. Eventually, we try to re-animate it somewhen, but that is only speculation.

Kind regards,
Tobias
Logged

Tobias Malbrecht
Rapid-I GmbH
simon.knoll
Newbie
*
Posts: 40


« Reply #3 on: September 02, 2010, 02:41:48 PM »

hi,
i coded myself a wordnet operator, if someone is interested i can share code snippets.
what i can say is that for my testing dataset i've got some good results by adding hyponyms  for kmeans clustering.

all the best,
simon
Logged
B.
Jr. Member
**
Posts: 71


« Reply #4 on: September 02, 2010, 05:18:56 PM »

Simon

would appreciate seeing how you set this up. 
thanks

b.
Logged
simon.knoll
Newbie
*
Posts: 40


« Reply #5 on: September 02, 2010, 08:26:39 PM »

hi,
1st, you'll have to install wordnet
2nd, you need a java wordnet api, i took this one http://projects.csail.mit.edu/jwi/ (not for commercial purposes, but the fastest i know)
3rd, you'll have to implement an Operator (i added a new Class in the "com.rapidminer.operator.text.io.wordfilter" package)
for this i just copied an operator of the text plugin, deleted all the things i do not need and added the code for wordnet (here i add hypernyms)

i hope this was more helpful  than confusing Wink

Code:
package com.rapidminer.operator.text.io.wordfilter;

import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.text.Document;
import com.rapidminer.operator.text.Token;
import com.rapidminer.operator.text.io.AbstractTokenProcessor;
import com.rapidminer.parameter.UndefinedParameterError;

import edu.mit.jwi.Dictionary;
import edu.mit.jwi.IDictionary;
import edu.mit.jwi.item.IIndexWord;
import edu.mit.jwi.item.ISynset;
import edu.mit.jwi.item.ISynsetID;
import edu.mit.jwi.item.IWord;
import edu.mit.jwi.item.IWordID;
import edu.mit.jwi.item.POS;
import edu.mit.jwi.item.Pointer;
import edu.mit.jwi.morph.WordnetStemmer;

public class WordnetHyponymOperator extends AbstractTokenProcessor {
private WordnetStemmer stemmer;
private IDictionary dict;

public WordnetHyponymOperator(OperatorDescription description) {
super(description);
String wnhome = "/usr/local/WordNet-3.0/";
String path = wnhome + File.separator + "dict";
URL url = null;
try {
url = new URL("file", null, path);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

// construct the dictionary object and open it
IDictionary dict = new Dictionary(url);
dict.open();
WordnetStemmer stemmer = new WordnetStemmer(dict);
this.dict = dict;
this.stemmer = stemmer;
}

@Override
protected Document doWork(Document textObject) throws OperatorException {

List<Token> newSequence = new ArrayList<Token>(textObject
.getTokenSequence().size());
for (Token token : textObject.getTokenSequence()) {
List<String> stems = stemmer.findStems(token.getToken(), POS.NOUN);
if (stems != null && stems.size() > 0) {
String word2 = stems.get(0);
IIndexWord idxWord = dict.getIndexWord(word2, POS.NOUN);
if (idxWord != null && idxWord.getWordIDs().size() > 0) {
if (idxWord != null && idxWord.getWordIDs().size() > 0) {
IWordID wordID = idxWord.getWordIDs().get(0);
IWord word = dict.getWord(wordID);
ISynset synset = word.getSynset();
List<ISynsetID> blub = synset.getRelatedMap().get(
Pointer.HYPERNYM);

for (ISynsetID iSynsetID : blub) {
ISynset set = dict.getSynset(iSynsetID);
List<IWord> bla = set.getWords();
for (IWord iWord : bla) {
newSequence.add(new Token(iWord.getLemma(),
token.getWeight()));
}

}
}
}
}
newSequence.add(token);
}
textObject.setTokenSequence(newSequence);
return textObject;
}

}
Logged
Tobias Malbrecht
Global Moderator
Sr. Member
*****
Posts: 290



WWW
« Reply #6 on: September 03, 2010, 08:37:41 AM »

Hi Simon,

thank you very much for sharing your work. At the moment, our work at the text processing extension is almost idle because of other work. But maybe we have a look at it sometime ...?!

Best regards,
Tobias
Logged

Tobias Malbrecht
Rapid-I GmbH
simon.knoll
Newbie
*
Posts: 40


« Reply #7 on: September 03, 2010, 08:45:17 AM »

Yes, would be cool if this kind of features would be added again to the text plugin.
Logged
B.
Jr. Member
**
Posts: 71


« Reply #8 on: September 03, 2010, 06:01:34 PM »

thanks for the example Simon
Logged
dxhura
Newbie
*
Posts: 5


« Reply #9 on: January 31, 2012, 01:46:16 PM »

Hi Simon,

I am trying to use wordnet in rapidminer. I already did what you said before but I can't bring it to work.
I already have wordnet installed and the wordnet api as well. I created the class as you described. Could you please explain in more detail what I need to do next so that I can use it as an operator for stemming synonyms?

hi,
1st, you'll have to install wordnet
2nd, you need a java wordnet api, i took this one http://projects.csail.mit.edu/jwi/ (not for commercial purposes, but the fastest i know)
3rd, you'll have to implement an Operator (i added a new Class in the "com.rapidminer.operator.text.io.wordfilter" package)
for this i just copied an operator of the text plugin, deleted all the things i do not need and added the code for wordnet (here i add hypernyms)

i hope this was more helpful  than confusing Wink

Code:
package com.rapidminer.operator.text.io.wordfilter;

import java.io.File;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;

import com.rapidminer.operator.OperatorDescription;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.operator.text.Document;
import com.rapidminer.operator.text.Token;
import com.rapidminer.operator.text.io.AbstractTokenProcessor;
import com.rapidminer.parameter.UndefinedParameterError;

import edu.mit.jwi.Dictionary;
import edu.mit.jwi.IDictionary;
import edu.mit.jwi.item.IIndexWord;
import edu.mit.jwi.item.ISynset;
import edu.mit.jwi.item.ISynsetID;
import edu.mit.jwi.item.IWord;
import edu.mit.jwi.item.IWordID;
import edu.mit.jwi.item.POS;
import edu.mit.jwi.item.Pointer;
import edu.mit.jwi.morph.WordnetStemmer;

public class WordnetHyponymOperator extends AbstractTokenProcessor {
private WordnetStemmer stemmer;
private IDictionary dict;

public WordnetHyponymOperator(OperatorDescription description) {
super(description);
String wnhome = "/usr/local/WordNet-3.0/";
String path = wnhome + File.separator + "dict";
URL url = null;
try {
url = new URL("file", null, path);
} catch (MalformedURLException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

// construct the dictionary object and open it
IDictionary dict = new Dictionary(url);
dict.open();
WordnetStemmer stemmer = new WordnetStemmer(dict);
this.dict = dict;
this.stemmer = stemmer;
}

@Override
protected Document doWork(Document textObject) throws OperatorException {

List<Token> newSequence = new ArrayList<Token>(textObject
.getTokenSequence().size());
for (Token token : textObject.getTokenSequence()) {
List<String> stems = stemmer.findStems(token.getToken(), POS.NOUN);
if (stems != null && stems.size() > 0) {
String word2 = stems.get(0);
IIndexWord idxWord = dict.getIndexWord(word2, POS.NOUN);
if (idxWord != null && idxWord.getWordIDs().size() > 0) {
if (idxWord != null && idxWord.getWordIDs().size() > 0) {
IWordID wordID = idxWord.getWordIDs().get(0);
IWord word = dict.getWord(wordID);
ISynset synset = word.getSynset();
List<ISynsetID> blub = synset.getRelatedMap().get(
Pointer.HYPERNYM);

for (ISynsetID iSynsetID : blub) {
ISynset set = dict.getSynset(iSynsetID);
List<IWord> bla = set.getWords();
for (IWord iWord : bla) {
newSequence.add(new Token(iWord.getLemma(),
token.getWeight()));
}

}
}
}
}
newSequence.add(token);
}
textObject.setTokenSequence(newSequence);
return textObject;
}

}
Logged
Pages: [1]
  Print  
 
Jump to: