incubator-accumulo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From afu...@apache.org
Subject svn commit: r1241141 - /incubator/accumulo/branches/1.4/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
Date Mon, 06 Feb 2012 20:07:59 GMT
Author: afuchs
Date: Mon Feb  6 20:07:59 2012
New Revision: 1241141

URL: http://svn.apache.org/viewvc?rev=1241141&view=rev
Log:
ACCUMULO-374 Removed stop list stuff

Modified:
    incubator/accumulo/branches/1.4/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java

Modified: incubator/accumulo/branches/1.4/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
URL: http://svn.apache.org/viewvc/incubator/accumulo/branches/1.4/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java?rev=1241141&r1=1241140&r2=1241141&view=diff
==============================================================================
--- incubator/accumulo/branches/1.4/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
(original)
+++ incubator/accumulo/branches/1.4/src/examples/wikisearch/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
Mon Feb  6 20:07:59 2012
@@ -32,7 +32,6 @@ import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-
 import org.apache.accumulo.core.data.Mutation;
 import org.apache.accumulo.core.data.Value;
 import org.apache.accumulo.core.security.ColumnVisibility;
@@ -48,20 +47,9 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.StopAnalyzer;
-import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.ar.ArabicAnalyzer;
-import org.apache.lucene.analysis.br.BrazilianAnalyzer;
-import org.apache.lucene.analysis.cjk.CJKAnalyzer;
-import org.apache.lucene.analysis.de.GermanAnalyzer;
-import org.apache.lucene.analysis.el.GreekAnalyzer;
-import org.apache.lucene.analysis.fa.PersianAnalyzer;
-import org.apache.lucene.analysis.fr.FrenchAnalyzer;
-import org.apache.lucene.analysis.nl.DutchAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
 
-
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
 
@@ -82,7 +70,6 @@ public class WikipediaMapper extends Map
   private ArticleExtractor extractor;
   private String language;
   private int numPartitions = 0;
-  private Set<?> stopwords = null;
   private ColumnVisibility cv = null;
   
   private Text tablename = null;
@@ -103,25 +90,6 @@ public class WikipediaMapper extends Map
     Matcher matcher = languagePattern.matcher(fileName);
     if (matcher.matches()) {
       language = matcher.group(1).replace('_', '-').toLowerCase();
-      if (language.equals("arwiki"))
-        stopwords = ArabicAnalyzer.getDefaultStopSet();
-      else if (language.equals("brwiki"))
-        stopwords = BrazilianAnalyzer.getDefaultStopSet();
-      else if (language.startsWith("zh"))
-        stopwords = CJKAnalyzer.getDefaultStopSet();
-      else if (language.equals("dewiki"))
-        stopwords = GermanAnalyzer.getDefaultStopSet();
-      else if (language.equals("elwiki"))
-        stopwords = GreekAnalyzer.getDefaultStopSet();
-      else if (language.equals("fawiki"))
-        stopwords = PersianAnalyzer.getDefaultStopSet();
-      else if (language.equals("frwiki"))
-        stopwords = FrenchAnalyzer.getDefaultStopSet();
-      else if (language.equals("nlwiki"))
-        stopwords = DutchAnalyzer.getDefaultStopSet();
-      else
-        stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
-      
     } else {
       throw new RuntimeException("Unknown ingest language! " + fileName);
     }
@@ -230,9 +198,8 @@ public class WikipediaMapper extends Map
     Set<String> tokenList = new HashSet<String>();
     WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
     TermAttribute term = tok.addAttribute(TermAttribute.class);
-    StopFilter filter = new StopFilter(false, tok, stopwords, true);
     try {
-      while (filter.incrementToken()) {
+      while (tok.incrementToken()) {
         String token = term.term();
         if (!StringUtils.isEmpty(token))
           tokenList.add(token);



Mime
View raw message