accumulo-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ujustgotbi...@apache.org
Subject [02/50] git commit: ACCUMULO-374 Removed stop list stuff
Date Thu, 06 Feb 2014 05:39:41 GMT
ACCUMULO-374 Removed stop list stuff

git-svn-id: https://svn.apache.org/repos/asf/incubator/accumulo/branches/1.4@1241141 13f79535-47bb-0310-9956-ffa450edef68


Project: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/commit/6a3b4190
Tree: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/tree/6a3b4190
Diff: http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/diff/6a3b4190

Branch: refs/heads/1.4.5-SNAPSHOT
Commit: 6a3b41909815ba1e44ce6ebf6de24ba756f1ccde
Parents: b9cf294
Author: Adam Fuchs <afuchs@apache.org>
Authored: Mon Feb 6 20:07:59 2012 +0000
Committer: Adam Fuchs <afuchs@apache.org>
Committed: Mon Feb 6 20:07:59 2012 +0000

----------------------------------------------------------------------
 .../wikisearch/ingest/WikipediaMapper.java      | 35 +-------------------
 1 file changed, 1 insertion(+), 34 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/accumulo-wikisearch/blob/6a3b4190/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
----------------------------------------------------------------------
diff --git a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
index b25c042..c343f52 100644
--- a/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
+++ b/ingest/src/main/java/org/apache/accumulo/examples/wikisearch/ingest/WikipediaMapper.java
@@ -32,7 +32,6 @@ import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
-
 import org.apache.accumulo.core.data.Mutation;
 import org.apache.accumulo.core.data.Value;
 import org.apache.accumulo.core.security.ColumnVisibility;
@@ -48,20 +47,9 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.hadoop.mapreduce.lib.input.FileSplit;
 import org.apache.log4j.Logger;
-import org.apache.lucene.analysis.StopAnalyzer;
-import org.apache.lucene.analysis.StopFilter;
-import org.apache.lucene.analysis.ar.ArabicAnalyzer;
-import org.apache.lucene.analysis.br.BrazilianAnalyzer;
-import org.apache.lucene.analysis.cjk.CJKAnalyzer;
-import org.apache.lucene.analysis.de.GermanAnalyzer;
-import org.apache.lucene.analysis.el.GreekAnalyzer;
-import org.apache.lucene.analysis.fa.PersianAnalyzer;
-import org.apache.lucene.analysis.fr.FrenchAnalyzer;
-import org.apache.lucene.analysis.nl.DutchAnalyzer;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.wikipedia.analysis.WikipediaTokenizer;
 
-
 import com.google.common.collect.HashMultimap;
 import com.google.common.collect.Multimap;
 
@@ -82,7 +70,6 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation>
{
   private ArticleExtractor extractor;
   private String language;
   private int numPartitions = 0;
-  private Set<?> stopwords = null;
   private ColumnVisibility cv = null;
   
   private Text tablename = null;
@@ -103,25 +90,6 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation>
{
     Matcher matcher = languagePattern.matcher(fileName);
     if (matcher.matches()) {
       language = matcher.group(1).replace('_', '-').toLowerCase();
-      if (language.equals("arwiki"))
-        stopwords = ArabicAnalyzer.getDefaultStopSet();
-      else if (language.equals("brwiki"))
-        stopwords = BrazilianAnalyzer.getDefaultStopSet();
-      else if (language.startsWith("zh"))
-        stopwords = CJKAnalyzer.getDefaultStopSet();
-      else if (language.equals("dewiki"))
-        stopwords = GermanAnalyzer.getDefaultStopSet();
-      else if (language.equals("elwiki"))
-        stopwords = GreekAnalyzer.getDefaultStopSet();
-      else if (language.equals("fawiki"))
-        stopwords = PersianAnalyzer.getDefaultStopSet();
-      else if (language.equals("frwiki"))
-        stopwords = FrenchAnalyzer.getDefaultStopSet();
-      else if (language.equals("nlwiki"))
-        stopwords = DutchAnalyzer.getDefaultStopSet();
-      else
-        stopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
-      
     } else {
       throw new RuntimeException("Unknown ingest language! " + fileName);
     }
@@ -230,9 +198,8 @@ public class WikipediaMapper extends Mapper<LongWritable,Text,Text,Mutation>
{
     Set<String> tokenList = new HashSet<String>();
     WikipediaTokenizer tok = new WikipediaTokenizer(new StringReader(article.getText()));
     TermAttribute term = tok.addAttribute(TermAttribute.class);
-    StopFilter filter = new StopFilter(false, tok, stopwords, true);
     try {
-      while (filter.incrementToken()) {
+      while (tok.incrementToken()) {
         String token = term.term();
         if (!StringUtils.isEmpty(token))
           tokenList.add(token);


Mime
View raw message