mahout-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sro...@apache.org
Subject svn commit: r1207060 - in /mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes: WikipediaDatasetCreatorDriver.java WikipediaDatasetCreatorMapper.java
Date Mon, 28 Nov 2011 10:45:09 GMT
Author: srowen
Date: Mon Nov 28 10:45:08 2011
New Revision: 1207060

URL: http://svn.apache.org/viewvc?rev=1207060&view=rev
Log:
MAHOUT-895 Match Wikipedia start/close tags as-is without preprocessing

Modified:
    mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java
    mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java?rev=1207060&r1=1207059&r2=1207060&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java
(original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorDriver.java
Mon Nov 28 10:45:08 2011
@@ -144,8 +144,8 @@ public final class WikipediaDatasetCreat
     throws IOException, InterruptedException, ClassNotFoundException {
     Configuration conf = new Configuration();
     conf.set("key.value.separator.in.input.line", " ");
-    conf.set("xmlinput.start", "<text xml:space=\"preserve\">");
-    conf.set("xmlinput.end", "</text>");
+    conf.set("xmlinput.start", "<page>");
+    conf.set("xmlinput.end", "</page>");
     conf.setBoolean("exact.match.only", exactMatchOnly);
     conf.set("analyzer.class", analyzerClass.getName());
     conf.set("io.serializations",

Modified: mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
URL: http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java?rev=1207060&r1=1207059&r2=1207060&view=diff
==============================================================================
--- mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
(original)
+++ mahout/trunk/examples/src/main/java/org/apache/mahout/classifier/bayes/WikipediaDatasetCreatorMapper.java
Mon Nov 28 10:45:08 2011
@@ -63,11 +63,11 @@ public class WikipediaDatasetCreatorMapp
   @Override
   protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException
{
     String document = value.toString();
+    document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(
+        OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
     String catMatch = findMatchingCategory(document);
     if (!"Unknown".equals(catMatch)) {
       StringBuilder contents = new StringBuilder(1000);
-      document = StringEscapeUtils.unescapeHtml(CLOSE_TEXT_TAG_PATTERN.matcher(
-          OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst("")).replaceAll(""));
       TokenStream stream = analyzer.reusableTokenStream(catMatch, new StringReader(document));
       CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
       stream.reset();



Mime
View raw message