lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r564151 - in /lucene/java/trunk/contrib/benchmark: ./ conf/ src/java/org/apache/lucene/benchmark/byTask/feeds/ src/java/org/apache/lucene/benchmark/byTask/tasks/ src/java/org/apache/lucene/benchmark/utils/
Date Thu, 09 Aug 2007 08:57:27 GMT
Author: mikemccand
Date: Thu Aug  9 01:57:26 2007
New Revision: 564151

URL: http://svn.apache.org/viewvc?view=rev&rev=564151
Log:
LUCENE-971: extract wikipedia documents as a doc maker directly from XML file without using
intermediate one-file-per-document

Added:
    lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg   (with props)
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
  (with props)
Removed:
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/utils/ExtractWikipedia.java
Modified:
    lucene/java/trunk/contrib/benchmark/CHANGES.txt
    lucene/java/trunk/contrib/benchmark/build.xml
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
    lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java

Modified: lucene/java/trunk/contrib/benchmark/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/CHANGES.txt?view=diff&rev=564151&r1=564150&r2=564151
==============================================================================
--- lucene/java/trunk/contrib/benchmark/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/benchmark/CHANGES.txt Thu Aug  9 01:57:26 2007
@@ -4,6 +4,12 @@
 
 $Id:$
 
+8/9/07
+  LUCENE-971: Change enwiki tasks to a doc maker (extending
+  LineDocMaker) that directly processes the Wikipedia XML and produces
+  documents.  Intermediate files (one per document) are no longer
+  created.
+
 8/1/07
   LUCENE-967: Add "ReadTokensTask" to allow for benchmarking just tokenization.
 

Modified: lucene/java/trunk/contrib/benchmark/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/build.xml?view=diff&rev=564151&r1=564150&r2=564151
==============================================================================
--- lucene/java/trunk/contrib/benchmark/build.xml (original)
+++ lucene/java/trunk/contrib/benchmark/build.xml Thu Aug  9 01:57:26 2007
@@ -23,7 +23,7 @@
         
         <available file="temp/enwiki-20070527-pages-articles.xml.bz2" property="enwiki.exists"/>
         <available file="temp/enwiki-20070527-pages-articles.xml" property="enwiki.expanded"/>
-        <available file="${working.dir}/enwiki" property="enwiki.extracted"/>
+        <available file="${working.dir}/enwiki.txt" property="enwiki.extracted"/>
 
     </target>
 
@@ -31,7 +31,6 @@
         <mkdir dir="temp"/>
         <antcall target="get-enwiki"/>
         <antcall target="expand-enwiki"/>
-        <antcall target="extract-enwiki"/>
     </target>
 
     <target name="get-enwiki" unless="enwiki.exists">
@@ -43,14 +42,6 @@
         <bunzip2 src="temp/enwiki-20070527-pages-articles.xml.bz2" dest="temp"/>
     </target>
 
-    <target name="extract-enwiki" depends="check-files" unless="enwiki.extracted">
-        <mkdir dir="${working.dir}/enwiki"/>
-        <java classname="org.apache.lucene.benchmark.utils.ExtractWikipedia" maxmemory="1024M"
fork="true">
-            <classpath refid="run.classpath"/>
-            <arg line="temp/enwiki-20070527-pages-articles.xml ${working.dir}/enwiki"/>
-        </java>
-    </target>
-
     <target name="get-news-20" unless="20news-18828.exists">
         <get src="http://www-2.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz"
              dest="temp/news20.tar.gz"/>
@@ -164,7 +155,7 @@
               <enable/>
             </assertions>
             <classpath refid="run.classpath"/>
-            <arg line="conf/wikipedia.alg"/>
+            <arg line="conf/extractWikipedia.alg"/>
         </java>
     </target>
 

Added: lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg?view=auto&rev=564151
==============================================================================
--- lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg (added)
+++ lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg Thu Aug  9 01:57:26 2007
@@ -0,0 +1,44 @@
+#/**
+# * Licensed to the Apache Software Foundation (ASF) under one or more
+# * contributor license agreements.  See the NOTICE file distributed with
+# * this work for additional information regarding copyright ownership.
+# * The ASF licenses this file to You under the Apache License, Version 2.0
+# * (the "License"); you may not use this file except in compliance with
+# * the License.  You may obtain a copy of the License at
+# *
+# *     http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+# -------------------------------------------------------------------------------------
+
+#
+# This alg will process the Wikipedia documents feed to produce a
+# single file that contains all documents, one per line.
+#
+# To use this, first cd to contrib/benchmark and then run:
+#
+#   ant run-task -Dtask.alg=conf/extractWikipedia.alg
+#
+# Then, to index the documents in the line file, see
+# indexLineFile.alg.
+#
+
+# Where to get documents from:
+doc.maker=org.apache.lucene.benchmark.byTask.feeds.EnwikiDocMaker
+docs.file=temp/enwiki-20070527-pages-articles.xml
+
+# Where to write the line file output:
+line.file.out=work/enwiki.txt
+
+# Stop after processing the document feed once:
+doc.maker.forever=false
+
+# -------------------------------------------------------------------------------------
+
+# Process all documents, appending each one to the line file:
+{WriteLineDoc() > : *

Propchange: lucene/java/trunk/contrib/benchmark/conf/extractWikipedia.alg
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java?view=auto&rev=564151
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
(added)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
Thu Aug  9 01:57:26 2007
@@ -0,0 +1,213 @@
+package org.apache.lucene.benchmark.byTask.feeds;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.xml.sax.XMLReader;
+import org.xml.sax.Attributes;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+import org.xml.sax.helpers.XMLReaderFactory;
+
+import java.io.IOException;
+
+import org.apache.lucene.document.Document;
+
+/**
+ * A LineDocMaker which reads the uncompressed english wikipedia dump.
+ */
+public class EnwikiDocMaker extends LineDocMaker {
+
+  static final int TITLE = 0;
+  static final int DATE = TITLE+1;
+  static final int BODY = DATE+1;
+  static final int LENGTH = BODY+1;
+
+  static final String[] months = {"JAN", "FEB", "MAR", "APR",
+                                  "MAY", "JUN", "JUL", "AUG",
+                                  "SEP", "OCT", "NOV", "DEC"};
+
+  class Parser extends DefaultHandler implements Runnable {
+
+    Thread t;
+
+    public void run() {
+
+      try {
+        XMLReader reader =
+          XMLReaderFactory.createXMLReader("org.apache.xerces.parsers.SAXParser");
+        reader.setContentHandler(this);
+        reader.setErrorHandler(this);
+        while(true){
+          InputSource is = new InputSource(fileIS);
+          reader.parse(is);
+          if (!forever) {
+            synchronized(this) {
+              nmde = new NoMoreDataException();
+              notify();
+            }
+            return;
+          } else {
+            synchronized(this){
+              openFile();
+            }
+          }
+        }
+      } catch (SAXException sae) {
+        throw new RuntimeException(sae);
+      } catch (IOException ioe) {
+        throw new RuntimeException(ioe);
+      }
+
+    }
+
+    Parser() {
+      t = new Thread(this);
+      t.setDaemon(true);
+      t.start();
+    }
+
+    String[] tuple;
+    NoMoreDataException nmde;
+
+    String[] next() throws NoMoreDataException {
+      String[] result;
+      synchronized(this){
+        while(tuple == null && nmde == null){
+          try {
+            wait();
+          } catch (InterruptedException ie) {
+          }
+        }
+        if (nmde != null) {
+          throw nmde;
+        }
+        result = tuple;
+        tuple = null;
+        notify();
+      }
+      return result;
+    }
+
+    StringBuffer contents = new StringBuffer();
+
+    public void characters(char[] ch, int start, int length) {
+      contents.append(ch, start, length);
+    }
+
+    String title;
+    String body;
+    String time;
+
+    static final int BASE = 10;
+    
+    public void startElement(String namespace,
+                             String simple,
+                             String qualified,
+                             Attributes attributes) {
+      if (qualified.equals("page")) {
+        title = null;
+        body = null;
+        time = null;
+      } else if (qualified.equals("text")) {
+        contents.setLength(0);
+      } else if (qualified.equals("timestamp")) {
+        contents.setLength(0);
+      } else if (qualified.equals("title")) {
+        contents.setLength(0);
+      }
+    }
+
+    String time(String original) {
+      StringBuffer buffer = new StringBuffer();
+
+      buffer.append(original.substring(8, 10));
+      buffer.append('-');
+      buffer.append(months[Integer.valueOf(original.substring(5, 7)).intValue() - 1]);
+      buffer.append('-');
+      buffer.append(original.substring(0, 4));
+      buffer.append(' ');
+      buffer.append(original.substring(11, 19));
+      buffer.append(".000");
+
+      return buffer.toString();
+    }
+
+    public void create(String title, String time, String body) {
+      String[] t = new String[LENGTH];
+      t[TITLE] = title.replace('\t', ' ');
+      t[DATE] = time.replace('\t', ' ');
+      t[BODY] = body.replaceAll("[\t\n]", " ");
+      synchronized(this) {
+        while(tuple!=null) {
+          try {
+            wait();
+          } catch (InterruptedException ie) {
+          }
+        }
+        tuple = t;
+        notify();
+      }
+    }
+
+    public void endElement(String namespace, String simple, String qualified)
+      throws SAXException {
+      if (qualified.equals("title")) {
+        title = contents.toString();
+      } else if (qualified.equals("text")) {
+        body = contents.toString();
+        if (body.startsWith("#REDIRECT") ||
+             body.startsWith("#redirect")) {
+          body = null;
+        }
+      } else if (qualified.equals("timestamp")) {
+        time = time(contents.toString());
+      } else if (qualified.equals("page")) {
+        if (body != null) {
+          create(title, time, body);
+        }
+      }
+    }
+  }
+
+  Parser parser = new Parser();
+
+  class DocState extends LineDocMaker.DocState {
+    public Document setFields(String[] tuple) {
+      titleField.setValue(tuple[TITLE]);
+      dateField.setValue(tuple[DATE]);
+      bodyField.setValue(tuple[BODY]);
+      return doc;
+    }
+  }
+
+  private DocState getDocState() {
+    DocState ds = (DocState) docState.get();
+    if (ds == null) {
+      ds = new DocState();
+      docState.set(ds);
+    }
+    return ds;
+  }
+
+  public Document makeDocument() throws Exception {
+    String[] tuple = parser.next();
+    return getDocState().setFields(tuple);
+  }
+
+}
\ No newline at end of file

Propchange: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/EnwikiDocMaker.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java?view=diff&rev=564151&r1=564150&r2=564151
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/feeds/LineDocMaker.java
Thu Aug  9 01:57:26 2007
@@ -25,7 +25,8 @@
 
 import java.io.BufferedReader;
 import java.io.IOException;
-import java.io.FileReader;
+import java.io.FileInputStream;
+import java.io.InputStreamReader;
 
 /**
  * A DocMaker reading one line at a time as a Document from
@@ -39,13 +40,14 @@
  */
 public class LineDocMaker extends BasicDocMaker {
 
-  private BufferedReader fileIn;
-  private ThreadLocal docState = new ThreadLocal();
+  FileInputStream fileIS;
+  BufferedReader fileIn;
+  ThreadLocal docState = new ThreadLocal();
   private String fileName;
 
   private static int READER_BUFFER_BYTES = 64*1024;
-
-  private class DocState {
+  
+  class DocState {
     Document doc;
     Field bodyField;
     Field titleField;
@@ -63,7 +65,7 @@
                              storeVal,
                              Field.Index.TOKENIZED,
                              termVecVal);
-      dateField = new Field(BasicDocMaker.TITLE_FIELD,
+      dateField = new Field(BasicDocMaker.DATE_FIELD,
                             "",
                             storeVal,
                             Field.Index.TOKENIZED,
@@ -143,11 +145,12 @@
     openFile();
   }
 
-  private void openFile() {
+  void openFile() {
     try {
       if (fileIn != null)
         fileIn.close();
-      fileIn = new BufferedReader(new FileReader(fileName), READER_BUFFER_BYTES);
+      fileIS = new FileInputStream(fileName);
+      fileIn = new BufferedReader(new InputStreamReader(fileIS,"UTF-8"), READER_BUFFER_BYTES);
     } catch (IOException e) {
       throw new RuntimeException(e);
     }

Modified: lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java?view=diff&rev=564151&r1=564150&r2=564151
==============================================================================
--- lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
(original)
+++ lucene/java/trunk/contrib/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/WriteLineDocTask.java
Thu Aug  9 01:57:26 2007
@@ -18,7 +18,8 @@
  */
 
 import java.io.BufferedWriter;
-import java.io.FileWriter;
+import java.io.FileOutputStream;
+import java.io.OutputStreamWriter;
 
 import org.apache.lucene.benchmark.byTask.PerfRunData;
 import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
@@ -59,7 +60,7 @@
       String fileName = config.get("line.file.out", null);
       if (fileName == null)
         throw new Exception("line.file.out must be set");
-      lineFileOut = new BufferedWriter(new FileWriter(fileName));
+      lineFileOut = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(fileName),"UTF-8"));
     }
     docMaker = getRunData().getDocMaker();
   }



Mime
View raw message