lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r608852 - in /lucene/java/trunk: ./ contrib/wikipedia/ contrib/wikipedia/src/ contrib/wikipedia/src/java/ contrib/wikipedia/src/java/org/ contrib/wikipedia/src/java/org/apache/ contrib/wikipedia/src/java/org/apache/lucene/ contrib/wikipedia...
Date Fri, 04 Jan 2008 14:29:18 GMT
Author: gsingers
Date: Fri Jan  4 06:29:15 2008
New Revision: 608852

URL: http://svn.apache.org/viewvc?rev=608852&view=rev
Log:
LUCENE-1103

Added:
    lucene/java/trunk/contrib/wikipedia/
    lucene/java/trunk/contrib/wikipedia/build.xml   (with props)
    lucene/java/trunk/contrib/wikipedia/pom.xml.template
    lucene/java/trunk/contrib/wikipedia/src/
    lucene/java/trunk/contrib/wikipedia/src/java/
    lucene/java/trunk/contrib/wikipedia/src/java/org/
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java   (with props)
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java   (with props)
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html   (with props)
    lucene/java/trunk/contrib/wikipedia/src/test/
    lucene/java/trunk/contrib/wikipedia/src/test/org/
    lucene/java/trunk/contrib/wikipedia/src/test/org/apache/
    lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/
    lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/
    lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/
    lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java   (with props)
Modified:
    lucene/java/trunk/build.xml
    lucene/java/trunk/docs/developer-resources.html
    lucene/java/trunk/docs/developer-resources.pdf
    lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml

Modified: lucene/java/trunk/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/build.xml?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/build.xml (original)
+++ lucene/java/trunk/build.xml Fri Jan  4 06:29:15 2008
@@ -250,6 +250,7 @@
           <packageset dir="contrib/spellchecker/src/java"/>
           <packageset dir="contrib/surround/src/java"/>
           <packageset dir="contrib/swing/src/java"/>
+          <packageset dir="contrib/wikipedia/src/java"/>
           <packageset dir="contrib/wordnet/src/java"/>
           <packageset dir="contrib/xml-query-parser/src/java"/>
           <!-- end alpha sort -->
@@ -279,6 +280,7 @@
           <group title="contrib: SpellChecker" packages="org.apache.lucene.search.spell*"/>
           <group title="contrib: Surround Parser" packages="org.apache.lucene.queryParser.surround*"/>
           <group title="contrib: Swing" packages="org.apache.lucene.swing*"/>
+          <group title="contrib: Wikipedia" packages="org.apache.lucene.wikipedia*"/>
           <group title="contrib: WordNet" packages="org.apache.lucene.wordnet*"/>
           <group title="contrib: XML Query Parser" packages="org.apache.lucene.xmlparser*"/>
 

Added: lucene/java/trunk/contrib/wikipedia/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/build.xml?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/build.xml (added)
+++ lucene/java/trunk/contrib/wikipedia/build.xml Fri Jan  4 06:29:15 2008
@@ -0,0 +1,49 @@
+<?xml version="1.0"?>
+
+<!--
+    Licensed to the Apache Software Foundation (ASF) under one or more
+    contributor license agreements.  See the NOTICE file distributed with
+    this work for additional information regarding copyright ownership.
+    The ASF licenses this file to You under the Apache License, Version 2.0
+    the "License"); you may not use this file except in compliance with
+    the License.  You may obtain a copy of the License at
+ 
+        http://www.apache.org/licenses/LICENSE-2.0
+ 
+    Unless required by applicable law or agreed to in writing, software
+    distributed under the License is distributed on an "AS IS" BASIS,
+    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+    See the License for the specific language governing permissions and
+    limitations under the License.
+ -->
+
+<project name="wikipedia" default="default">
+
+  <description>
+    Tools for working with Wikipedia
+  </description>
+
+
+  <import file="../contrib-build.xml"/>
+
+
+  <target name="jflex" depends="clean-jflex,jflex-wiki-tokenizer"/>
+
+  <target name="jflex-wiki-tokenizer" depends="init,jflex-check" if="jflex.present">
+    <taskdef classname="JFlex.anttask.JFlexTask" name="jflex">
+      <classpath location="${jflex.home}/lib/JFlex.jar"/>
+    </taskdef>
+
+    <jflex file="src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex"
+           outdir="src/java/org/apache/lucene/wikipedia/analysis"
+           nobak="on"/>
+  </target>
+
+  <target name="clean-jflex">
+    <delete>
+      <fileset dir="src/java/org/apache/lucene/wikipedia" includes="*.java">
+        <containsregexp expression="generated.*by.*JFlex"/>
+      </fileset>
+    </delete>
+  </target>
+</project>

Propchange: lucene/java/trunk/contrib/wikipedia/build.xml
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/wikipedia/pom.xml.template
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/pom.xml.template?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/pom.xml.template (added)
+++ lucene/java/trunk/contrib/wikipedia/pom.xml.template Fri Jan  4 06:29:15 2008
@@ -0,0 +1,43 @@
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+  xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <!--
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+    
+    http://www.apache.org/licenses/LICENSE-2.0
+    
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+  -->
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.lucene</groupId>
+    <artifactId>lucene-contrib</artifactId>
+    <version>@version@</version>
+  </parent>
+  <groupId>org.apache.lucene</groupId>
+  <artifactId>lucene-wikipedia</artifactId>
+  <name>Lucene Wikipedia Tools</name>
+  <version>@version@</version>
+  <description>Lucene Wikipedia Contributions</description>
+  <packaging>jar</packaging>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-benchmark</artifactId>
+      <version>@version@</version>
+    </dependency>
+  </dependencies>
+</project>

Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java Fri Jan  4 06:29:15 2008
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.wikipedia.analysis;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+import java.io.Reader;
+import java.io.IOException;
+
+
+/**
+ * Extension of StandardTokenizer that is aware of Wikipedia syntax.  It is based off of the
+ * Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
+ *
+ *
+ **/
+public class WikipediaTokenizer extends Tokenizer {
+  public static final String INTERNAL_LINK = "il";
+  public static final String EXTERNAL_LINK = "el";
+  //The URL part of the link, i.e. the first token
+  public static final String EXTERNAL_LINK_URL = "elu";
+  public static final String CITATION = "ci";
+  public static final String CATEGORY = "c";
+  public static final String BOLD = "b";
+  public static final String ITALICS = "i";
+  public static final String BOLD_ITALICS = "bi";
+  public static final String HEADING = "h";
+  public static final String SUB_HEADING = "sh";
+  /**
+   * A private instance of the JFlex-constructed scanner
+   */
+  private final WikipediaTokenizerImpl scanner;
+
+  void setInput(Reader reader) {
+    this.input = reader;
+  }
+
+  /**
+   * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
+   * <code>input</code> to a newly created JFlex scanner.
+   * @param input The Input Reader
+   */
+  public WikipediaTokenizer(Reader input) {
+    this.input = input;
+    this.scanner = new WikipediaTokenizerImpl(input);
+  }
+
+  /*
+  * (non-Javadoc)
+  *
+  * @see org.apache.lucene.analysis.TokenStream#next()
+  */
+  public Token next(Token result) throws IOException {
+    int tokenType = scanner.getNextToken();
+
+    if (tokenType == WikipediaTokenizerImpl.YYEOF) {
+      return null;
+    }
+
+    scanner.getText(result, tokenType);
+    final int start = scanner.yychar();
+    result.setStartOffset(start);
+    result.setEndOffset(start + result.termLength());
+    result.setPositionIncrement(scanner.getPositionIncrement());
+    result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
+    return result;
+  }
+
+  /*
+  * (non-Javadoc)
+  *
+  * @see org.apache.lucene.analysis.TokenStream#reset()
+  */
+  public void reset() throws IOException {
+    super.reset();
+    scanner.yyreset(input);
+  }
+
+  public void reset(Reader reader) throws IOException {
+    input = reader;
+    reset();
+  }
+
+}
\ No newline at end of file

Propchange: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java Fri Jan  4 06:29:15 2008
@@ -0,0 +1,949 @@
+/* The following code was generated by JFlex 1.4.1 on 1/3/08 10:05 PM */
+
+package org.apache.lucene.wikipedia.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+
+/**
+ * This class is a scanner generated by 
+ * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
+ * on 1/3/08 10:05 PM from the specification file
+ * <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
+ */
+class WikipediaTokenizerImpl {
+
+  /** This character denotes the end of file */
+  public static final int YYEOF = -1;
+
+  /** initial size of the lookahead buffer */
+  private static final int ZZ_BUFFERSIZE = 16384;
+
+  /** lexical states */
+  public static final int DOUBLE_BRACE_STATE = 7;
+  public static final int INTERNAL_LINK_STATE = 2;
+  public static final int TWO_SINGLE_QUOTES_STATE = 4;
+  public static final int CATEGORY_STATE = 1;
+  public static final int FIVE_SINGLE_QUOTES_STATE = 5;
+  public static final int STRING = 8;
+  public static final int YYINITIAL = 0;
+  public static final int DOUBLE_EQUALS_STATE = 6;
+  public static final int THREE_SINGLE_QUOTES_STATE = 5;
+  public static final int EXTERNAL_LINK_STATE = 3;
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final String ZZ_CMAP_PACKED = 
+    "\11\0\1\24\1\23\1\0\1\24\1\22\22\0\1\24\1\0\1\12"+
+    "\1\52\2\0\1\3\1\1\4\0\1\14\1\5\1\2\1\10\12\16"+
+    "\1\27\1\0\1\7\1\11\1\13\1\52\1\4\2\15\1\30\5\15"+
+    "\1\41\21\15\1\25\1\0\1\26\1\0\1\6\1\0\1\31\1\43"+
+    "\2\15\1\33\1\40\1\34\1\50\1\41\4\15\1\42\1\35\1\51"+
+    "\1\15\1\36\1\15\1\32\3\15\1\44\1\37\1\15\1\45\1\47"+
+    "\1\46\102\0\27\15\1\0\37\15\1\0\u0568\15\12\17\206\15\12\17"+
+    "\u026c\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17\166\15\12\17"+
+    "\167\15\11\17\166\15\12\17\166\15\12\17\166\15\12\17\340\15\12\17"+
+    "\166\15\12\17\u0166\15\12\17\266\15\u0100\15\u0e00\15\u1040\0\u0150\21\140\0"+
+    "\20\21\u0100\0\200\21\200\0\u19c0\21\100\0\u5200\21\u0c00\0\u2bb0\20\u2150\0"+
+    "\u0200\21\u0465\0\73\21\75\15\43\0";
+
+  /** 
+   * Translates characters to character classes
+   */
+  private static final char [] ZZ_CMAP = zzUnpackCMap(ZZ_CMAP_PACKED);
+
+  /** 
+   * Translates DFA states to action switch labels.
+   */
+  private static final int [] ZZ_ACTION = zzUnpackAction();
+
+  private static final String ZZ_ACTION_PACKED_0 =
+    "\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
+    "\1\1\1\6\2\7\1\10\1\11\1\10\1\12\1\13"+
+    "\1\7\1\14\1\15\1\16\1\17\1\7\1\20\1\7"+
+    "\4\21\1\22\1\21\1\23\1\24\1\25\3\0\1\26"+
+    "\14\0\1\27\1\30\1\10\1\0\1\31\1\0\1\32"+
+    "\1\0\1\33\3\0\1\34\1\35\2\36\1\35\2\37"+
+    "\2\0\1\36\1\0\14\36\1\35\3\0\1\10\1\40"+
+    "\3\0\1\41\1\42\5\0\1\43\4\0\1\43\2\0"+
+    "\2\43\2\0\1\10\5\0\1\30\1\35\1\36\1\44"+
+    "\5\0\1\45\30\0\1\46\2\0\1\47\1\50\1\51";
+
+  private static int [] zzUnpackAction() {
+    int [] result = new int[174];
+    int offset = 0;
+    offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAction(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /** 
+   * Translates a state to a row index in the transition table
+   */
+  private static final int [] ZZ_ROWMAP = zzUnpackRowMap();
+
+  private static final String ZZ_ROWMAP_PACKED_0 =
+    "\0\0\0\53\0\126\0\201\0\254\0\327\0\u0102\0\u012d"+
+    "\0\u0158\0\u0183\0\u01ae\0\u01d9\0\u0204\0\u022f\0\u025a\0\u0285"+
+    "\0\u02b0\0\u0183\0\u02db\0\u0306\0\u0331\0\u035c\0\u0387\0\u03b2"+
+    "\0\u03dd\0\u0183\0\u035c\0\u0408\0\u0183\0\u0433\0\u045e\0\u0489"+
+    "\0\u04b4\0\u04df\0\u050a\0\u0535\0\u0560\0\u058b\0\u05b6\0\u05e1"+
+    "\0\u0183\0\u060c\0\u035c\0\u0637\0\u0662\0\u068d\0\u06b8\0\u0183"+
+    "\0\u0183\0\u06e3\0\u070e\0\u0739\0\u0183\0\u0764\0\u078f\0\u07ba"+
+    "\0\u07e5\0\u0810\0\u083b\0\u0866\0\u0891\0\u08bc\0\u08e7\0\u0912"+
+    "\0\u093d\0\u0968\0\u0993\0\u09be\0\u09e9\0\u0a14\0\u0a3f\0\u0a6a"+
+    "\0\u0a95\0\u0ac0\0\u0aeb\0\u0b16\0\u0b41\0\u0b6c\0\u0b97\0\u0bc2"+
+    "\0\u0bed\0\u0c18\0\u07ba\0\u0c43\0\u0c6e\0\u0c99\0\u0cc4\0\u0cef"+
+    "\0\u0d1a\0\u0d45\0\u0d70\0\u0d9b\0\u0dc6\0\u0df1\0\u0e1c\0\u0e47"+
+    "\0\u0e72\0\u0e9d\0\u0ec8\0\u0ef3\0\u0f1e\0\u0f49\0\u0f74\0\u0f9f"+
+    "\0\u0fca\0\u0183\0\u0ff5\0\u1020\0\u104b\0\u1076\0\u0183\0\u10a1"+
+    "\0\u10cc\0\u10f7\0\u1122\0\u114d\0\u1178\0\u11a3\0\u11ce\0\u11f9"+
+    "\0\u1224\0\u124f\0\u127a\0\u12a5\0\u078f\0\u0912\0\u12d0\0\u12fb"+
+    "\0\u1326\0\u1351\0\u137c\0\u13a7\0\u13d2\0\u13fd\0\u0183\0\u1428"+
+    "\0\u1453\0\u147e\0\u14a9\0\u14d4\0\u14ff\0\u152a\0\u1555\0\u0183"+
+    "\0\u1580\0\u15ab\0\u15d6\0\u1601\0\u162c\0\u1657\0\u1682\0\u16ad"+
+    "\0\u16d8\0\u1703\0\u172e\0\u1759\0\u1784\0\u17af\0\u17da\0\u1805"+
+    "\0\u1830\0\u185b\0\u1886\0\u18b1\0\u18dc\0\u1907\0\u1932\0\u195d"+
+    "\0\u1988\0\u19b3\0\u19de\0\u0183\0\u0183\0\u0183";
+
+  private static int [] zzUnpackRowMap() {
+    int [] result = new int[174];
+    int offset = 0;
+    offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackRowMap(String packed, int offset, int [] result) {
+    int i = 0;  /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int high = packed.charAt(i++) << 16;
+      result[j++] = high | packed.charAt(i++);
+    }
+    return j;
+  }
+
+  /** 
+   * The transition table of the DFA
+   */
+  private static final int [] ZZ_TRANS = zzUnpackTrans();
+
+  private static final String ZZ_TRANS_PACKED_0 =
+    "\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
+    "\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
+    "\15\16\1\25\2\12\2\16\10\12\1\26\5\12\4\27"+
+    "\1\12\1\23\3\12\1\30\1\12\15\27\3\12\2\27"+
+    "\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\30"+
+    "\1\12\15\31\3\12\2\31\1\12\7\32\1\33\5\32"+
+    "\4\34\1\32\1\23\2\12\1\32\1\35\1\32\15\34"+
+    "\3\32\1\36\1\34\2\32\1\37\5\32\1\33\5\32"+
+    "\4\40\1\32\1\41\2\32\1\42\2\32\15\40\3\32"+
+    "\2\40\10\32\1\33\5\32\4\43\1\32\1\41\2\32"+
+    "\1\42\2\32\15\43\3\32\2\43\10\32\1\33\1\32"+
+    "\1\44\3\32\4\45\1\32\1\41\5\32\15\45\3\32"+
+    "\2\45\10\32\1\46\5\32\4\47\1\32\1\41\5\32"+
+    "\15\47\1\32\1\50\1\32\2\47\1\32\1\51\1\52"+
+    "\5\51\1\53\1\51\1\54\3\51\4\55\1\51\1\56"+
+    "\2\51\1\57\2\51\15\55\2\51\1\60\2\55\1\51"+
+    "\54\0\1\61\61\0\1\62\4\0\4\63\7\0\6\63"+
+    "\1\64\6\63\3\0\2\63\12\0\1\65\42\0\1\66"+
+    "\1\67\1\70\1\71\2\72\1\0\1\73\3\0\1\73"+
+    "\1\16\1\17\1\20\1\21\7\0\15\16\3\0\2\16"+
+    "\3\0\1\74\1\0\1\75\2\76\1\0\1\77\3\0"+
+    "\1\77\3\17\1\21\7\0\15\17\3\0\2\17\2\0"+
+    "\1\66\1\100\1\70\1\71\2\76\1\0\1\77\3\0"+
+    "\1\77\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
+    "\2\20\3\0\1\101\1\0\1\75\2\72\1\0\1\73"+
+    "\3\0\1\73\4\21\7\0\15\21\3\0\2\21\24\0"+
+    "\1\12\54\0\1\102\72\0\1\103\15\0\1\62\4\0"+
+    "\4\63\7\0\15\63\3\0\2\63\16\0\4\27\7\0"+
+    "\15\27\3\0\2\27\27\0\1\35\41\0\4\31\7\0"+
+    "\15\31\3\0\2\31\16\0\4\34\7\0\15\34\3\0"+
+    "\2\34\16\0\4\34\7\0\2\34\1\104\12\34\3\0"+
+    "\2\34\2\0\1\105\66\0\4\40\7\0\15\40\3\0"+
+    "\2\40\24\0\1\32\54\0\1\106\42\0\4\43\7\0"+
+    "\15\43\3\0\2\43\12\0\1\35\56\0\4\45\7\0"+
+    "\15\45\3\0\2\45\11\0\1\107\4\0\4\63\7\0"+
+    "\15\63\3\0\2\63\16\0\4\47\7\0\15\47\3\0"+
+    "\2\47\47\0\1\35\5\0\1\110\62\0\1\111\56\0"+
+    "\4\55\7\0\15\55\3\0\2\55\24\0\1\51\54\0"+
+    "\1\112\42\0\4\63\7\0\15\63\3\0\2\63\14\0"+
+    "\1\32\1\0\4\113\1\0\3\114\3\0\15\113\3\0"+
+    "\2\113\14\0\1\32\1\0\4\113\1\0\3\114\3\0"+
+    "\3\113\1\115\11\113\3\0\2\113\16\0\1\116\1\0"+
+    "\1\116\10\0\15\116\3\0\2\116\16\0\1\117\1\120"+
+    "\1\121\1\122\7\0\15\117\3\0\2\117\16\0\1\123"+
+    "\1\0\1\123\10\0\15\123\3\0\2\123\16\0\1\124"+
+    "\1\125\1\124\1\125\7\0\15\124\3\0\2\124\16\0"+
+    "\1\126\2\127\1\130\7\0\15\126\3\0\2\126\16\0"+
+    "\1\73\2\131\10\0\15\73\3\0\2\73\16\0\1\132"+
+    "\2\133\1\134\7\0\15\132\3\0\2\132\16\0\4\125"+
+    "\7\0\15\125\3\0\2\125\16\0\1\135\2\136\1\137"+
+    "\7\0\15\135\3\0\2\135\16\0\1\140\2\141\1\142"+
+    "\7\0\15\140\3\0\2\140\16\0\1\143\1\133\1\144"+
+    "\1\134\7\0\15\143\3\0\2\143\16\0\1\145\2\120"+
+    "\1\122\7\0\15\145\3\0\2\145\30\0\1\146\1\147"+
+    "\63\0\1\150\26\0\4\34\7\0\2\34\1\151\12\34"+
+    "\3\0\2\34\2\0\1\152\100\0\1\153\1\154\37\0"+
+    "\4\63\7\0\6\63\1\155\6\63\3\0\2\63\2\0"+
+    "\1\156\62\0\1\157\70\0\1\160\1\161\33\0\1\162"+
+    "\1\0\1\32\1\0\4\113\1\0\3\114\3\0\15\113"+
+    "\3\0\2\113\16\0\4\163\1\0\3\114\3\0\15\163"+
+    "\3\0\2\163\12\0\1\162\1\0\1\32\1\0\4\113"+
+    "\1\0\3\114\3\0\10\113\1\164\4\113\3\0\2\113"+
+    "\2\0\1\66\13\0\1\116\1\0\1\116\10\0\15\116"+
+    "\3\0\2\116\3\0\1\165\1\0\1\75\2\166\6\0"+
+    "\1\117\1\120\1\121\1\122\7\0\15\117\3\0\2\117"+
+    "\3\0\1\167\1\0\1\75\2\170\1\0\1\171\3\0"+
+    "\1\171\3\120\1\122\7\0\15\120\3\0\2\120\3\0"+
+    "\1\172\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
+    "\1\121\1\120\1\121\1\122\7\0\15\121\3\0\2\121"+
+    "\3\0\1\173\1\0\1\75\2\166\6\0\4\122\7\0"+
+    "\15\122\3\0\2\122\3\0\1\174\2\0\1\174\7\0"+
+    "\1\124\1\125\1\124\1\125\7\0\15\124\3\0\2\124"+
+    "\3\0\1\174\2\0\1\174\7\0\4\125\7\0\15\125"+
+    "\3\0\2\125\3\0\1\166\1\0\1\75\2\166\6\0"+
+    "\1\126\2\127\1\130\7\0\15\126\3\0\2\126\3\0"+
+    "\1\170\1\0\1\75\2\170\1\0\1\171\3\0\1\171"+
+    "\3\127\1\130\7\0\15\127\3\0\2\127\3\0\1\166"+
+    "\1\0\1\75\2\166\6\0\4\130\7\0\15\130\3\0"+
+    "\2\130\3\0\1\171\2\0\2\171\1\0\1\171\3\0"+
+    "\1\171\3\131\10\0\15\131\3\0\2\131\3\0\1\101"+
+    "\1\0\1\75\2\72\1\0\1\73\3\0\1\73\1\132"+
+    "\2\133\1\134\7\0\15\132\3\0\2\132\3\0\1\74"+
+    "\1\0\1\75\2\76\1\0\1\77\3\0\1\77\3\133"+
+    "\1\134\7\0\15\133\3\0\2\133\3\0\1\101\1\0"+
+    "\1\75\2\72\1\0\1\73\3\0\1\73\4\134\7\0"+
+    "\15\134\3\0\2\134\3\0\1\72\1\0\1\75\2\72"+
+    "\1\0\1\73\3\0\1\73\1\135\2\136\1\137\7\0"+
+    "\15\135\3\0\2\135\3\0\1\76\1\0\1\75\2\76"+
+    "\1\0\1\77\3\0\1\77\3\136\1\137\7\0\15\136"+
+    "\3\0\2\136\3\0\1\72\1\0\1\75\2\72\1\0"+
+    "\1\73\3\0\1\73\4\137\7\0\15\137\3\0\2\137"+
+    "\3\0\1\73\2\0\2\73\1\0\1\73\3\0\1\73"+
+    "\1\140\2\141\1\142\7\0\15\140\3\0\2\140\3\0"+
+    "\1\77\2\0\2\77\1\0\1\77\3\0\1\77\3\141"+
+    "\1\142\7\0\15\141\3\0\2\141\3\0\1\73\2\0"+
+    "\2\73\1\0\1\73\3\0\1\73\4\142\7\0\15\142"+
+    "\3\0\2\142\3\0\1\175\1\0\1\75\2\72\1\0"+
+    "\1\73\3\0\1\73\1\143\1\133\1\144\1\134\7\0"+
+    "\15\143\3\0\2\143\3\0\1\176\1\0\1\75\2\76"+
+    "\1\0\1\77\3\0\1\77\1\144\1\133\1\144\1\134"+
+    "\7\0\15\144\3\0\2\144\3\0\1\173\1\0\1\75"+
+    "\2\166\6\0\1\145\2\120\1\122\7\0\15\145\3\0"+
+    "\2\145\31\0\1\147\53\0\1\177\63\0\1\200\25\0"+
+    "\4\34\7\0\15\34\3\0\1\34\1\201\31\0\1\154"+
+    "\53\0\1\202\34\0\1\32\1\0\4\113\1\0\3\114"+
+    "\3\0\3\113\1\203\11\113\3\0\2\113\2\0\1\204"+
+    "\101\0\1\161\53\0\1\205\33\0\1\206\51\0\1\162"+
+    "\3\0\4\163\7\0\15\163\3\0\2\163\12\0\1\162"+
+    "\1\0\1\207\1\0\4\113\1\0\3\114\3\0\15\113"+
+    "\3\0\2\113\16\0\1\210\1\122\1\210\1\122\7\0"+
+    "\15\210\3\0\2\210\16\0\4\130\7\0\15\130\3\0"+
+    "\2\130\16\0\4\134\7\0\15\134\3\0\2\134\16\0"+
+    "\4\137\7\0\15\137\3\0\2\137\16\0\4\142\7\0"+
+    "\15\142\3\0\2\142\16\0\1\211\1\134\1\211\1\134"+
+    "\7\0\15\211\3\0\2\211\16\0\4\122\7\0\15\122"+
+    "\3\0\2\122\16\0\4\212\7\0\15\212\3\0\2\212"+
+    "\33\0\1\213\60\0\1\214\27\0\4\34\6\0\1\215"+
+    "\15\34\3\0\2\34\33\0\1\216\31\0\1\162\1\0"+
+    "\1\32\1\0\4\113\1\0\3\114\3\0\10\113\1\217"+
+    "\4\113\3\0\2\113\2\0\1\220\103\0\1\221\35\0"+
+    "\4\222\7\0\15\222\3\0\2\222\3\0\1\165\1\0"+
+    "\1\75\2\166\6\0\1\210\1\122\1\210\1\122\7\0"+
+    "\15\210\3\0\2\210\3\0\1\175\1\0\1\75\2\72"+
+    "\1\0\1\73\3\0\1\73\1\211\1\134\1\211\1\134"+
+    "\7\0\15\211\3\0\2\211\3\0\1\174\2\0\1\174"+
+    "\7\0\4\212\7\0\15\212\3\0\2\212\34\0\1\223"+
+    "\54\0\1\224\25\0\1\225\75\0\1\226\30\0\1\162"+
+    "\1\0\1\35\1\0\4\113\1\0\3\114\3\0\15\113"+
+    "\3\0\2\113\34\0\1\227\31\0\1\230\2\0\4\222"+
+    "\7\0\15\222\3\0\2\222\35\0\1\231\61\0\1\232"+
+    "\17\0\1\233\76\0\1\234\52\0\1\235\31\0\1\32"+
+    "\1\0\4\163\1\0\3\114\3\0\15\163\3\0\2\163"+
+    "\36\0\1\236\52\0\1\237\32\0\4\240\7\0\15\240"+
+    "\3\0\2\240\36\0\1\241\52\0\1\242\53\0\1\243"+
+    "\60\0\1\244\10\0\1\245\12\0\4\240\7\0\15\240"+
+    "\3\0\2\240\37\0\1\246\52\0\1\247\53\0\1\250"+
+    "\21\0\1\12\61\0\4\251\7\0\15\251\3\0\2\251"+
+    "\40\0\1\252\52\0\1\253\42\0\1\254\25\0\2\251"+
+    "\1\0\2\251\1\0\2\251\2\0\5\251\7\0\15\251"+
+    "\3\0\3\251\27\0\1\255\52\0\1\256\23\0";
+
+  private static int [] zzUnpackTrans() {
+    int [] result = new int[6665];
+    int offset = 0;
+    offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackTrans(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      value--;
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+
+  /* error codes */
+  private static final int ZZ_UNKNOWN_ERROR = 0;
+  private static final int ZZ_NO_MATCH = 1;
+  private static final int ZZ_PUSHBACK_2BIG = 2;
+
+  /* error messages for the codes above */
+  private static final String ZZ_ERROR_MSG[] = {
+    "Unkown internal scanner error",
+    "Error: could not match input",
+    "Error: pushback value was too large"
+  };
+
+  /**
+   * ZZ_ATTRIBUTE[aState] contains the attributes of state <code>aState</code>
+   */
+  private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
+
+  private static final String ZZ_ATTRIBUTE_PACKED_0 =
+    "\11\0\1\11\7\1\1\11\7\1\1\11\2\1\1\11"+
+    "\13\1\1\11\6\1\2\11\3\0\1\11\14\0\3\1"+
+    "\1\0\1\1\1\0\1\1\1\0\1\1\3\0\7\1"+
+    "\2\0\1\1\1\0\15\1\3\0\1\1\1\11\3\0"+
+    "\1\1\1\11\5\0\1\1\4\0\1\1\2\0\2\1"+
+    "\2\0\1\1\5\0\1\11\3\1\5\0\1\11\30\0"+
+    "\1\1\2\0\3\11";
+
+  private static int [] zzUnpackAttribute() {
+    int [] result = new int[174];
+    int offset = 0;
+    offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
+    return result;
+  }
+
+  private static int zzUnpackAttribute(String packed, int offset, int [] result) {
+    int i = 0;       /* index in packed string  */
+    int j = offset;  /* index in unpacked array */
+    int l = packed.length();
+    while (i < l) {
+      int count = packed.charAt(i++);
+      int value = packed.charAt(i++);
+      do result[j++] = value; while (--count > 0);
+    }
+    return j;
+  }
+
+  /** the input device */
+  private java.io.Reader zzReader;
+
+  /** the current state of the DFA */
+  private int zzState;
+
+  /** the current lexical state */
+  private int zzLexicalState = YYINITIAL;
+
+  /** this buffer contains the current text to be matched and is
+      the source of the yytext() string */
+  private char zzBuffer[] = new char[ZZ_BUFFERSIZE];
+
+  /** the textposition at the last accepting state */
+  private int zzMarkedPos;
+
+  /** the textposition at the last state to be included in yytext */
+  private int zzPushbackPos;
+
+  /** the current text position in the buffer */
+  private int zzCurrentPos;
+
+  /** startRead marks the beginning of the yytext() string in the buffer */
+  private int zzStartRead;
+
+  /** endRead marks the last character in the buffer, that has been read
+      from input */
+  private int zzEndRead;
+
+  /** number of newlines encountered up to the start of the matched text */
+  private int yyline;
+
+  /** the number of characters up to the start of the matched text */
+  private int yychar;
+
+  /**
+   * the number of characters from the last newline up to the start of the 
+   * matched text
+   */
+  private int yycolumn;
+
+  /** 
+   * zzAtBOL == true <=> the scanner is currently at the beginning of a line
+   */
+  private boolean zzAtBOL = true;
+
+  /** zzAtEOF == true <=> the scanner is at the EOF */
+  private boolean zzAtEOF;
+
+  /* user code: */
+
+public static final int ALPHANUM          = 0;
+public static final int APOSTROPHE        = 1;
+public static final int ACRONYM           = 2;
+public static final int COMPANY           = 3;
+public static final int EMAIL             = 4;
+public static final int HOST              = 5;
+public static final int NUM               = 6;
+public static final int CJ                = 7;
+public static final int INTERNAL_LINK     = 8;
+public static final int EXTERNAL_LINK     = 9;
+public static final int CITATION          = 10;
+public static final int CATEGORY         = 11;
+public static final int BOLD     = 12;
+public static final int ITALICS     = 13;
+public static final int BOLD_ITALICS     = 14;
+public static final int HEADING     = 15;
+public static final int SUB_HEADING     = 16;
+public static final int EXTERNAL_LINK_URL = 17;
+
+
+private int currentTokType;
+private int numBalanced = 0;
+private int positionInc = 1;
+
+public static final String [] TOKEN_TYPES = new String [] {
+    "<ALPHANUM>",
+    "<APOSTROPHE>",
+    "<ACRONYM>",
+    "<COMPANY>",
+    "<EMAIL>",
+    "<HOST>",
+    "<NUM>",
+    "<CJ>",
+    WikipediaTokenizer.INTERNAL_LINK,
+    WikipediaTokenizer.EXTERNAL_LINK,
+    WikipediaTokenizer.CITATION,
+    WikipediaTokenizer.CATEGORY,
+    WikipediaTokenizer.BOLD,
+    WikipediaTokenizer.ITALICS,
+    WikipediaTokenizer.BOLD_ITALICS,
+    WikipediaTokenizer.HEADING,
+    WikipediaTokenizer.SUB_HEADING,
+    WikipediaTokenizer.EXTERNAL_LINK_URL
+};
+
+public final int yychar()
+{
+    return yychar;
+}
+
+public final int getPositionIncrement(){
+  return positionInc;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t, int tokType) {
+  t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+
+
+  /**
+   * Creates a new scanner
+   * There is also a java.io.InputStream version of this constructor.
+   *
+   * @param   in  the java.io.Reader to read input from.
+   */
+  WikipediaTokenizerImpl(java.io.Reader in) {
+    this.zzReader = in;
+  }
+
+  /**
+   * Creates a new scanner.
+   * There is also java.io.Reader version of this constructor.
+   *
+   * @param   in  the java.io.Inputstream to read input from.
+   */
+  WikipediaTokenizerImpl(java.io.InputStream in) {
+    this(new java.io.InputStreamReader(in));
+  }
+
+  /** 
+   * Unpacks the compressed character translation table.
+   *
+   * @param packed   the packed character translation table
+   * @return         the unpacked character translation table
+   */
+  private static char [] zzUnpackCMap(String packed) {
+    char [] map = new char[0x10000];
+    int i = 0;  /* index in packed string  */
+    int j = 0;  /* index in unpacked array */
+    while (i < 230) {
+      int  count = packed.charAt(i++);
+      char value = packed.charAt(i++);
+      do map[j++] = value; while (--count > 0);
+    }
+    return map;
+  }
+
+
+  /**
+   * Refills the input buffer.
+   *
+   * @return      <code>false</code>, iff there was new input.
+   * 
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  private boolean zzRefill() throws java.io.IOException {
+
+    /* first: make room (if you can) */
+    if (zzStartRead > 0) {
+      System.arraycopy(zzBuffer, zzStartRead,
+                       zzBuffer, 0,
+                       zzEndRead-zzStartRead);
+
+      /* translate stored positions */
+      zzEndRead-= zzStartRead;
+      zzCurrentPos-= zzStartRead;
+      zzMarkedPos-= zzStartRead;
+      zzPushbackPos-= zzStartRead;
+      zzStartRead = 0;
+    }
+
+    /* is the buffer big enough? */
+    if (zzCurrentPos >= zzBuffer.length) {
+      /* if not: blow it up */
+      char newBuffer[] = new char[zzCurrentPos*2];
+      System.arraycopy(zzBuffer, 0, newBuffer, 0, zzBuffer.length);
+      zzBuffer = newBuffer;
+    }
+
+    /* finally: fill the buffer with new input */
+    int numRead = zzReader.read(zzBuffer, zzEndRead,
+                                            zzBuffer.length-zzEndRead);
+
+    if (numRead < 0) {
+      return true;
+    }
+    else {
+      zzEndRead+= numRead;
+      return false;
+    }
+  }
+
+    
+  /**
+   * Closes the input stream.
+   */
+  public final void yyclose() throws java.io.IOException {
+    zzAtEOF = true;            /* indicate end of file */
+    zzEndRead = zzStartRead;  /* invalidate buffer    */
+
+    if (zzReader != null)
+      zzReader.close();
+  }
+
+
+  /**
+   * Resets the scanner to read from a new input stream.
+   * Does not close the old reader.
+   *
+   * All internal variables are reset, the old input stream 
+   * <b>cannot</b> be reused (internal buffer is discarded and lost).
+   * Lexical state is set to <tt>ZZ_INITIAL</tt>.
+   *
+   * @param reader   the new input stream 
+   */
+  public final void yyreset(java.io.Reader reader) {
+    zzReader = reader;
+    zzAtBOL  = true;
+    zzAtEOF  = false;
+    zzEndRead = zzStartRead = 0;
+    zzCurrentPos = zzMarkedPos = zzPushbackPos = 0;
+    yyline = yychar = yycolumn = 0;
+    zzLexicalState = YYINITIAL;
+  }
+
+
+  /**
+   * Returns the current lexical state.
+   */
+  public final int yystate() {
+    return zzLexicalState;
+  }
+
+
+  /**
+   * Enters a new lexical state
+   *
+   * @param newState the new lexical state
+   */
+  public final void yybegin(int newState) {
+    zzLexicalState = newState;
+  }
+
+
+  /**
+   * Returns the text matched by the current regular expression.
+   */
+  public final String yytext() {
+    return new String( zzBuffer, zzStartRead, zzMarkedPos-zzStartRead );
+  }
+
+
+  /**
+   * Returns the character at position <tt>pos</tt> from the 
+   * matched text. 
+   * 
+   * It is equivalent to yytext().charAt(pos), but faster
+   *
+   * @param pos the position of the character to fetch. 
+   *            A value from 0 to yylength()-1.
+   *
+   * @return the character at position pos
+   */
+  public final char yycharat(int pos) {
+    return zzBuffer[zzStartRead+pos];
+  }
+
+
+  /**
+   * Returns the length of the matched text region.
+   */
+  public final int yylength() {
+    return zzMarkedPos-zzStartRead;
+  }
+
+
+  /**
+   * Reports an error that occured while scanning.
+   *
+   * In a wellformed scanner (no or only correct usage of 
+   * yypushback(int) and a match-all fallback rule) this method 
+   * will only be called with things that "Can't Possibly Happen".
+   * If this method is called, something is seriously wrong
+   * (e.g. a JFlex bug producing a faulty scanner etc.).
+   *
+   * Usual syntax/scanner level error handling should be done
+   * in error fallback rules.
+   *
+   * @param   errorCode  the code of the errormessage to display
+   */
+  private void zzScanError(int errorCode) {
+    String message;
+    try {
+      message = ZZ_ERROR_MSG[errorCode];
+    }
+    catch (ArrayIndexOutOfBoundsException e) {
+      message = ZZ_ERROR_MSG[ZZ_UNKNOWN_ERROR];
+    }
+
+    throw new Error(message);
+  } 
+
+
+  /**
+   * Pushes the specified amount of characters back into the input stream.
+   *
+   * They will be read again by then next call of the scanning method
+   *
+   * @param number  the number of characters to be read again.
+   *                This number must not be greater than yylength()!
+   */
+  public void yypushback(int number)  {
+    if ( number > yylength() )
+      zzScanError(ZZ_PUSHBACK_2BIG);
+
+    zzMarkedPos -= number;
+  }
+
+
+  /**
+   * Resumes scanning until the next regular expression is matched,
+   * the end of input is encountered or an I/O-Error occurs.
+   *
+   * @return      the next token
+   * @exception   java.io.IOException  if any I/O-Error occurs
+   */
+  public int getNextToken() throws java.io.IOException {
+    int zzInput;
+    int zzAction;
+
+    // cached fields:
+    int zzCurrentPosL;
+    int zzMarkedPosL;
+    int zzEndReadL = zzEndRead;
+    char [] zzBufferL = zzBuffer;
+    char [] zzCMapL = ZZ_CMAP;
+
+    int [] zzTransL = ZZ_TRANS;
+    int [] zzRowMapL = ZZ_ROWMAP;
+    int [] zzAttrL = ZZ_ATTRIBUTE;
+
+    while (true) {
+      zzMarkedPosL = zzMarkedPos;
+
+      yychar+= zzMarkedPosL-zzStartRead;
+
+      zzAction = -1;
+
+      zzCurrentPosL = zzCurrentPos = zzStartRead = zzMarkedPosL;
+  
+      zzState = zzLexicalState;
+
+
+      zzForAction: {
+        while (true) {
+    
+          if (zzCurrentPosL < zzEndReadL)
+            zzInput = zzBufferL[zzCurrentPosL++];
+          else if (zzAtEOF) {
+            zzInput = YYEOF;
+            break zzForAction;
+          }
+          else {
+            // store back cached positions
+            zzCurrentPos  = zzCurrentPosL;
+            zzMarkedPos   = zzMarkedPosL;
+            boolean eof = zzRefill();
+            // get translated positions and possibly new buffer
+            zzCurrentPosL  = zzCurrentPos;
+            zzMarkedPosL   = zzMarkedPos;
+            zzBufferL      = zzBuffer;
+            zzEndReadL     = zzEndRead;
+            if (eof) {
+              zzInput = YYEOF;
+              break zzForAction;
+            }
+            else {
+              zzInput = zzBufferL[zzCurrentPosL++];
+            }
+          }
+          int zzNext = zzTransL[ zzRowMapL[zzState] + zzCMapL[zzInput] ];
+          if (zzNext == -1) break zzForAction;
+          zzState = zzNext;
+
+          int zzAttributes = zzAttrL[zzState];
+          if ( (zzAttributes & 1) == 1 ) {
+            zzAction = zzState;
+            zzMarkedPosL = zzCurrentPosL;
+            if ( (zzAttributes & 8) == 8 ) break zzForAction;
+          }
+
+        }
+      }
+
+      // store back cached position
+      zzMarkedPos = zzMarkedPosL;
+
+      switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
+        case 7: 
+          { /* ignore */
+          }
+        case 42: break;
+        case 3: 
+          { positionInc = 1; return CJ;
+          }
+        case 43: break;
+        case 26: 
+          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
+          }
+        case 44: break;
+        case 37: 
+          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
+          }
+        case 45: break;
+        case 11: 
+          { currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
+          }
+        case 46: break;
+        case 5: 
+          { yybegin(CATEGORY_STATE); return currentTokType;
+          }
+        case 47: break;
+        case 34: 
+          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
+          }
+        case 48: break;
+        case 24: 
+          { positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
+          }
+        case 49: break;
+        case 22: 
+          { positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
+          }
+        case 50: break;
+        case 39: 
+          { positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
+          }
+        case 51: break;
+        case 18: 
+          { yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
+          }
+        case 52: break;
+        case 21: 
+          { positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
+          }
+        case 53: break;
+        case 1: 
+          { positionInc = 1;
+          }
+        case 54: break;
+        case 41: 
+          { numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
+          }
+        case 55: break;
+        case 9: 
+          { yybegin(YYINITIAL);
+          }
+        case 56: break;
+        case 19: 
+          { numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
+          }
+        case 57: break;
+        case 13: 
+          { yybegin(STRING);return currentTokType;
+          }
+        case 58: break;
+        case 36: 
+          { positionInc = 1; return EMAIL;
+          }
+        case 59: break;
+        case 35: 
+          { positionInc = 1; return ACRONYM;
+          }
+        case 60: break;
+        case 4: 
+          { positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
+          }
+        case 61: break;
+        case 17: 
+          { /* ignore STRING */
+          }
+        case 62: break;
+        case 40: 
+          { currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
+          }
+        case 63: break;
+        case 20: 
+          { yybegin(STRING); return currentTokType;/*pipe*/
+          }
+        case 64: break;
+        case 12: 
+          { currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
+          }
+        case 65: break;
+        case 27: 
+          { numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
+          }
+        case 66: break;
+        case 33: 
+          { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
+          }
+        case 67: break;
+        case 16: 
+          { yybegin(DOUBLE_BRACE_STATE); return currentTokType;
+          }
+        case 68: break;
+        case 29: 
+          { positionInc = 1; return HOST;
+          }
+        case 69: break;
+        case 32: 
+          { currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE);
+          }
+        case 70: break;
+        case 25: 
+          { currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
+          }
+        case 71: break;
+        case 23: 
+          { positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
+          }
+        case 72: break;
+        case 14: 
+          { currentTokType = SUB_HEADING; yybegin(STRING);
+          }
+        case 73: break;
+        case 28: 
+          { positionInc = 1; return APOSTROPHE;
+          }
+        case 74: break;
+        case 30: 
+          { positionInc = 1; return NUM;
+          }
+        case 75: break;
+        case 15: 
+          { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
+          }
+        case 76: break;
+        case 6: 
+          { yybegin(INTERNAL_LINK_STATE); return currentTokType;
+          }
+        case 77: break;
+        case 2: 
+          { positionInc = 1; return ALPHANUM;
+          }
+        case 78: break;
+        case 31: 
+          { positionInc = 1; return COMPANY;
+          }
+        case 79: break;
+        case 10: 
+          { currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE);
+          }
+        case 80: break;
+        case 8: 
+          { positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;
+          }
+        case 81: break;
+        case 38: 
+          { positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
+          }
+        case 82: break;
+        default: 
+          if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
+            zzAtEOF = true;
+            return YYEOF;
+          } 
+          else {
+            zzScanError(ZZ_NO_MATCH);
+          }
+      }
+    }
+  }
+
+
+}

Propchange: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex Fri Jan  4 06:29:15 2008
@@ -0,0 +1,324 @@
+package org.apache.lucene.wikipedia.analysis;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+
+%%
+
+%class WikipediaTokenizerImpl
+%unicode
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM          = 0;
+public static final int APOSTROPHE        = 1;
+public static final int ACRONYM           = 2;
+public static final int COMPANY           = 3;
+public static final int EMAIL             = 4;
+public static final int HOST              = 5;
+public static final int NUM               = 6;
+public static final int CJ                = 7;
+public static final int INTERNAL_LINK     = 8;
+public static final int EXTERNAL_LINK     = 9;
+public static final int CITATION          = 10;
+public static final int CATEGORY         = 11;
+public static final int BOLD     = 12;
+public static final int ITALICS     = 13;
+public static final int BOLD_ITALICS     = 14;
+public static final int HEADING     = 15;
+public static final int SUB_HEADING     = 16;
+public static final int EXTERNAL_LINK_URL = 17;
+
+
+private int currentTokType;
+private int numBalanced = 0;
+private int positionInc = 1;
+
+public static final String [] TOKEN_TYPES = new String [] {
+    "<ALPHANUM>",
+    "<APOSTROPHE>",
+    "<ACRONYM>",
+    "<COMPANY>",
+    "<EMAIL>",
+    "<HOST>",
+    "<NUM>",
+    "<CJ>",
+    WikipediaTokenizer.INTERNAL_LINK,
+    WikipediaTokenizer.EXTERNAL_LINK,
+    WikipediaTokenizer.CITATION,
+    WikipediaTokenizer.CATEGORY,
+    WikipediaTokenizer.BOLD,
+    WikipediaTokenizer.ITALICS,
+    WikipediaTokenizer.BOLD_ITALICS,
+    WikipediaTokenizer.HEADING,
+    WikipediaTokenizer.SUB_HEADING,
+    WikipediaTokenizer.EXTERNAL_LINK_URL
+};
+
+public final int yychar()
+{
+    return yychar;
+}
+
+public final int getPositionIncrement(){
+  return positionInc;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(Token t, int tokType) {
+  t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+%}
+
+// basic word: a sequence of digits & letters
+ALPHANUM   = ({LETTER}|{DIGIT}|{KOREAN})+
+
+// internal apostrophes: O'Reilly, you're, O'Reilly's
+// use a post-filter to remove possesives
+APOSTROPHE =  {ALPHA} ("'" {ALPHA})+
+
+// acronyms: U.S.A., I.B.M., etc.
+// use a post-filter to remove dots
+ACRONYM    =  {ALPHA} "." ({ALPHA} ".")+
+
+// company names like AT&T and Excite@Home.
+COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
+
+// email addresses
+EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+
+// hostname
+HOST       =  {ALPHANUM} ((".") {ALPHANUM})+
+
+// floating point, serial, model numbers, ip addresses, etc.
+// every other segment must have at least one digit
+NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
+           | {DIGIT}+ {P} {DIGIT}+
+           | {HAS_DIGIT} {P} {ALPHANUM}
+           | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+           | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+           | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+           | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+
+TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
+
+// punctuation
+P	         = ("_"|"-"|"/"|"."|",")
+
+// at least one digit
+HAS_DIGIT  =
+    ({LETTER}|{DIGIT})*
+    {DIGIT}
+    ({LETTER}|{DIGIT})*
+
+ALPHA      = ({LETTER})+
+
+
+LETTER     = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
+
+DIGIT      = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
+
+KOREAN     = [\uac00-\ud7af\u1100-\u11ff]
+
+// Chinese, Japanese
+CJ         = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
+
+WHITESPACE = \r\n | [ \r\n\t\f]
+
+//Wikipedia
+DOUBLE_BRACKET = "["{2}
+DOUBLE_BRACKET_CLOSE = "]"{2}
+DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
+EXTERNAL_LINK = "["
+TWO_SINGLE_QUOTES = "'"{2}
+CITATION = "<ref>"
+CITATION_CLOSE = "</ref>"
+INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
+
+DOUBLE_BRACE = "{"{2}
+DOUBLE_BRACE_CLOSE = "}"{2}
+PIPE = "|"
+DOUBLE_EQUALS = "="{2}
+
+
+%state CATEGORY_STATE
+%state INTERNAL_LINK_STATE
+%state EXTERNAL_LINK_STATE
+
+%state TWO_SINGLE_QUOTES_STATE
+%state THREE_SINGLE_QUOTES_STATE
+%state FIVE_SINGLE_QUOTES_STATE
+%state DOUBLE_EQUALS_STATE
+%state DOUBLE_BRACE_STATE
+%state STRING
+
+%%
+
+<YYINITIAL>{ALPHANUM}                                                     {positionInc = 1; return ALPHANUM; }
+<YYINITIAL>{APOSTROPHE}                                                   {positionInc = 1; return APOSTROPHE; }
+<YYINITIAL>{ACRONYM}                                                      {positionInc = 1; return ACRONYM; }
+<YYINITIAL>{COMPANY}                                                      {positionInc = 1; return COMPANY; }
+<YYINITIAL>{EMAIL}                                                        {positionInc = 1; return EMAIL; }
+<YYINITIAL>{NUM}                                                          {positionInc = 1; return NUM; }
+<YYINITIAL>{HOST}                                                         {positionInc = 1; return HOST; }
+<YYINITIAL>{CJ}                                                           {positionInc = 1; return CJ; }
+
+//wikipedia
+<YYINITIAL>{
+  //First {ALPHANUM} is always the link, set position to 0 for double bracket
+  {DOUBLE_BRACKET} {positionInc = 0; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
+  {DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
+  {EXTERNAL_LINK} {positionInc = 1;currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
+  {TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
+  {DOUBLE_EQUALS} {positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
+  {DOUBLE_BRACE} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
+  {CITATION} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
+//ignore
+  . | {WHITESPACE} |{INFOBOX}                                               { positionInc = 1; }
+}
+
+<INTERNAL_LINK_STATE>{
+//First {ALPHANUM} is always the link, set position to 0 for these
+  {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
+  {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
+  //ignore
+  . | {WHITESPACE}                                               { positionInc = 1; }
+}
+
+<EXTERNAL_LINK_STATE>{
+  "http://"{HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 0; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
+  {ALPHANUM} {positionInc = 1; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE);return currentTokType;}
+  "]" {yybegin(YYINITIAL);}
+  {WHITESPACE}                                               { positionInc = 1; }
+}
+
+<CATEGORY_STATE>{
+  {ALPHANUM} {yybegin(CATEGORY_STATE); return currentTokType;}
+  {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
+  //ignore
+  . | {WHITESPACE}                                               { positionInc = 1; }
+}
+//italics
+<TWO_SINGLE_QUOTES_STATE>{
+  "'" {currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE);}
+   "'''" {currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE);}
+   {ALPHANUM} {currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/}
+   //we can have links inside, let those override
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+   //ignore
+  . | {WHITESPACE}                                               { /* ignore */ }
+}
+//bold
+<THREE_SINGLE_QUOTES_STATE>{
+  {ALPHANUM} {yybegin(STRING);return currentTokType;}
+  //we can have links inside, let those override
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+   //ignore
+  . | {WHITESPACE}                                               { /* ignore */ }
+
+}
+//bold italics
+<FIVE_SINGLE_QUOTES_STATE>{
+  {ALPHANUM} {yybegin(STRING);return currentTokType;}
+  //we can have links inside, let those override
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+   //ignore
+  . | {WHITESPACE}                                               { /* ignore */ }
+}
+
+<DOUBLE_EQUALS_STATE>{
+ "=" {currentTokType = SUB_HEADING; yybegin(STRING);}
+ {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;}
+ {DOUBLE_EQUALS} {yybegin(YYINITIAL);}
+  //ignore
+  . | {WHITESPACE}                                               { /* ignore */ }
+}
+
+<DOUBLE_BRACE_STATE>{
+  {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); return currentTokType;}
+  {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
+  {CITATION_CLOSE} {yybegin(YYINITIAL);}
+   //ignore
+  . | {WHITESPACE}                                               { /* ignore */ }
+}
+
+<STRING> {
+  "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/}
+  "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
+  "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
+  "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
+  {ALPHANUM} {yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/}
+  //we can have links inside, let those override
+   {DOUBLE_BRACKET} {numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+
+
+  {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
+
+  .|{WHITESPACE}                                              { /* ignore STRING */ }
+}
+
+
+
+
+/*
+{INTERNAL_LINK}                                                { return curentTokType; }
+
+{CITATION}                                                { return currentTokType; }
+{CATEGORY}                                                { return currentTokType; }
+
+{BOLD}                                                { return currentTokType; }
+{ITALICS}                                                { return currentTokType; }
+{BOLD_ITALICS}                                                { return currentTokType; }
+{HEADING}                                                { return currentTokType; }
+{SUB_HEADING}                                                { return currentTokType; }
+
+*/
+//end wikipedia
+
+/** Ignore the rest */
+. | {WHITESPACE}|{TAGS}                                                { /* ignore */ }
+
+
+//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
+//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
+//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
+//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
+//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
+//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
+//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}
\ No newline at end of file

Added: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html (added)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html Fri Jan  4 06:29:15 2008
@@ -0,0 +1,35 @@
+<!--
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+  -->
+
+<HTML>
+ <!--
+ * 
+ --><HEAD>
+    <TITLE>org.apache.lucene.wikipedia</TITLE>
+</HEAD>
+<BODY>
+<DIV>Tools for working with <a href="http://www.wikipedia.org">Wikipedia</a> content.
+</DIV>
+<DIV>&nbsp;</DIV>
+<DIV align="center">
+Copyright &copy; 2007 <A HREF="http://www.apache.org">Apache Software Foundation</A>
+</DIV>
+</BODY>
+</HTML>
\ No newline at end of file

Propchange: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/package.html
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java?rev=608852&view=auto
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (added)
+++ lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java Fri Jan  4 06:29:15 2008
@@ -0,0 +1,213 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.lucene.wikipedia.analysis;
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+
+import java.io.File;
+import java.io.StringReader;
+import java.util.Map;
+import java.util.HashMap;
+
+
+/**
+ *
+ *
+ **/
+public class WikipediaTokenizerTest extends TestCase {
+
+
+  public WikipediaTokenizerTest(String s) {
+    super(s);
+  }
+
+  protected void setUp() {
+  }
+
+  protected void tearDown() {
+
+  }
+
+
+  public void testHandwritten() throws Exception {
+    //make sure all tokens are in only one type
+    String test = "[[link]] This is a [[Category:foo]] Category  This is a linked [[:Category:bar none withstanding]] " +
+            "Category This is (parens) This is a [[link]]  This is an external URL [http://lucene.apache.org] " +
+            "Here is ''italics'' and ''more italics'', '''bold''' and '''''five quotes''''' " +
+            " This is a [[link|display info]]  This is a period.  Here is $3.25 and here is 3.50.  Here's Johnny.  " +
+            "==heading== ===sub head=== followed by some text  [[Category:blah| ]] " +
+            "''[[Category:ital_cat]]''  here is some that is ''italics [[Category:foo]] but is never closed." +
+            "'''same [[Category:foo]] goes for this '''''and2 [[Category:foo]] and this" +
+            " [http://foo.boo.com/test/test/ Test Test] [http://foo.boo.com/test/test/test.html Test Test]" +
+            " [http://foo.boo.com/test/test/test.html?g=b&c=d Test Test] <ref>Citation</ref> <sup>martian</sup> <span class=\"glue\">code</span>";
+    Map tcm = new HashMap();//map tokens to types
+    tcm.put("link", WikipediaTokenizer.INTERNAL_LINK);
+    tcm.put("display", WikipediaTokenizer.INTERNAL_LINK);
+    tcm.put("info", WikipediaTokenizer.INTERNAL_LINK);
+
+    tcm.put("http://lucene.apache.org", WikipediaTokenizer.EXTERNAL_LINK_URL);
+    tcm.put("http://foo.boo.com/test/test/", WikipediaTokenizer.EXTERNAL_LINK_URL);
+    tcm.put("http://foo.boo.com/test/test/test.html", WikipediaTokenizer.EXTERNAL_LINK_URL);
+    tcm.put("http://foo.boo.com/test/test/test.html?g=b&c=d", WikipediaTokenizer.EXTERNAL_LINK_URL);
+    tcm.put("Test", WikipediaTokenizer.EXTERNAL_LINK);
+    
+    //alphanums
+    tcm.put("This", "<ALPHANUM>");
+    tcm.put("is", "<ALPHANUM>");
+    tcm.put("a", "<ALPHANUM>");
+    tcm.put("Category", "<ALPHANUM>");
+    tcm.put("linked", "<ALPHANUM>");
+    tcm.put("parens", "<ALPHANUM>");
+    tcm.put("external", "<ALPHANUM>");
+    tcm.put("URL", "<ALPHANUM>");
+    tcm.put("and", "<ALPHANUM>");
+    tcm.put("period", "<ALPHANUM>");
+    tcm.put("Here", "<ALPHANUM>");
+    tcm.put("Here's", "<APOSTROPHE>");
+    tcm.put("here", "<ALPHANUM>");
+    tcm.put("Johnny", "<ALPHANUM>");
+    tcm.put("followed", "<ALPHANUM>");
+    tcm.put("by", "<ALPHANUM>");
+    tcm.put("text", "<ALPHANUM>");
+    tcm.put("that", "<ALPHANUM>");
+    tcm.put("but", "<ALPHANUM>");
+    tcm.put("never", "<ALPHANUM>");
+    tcm.put("closed", "<ALPHANUM>");
+    tcm.put("goes", "<ALPHANUM>");
+    tcm.put("for", "<ALPHANUM>");
+    tcm.put("this", "<ALPHANUM>");
+    tcm.put("an", "<ALPHANUM>");
+    tcm.put("some", "<ALPHANUM>");
+    tcm.put("martian", "<ALPHANUM>");
+    tcm.put("code", "<ALPHANUM>");
+
+    tcm.put("foo", WikipediaTokenizer.CATEGORY);
+    tcm.put("bar", WikipediaTokenizer.CATEGORY);
+    tcm.put("none", WikipediaTokenizer.CATEGORY);
+    tcm.put("withstanding", WikipediaTokenizer.CATEGORY);
+    tcm.put("blah", WikipediaTokenizer.CATEGORY);
+    tcm.put("ital", WikipediaTokenizer.CATEGORY);
+    tcm.put("cat", WikipediaTokenizer.CATEGORY);
+
+    tcm.put("italics", WikipediaTokenizer.ITALICS);
+    tcm.put("more", WikipediaTokenizer.ITALICS);
+    tcm.put("bold", WikipediaTokenizer.BOLD);
+    tcm.put("same", WikipediaTokenizer.BOLD);
+    tcm.put("five", WikipediaTokenizer.BOLD_ITALICS);
+    tcm.put("and2", WikipediaTokenizer.BOLD_ITALICS);
+    tcm.put("quotes", WikipediaTokenizer.BOLD_ITALICS);
+
+    tcm.put("heading", WikipediaTokenizer.HEADING);
+    tcm.put("sub", WikipediaTokenizer.SUB_HEADING);
+    tcm.put("head", WikipediaTokenizer.SUB_HEADING);
+    
+    tcm.put("Citation", WikipediaTokenizer.CITATION);
+
+    tcm.put("3.25", "<NUM>");
+    tcm.put("3.50", "<NUM>");
+    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
+    Token token = new Token();
+    int count = 0;
+    int numItalics = 0;
+    int numBoldItalics = 0;
+    int numCategory = 0;
+    int numCitation = 0;
+    while ((token = tf.next(token)) != null) {
+      String tokText = token.termText();
+      //System.out.println("Text: " + tokText + " Type: " + token.type());
+      assertTrue("token is null and it shouldn't be", token != null);
+      String expectedType = (String) tcm.get(tokText);
+      assertTrue("expectedType is null and it shouldn't be for: " + token, expectedType != null);
+      assertTrue(token.type() + " is not equal to " + expectedType + " for " + token, token.type().equals(expectedType) == true);
+      count++;
+      if (token.type().equals(WikipediaTokenizer.ITALICS)  == true){
+        numItalics++;
+      } else if (token.type().equals(WikipediaTokenizer.BOLD_ITALICS)  == true){
+        numBoldItalics++;
+      } else if (token.type().equals(WikipediaTokenizer.CATEGORY)  == true){
+        numCategory++;
+      }
+      else if (token.type().equals(WikipediaTokenizer.CITATION)  == true){
+        numCitation++;
+      }
+    }
+    assertTrue("We have not seen enough tokens: " + count + " is not >= " + tcm.size(), count >= tcm.size());
+    assertTrue(numItalics + " does not equal: " + 4 + " for numItalics", numItalics == 4);
+    assertTrue(numBoldItalics + " does not equal: " + 3 + " for numBoldItalics", numBoldItalics == 3);
+    assertTrue(numCategory + " does not equal: " + 10 + " for numCategory", numCategory == 10);
+    assertTrue(numCitation + " does not equal: " + 1 + " for numCitation", numCitation == 1);
+  }
+
+  public void testLinkPhrases() throws Exception {
+    String test = "click [[link here]] click [http://lucene.apache.org here]";
+    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
+    Token token = new Token();
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click", new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link", new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
+            new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "click",
+            new String(token.termBuffer(), 0, token.termLength()).equals("click") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org",
+            new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
+            new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    
+  }
+
+  public void testLinks() throws Exception {
+    String test = "[http://lucene.apache.org/java/docs/index.html#news here] [http://lucene.apache.org/java/docs/index.html?b=c here]";
+    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
+    Token token = new Token();
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html#news",
+            new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html#news") == true);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+    tf.next(token);//skip here
+    token = tf.next(token);
+
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "http://lucene.apache.org/java/docs/index.html?b=c",
+            new String(token.termBuffer(), 0, token.termLength()).equals("http://lucene.apache.org/java/docs/index.html?b=c") == true);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+  }
+}

Propchange: lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/docs/developer-resources.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/docs/developer-resources.html?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/docs/developer-resources.html (original)
+++ lucene/java/trunk/docs/developer-resources.html Fri Jan  4 06:29:15 2008
@@ -367,6 +367,10 @@
 <li>
 <a href="api/contrib-swing/index.html">Swing</a>
 </li>
+          
+<li>
+<a href="api/contrib-wikipedia/index.html">Wikipedia</a>
+</li>
         
 <li>
 <a href="api/contrib-wordnet/index.html">Wordnet</a>
@@ -383,11 +387,11 @@
 </p>
 </div>
 
-<a name="N10097"></a><a name="Downloads"></a>
+<a name="N1009C"></a><a name="Downloads"></a>
 <h2 class="boxed">Downloads</h2>
 <div class="section">
 <p>System Requirements are detailed <a href="systemrequirements.html">here</a>.</p>
-<a name="N100A3"></a><a name="Clover"></a>
+<a name="N100A8"></a><a name="Clover"></a>
 <h3 class="boxed">Clover Test Coverage Reports</h3>
 <p>
             
@@ -396,7 +400,7 @@
             <a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/lastSuccessfulBuild/artifact/trunk/build/test/clover/reports/index.html">here</a>
             for the nightly build.
         </p>
-<a name="N100B4"></a><a name="Hudson"></a>
+<a name="N100B9"></a><a name="Hudson"></a>
 <h3 class="boxed">Hudson</h3>
 <p>
       
@@ -404,13 +408,13 @@
       project.   It is responsible for running nightly builds, code coverage reports as well as building the nightly version
       of the website.
       </p>
-<a name="N100C1"></a><a name="Nightly"></a>
+<a name="N100C6"></a><a name="Nightly"></a>
 <h3 class="boxed">Nightly Build Download</h3>
 <p>Nightly builds are based on the trunk version of the code checked into
             <a href="https://svn.apache.org/repos/asf/lucene/java/trunk">SVN</a>
         
 </p>
-<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D3"></a><a name="source"></a>
+<a href="http://lucene.zones.apache.org:8080/hudson/job/Lucene-Nightly/">Download via Hudson</a><a name="N100D8"></a><a name="source"></a>
 <h3 class="boxed">Source Code</h3>
 <p>The source files are now stored using Subversion (see http://subversion.tigris.org/ and http://svnbook.red-bean.com/)
         </p>

Modified: lucene/java/trunk/docs/developer-resources.pdf
URL: http://svn.apache.org/viewvc/lucene/java/trunk/docs/developer-resources.pdf?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/docs/developer-resources.pdf (original)
+++ lucene/java/trunk/docs/developer-resources.pdf Fri Jan  4 06:29:15 2008
@@ -5,10 +5,10 @@
 /Producer (FOP 0.20.5) >>
 endobj
 5 0 obj
-<< /Length 677 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 680 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gaua=966RV&:j6I$6AM$<'>LHA#F%1Bfjds>FL0G11<e;/&7SNq]Z`9S4S?[.)cF,`Nm,5C0/h1%GkAg5Sum3h$(mtAH;j$J-"qb(aN]GA-gU;#3ep&]/ni-lp7ej)2[APh<K'JakDs)h!dWOA>Sut];]T!W(q!2=bAet?M3_@^r3fKo#YAMEQkN1%pIJb1\.LtUi'.39h$=WfkX&D$ptq:,H82VL,c_h[8K(Fc;`WB\og;YJC1VbN(fJ+7PK\#4mZt!5et;#^@iB5UT=jd;Rk\opTD^:Xt<Ye#/_-[k36A0O](e$'q*+_$&kI,a?727rO"0tg?QdGDn<6sSTlFj,$.pR.)3TrnR(3!%/D[jS]CK30@XK;.?Q:,pGt&Qj>:DrCFl?ENl^_7Os^t7,Q2JU%^u-!)Q0lTrMo<nMSe]#CUmNXrPp?BCX+O3BaG&*`)?`o/:*cc:]Fpk2bUXP+.*qSbiC$](/Fa-^RXo>T)MuiO.ELK<fCB"NTEpua[nU3=VS/eE_l^I38"-1cfP12HW0*DWq2ANI2`#%cM-:B`-lBp*b'pA6,J2=?l+1u2We=2VAp`rPDF5I>(AVg8]X1aV_)eep.sBn^,r7-'98VO&.4tmqKIa^GD[?ICEJ9(AhI(bkJEaNd/2&B!(LDdh0?5AUcuj>3qV;OUY?7~>
+Gaua=c#T:-&:j43KoYjMWr@QZm5/Z9D.f;V97'K#-;5k/"ZV^T07n`8N#UP>.6^iaQJML?]95'DnE,T0_B8p%/jT+`%1pS_$rH]9+LZ?C+\5$)P+UZMol`";#IOaP,`K<YX&!T4.Smct_b5,KV;<`_VJ]A8>Ec(5%DIJU\oWo5-A21?T00t%7r7\jhT2^:7Qsk`1:8aXa[c[k4(Uff.&[uA,kA\9j5fpa9(1OpeQ&BE`Wa<^7`Y3VZ;GW\_DiP%@:/m8!q&;I[6MhV7?os$dLE[VG>m;+9qeUFkbm>o:D\YCPhHADa?2YH@JC7!k4(V%01GlePYVd8dhZNW`+sX[R83':q(4dirW40MaIe%I4MI_AW?80=bA:=qe&[3(o8mgmZ%t_-,/[JTSl!LhoXCY\%)"m,EMDa<YPR[`be9%L;1!/KJ".^pk'Z)O0*-Yl4=/q9U)t$+l$mBQHKgap"#Igs1BGO]H^^@ZC#A3AF.V(6T54)(':-atoo$\lgs7)jdBmI^aQCT-rS.<Zo*[ZZs)7:AS[5<@8+Ljqn>ee2qr894W<kYB5$e?u#=a8Mn28kNHNBOTKO@27`k?-gjrDBmK:#D<gfEm9N*UgT:4FK.4,G(*S<WemLmN=C>J4enV=A0#FB`>)Srh=rbh`=hU%_i>ZDs0MW!k`Zh]eWG%f~>
 endstream
 endobj
 6 0 obj
@@ -102,10 +102,10 @@
 >>
 endobj
 22 0 obj
-<< /Length 1268 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1260 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gatm;?'!Gq&:O#Nn0:K*nOK)[hWQR">-S@A]`RH86^kT'Zu0-=UqOIVr_AoXEJf1PLfL90PNI,8S/HZ#e+r(\I.D>FE1Fk"UZOM.[.tF^M%MLYjT"i14dmW!',6<(^&ubf?VuA'&!2oe1&3Smc6of6,=J"+k,SeSlNsWm+,X68^u=epc*up*(DOVP=7!<<Mu3-$0@kcH%[YeYW(aq(X'?God18(KCSRQ#`SYm0G_"g.lrelsjs1LL/Z1<F7WB!IF?\9D/EGa`0&9__o$S_[%dDBHQ.BI=.r>J8ZhTljLAS:3aXg5Q3a,jIiO[`FA5XXNbG/AQf1V[b>?Op>irF91NM@aRAmLWW.6T;tk)[&#KVbcu>hT)a=+%t*gBm_o1)"YdK-@)k>A0dN724^BYVog"("oL4]r>hYVff/.j,AVZV74T8Qse4m\]Q=G>+#[?E71*O?\Mggng<r\cY*39hKff7>_]L#LB4el)10G0\kr+&lA_i:7)3ckPPi^n!!K,MjK2<T#YVDk5d#5s_h"L)B^!IkaK9VpLimN=?nRi_WB1#/9mNuRKfoEk<h'ST\3te7].m1HeRa%_F^mc+'a??nGOO<@Gf$B'<#4/!S!li+P"/i45!Zf[VIlWD@i>#u!f*Mmh'a\#T.K_u7rrdhGS5:e\^/Icnc7uPESqSo9.`jjh>-1FouchJfCC"B8jluOB)*59PK>=NGX:7fAQJ$4,5l(#F[/^8]$,=qWGEfc7o'=LNP4o,C1D?k/DWQ8]_]h^Vk3$9Z%RGrq5V]FOHgb$`EmPuh1$D>W$\@lq=Zg+jZ2npBItb@+TV65XbH9VP;$l!i)XX(d>('QFQr67N(#]-(;3M\YTVn'\d&<e6/U"#a0Lk/B6]M&leKQn!%9u^T^CUal4=nKjE%bERKQ_pZ%7"K?g>`RTi'IoK(a6="6$Z9HYJH^F'2Q;9F--FCME+=e#)]Z:Z3!(J`nQ"grH"\4b\NY#"d#-P#F6
 tPl8J@aKek'ZoC*fP8DONp4Nb1=YgD;3ZmGZNJqn<R>C^(g>ge?H<Hh0pt)t`_0fnUH,=Pd"c?\rau[f)maA0G*N$Cn$5p,Rp%,;dO`_oC;X%JVWq5$O=mF^OE`=:.)bS;C_"R-T,nNBVm&CS^??-drOiJG-iQ"ecolHJtZU\WH8kCNK8Bi=iG'Ga9c#=L,"C$G_DD,8\h9e&N5-5X1F,loG`<ZbTCYgL^1+^C>p.VZi6W\R#S_L"`oWJX8Qr8QVnm,G-KEHHPm1T:,5i+<s~>
+Gatm;9on$e&A@sBn<a2Y7O1ENoG6n-'.gFTZ?D?3D%Ep7[Y:PHLQ_&9^V1pcZ7(JZ[V0'af#W0-2fFG@d<%Qt5G*=q">i]65`k+eA?>Tl6(/?of#7UOdg%X"!ud7lLA2e3k5'g(b!\c/ON$;HA3-?q]oNJ8*gj,X!Ei3;"oL(&6:6<b7qB=[^V=7M27_]/AR7N3cUGAcltj2EeR`TT`P*sHq3q4Qjpu;Cm(E+KNfs`_&k+r6[)SU]@D(\Yk//&Gj-IugI1@P3INPkJ`M0:^EpB$_/6cgOB"uc+0QN)$ok(]EeIM-o;sIn=.>R58A!+kAA<oa!)QoDfhf#3iZc3%5'H0IZ,a<LMc2]sZ^(ntJ-\Neq=eT[3V(99#:$i)^cSC\U6GjtCs&s*2E+L$<.,\sTga]VU`HkJ;E_lF`>)kD!C0'1@P>Or+i0PqRJjqCB1FXVEqm-70e>lTsADZGDU;,)[:SQ"t0G`\&YZ<=8J=4Rrb.^mUii3V/EZ4Zts57-+K!?*^:+7QDVc7sb\%89l^p")%FCX`HX.>Wn3]=,pf1a@]Sc/gl?*2"`&)a.HUdH?/Oe:fS9H8>#RGLZoQq2IU6-RTkHT)A*n7a\41>cn(oB\r(fW3@hX;BF:$lCo=bLe0L*f?t:UKHH>jUq)t:/70IA$7YK7?q4%?l^^']u#0QK.4U,D\4e//5NL)^YC4THNffP>MN#OW'0*nTi[L^WK2u(LRloq"nV"7gOW8u#4cD&"pIZ4o$$gIk^C!`!s@f<B6HE31^Ao9bc=Og7<Ymd1,gs`]^gI!Am%If`^S=Vc>Sc9L9n!2#4#gtP*tU0g!=>(>R1ags7`D/3QD.HBd40i;nCGu;X&#^mlDF)PfTTjM-D-fi[&Gs#E85f&^F-O%9?,IK(&TR5VkE"T&.l'IBZnu"q:FR*mYHg`=Q4Jgf8#9Vf7=/E+9:,X(b@m[n7aXRU,&Peb1\RIqEK#FkCD,qpf>(P`&B3'QIO
 A0@?AFfP/40AU)J!*c]>Pi+/M7.rST2/\T8aB6JX<kg3u7m6)gVF4mgS<7$^u&n;[on%s@ZbOTgVS@$^(ikub,0=Pggef>_ql!:@,jmEd]H>dg@aks`gZl0-'NI0ahb%2gqWY_74F-XgD,_Die.M-f'^N@%n4E;rQ(:8q9]&YTCG^*A"c+k)dJpWLIY0:eVmUR77cKceDllk%inK<VlCYgL^2Cuhmp.V[TA!b^=c@'b'pETS>VUT"DI92^DGdJK!C$irj!3?kVBE~>
 endstream
 endobj
 23 0 obj
@@ -117,10 +117,10 @@
 >>
 endobj
 24 0 obj
-<< /Length 1000 /Filter [ /ASCII85Decode /FlateDecode ]
+<< /Length 1033 /Filter [ /ASCII85Decode /FlateDecode ]
  >>
 stream
-Gatm:997OU&AIm?pp_iZ'0EJji5RB#%j;'l`J*E8Zu4K3Ll:Z-g4,0Q7m/<'!!uYB11P[nS!e1W]q3PVc"B7RN3*!PorOfO8XcOUQ](h><Ql;u1aPVSCA)@brPo"&"2$_,'njPT@D82XmL]%57f%`]L6Q^6s$=A*]P3^52j`B4BBV\8n@gW'X-Tbg#Lu.":,T.kd5KN7\Mg9_fZp5HPa/86;\Eqi)SJAV+M;V/ZB5gPhrV^@d(FL"OPe.Wh,.EK!ITjL8A!9/<E5$[Y-\LeBq&]I>GKa[Y'p_8W`A`?5/Yc$-'l!fP\['LEheU]nWrBe-gUX$9'9ggAWiMbXqMu!WTVA7Z:ACP2:/B[(3d'ib"3FDG2/gR(HG42rlH&456WWZ6Hkmg?i9hj$uD^gn<&]h!u8q2Nu;@tcc8TWU.?G9E?@FI#qG_*j)SDndhK`eHIpZX>iHkQ%3k?_&tY)U&e<T_(7@M33#$u;OF@;ZErsn>TY+=%PmAP9`?bK/F71i>Ip]]1!^=]E,Bi94&N/4sQOcH$U0Ddf<fo\2%'bP??H$((!E3P@`gbu&;+>L/BrK?BO="BY6nsjZ,LYB3egsXTDRtLl,=n>T-k<=QF3+B=+H-BnO`^$S\30K+qb#E0**4&l6O0/?\<*h`>R,Iioc$Eh0<@S(*h:tIO!8r'#u7U^ZCoH'64fUIWI'<q2Cuk07T8'=deXP<YcCB^%4q.4kl1-<WT&#CGU=OP9mI]7G=hsCFmFW+GQL)^r-F-AIuE$Z[&^.nVF'pggm6@71FKE)Z*30FnD&la2C>uLTT7E3SmX)jW:Zh%cnR_:,"lE.QO^$rM=a\`AZ[o>9XLF':B`go(XKe?.AIbk@\=`q=R92")Hk*U!\HCmfmr\S0Q[fm&elXe_q9`,cF?-WT*YNe.S.j5a@";2kfo=Ehf5@9mpj1c'N:G)Bjc9@TQK:Dn#bA_i-;H9eSVYZrYPVrerRXRk^6S[9.S7f0%"A
 /"j0OmBE~>
+GatU29lo&3&A@sBE(jj^";9pn[jJ*'Ful0Pnat9%+PGBo;1o=uT)AJ[0=-Q':-'8RD&*9CmI&^tQa=fSg#UJ4U6YW49JV2+MW<@f1>[sC6:Pt91Sb1(-9iiDr<@l&/0tmCrY_QW5:9TkOC"X0MmkXDLZ$BD:cIMK5R=4nU6/ZK/LR?*7bicnDC^Wa7RK"Z6GLei*9)$0Z9@F<;nXlb3_-s4m)q,u,*"IF%'V;!nHb@()Kqc`^bE]sdgg>I(a,[Fj,I)D\WG@a:X)mNAQ>cL4=2R/<][]&Shn\OGp,gpV#ZQ["ib?:NN`"[<+6"/+QaHXij0YhCh#6oMe"dcr6iGu<+TT;JlA".HUHCb2=dl4X0jOOrBsN+EadfjmhZ%/>G:X'R*fB<Zj$Hk@([.aH$lI-),V9jB$]eiAY-l.%PBh$As<D*4E9@U'Ll8WTsENW!'H*XjL+d/&fA7=MR0;&?N(O-q6u[s\SM?/c"j_f8sM&)('s(P%Pp#)MAT)p;]qU8`nug)!JsPd+Hm^3[`FF.$t2%`)6ckaO5>84l<[##/`r**d+'<gZ+HXcqUbpLHG\8f_-tdhl2u@kk#u4LpVpW'HE1^-`SBY>5=@lo7M66+@6cYnWn)5&U8p1[-Nq;l#]iH?!ca+Hf*I&+"B3<%p_cbdH%E\oV)Y<:9'sD`X#*L\D+4unXtC2@Edq[(3,WU#k'[f.rKGmVQ'MV)]a=anC2p>=0NX"^i6lSo[ORMf`/`J=+n>udePgF:^6_fB,!(\CA2q`&iNQhOjf3(]B06fM(PZ.4ff"9."M%Y>AU?'5&,A*oqO0F]4`V@m!-i6K0F^m:f`BjG$&jUes7t#^BGI/t/;Z`=1[V(m7b5!"MGe;E*mNQ!:'tT4/l$aF-m]0pGV;/<qHp#/je+Y\P\B3J;UY17hHr?k\-R=N8cPZ=e$-rHJ)?pS]eIAEV9HL*h\oR2o;nE&kNi%0jO@BW+gJM&[<Df\=5)3]ddDF
 @G;E/t=,Ze&!/#.E7G&J0[K3fj6>sgE>Y]bL-t,tW~>
 endstream
 endobj
 25 0 obj
@@ -260,31 +260,31 @@
 13 0 obj
 <<
 /S /GoTo
-/D [23 0 R /XYZ 85.0 251.932 null]
+/D [23 0 R /XYZ 85.0 238.732 null]
 >>
 endobj
 15 0 obj
 <<
 /S /GoTo
-/D [23 0 R /XYZ 85.0 199.598 null]
+/D [23 0 R /XYZ 85.0 186.398 null]
 >>
 endobj
 17 0 obj
 <<
 /S /GoTo
-/D [23 0 R /XYZ 85.0 148.345 null]
+/D [25 0 R /XYZ 85.0 659.0 null]
 >>
 endobj
 19 0 obj
 <<
 /S /GoTo
-/D [25 0 R /XYZ 85.0 611.4 null]
+/D [25 0 R /XYZ 85.0 581.347 null]
 >>
 endobj
 21 0 obj
 <<
 /S /GoTo
-/D [25 0 R /XYZ 85.0 546.647 null]
+/D [25 0 R /XYZ 85.0 516.594 null]
 >>
 endobj
 26 0 obj
@@ -295,45 +295,45 @@
 xref
 0 40
 0000000000 65535 f 
-0000006782 00000 n 
-0000006854 00000 n 
-0000006946 00000 n 
+0000006810 00000 n 
+0000006882 00000 n 
+0000006974 00000 n 
 0000000015 00000 n 
 0000000071 00000 n 
-0000000839 00000 n 
-0000000959 00000 n 
-0000001026 00000 n 
-0000007080 00000 n 
-0000001161 00000 n 
-0000007143 00000 n 
-0000001298 00000 n 
-0000007209 00000 n 
-0000001434 00000 n 
-0000007275 00000 n 
-0000001571 00000 n 
-0000007341 00000 n 
-0000001708 00000 n 
-0000007407 00000 n 
-0000001844 00000 n 
-0000007471 00000 n 
-0000001981 00000 n 
-0000003342 00000 n 
-0000003450 00000 n 
-0000004543 00000 n 
-0000007537 00000 n 
-0000004651 00000 n 
-0000004866 00000 n 
-0000005102 00000 n 
-0000005288 00000 n 
-0000005555 00000 n 
-0000005707 00000 n 
-0000005953 00000 n 
-0000006120 00000 n 
-0000006233 00000 n 
-0000006343 00000 n 
-0000006451 00000 n 
-0000006557 00000 n 
-0000006673 00000 n 
+0000000842 00000 n 
+0000000962 00000 n 
+0000001029 00000 n 
+0000007108 00000 n 
+0000001164 00000 n 
+0000007171 00000 n 
+0000001301 00000 n 
+0000007237 00000 n 
+0000001437 00000 n 
+0000007303 00000 n 
+0000001574 00000 n 
+0000007369 00000 n 
+0000001711 00000 n 
+0000007433 00000 n 
+0000001847 00000 n 
+0000007499 00000 n 
+0000001984 00000 n 
+0000003337 00000 n 
+0000003445 00000 n 
+0000004571 00000 n 
+0000007565 00000 n 
+0000004679 00000 n 
+0000004894 00000 n 
+0000005130 00000 n 
+0000005316 00000 n 
+0000005583 00000 n 
+0000005735 00000 n 
+0000005981 00000 n 
+0000006148 00000 n 
+0000006261 00000 n 
+0000006371 00000 n 
+0000006479 00000 n 
+0000006585 00000 n 
+0000006701 00000 n 
 trailer
 <<
 /Size 40
@@ -341,5 +341,5 @@
 /Info 4 0 R
 >>
 startxref
-7588
+7616
 %%EOF

Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml?rev=608852&r1=608851&r2=608852&view=diff
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/developer-resources.xml Fri Jan  4 06:29:15 2008
@@ -35,6 +35,7 @@
         <li><a href="api/contrib-spellchecker/index.html">Spellchecker</a></li>
         <li><a href="api/contrib-surround/index.html">Surround</a></li>
         <li><a href="api/contrib-swing/index.html">Swing</a></li>
+          <li><a href="api/contrib-wikipedia/index.html">Wikipedia</a></li>
         <li><a href="api/contrib-wordnet/index.html">Wordnet</a></li>
         <li><a href="api/contrib-xml-query-parser/index.html">XML Query Parser</a></li></ul></li>
       </ul>



Mime
View raw message