lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r614895 - in /lucene/java/trunk/contrib/wikipedia/src: java/org/apache/lucene/wikipedia/analysis/ test/org/apache/lucene/wikipedia/analysis/
Date Thu, 24 Jan 2008 15:05:59 GMT
Author: gsingers
Date: Thu Jan 24 07:05:53 2008
New Revision: 614895

URL: http://svn.apache.org/viewvc?rev=614895&view=rev
Log:
LUCENE-1133: Adds ability to keep certain strings as single tokens

Modified:
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
    lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex
    lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java

Modified: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java?rev=614895&r1=614894&r2=614895&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java (original)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizer.java Thu Jan 24 07:05:53 2008
@@ -22,17 +22,17 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.util.*;
 
 
 /**
  * Extension of StandardTokenizer that is aware of Wikipedia syntax.  It is based off of the
  * Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
- *
+ * <p/>
  * <p/>
  * EXPERIMENTAL !!!!!!!!!
  * NOTE: This Tokenizer is considered experimental and the grammar is subject to change in the trunk and in follow up releases.
- *
- **/
+ */
 public class WikipediaTokenizer extends Tokenizer {
   public static final String INTERNAL_LINK = "il";
   public static final String EXTERNAL_LINK = "el";
@@ -45,11 +45,21 @@
   public static final String BOLD_ITALICS = "bi";
   public static final String HEADING = "h";
   public static final String SUB_HEADING = "sh";
+
+  public static final int TOKENS_ONLY = 0;
+  public static final int UNTOKENIZED_ONLY = 1;
+  public static final int BOTH = 2;
+
+  public static final int UNTOKENIZED_TOKEN_FLAG = 1;
   /**
    * A private instance of the JFlex-constructed scanner
    */
   private final WikipediaTokenizerImpl scanner;
 
+  private int tokenOutput = TOKENS_ONLY;
+  private Set untokenizedTypes = Collections.EMPTY_SET;
+  private Iterator tokens = null;
+
   void setInput(Reader reader) {
     this.input = reader;
   }
@@ -57,11 +67,19 @@
   /**
    * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
    * <code>input</code> to a newly created JFlex scanner.
+   *
    * @param input The Input Reader
    */
   public WikipediaTokenizer(Reader input) {
-    this.input = input;
+    this(input, TOKENS_ONLY, Collections.EMPTY_SET);
+  }
+
+
+  public WikipediaTokenizer(Reader input, int tokenOutput, Set untokenizedTypes) {
+    super(input);
+    this.tokenOutput = tokenOutput;
     this.scanner = new WikipediaTokenizerImpl(input);
+    this.untokenizedTypes = untokenizedTypes;
   }
 
   /*
@@ -70,19 +88,116 @@
   * @see org.apache.lucene.analysis.TokenStream#next()
   */
   public Token next(Token result) throws IOException {
+    if (tokens != null && tokens.hasNext()){
+      return (Token)tokens.next();
+    }
     int tokenType = scanner.getNextToken();
 
     if (tokenType == WikipediaTokenizerImpl.YYEOF) {
       return null;
     }
+    String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
+    if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
+      setupToken(result);
+    } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
+      collapseTokens(result, tokenType);
+
+    }
+    else if (tokenOutput == BOTH){
+      //collapse into a single token, add it to tokens AND output the individual tokens
+      //output the untokenized Token first
+      collapseAndSaveTokens(result, tokenType, type);
+    }
+    result.setPositionIncrement(scanner.getPositionIncrement());
+    result.setType(type);
+    return result;
+  }
+
+  private void collapseAndSaveTokens(Token result, int tokenType, String type) throws IOException {
+    //collapse
+    StringBuffer buffer = new StringBuffer(32);
+    int numAdded = scanner.setText(buffer);
+    //TODO: how to know how much whitespace to add
+    int theStart = scanner.yychar();
+    int lastPos = theStart + numAdded;
+    int tmpTokType;
+    int numSeen = 0;
+    List tmp = new ArrayList();
+    Token saved = new Token();
+    setupSavedToken(saved, 0, type);
+    tmp.add(saved);
+    //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
+    while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
+      int currPos = scanner.yychar();
+      //append whitespace
+      for (int i = 0; i < (currPos - lastPos); i++){
+        buffer.append(' ');
+      }
+      numAdded = scanner.setText(buffer);
+      saved = new Token();
+      setupSavedToken(saved, scanner.getPositionIncrement(), type);
+      tmp.add(saved);
+      numSeen++;
+      lastPos = currPos + numAdded;
+    }
+    //trim the buffer
+    String s = buffer.toString().trim();
+    result.setTermBuffer(s.toCharArray(), 0, s.length());
+    result.setStartOffset(theStart);
+    result.setEndOffset(theStart + s.length());
+    result.setFlags(UNTOKENIZED_TOKEN_FLAG);
+    //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
+    if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
+      scanner.yypushback(scanner.yylength());
+    }
+    tokens = tmp.iterator();
+  }
+
+  private void setupSavedToken(Token saved, int positionInc, String type){
+    setupToken(saved);
+    saved.setPositionIncrement(positionInc);
+    saved.setType(type);
+  }
 
-    scanner.getText(result, tokenType);
+  private void collapseTokens(Token result, int tokenType) throws IOException {
+    //collapse
+    StringBuffer buffer = new StringBuffer(32);
+    int numAdded = scanner.setText(buffer);
+    //TODO: how to know how much whitespace to add
+    int theStart = scanner.yychar();
+    int lastPos = theStart + numAdded;
+    int tmpTokType;
+    int numSeen = 0;
+    //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
+    while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
+      int currPos = scanner.yychar();
+      //append whitespace
+      for (int i = 0; i < (currPos - lastPos); i++){
+        buffer.append(' ');
+      }
+      numAdded = scanner.setText(buffer);
+      numSeen++;
+      lastPos = currPos + numAdded;
+    }
+    //trim the buffer
+    String s = buffer.toString().trim();
+    result.setTermBuffer(s.toCharArray(), 0, s.length());
+    result.setStartOffset(theStart);
+    result.setEndOffset(theStart + s.length());
+    result.setFlags(UNTOKENIZED_TOKEN_FLAG);
+    //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
+    if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
+      scanner.yypushback(scanner.yylength());
+    } else {
+      tokens = null;
+    }
+  }
+
+  private void setupToken(Token result) {
+    scanner.getText(result);
     final int start = scanner.yychar();
     result.setStartOffset(start);
     result.setEndOffset(start + result.termLength());
-    result.setPositionIncrement(scanner.getPositionIncrement());
-    result.setType(WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]);
-    return result;
   }
 
   /*

Modified: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java?rev=614895&r1=614894&r2=614895&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java (original)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.java Thu Jan 24 07:05:53 2008
@@ -1,4 +1,4 @@
-/* The following code was generated by JFlex 1.4.1 on 1/4/08 3:30 PM */
+/* The following code was generated by JFlex 1.4.1 on 1/16/08 10:31 AM */
 
 package org.apache.lucene.wikipedia.analysis;
 
@@ -25,7 +25,7 @@
 /**
  * This class is a scanner generated by 
  * <a href="http://www.jflex.de/">JFlex</a> 1.4.1
- * on 1/4/08 3:30 PM from the specification file
+ * on 1/16/08 10:31 AM from the specification file
  * <tt>/Volumes/User/grantingersoll/projects/lucene/Lucene-Trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex</tt>
  */
 class WikipediaTokenizerImpl {
@@ -37,14 +37,14 @@
   private static final int ZZ_BUFFERSIZE = 16384;
 
   /** lexical states */
-  public static final int DOUBLE_BRACE_STATE = 7;
+  public static final int DOUBLE_BRACE_STATE = 8;
   public static final int INTERNAL_LINK_STATE = 2;
   public static final int TWO_SINGLE_QUOTES_STATE = 4;
   public static final int CATEGORY_STATE = 1;
-  public static final int FIVE_SINGLE_QUOTES_STATE = 5;
-  public static final int STRING = 8;
+  public static final int FIVE_SINGLE_QUOTES_STATE = 6;
+  public static final int STRING = 9;
   public static final int YYINITIAL = 0;
-  public static final int DOUBLE_EQUALS_STATE = 6;
+  public static final int DOUBLE_EQUALS_STATE = 7;
   public static final int THREE_SINGLE_QUOTES_STATE = 5;
   public static final int EXTERNAL_LINK_STATE = 3;
 
@@ -76,20 +76,20 @@
   private static final int [] ZZ_ACTION = zzUnpackAction();
 
   private static final String ZZ_ACTION_PACKED_0 =
-    "\11\0\4\1\4\2\1\3\1\1\1\4\2\1\1\5"+
-    "\1\1\1\6\1\1\2\7\1\10\1\11\1\10\1\12"+
-    "\1\13\1\7\1\14\1\15\1\16\1\17\1\7\1\20"+
-    "\1\7\4\21\1\22\1\21\1\23\1\24\1\25\3\0"+
-    "\1\26\14\0\1\27\1\30\1\31\1\32\1\10\1\0"+
-    "\1\33\1\0\1\34\1\0\1\35\3\0\1\36\1\37"+
-    "\2\40\1\37\2\41\2\0\1\40\1\0\14\40\1\37"+
-    "\3\0\1\10\1\42\3\0\1\43\1\44\5\0\1\45"+
-    "\4\0\1\45\2\0\2\45\2\0\1\10\5\0\1\30"+
-    "\1\37\1\40\1\46\3\0\1\10\2\0\1\47\30\0"+
-    "\1\50\2\0\1\51\1\52\1\53";
+    "\12\0\4\1\4\2\1\3\1\1\1\4\1\1\2\5"+
+    "\1\6\2\5\1\7\1\5\2\10\1\11\1\12\1\11"+
+    "\1\13\1\14\1\10\1\15\1\16\1\15\1\17\1\20"+
+    "\1\10\1\21\1\10\4\22\1\23\1\22\1\24\1\25"+
+    "\1\26\3\0\1\27\14\0\1\30\1\31\1\32\1\33"+
+    "\1\11\1\0\1\34\1\35\1\0\1\36\1\0\1\37"+
+    "\3\0\1\40\1\41\2\42\1\41\2\43\2\0\1\42"+
+    "\1\0\14\42\1\41\3\0\1\11\1\44\3\0\1\45"+
+    "\1\46\5\0\1\47\4\0\1\47\2\0\2\47\2\0"+
+    "\1\11\5\0\1\31\1\41\1\42\1\50\3\0\1\11"+
+    "\2\0\1\51\30\0\1\52\2\0\1\53\1\54\1\55";
 
   private static int [] zzUnpackAction() {
-    int [] result = new int[178];
+    int [] result = new int[183];
     int offset = 0;
     offset = zzUnpackAction(ZZ_ACTION_PACKED_0, offset, result);
     return result;
@@ -116,30 +116,30 @@
   private static final String ZZ_ROWMAP_PACKED_0 =
     "\0\0\0\54\0\130\0\204\0\260\0\334\0\u0108\0\u0134"+
     "\0\u0160\0\u018c\0\u01b8\0\u01e4\0\u0210\0\u023c\0\u0268\0\u0294"+
-    "\0\u02c0\0\u018c\0\u02ec\0\u0318\0\u0344\0\u0370\0\u039c\0\u03c8"+
-    "\0\u03f4\0\u0420\0\u018c\0\u0370\0\u044c\0\u018c\0\u0478\0\u04a4"+
-    "\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8\0\u0604"+
-    "\0\u0630\0\u018c\0\u065c\0\u0370\0\u0688\0\u06b4\0\u06e0\0\u070c"+
-    "\0\u018c\0\u018c\0\u0738\0\u0764\0\u0790\0\u018c\0\u07bc\0\u07e8"+
-    "\0\u0814\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c\0\u0948"+
-    "\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u018c\0\u018c\0\u0a24\0\u0a50"+
-    "\0\u0a7c\0\u0aa8\0\u0ad4\0\u0b00\0\u0b2c\0\u0b58\0\u0b84\0\u0bb0"+
-    "\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0814\0\u0cb8\0\u0ce4"+
-    "\0\u0d10\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec\0\u0e18\0\u0e44"+
-    "\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c\0\u0f78\0\u0fa4"+
-    "\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u018c\0\u1080\0\u10ac\0\u10d8"+
-    "\0\u1104\0\u018c\0\u1130\0\u115c\0\u1188\0\u11b4\0\u11e0\0\u120c"+
-    "\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314\0\u1340\0\u07e8"+
-    "\0\u0974\0\u136c\0\u1398\0\u13c4\0\u13f0\0\u141c\0\u1448\0\u1474"+
-    "\0\u14a0\0\u018c\0\u14cc\0\u14f8\0\u1524\0\u1550\0\u157c\0\u15a8"+
-    "\0\u15d4\0\u1600\0\u162c\0\u018c\0\u1658\0\u1684\0\u16b0\0\u16dc"+
-    "\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4\0\u1810\0\u183c"+
-    "\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944\0\u1970\0\u199c"+
-    "\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4\0\u1ad0\0\u018c"+
-    "\0\u018c\0\u018c";
+    "\0\u02c0\0\u02ec\0\u01b8\0\u0318\0\u0344\0\u0370\0\u01b8\0\u039c"+
+    "\0\u03c8\0\u03f4\0\u0420\0\u044c\0\u0478\0\u01b8\0\u039c\0\u04a4"+
+    "\0\u01b8\0\u04d0\0\u04fc\0\u0528\0\u0554\0\u0580\0\u05ac\0\u05d8"+
+    "\0\u0604\0\u0630\0\u065c\0\u0688\0\u06b4\0\u01b8\0\u06e0\0\u039c"+
+    "\0\u070c\0\u0738\0\u0764\0\u0790\0\u01b8\0\u01b8\0\u07bc\0\u07e8"+
+    "\0\u0814\0\u01b8\0\u0840\0\u086c\0\u0898\0\u08c4\0\u08f0\0\u091c"+
+    "\0\u0948\0\u0974\0\u09a0\0\u09cc\0\u09f8\0\u0a24\0\u0a50\0\u0a7c"+
+    "\0\u01b8\0\u01b8\0\u0aa8\0\u0ad4\0\u0b00\0\u0b00\0\u0b2c\0\u0b58"+
+    "\0\u0b84\0\u0bb0\0\u0bdc\0\u0c08\0\u0c34\0\u0c60\0\u0c8c\0\u0cb8"+
+    "\0\u0ce4\0\u0d10\0\u0898\0\u0d3c\0\u0d68\0\u0d94\0\u0dc0\0\u0dec"+
+    "\0\u0e18\0\u0e44\0\u0e70\0\u0e9c\0\u0ec8\0\u0ef4\0\u0f20\0\u0f4c"+
+    "\0\u0f78\0\u0fa4\0\u0fd0\0\u0ffc\0\u1028\0\u1054\0\u1080\0\u10ac"+
+    "\0\u10d8\0\u01b8\0\u1104\0\u1130\0\u115c\0\u1188\0\u01b8\0\u11b4"+
+    "\0\u11e0\0\u120c\0\u1238\0\u1264\0\u1290\0\u12bc\0\u12e8\0\u1314"+
+    "\0\u1340\0\u136c\0\u1398\0\u13c4\0\u086c\0\u09f8\0\u13f0\0\u141c"+
+    "\0\u1448\0\u1474\0\u14a0\0\u14cc\0\u14f8\0\u1524\0\u01b8\0\u1550"+
+    "\0\u157c\0\u15a8\0\u15d4\0\u1600\0\u162c\0\u1658\0\u1684\0\u16b0"+
+    "\0\u01b8\0\u16dc\0\u1708\0\u1734\0\u1760\0\u178c\0\u17b8\0\u17e4"+
+    "\0\u1810\0\u183c\0\u1868\0\u1894\0\u18c0\0\u18ec\0\u1918\0\u1944"+
+    "\0\u1970\0\u199c\0\u19c8\0\u19f4\0\u1a20\0\u1a4c\0\u1a78\0\u1aa4"+
+    "\0\u1ad0\0\u1afc\0\u1b28\0\u1b54\0\u01b8\0\u01b8\0\u01b8";
 
   private static int [] zzUnpackRowMap() {
-    int [] result = new int[178];
+    int [] result = new int[183];
     int offset = 0;
     offset = zzUnpackRowMap(ZZ_ROWMAP_PACKED_0, offset, result);
     return result;
@@ -162,151 +162,153 @@
   private static final int [] ZZ_TRANS = zzUnpackTrans();
 
   private static final String ZZ_TRANS_PACKED_0 =
-    "\1\12\1\13\5\12\1\14\1\12\1\15\3\12\1\16"+
-    "\1\17\1\20\1\21\1\22\1\23\2\12\1\24\2\12"+
-    "\15\16\1\25\2\12\3\16\10\12\1\26\5\12\4\27"+
-    "\1\12\1\23\3\12\1\30\1\12\15\27\3\12\3\27"+
-    "\10\12\1\26\5\12\4\31\1\12\1\23\3\12\1\32"+
-    "\1\12\15\31\3\12\3\31\1\12\7\33\1\34\5\33"+
-    "\4\35\1\33\1\23\2\12\1\33\1\36\1\33\15\35"+
-    "\3\33\1\37\2\35\2\33\1\40\5\33\1\34\5\33"+
-    "\4\41\1\33\1\42\2\33\1\43\2\33\15\41\3\33"+
-    "\3\41\10\33\1\34\5\33\4\44\1\33\1\42\2\33"+
-    "\1\43\2\33\15\44\3\33\3\44\10\33\1\34\1\33"+
-    "\1\45\3\33\4\46\1\33\1\42\5\33\15\46\3\33"+
-    "\3\46\10\33\1\47\5\33\4\50\1\33\1\42\5\33"+
-    "\15\50\1\33\1\51\1\33\3\50\1\33\1\52\1\53"+
-    "\5\52\1\54\1\52\1\55\3\52\4\56\1\52\1\57"+
-    "\2\52\1\60\2\52\15\56\2\52\1\61\3\56\1\52"+
-    "\55\0\1\62\62\0\1\63\4\0\4\64\7\0\6\64"+
-    "\1\65\6\64\3\0\3\64\12\0\1\66\43\0\1\67"+
-    "\1\70\1\71\1\72\2\73\1\0\1\74\3\0\1\74"+
-    "\1\16\1\17\1\20\1\21\7\0\15\16\3\0\3\16"+
-    "\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
-    "\1\100\3\17\1\21\7\0\15\17\3\0\3\17\2\0"+
-    "\1\67\1\101\1\71\1\72\2\77\1\0\1\100\3\0"+
-    "\1\100\1\20\1\17\1\20\1\21\7\0\15\20\3\0"+
-    "\3\20\3\0\1\102\1\0\1\76\2\73\1\0\1\74"+
-    "\3\0\1\74\4\21\7\0\15\21\3\0\3\21\24\0"+
-    "\1\12\55\0\1\103\73\0\1\104\16\0\1\63\4\0"+
-    "\4\64\7\0\15\64\3\0\3\64\16\0\4\27\7\0"+
-    "\15\27\3\0\3\27\27\0\1\105\42\0\4\31\7\0"+
-    "\15\31\3\0\3\31\27\0\1\106\42\0\4\35\7\0"+
-    "\15\35\3\0\3\35\16\0\4\35\7\0\2\35\1\107"+
-    "\12\35\3\0\3\35\2\0\1\110\67\0\4\41\7\0"+
-    "\15\41\3\0\3\41\24\0\1\33\55\0\1\111\43\0"+
-    "\4\44\7\0\15\44\3\0\3\44\12\0\1\105\57\0"+
-    "\4\46\7\0\15\46\3\0\3\46\11\0\1\112\4\0"+
-    "\4\64\7\0\15\64\3\0\3\64\16\0\4\50\7\0"+
-    "\15\50\3\0\3\50\47\0\1\105\6\0\1\113\63\0"+
-    "\1\114\57\0\4\56\7\0\15\56\3\0\3\56\24\0"+
-    "\1\52\55\0\1\115\43\0\4\64\7\0\15\64\3\0"+
-    "\3\64\14\0\1\33\1\0\4\116\1\0\3\117\3\0"+
-    "\15\116\3\0\3\116\14\0\1\33\1\0\4\116\1\0"+
-    "\3\117\3\0\3\116\1\120\11\116\3\0\3\116\16\0"+
-    "\1\121\1\0\1\121\10\0\15\121\3\0\3\121\16\0"+
-    "\1\122\1\123\1\124\1\125\7\0\15\122\3\0\3\122"+
+    "\1\13\1\14\5\13\1\15\1\13\1\16\3\13\1\17"+
+    "\1\20\1\21\1\22\1\23\1\24\2\13\1\25\2\13"+
+    "\15\17\1\26\2\13\3\17\1\13\7\27\1\30\5\27"+
+    "\4\31\1\27\1\32\3\27\1\33\1\27\15\31\3\27"+
+    "\3\31\10\27\1\30\5\27\4\34\1\27\1\32\3\27"+
+    "\1\35\1\27\15\34\3\27\3\34\1\27\7\36\1\37"+
+    "\5\36\4\40\1\36\1\32\2\27\1\36\1\41\1\36"+
+    "\15\40\3\36\1\42\2\40\2\36\1\43\5\36\1\37"+
+    "\5\36\4\44\1\36\1\45\2\36\1\46\2\36\15\44"+
+    "\3\36\3\44\10\36\1\37\5\36\4\47\1\36\1\45"+
+    "\2\36\1\46\2\36\15\47\3\36\3\47\10\36\1\37"+
+    "\5\36\4\47\1\36\1\45\2\36\1\50\2\36\15\47"+
+    "\3\36\3\47\10\36\1\37\1\36\1\51\3\36\4\52"+
+    "\1\36\1\45\5\36\15\52\3\36\3\52\10\36\1\53"+
+    "\5\36\4\54\1\36\1\45\5\36\15\54\1\36\1\55"+
+    "\1\36\3\54\1\36\1\56\1\57\5\56\1\60\1\56"+
+    "\1\61\3\56\4\62\1\56\1\63\2\56\1\64\2\56"+
+    "\15\62\2\56\1\65\3\62\1\56\55\0\1\66\62\0"+
+    "\1\67\4\0\4\70\7\0\6\70\1\71\6\70\3\0"+
+    "\3\70\12\0\1\72\43\0\1\73\1\74\1\75\1\76"+
+    "\2\77\1\0\1\100\3\0\1\100\1\17\1\20\1\21"+
+    "\1\22\7\0\15\17\3\0\3\17\3\0\1\101\1\0"+
+    "\1\102\2\103\1\0\1\104\3\0\1\104\3\20\1\22"+
+    "\7\0\15\20\3\0\3\20\2\0\1\73\1\105\1\75"+
+    "\1\76\2\103\1\0\1\104\3\0\1\104\1\21\1\20"+
+    "\1\21\1\22\7\0\15\21\3\0\3\21\3\0\1\106"+
+    "\1\0\1\102\2\77\1\0\1\100\3\0\1\100\4\22"+
+    "\7\0\15\22\3\0\3\22\24\0\1\13\55\0\1\107"+
+    "\73\0\1\110\16\0\1\67\4\0\4\70\7\0\15\70"+
+    "\3\0\3\70\16\0\4\31\7\0\15\31\3\0\3\31"+
+    "\24\0\1\27\56\0\1\111\42\0\4\34\7\0\15\34"+
+    "\3\0\3\34\27\0\1\112\42\0\4\40\7\0\15\40"+
+    "\3\0\3\40\16\0\4\40\7\0\2\40\1\113\12\40"+
+    "\3\0\3\40\2\0\1\114\67\0\4\44\7\0\15\44"+
+    "\3\0\3\44\24\0\1\36\55\0\1\115\43\0\4\47"+
+    "\7\0\15\47\3\0\3\47\26\0\1\116\37\0\1\111"+
+    "\57\0\4\52\7\0\15\52\3\0\3\52\11\0\1\117"+
+    "\4\0\4\70\7\0\15\70\3\0\3\70\16\0\4\54"+
+    "\7\0\15\54\3\0\3\54\47\0\1\111\6\0\1\120"+
+    "\63\0\1\121\57\0\4\62\7\0\15\62\3\0\3\62"+
+    "\24\0\1\56\55\0\1\122\43\0\4\70\7\0\15\70"+
+    "\3\0\3\70\14\0\1\36\1\0\4\123\1\0\3\124"+
+    "\3\0\15\123\3\0\3\123\14\0\1\36\1\0\4\123"+
+    "\1\0\3\124\3\0\3\123\1\125\11\123\3\0\3\123"+
     "\16\0\1\126\1\0\1\126\10\0\15\126\3\0\3\126"+
-    "\16\0\1\127\1\130\1\127\1\130\7\0\15\127\3\0"+
-    "\3\127\16\0\1\131\2\132\1\133\7\0\15\131\3\0"+
-    "\3\131\16\0\1\74\2\134\10\0\15\74\3\0\3\74"+
-    "\16\0\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
-    "\16\0\4\130\7\0\15\130\3\0\3\130\16\0\1\140"+
-    "\2\141\1\142\7\0\15\140\3\0\3\140\16\0\1\143"+
-    "\2\144\1\145\7\0\15\143\3\0\3\143\16\0\1\146"+
-    "\1\136\1\147\1\137\7\0\15\146\3\0\3\146\16\0"+
-    "\1\150\2\123\1\125\7\0\15\150\3\0\3\150\30\0"+
-    "\1\151\1\152\64\0\1\153\27\0\4\35\7\0\2\35"+
-    "\1\154\12\35\3\0\3\35\2\0\1\155\101\0\1\156"+
-    "\1\157\40\0\4\64\7\0\6\64\1\160\6\64\3\0"+
-    "\3\64\2\0\1\161\63\0\1\162\71\0\1\163\1\164"+
-    "\34\0\1\165\1\0\1\33\1\0\4\116\1\0\3\117"+
-    "\3\0\15\116\3\0\3\116\16\0\4\166\1\0\3\117"+
-    "\3\0\15\166\3\0\3\166\12\0\1\165\1\0\1\33"+
-    "\1\0\4\116\1\0\3\117\3\0\10\116\1\167\4\116"+
-    "\3\0\3\116\2\0\1\67\13\0\1\121\1\0\1\121"+
-    "\10\0\15\121\3\0\3\121\3\0\1\170\1\0\1\76"+
-    "\2\171\6\0\1\122\1\123\1\124\1\125\7\0\15\122"+
-    "\3\0\3\122\3\0\1\172\1\0\1\76\2\173\1\0"+
-    "\1\174\3\0\1\174\3\123\1\125\7\0\15\123\3\0"+
-    "\3\123\3\0\1\175\1\0\1\76\2\173\1\0\1\174"+
-    "\3\0\1\174\1\124\1\123\1\124\1\125\7\0\15\124"+
-    "\3\0\3\124\3\0\1\176\1\0\1\76\2\171\6\0"+
-    "\4\125\7\0\15\125\3\0\3\125\3\0\1\177\2\0"+
-    "\1\177\7\0\1\127\1\130\1\127\1\130\7\0\15\127"+
-    "\3\0\3\127\3\0\1\177\2\0\1\177\7\0\4\130"+
-    "\7\0\15\130\3\0\3\130\3\0\1\171\1\0\1\76"+
-    "\2\171\6\0\1\131\2\132\1\133\7\0\15\131\3\0"+
-    "\3\131\3\0\1\173\1\0\1\76\2\173\1\0\1\174"+
-    "\3\0\1\174\3\132\1\133\7\0\15\132\3\0\3\132"+
-    "\3\0\1\171\1\0\1\76\2\171\6\0\4\133\7\0"+
-    "\15\133\3\0\3\133\3\0\1\174\2\0\2\174\1\0"+
-    "\1\174\3\0\1\174\3\134\10\0\15\134\3\0\3\134"+
-    "\3\0\1\102\1\0\1\76\2\73\1\0\1\74\3\0"+
-    "\1\74\1\135\2\136\1\137\7\0\15\135\3\0\3\135"+
-    "\3\0\1\75\1\0\1\76\2\77\1\0\1\100\3\0"+
-    "\1\100\3\136\1\137\7\0\15\136\3\0\3\136\3\0"+
-    "\1\102\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
-    "\4\137\7\0\15\137\3\0\3\137\3\0\1\73\1\0"+
-    "\1\76\2\73\1\0\1\74\3\0\1\74\1\140\2\141"+
-    "\1\142\7\0\15\140\3\0\3\140\3\0\1\77\1\0"+
-    "\1\76\2\77\1\0\1\100\3\0\1\100\3\141\1\142"+
-    "\7\0\15\141\3\0\3\141\3\0\1\73\1\0\1\76"+
-    "\2\73\1\0\1\74\3\0\1\74\4\142\7\0\15\142"+
-    "\3\0\3\142\3\0\1\74\2\0\2\74\1\0\1\74"+
-    "\3\0\1\74\1\143\2\144\1\145\7\0\15\143\3\0"+
-    "\3\143\3\0\1\100\2\0\2\100\1\0\1\100\3\0"+
-    "\1\100\3\144\1\145\7\0\15\144\3\0\3\144\3\0"+
-    "\1\74\2\0\2\74\1\0\1\74\3\0\1\74\4\145"+
-    "\7\0\15\145\3\0\3\145\3\0\1\200\1\0\1\76"+
-    "\2\73\1\0\1\74\3\0\1\74\1\146\1\136\1\147"+
-    "\1\137\7\0\15\146\3\0\3\146\3\0\1\201\1\0"+
-    "\1\76\2\77\1\0\1\100\3\0\1\100\1\147\1\136"+
-    "\1\147\1\137\7\0\15\147\3\0\3\147\3\0\1\176"+
-    "\1\0\1\76\2\171\6\0\1\150\2\123\1\125\7\0"+
-    "\15\150\3\0\3\150\31\0\1\152\54\0\1\202\64\0"+
-    "\1\203\26\0\4\35\7\0\15\35\3\0\1\35\1\204"+
-    "\1\35\31\0\1\157\54\0\1\205\35\0\1\33\1\0"+
-    "\4\116\1\0\3\117\3\0\3\116\1\206\11\116\3\0"+
-    "\3\116\2\0\1\207\102\0\1\164\54\0\1\210\34\0"+
-    "\1\211\52\0\1\165\3\0\4\166\7\0\15\166\3\0"+
-    "\3\166\12\0\1\165\1\0\1\212\1\0\4\116\1\0"+
-    "\3\117\3\0\15\116\3\0\3\116\16\0\1\213\1\125"+
-    "\1\213\1\125\7\0\15\213\3\0\3\213\16\0\4\133"+
-    "\7\0\15\133\3\0\3\133\16\0\4\137\7\0\15\137"+
-    "\3\0\3\137\16\0\4\142\7\0\15\142\3\0\3\142"+
-    "\16\0\4\145\7\0\15\145\3\0\3\145\16\0\1\214"+
-    "\1\137\1\214\1\137\7\0\15\214\3\0\3\214\16\0"+
-    "\4\125\7\0\15\125\3\0\3\125\16\0\4\215\7\0"+
-    "\15\215\3\0\3\215\33\0\1\216\61\0\1\217\30\0"+
-    "\4\35\6\0\1\220\15\35\3\0\2\35\1\221\33\0"+
-    "\1\222\32\0\1\165\1\0\1\33\1\0\4\116\1\0"+
-    "\3\117\3\0\10\116\1\223\4\116\3\0\3\116\2\0"+
-    "\1\224\104\0\1\225\36\0\4\226\7\0\15\226\3\0"+
-    "\3\226\3\0\1\170\1\0\1\76\2\171\6\0\1\213"+
-    "\1\125\1\213\1\125\7\0\15\213\3\0\3\213\3\0"+
-    "\1\200\1\0\1\76\2\73\1\0\1\74\3\0\1\74"+
-    "\1\214\1\137\1\214\1\137\7\0\15\214\3\0\3\214"+
-    "\3\0\1\177\2\0\1\177\7\0\4\215\7\0\15\215"+
-    "\3\0\3\215\34\0\1\227\55\0\1\230\26\0\1\231"+
-    "\60\0\4\35\6\0\1\220\15\35\3\0\3\35\34\0"+
-    "\1\232\31\0\1\165\1\0\1\105\1\0\4\116\1\0"+
-    "\3\117\3\0\15\116\3\0\3\116\34\0\1\233\32\0"+
-    "\1\234\2\0\4\226\7\0\15\226\3\0\3\226\35\0"+
-    "\1\235\62\0\1\236\20\0\1\237\77\0\1\240\53\0"+
-    "\1\241\32\0\1\33\1\0\4\166\1\0\3\117\3\0"+
-    "\15\166\3\0\3\166\36\0\1\242\53\0\1\243\33\0"+
-    "\4\244\7\0\15\244\3\0\3\244\36\0\1\245\53\0"+
-    "\1\246\54\0\1\247\61\0\1\250\11\0\1\251\12\0"+
-    "\4\244\7\0\15\244\3\0\3\244\37\0\1\252\53\0"+
-    "\1\253\54\0\1\254\22\0\1\12\62\0\4\255\7\0"+
-    "\15\255\3\0\3\255\40\0\1\256\53\0\1\257\43\0"+
-    "\1\260\26\0\2\255\1\0\2\255\1\0\2\255\2\0"+
-    "\5\255\7\0\15\255\3\0\4\255\27\0\1\261\53\0"+
-    "\1\262\24\0";
+    "\16\0\1\127\1\130\1\131\1\132\7\0\15\127\3\0"+
+    "\3\127\16\0\1\133\1\0\1\133\10\0\15\133\3\0"+
+    "\3\133\16\0\1\134\1\135\1\134\1\135\7\0\15\134"+
+    "\3\0\3\134\16\0\1\136\2\137\1\140\7\0\15\136"+
+    "\3\0\3\136\16\0\1\100\2\141\10\0\15\100\3\0"+
+    "\3\100\16\0\1\142\2\143\1\144\7\0\15\142\3\0"+
+    "\3\142\16\0\4\135\7\0\15\135\3\0\3\135\16\0"+
+    "\1\145\2\146\1\147\7\0\15\145\3\0\3\145\16\0"+
+    "\1\150\2\151\1\152\7\0\15\150\3\0\3\150\16\0"+
+    "\1\153\1\143\1\154\1\144\7\0\15\153\3\0\3\153"+
+    "\16\0\1\155\2\130\1\132\7\0\15\155\3\0\3\155"+
+    "\30\0\1\156\1\157\64\0\1\160\27\0\4\40\7\0"+
+    "\2\40\1\161\12\40\3\0\3\40\2\0\1\162\101\0"+
+    "\1\163\1\164\40\0\4\70\7\0\6\70\1\165\6\70"+
+    "\3\0\3\70\2\0\1\166\63\0\1\167\71\0\1\170"+
+    "\1\171\34\0\1\172\1\0\1\36\1\0\4\123\1\0"+
+    "\3\124\3\0\15\123\3\0\3\123\16\0\4\173\1\0"+
+    "\3\124\3\0\15\173\3\0\3\173\12\0\1\172\1\0"+
+    "\1\36\1\0\4\123\1\0\3\124\3\0\10\123\1\174"+
+    "\4\123\3\0\3\123\2\0\1\73\13\0\1\126\1\0"+
+    "\1\126\10\0\15\126\3\0\3\126\3\0\1\175\1\0"+
+    "\1\102\2\176\6\0\1\127\1\130\1\131\1\132\7\0"+
+    "\15\127\3\0\3\127\3\0\1\177\1\0\1\102\2\200"+
+    "\1\0\1\201\3\0\1\201\3\130\1\132\7\0\15\130"+
+    "\3\0\3\130\3\0\1\202\1\0\1\102\2\200\1\0"+
+    "\1\201\3\0\1\201\1\131\1\130\1\131\1\132\7\0"+
+    "\15\131\3\0\3\131\3\0\1\203\1\0\1\102\2\176"+
+    "\6\0\4\132\7\0\15\132\3\0\3\132\3\0\1\204"+
+    "\2\0\1\204\7\0\1\134\1\135\1\134\1\135\7\0"+
+    "\15\134\3\0\3\134\3\0\1\204\2\0\1\204\7\0"+
+    "\4\135\7\0\15\135\3\0\3\135\3\0\1\176\1\0"+
+    "\1\102\2\176\6\0\1\136\2\137\1\140\7\0\15\136"+
+    "\3\0\3\136\3\0\1\200\1\0\1\102\2\200\1\0"+
+    "\1\201\3\0\1\201\3\137\1\140\7\0\15\137\3\0"+
+    "\3\137\3\0\1\176\1\0\1\102\2\176\6\0\4\140"+
+    "\7\0\15\140\3\0\3\140\3\0\1\201\2\0\2\201"+
+    "\1\0\1\201\3\0\1\201\3\141\10\0\15\141\3\0"+
+    "\3\141\3\0\1\106\1\0\1\102\2\77\1\0\1\100"+
+    "\3\0\1\100\1\142\2\143\1\144\7\0\15\142\3\0"+
+    "\3\142\3\0\1\101\1\0\1\102\2\103\1\0\1\104"+
+    "\3\0\1\104\3\143\1\144\7\0\15\143\3\0\3\143"+
+    "\3\0\1\106\1\0\1\102\2\77\1\0\1\100\3\0"+
+    "\1\100\4\144\7\0\15\144\3\0\3\144\3\0\1\77"+
+    "\1\0\1\102\2\77\1\0\1\100\3\0\1\100\1\145"+
+    "\2\146\1\147\7\0\15\145\3\0\3\145\3\0\1\103"+
+    "\1\0\1\102\2\103\1\0\1\104\3\0\1\104\3\146"+
+    "\1\147\7\0\15\146\3\0\3\146\3\0\1\77\1\0"+
+    "\1\102\2\77\1\0\1\100\3\0\1\100\4\147\7\0"+
+    "\15\147\3\0\3\147\3\0\1\100\2\0\2\100\1\0"+
+    "\1\100\3\0\1\100\1\150\2\151\1\152\7\0\15\150"+
+    "\3\0\3\150\3\0\1\104\2\0\2\104\1\0\1\104"+
+    "\3\0\1\104\3\151\1\152\7\0\15\151\3\0\3\151"+
+    "\3\0\1\100\2\0\2\100\1\0\1\100\3\0\1\100"+
+    "\4\152\7\0\15\152\3\0\3\152\3\0\1\205\1\0"+
+    "\1\102\2\77\1\0\1\100\3\0\1\100\1\153\1\143"+
+    "\1\154\1\144\7\0\15\153\3\0\3\153\3\0\1\206"+
+    "\1\0\1\102\2\103\1\0\1\104\3\0\1\104\1\154"+
+    "\1\143\1\154\1\144\7\0\15\154\3\0\3\154\3\0"+
+    "\1\203\1\0\1\102\2\176\6\0\1\155\2\130\1\132"+
+    "\7\0\15\155\3\0\3\155\31\0\1\157\54\0\1\207"+
+    "\64\0\1\210\26\0\4\40\7\0\15\40\3\0\1\40"+
+    "\1\211\1\40\31\0\1\164\54\0\1\212\35\0\1\36"+
+    "\1\0\4\123\1\0\3\124\3\0\3\123\1\213\11\123"+
+    "\3\0\3\123\2\0\1\214\102\0\1\171\54\0\1\215"+
+    "\34\0\1\216\52\0\1\172\3\0\4\173\7\0\15\173"+
+    "\3\0\3\173\12\0\1\172\1\0\1\217\1\0\4\123"+
+    "\1\0\3\124\3\0\15\123\3\0\3\123\16\0\1\220"+
+    "\1\132\1\220\1\132\7\0\15\220\3\0\3\220\16\0"+
+    "\4\140\7\0\15\140\3\0\3\140\16\0\4\144\7\0"+
+    "\15\144\3\0\3\144\16\0\4\147\7\0\15\147\3\0"+
+    "\3\147\16\0\4\152\7\0\15\152\3\0\3\152\16\0"+
+    "\1\221\1\144\1\221\1\144\7\0\15\221\3\0\3\221"+
+    "\16\0\4\132\7\0\15\132\3\0\3\132\16\0\4\222"+
+    "\7\0\15\222\3\0\3\222\33\0\1\223\61\0\1\224"+
+    "\30\0\4\40\6\0\1\225\15\40\3\0\2\40\1\226"+
+    "\33\0\1\227\32\0\1\172\1\0\1\36\1\0\4\123"+
+    "\1\0\3\124\3\0\10\123\1\230\4\123\3\0\3\123"+
+    "\2\0\1\231\104\0\1\232\36\0\4\233\7\0\15\233"+
+    "\3\0\3\233\3\0\1\175\1\0\1\102\2\176\6\0"+
+    "\1\220\1\132\1\220\1\132\7\0\15\220\3\0\3\220"+
+    "\3\0\1\205\1\0\1\102\2\77\1\0\1\100\3\0"+
+    "\1\100\1\221\1\144\1\221\1\144\7\0\15\221\3\0"+
+    "\3\221\3\0\1\204\2\0\1\204\7\0\4\222\7\0"+
+    "\15\222\3\0\3\222\34\0\1\234\55\0\1\235\26\0"+
+    "\1\236\60\0\4\40\6\0\1\225\15\40\3\0\3\40"+
+    "\34\0\1\237\31\0\1\172\1\0\1\111\1\0\4\123"+
+    "\1\0\3\124\3\0\15\123\3\0\3\123\34\0\1\240"+
+    "\32\0\1\241\2\0\4\233\7\0\15\233\3\0\3\233"+
+    "\35\0\1\242\62\0\1\243\20\0\1\244\77\0\1\245"+
+    "\53\0\1\246\32\0\1\36\1\0\4\173\1\0\3\124"+
+    "\3\0\15\173\3\0\3\173\36\0\1\247\53\0\1\250"+
+    "\33\0\4\251\7\0\15\251\3\0\3\251\36\0\1\252"+
+    "\53\0\1\253\54\0\1\254\61\0\1\255\11\0\1\256"+
+    "\12\0\4\251\7\0\15\251\3\0\3\251\37\0\1\257"+
+    "\53\0\1\260\54\0\1\261\22\0\1\13\62\0\4\262"+
+    "\7\0\15\262\3\0\3\262\40\0\1\263\53\0\1\264"+
+    "\43\0\1\265\26\0\2\262\1\0\2\262\1\0\2\262"+
+    "\2\0\5\262\7\0\15\262\3\0\4\262\27\0\1\266"+
+    "\53\0\1\267\24\0";
 
   private static int [] zzUnpackTrans() {
-    int [] result = new int[6908];
+    int [] result = new int[7040];
     int offset = 0;
     offset = zzUnpackTrans(ZZ_TRANS_PACKED_0, offset, result);
     return result;
@@ -344,16 +346,17 @@
   private static final int [] ZZ_ATTRIBUTE = zzUnpackAttribute();
 
   private static final String ZZ_ATTRIBUTE_PACKED_0 =
-    "\11\0\1\11\7\1\1\11\10\1\1\11\2\1\1\11"+
-    "\13\1\1\11\6\1\2\11\3\0\1\11\14\0\2\1"+
-    "\2\11\1\1\1\0\1\1\1\0\1\1\1\0\1\1"+
-    "\3\0\7\1\2\0\1\1\1\0\15\1\3\0\1\1"+
-    "\1\11\3\0\1\1\1\11\5\0\1\1\4\0\1\1"+
-    "\2\0\2\1\2\0\1\1\5\0\1\11\3\1\3\0"+
-    "\1\1\2\0\1\11\30\0\1\1\2\0\3\11";
+    "\12\0\1\11\7\1\1\11\3\1\1\11\6\1\1\11"+
+    "\2\1\1\11\14\1\1\11\6\1\2\11\3\0\1\11"+
+    "\14\0\2\1\2\11\1\1\1\0\2\1\1\0\1\1"+
+    "\1\0\1\1\3\0\7\1\2\0\1\1\1\0\15\1"+
+    "\3\0\1\1\1\11\3\0\1\1\1\11\5\0\1\1"+
+    "\4\0\1\1\2\0\2\1\2\0\1\1\5\0\1\11"+
+    "\3\1\3\0\1\1\2\0\1\11\30\0\1\1\2\0"+
+    "\3\11";
 
   private static int [] zzUnpackAttribute() {
-    int [] result = new int[178];
+    int [] result = new int[183];
     int offset = 0;
     offset = zzUnpackAttribute(ZZ_ATTRIBUTE_PACKED_0, offset, result);
     return result;
@@ -446,6 +449,10 @@
 private int numBalanced = 0;
 private int positionInc = 1;
 private int numLinkToks = 0;
+//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
+//this can be useful for detecting when a new reserved token is encountered
+//see https://issues.apache.org/jira/browse/LUCENE-1133
+private int numWikiTokensSeen = 0;
 
 public static final String [] TOKEN_TYPES = new String [] {
     "<ALPHANUM>",
@@ -468,6 +475,14 @@
     WikipediaTokenizer.EXTERNAL_LINK_URL
 };
 
+/**
+Returns the number of tokens seen inside a category or link, etc.
+@return the number of tokens seen inside the context of wiki syntax.
+**/
+public final int getNumWikiTokensSeen(){
+  return numWikiTokensSeen;
+}
+
 public final int yychar()
 {
     return yychar;
@@ -480,10 +495,18 @@
 /**
  * Fills Lucene token with the current token text.
  */
-final void getText(Token t, int tokType) {
+final void getText(Token t) {
   t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 }
 
+final int setText(StringBuffer buffer){
+  int length = zzMarkedPos - zzStartRead;
+  buffer.append(zzBuffer, zzStartRead, length);
+  return length;
+}
+
+
+
 
   /**
    * Creates a new scanner
@@ -774,178 +797,186 @@
       zzMarkedPos = zzMarkedPosL;
 
       switch (zzAction < 0 ? zzAction : ZZ_ACTION[zzAction]) {
-        case 7: 
+        case 8: 
           { /* ignore */
           }
-        case 44: break;
-        case 3: 
-          { positionInc = 1; return CJ;
-          }
-        case 45: break;
-        case 28: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
-          }
         case 46: break;
-        case 9: 
-          { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
+        case 28: 
+          { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);
           }
         case 47: break;
-        case 4: 
-          { positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
+        case 3: 
+          { positionInc = 1; return CJ;
           }
         case 48: break;
-        case 39: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
+        case 30: 
+          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/
           }
         case 49: break;
-        case 11: 
-          { currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/
+        case 10: 
+          { numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);
           }
         case 50: break;
-        case 23: 
-          { positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
+        case 41: 
+          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/
           }
         case 51: break;
-        case 5: 
-          { yybegin(CATEGORY_STATE); return currentTokType;
+        case 7: 
+          { yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;
           }
         case 52: break;
-        case 36: 
-          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
+        case 23: 
+          { numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
           }
         case 53: break;
-        case 8: 
-          { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
+        case 38: 
+          { numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/
           }
         case 54: break;
-        case 24: 
-          { positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
+        case 17: 
+          { yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;
           }
         case 55: break;
-        case 22: 
-          { positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);
+        case 24: 
+          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);
           }
         case 56: break;
-        case 41: 
-          { positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
+        case 14: 
+          { yybegin(STRING); numWikiTokensSeen++; return currentTokType;
           }
         case 57: break;
-        case 18: 
-          { yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/
+        case 5: 
+          { positionInc = 1;
           }
         case 58: break;
-        case 21: 
-          { positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
+        case 43: 
+          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);
           }
         case 59: break;
-        case 1: 
-          { positionInc = 1;
+        case 26: 
+          { yybegin(YYINITIAL);
           }
         case 60: break;
-        case 43: 
-          { numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
+        case 20: 
+          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
           }
         case 61: break;
-        case 25: 
-          { yybegin(YYINITIAL);
+        case 1: 
+          { numWikiTokensSeen = 0;  positionInc = 1;
           }
         case 62: break;
         case 40: 
-          { positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
+          { positionInc = 1; return EMAIL;
           }
         case 63: break;
-        case 19: 
-          { numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
+        case 25: 
+          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);
           }
         case 64: break;
-        case 13: 
-          { yybegin(STRING);return currentTokType;
+        case 39: 
+          { positionInc = 1; return ACRONYM;
           }
         case 65: break;
-        case 38: 
-          { positionInc = 1; return EMAIL;
+        case 9: 
+          { if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;
           }
         case 66: break;
-        case 37: 
-          { positionInc = 1; return ACRONYM;
+        case 22: 
+          { numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}
           }
         case 67: break;
-        case 17: 
-          { /* ignore STRING */
+        case 31: 
+          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
           }
         case 68: break;
-        case 42: 
-          { currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
+        case 15: 
+          { currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);
           }
         case 69: break;
-        case 20: 
-          { yybegin(STRING); return currentTokType;/*pipe*/
+        case 18: 
+          { /* ignore STRING */
           }
         case 70: break;
-        case 12: 
-          { currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);
+        case 42: 
+          { positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;
           }
         case 71: break;
-        case 29: 
-          { numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
+        case 21: 
+          { yybegin(STRING); return currentTokType;/*pipe*/
           }
         case 72: break;
-        case 35: 
+        case 37: 
           { numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/
           }
         case 73: break;
-        case 16: 
-          { yybegin(DOUBLE_BRACE_STATE); return currentTokType;
+        case 33: 
+          { positionInc = 1; return HOST;
           }
         case 74: break;
-        case 31: 
-          { positionInc = 1; return HOST;
+        case 45: 
+          { numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);
           }
         case 75: break;
-        case 34: 
+        case 36: 
           { currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE);
           }
         case 76: break;
-        case 27: 
-          { currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);
+        case 13: 
+          { currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);
           }
         case 77: break;
-        case 14: 
-          { currentTokType = SUB_HEADING; yybegin(STRING);
+        case 16: 
+          { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;
           }
         case 78: break;
-        case 30: 
-          { positionInc = 1; return APOSTROPHE;
+        case 12: 
+          { currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/
           }
         case 79: break;
-        case 32: 
-          { positionInc = 1; return NUM;
+        case 6: 
+          { yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;
           }
         case 80: break;
-        case 15: 
-          { currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;
+        case 32: 
+          { positionInc = 1; return APOSTROPHE;
           }
         case 81: break;
-        case 6: 
-          { yybegin(INTERNAL_LINK_STATE); return currentTokType;
+        case 19: 
+          { yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/
           }
         case 82: break;
+        case 34: 
+          { positionInc = 1; return NUM;
+          }
+        case 83: break;
+        case 44: 
+          { currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);
+          }
+        case 84: break;
         case 2: 
           { positionInc = 1; return ALPHANUM;
           }
-        case 83: break;
-        case 33: 
+        case 85: break;
+        case 35: 
           { positionInc = 1; return COMPANY;
           }
-        case 84: break;
-        case 10: 
+        case 86: break;
+        case 11: 
           { currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE);
           }
-        case 85: break;
-        case 26: 
+        case 87: break;
+        case 29: 
+          { currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE);
+          }
+        case 88: break;
+        case 4: 
+          { numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);
+          }
+        case 89: break;
+        case 27: 
           { numLinkToks = 0; yybegin(YYINITIAL);
           }
-        case 86: break;
+        case 90: break;
         default: 
           if (zzInput == YYEOF && zzStartRead == zzCurrentPos) {
             zzAtEOF = true;

Modified: lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex?rev=614895&r1=614894&r2=614895&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex (original)
+++ lucene/java/trunk/contrib/wikipedia/src/java/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerImpl.jflex Thu Jan 24 07:05:53 2008
@@ -54,6 +54,10 @@
 private int numBalanced = 0;
 private int positionInc = 1;
 private int numLinkToks = 0;
+//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
+//this can be useful for detecting when a new reserved token is encountered
+//see https://issues.apache.org/jira/browse/LUCENE-1133
+private int numWikiTokensSeen = 0;
 
 public static final String [] TOKEN_TYPES = new String [] {
     "<ALPHANUM>",
@@ -76,6 +80,14 @@
     WikipediaTokenizer.EXTERNAL_LINK_URL
 };
 
+/**
+Returns the number of tokens seen inside a category or link, etc.
+@return the number of tokens seen inside the context of wiki syntax.
+**/
+public final int getNumWikiTokensSeen(){
+  return numWikiTokensSeen;
+}
+
 public final int yychar()
 {
     return yychar;
@@ -88,9 +100,17 @@
 /**
  * Fills Lucene token with the current token text.
  */
-final void getText(Token t, int tokType) {
+final void getText(Token t) {
   t.setTermBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 }
+
+final int setText(StringBuffer buffer){
+  int length = zzMarkedPos - zzStartRead;
+  buffer.append(zzBuffer, zzStartRead, length);
+  return length;
+}
+
+
 %}
 
 // basic word: a sequence of digits & letters
@@ -191,21 +211,21 @@
   //First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
   //set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
   //tokens within the link are incremented
-  {DOUBLE_BRACKET} {positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
-  {DOUBLE_BRACKET_CAT} {positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
-  {EXTERNAL_LINK} {positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
-  {TWO_SINGLE_QUOTES} {positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
-  {DOUBLE_EQUALS} {positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
-  {DOUBLE_BRACE} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
-  {CITATION} {positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
+  {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
+  {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
+  {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
+  {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
+  {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
+  {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
+  {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
 //ignore
-  . | {WHITESPACE} |{INFOBOX}                                               { positionInc = 1; }
+  . | {WHITESPACE} |{INFOBOX}                                               {numWikiTokensSeen = 0;  positionInc = 1; }
 }
 
 <INTERNAL_LINK_STATE>{
 //First {ALPHANUM} is always the link, set position to 0 for these
 //This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
-  {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); return currentTokType;}
+  {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
   {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);}
   //ignore
   . | {WHITESPACE}                                               { positionInc = 1; }
@@ -213,14 +233,14 @@
 
 <EXTERNAL_LINK_STATE>{
 //increment the link token, but then don't increment the tokens after that which are still in the link
-  ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
-  {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
+  ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
+  {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
   "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);}
   {WHITESPACE}                                               { positionInc = 1; }
 }
 
 <CATEGORY_STATE>{
-  {ALPHANUM} {yybegin(CATEGORY_STATE); return currentTokType;}
+  {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
   {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
   //ignore
   . | {WHITESPACE}                                               { positionInc = 1; }
@@ -229,22 +249,22 @@
 <TWO_SINGLE_QUOTES_STATE>{
   "'" {currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE);}
    "'''" {currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE);}
-   {ALPHANUM} {currentTokType = ITALICS; yybegin(STRING); return currentTokType;/*italics*/}
+   {ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/}
    //we can have links inside, let those override
-   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
-   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
-   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
 
    //ignore
   . | {WHITESPACE}                                               { /* ignore */ }
 }
 //bold
 <THREE_SINGLE_QUOTES_STATE>{
-  {ALPHANUM} {yybegin(STRING);return currentTokType;}
+  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
   //we can have links inside, let those override
-   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
-   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
-   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
 
    //ignore
   . | {WHITESPACE}                                               { /* ignore */ }
@@ -252,26 +272,26 @@
 }
 //bold italics
 <FIVE_SINGLE_QUOTES_STATE>{
-  {ALPHANUM} {yybegin(STRING);return currentTokType;}
+  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
   //we can have links inside, let those override
-   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
-   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
-   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
 
    //ignore
   . | {WHITESPACE}                                               { /* ignore */ }
 }
 
 <DOUBLE_EQUALS_STATE>{
- "=" {currentTokType = SUB_HEADING; yybegin(STRING);}
- {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); return currentTokType;}
+ "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);}
+ {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
  {DOUBLE_EQUALS} {yybegin(YYINITIAL);}
   //ignore
   . | {WHITESPACE}                                               { /* ignore */ }
 }
 
 <DOUBLE_BRACE_STATE>{
-  {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); return currentTokType;}
+  {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
   {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
   {CITATION_CLOSE} {yybegin(YYINITIAL);}
    //ignore
@@ -283,11 +303,11 @@
   "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
   "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
   "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
-  {ALPHANUM} {yybegin(STRING); return currentTokType;/* STRING ALPHANUM*/}
+  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
   //we can have links inside, let those override
-   {DOUBLE_BRACKET} {numBalanced = 0;currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
-   {DOUBLE_BRACKET_CAT} {numBalanced = 0;currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
-   {EXTERNAL_LINK} {numBalanced = 0;currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
+   {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
+   {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
 
 
   {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}

Modified: lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java?rev=614895&r1=614894&r2=614895&view=diff
==============================================================================
--- lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java (original)
+++ lucene/java/trunk/contrib/wikipedia/src/test/org/apache/lucene/wikipedia/analysis/WikipediaTokenizerTest.java Thu Jan 24 07:05:53 2008
@@ -22,8 +22,11 @@
 import org.apache.lucene.analysis.Token;
 
 import java.io.StringReader;
+import java.io.IOException;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Set;
+import java.util.HashSet;
 
 
 /**
@@ -31,6 +34,7 @@
  *
  **/
 public class WikipediaTokenizerTest extends TestCase {
+  protected static final String LINK_PHRASES = "click [[link here again]] click [http://lucene.apache.org here again] [[Category:a b c d]]";
 
 
   public WikipediaTokenizerTest(String s) {
@@ -155,8 +159,13 @@
   }
 
   public void testLinkPhrases() throws Exception {
-    String test = "click [[link here again]] click [http://lucene.apache.org here again]";
-    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test));
+
+    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES));
+    checkLinkPhrases(tf);
+    
+  }
+
+  private void checkLinkPhrases(WikipediaTokenizer tf) throws IOException {
     Token token = new Token();
     token = tf.next(token);
     assertTrue("token is null and it shouldn't be", token != null);
@@ -201,7 +210,33 @@
     assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "again",
             new String(token.termBuffer(), 0, token.termLength()).equals("again") == true);
     assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
-    
+
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
+            new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
+            new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
+            new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+    token = tf.next(token);
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
+            new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+
+    token = tf.next();
+    assertTrue("token is not null and it should be", token == null);
   }
 
   public void testLinks() throws Exception {
@@ -225,5 +260,317 @@
     assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "https://lucene.apache.org/java/docs/index.html?b=c",
             new String(token.termBuffer(), 0, token.termLength()).equals("https://lucene.apache.org/java/docs/index.html?b=c") == true);
     assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.EXTERNAL_LINK_URL, token.type().equals(WikipediaTokenizer.EXTERNAL_LINK_URL) == true);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+
+    token = tf.next();
+    assertTrue("token is not null and it should be", token == null);
+
+  }
+
+  public void testLucene1133() throws Exception {
+    Set untoks = new HashSet();
+    untoks.add(WikipediaTokenizer.CATEGORY);
+    untoks.add(WikipediaTokenizer.ITALICS);
+    //should be exactly the same, regardless of untoks
+    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(LINK_PHRASES), WikipediaTokenizer.TOKENS_ONLY, untoks);
+    checkLinkPhrases(tf);
+    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
+    tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.UNTOKENIZED_ONLY, untoks);
+    Token token;
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
+            new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
+    assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
+            new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
+    assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
+            new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
+    assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
+            new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
+    assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
+            new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
+    assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
+            new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
+    assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
+            new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
+    assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
+            new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
+    assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
+            new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
+    assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h   i   j",
+            new String(token.termBuffer(), 0, token.termLength()).equals("h   i   j") == true);
+    assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
+    assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
+
+    token = tf.next();
+    assertTrue("token is not null and it should be", token == null);
+  }
+
+  public void testBoth() throws Exception {
+    Set untoks = new HashSet();
+    untoks.add(WikipediaTokenizer.CATEGORY);
+    untoks.add(WikipediaTokenizer.ITALICS);
+    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
+    //should output all the indivual tokens plus the untokenized tokens as well.  Untokenized tokens
+    WikipediaTokenizer tf = new WikipediaTokenizer(new StringReader(test), WikipediaTokenizer.BOTH, untoks);
+    Token token;
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a b c d",
+            new String(token.termBuffer(), 0, token.termLength()).equals("a b c d") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+    assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
+    assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "a",
+            new String(token.termBuffer(), 0, token.termLength()).equals("a") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.getFlags() + " equals: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG + " and it shouldn't", token.getFlags() != WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+    assertTrue(token.startOffset() + " does not equal: " + 11, token.startOffset() == 11);
+    assertTrue(token.endOffset() + " does not equal: " + 12, token.endOffset() == 12);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "b",
+            new String(token.termBuffer(), 0, token.termLength()).equals("b") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 13, token.startOffset() == 13);
+    assertTrue(token.endOffset() + " does not equal: " + 14, token.endOffset() == 14);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "c",
+            new String(token.termBuffer(), 0, token.termLength()).equals("c") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 15, token.startOffset() == 15);
+    assertTrue(token.endOffset() + " does not equal: " + 16, token.endOffset() == 16);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "d",
+            new String(token.termBuffer(), 0, token.termLength()).equals("d") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 17, token.startOffset() == 17);
+    assertTrue(token.endOffset() + " does not equal: " + 18, token.endOffset() == 18);
+
+
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e f g",
+            new String(token.termBuffer(), 0, token.termLength()).equals("e f g") == true);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+    assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
+    assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "e",
+            new String(token.termBuffer(), 0, token.termLength()).equals("e") == true);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+    assertTrue(token.startOffset() + " does not equal: " + 32, token.startOffset() == 32);
+    assertTrue(token.endOffset() + " does not equal: " + 33, token.endOffset() == 33);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "f",
+            new String(token.termBuffer(), 0, token.termLength()).equals("f") == true);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.startOffset() + " does not equal: " + 34, token.startOffset() == 34);
+    assertTrue(token.endOffset() + " does not equal: " + 35, token.endOffset() == 35);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "g",
+            new String(token.termBuffer(), 0, token.termLength()).equals("g") == true);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.startOffset() + " does not equal: " + 36, token.startOffset() == 36);
+    assertTrue(token.endOffset() + " does not equal: " + 37, token.endOffset() == 37);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
+            new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 42, token.startOffset() == 42);
+    assertTrue(token.endOffset() + " does not equal: " + 46, token.endOffset() == 46);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
+            new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 47, token.startOffset() == 47);
+    assertTrue(token.endOffset() + " does not equal: " + 51, token.endOffset() == 51);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "link",
+            new String(token.termBuffer(), 0, token.termLength()).equals("link") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.startOffset() + " does not equal: " + 56, token.startOffset() == 56);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+    assertTrue(token.endOffset() + " does not equal: " + 60, token.endOffset() == 60);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "there",
+            new String(token.termBuffer(), 0, token.termLength()).equals("there") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.INTERNAL_LINK, token.type().equals(WikipediaTokenizer.INTERNAL_LINK) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 61, token.startOffset() == 61);
+    assertTrue(token.endOffset() + " does not equal: " + 66, token.endOffset() == 66);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics here",
+            new String(token.termBuffer(), 0, token.termLength()).equals("italics here") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
+    assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+    assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
+    assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
+            new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 71, token.startOffset() == 71);
+    assertTrue(token.endOffset() + " does not equal: " + 78, token.endOffset() == 78);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "here",
+            new String(token.termBuffer(), 0, token.termLength()).equals("here") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 79, token.startOffset() == 79);
+    assertTrue(token.endOffset() + " does not equal: " + 83, token.endOffset() == 83);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "something",
+            new String(token.termBuffer(), 0, token.termLength()).equals("something") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.startOffset() + " does not equal: " + 86, token.startOffset() == 86);
+    assertTrue(token.endOffset() + " does not equal: " + 95, token.endOffset() == 95);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more italics",
+            new String(token.termBuffer(), 0, token.termLength()).equals("more italics") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
+    assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+    assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
+    assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "more",
+            new String(token.termBuffer(), 0, token.termLength()).equals("more") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 98, token.startOffset() == 98);
+    assertTrue(token.endOffset() + " does not equal: " + 102, token.endOffset() == 102);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "italics",
+            new String(token.termBuffer(), 0, token.termLength()).equals("italics") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+        assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.ITALICS, token.type().equals(WikipediaTokenizer.ITALICS) == true);
+
+    assertTrue(token.startOffset() + " does not equal: " + 103, token.startOffset() == 103);
+    assertTrue(token.endOffset() + " does not equal: " + 110, token.endOffset() == 110);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h   i   j",
+            new String(token.termBuffer(), 0, token.termLength()).equals("h   i   j") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.getFlags() + " does not equal: " + WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG, token.getFlags() == WikipediaTokenizer.UNTOKENIZED_TOKEN_FLAG);
+    assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
+    assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "h",
+            new String(token.termBuffer(), 0, token.termLength()).equals("h") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 0, token.getPositionIncrement() == 0);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 124, token.startOffset() == 124);
+    assertTrue(token.endOffset() + " does not equal: " + 125, token.endOffset() == 125);
+
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "i",
+            new String(token.termBuffer(), 0, token.termLength()).equals("i") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 128, token.startOffset() == 128);
+    assertTrue(token.endOffset() + " does not equal: " + 129, token.endOffset() == 129);
+    token = tf.next();
+    assertTrue("token is null and it shouldn't be", token != null);
+    assertTrue(new String(token.termBuffer(), 0, token.termLength()) + " is not equal to " + "j",
+            new String(token.termBuffer(), 0, token.termLength()).equals("j") == true);
+    assertTrue(token.getPositionIncrement() + " does not equal: " + 1, token.getPositionIncrement() == 1);
+    assertTrue(token.type() + " is not equal to " + WikipediaTokenizer.CATEGORY, token.type().equals(WikipediaTokenizer.CATEGORY) == true);
+    assertTrue(token.startOffset() + " does not equal: " + 132, token.startOffset() == 132);
+    assertTrue(token.endOffset() + " does not equal: " + 133, token.endOffset() == 133);
+
+    token = tf.next();
+    assertTrue("token is not null and it should be", token == null);
+
   }
 }



Mime
View raw message