lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r609330 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/analysis/standard/ src/test/org/apache/lucene/analysis/ src/test/org/apache/lucene/index/
Date Sun, 06 Jan 2008 15:37:45 GMT
Author: mikemccand
Date: Sun Jan  6 07:37:44 2008
New Revision: 609330

URL: http://svn.apache.org/viewvc?rev=609330&view=rev
Log:
LUCENE-1118: skip terms > 255 (by default) characters in length in StandardAnalyzer

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
    lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
    lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
    lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=609330&r1=609329&r2=609330&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sun Jan  6 07:37:44 2008
@@ -90,6 +90,10 @@
     modified so it is token producer's responsibility
     to call Token.clear(). (Doron Cohen)   
 
+14. LUCENE-1118: Changed StandardAnalyzer to skip too-long (default >
+    255 characters) tokens.  You can increase this limit by calling
+    StandardAnalyzer.setMaxTokenLength(...).  (Michael McCandless)
+
 
 Bug fixes
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java?rev=609330&r1=609329&r2=609330&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardAnalyzer.java Sun
Jan  6 07:37:44 2008
@@ -144,8 +144,9 @@
   /** Constructs a {@link StandardTokenizer} filtered by a {@link
   StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
   public TokenStream tokenStream(String fieldName, Reader reader) {
-    TokenStream result = new StandardTokenizer(reader, replaceInvalidAcronym);
-    result = new StandardFilter(result);
+    StandardTokenizer tokenStream = new StandardTokenizer(reader, replaceInvalidAcronym);
+    tokenStream.setMaxTokenLength(maxTokenLength);
+    TokenStream result = new StandardFilter(tokenStream);
     result = new LowerCaseFilter(result);
     result = new StopFilter(result, stopSet);
     return result;
@@ -155,6 +156,28 @@
     StandardTokenizer tokenStream;
     TokenStream filteredTokenStream;
   }
+
+  /** Default maximum allowed token length */
+  public static final int DEFAULT_MAX_TOKEN_LENGTH = 255;
+
+  private int maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
+
+  /**
+   * Set maximum allowed token length.  If a token is seen
+   * that exceeds this length then it is discarded.  This
+   * setting only takes effect the next time tokenStream or
+   * reusableTokenStream is called.
+   */
+  public void setMaxTokenLength(int length) {
+    maxTokenLength = length;
+  }
+    
+  /**
+   * @see #setMaxTokenLength
+   */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
   
   public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException
{
     SavedStreams streams = (SavedStreams) getPreviousTokenStream();
@@ -168,6 +191,7 @@
     } else {
       streams.tokenStream.reset(reader);
     }
+    streams.tokenStream.setMaxTokenLength(maxTokenLength);
     
     streams.tokenStream.setReplaceInvalidAcronym(replaceInvalidAcronym);
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java?rev=609330&r1=609329&r2=609330&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
(original)
+++ lucene/java/trunk/src/java/org/apache/lucene/analysis/standard/StandardTokenizer.java
Sun Jan  6 07:37:44 2008
@@ -58,6 +58,19 @@
     this.input = reader;
   }
 
+  private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
+
+  /** Set the max allowed token length.  Any token longer
+   *  than this is skipped. */
+  public void setMaxTokenLength(int length) {
+    this.maxTokenLength = length;
+  }
+
+  /** @see #setMaxTokenLength */
+  public int getMaxTokenLength() {
+    return maxTokenLength;
+  }
+
     /**
      * Creates a new instance of the {@link StandardTokenizer}. Attaches the
      * <code>input</code> to a newly created JFlex scanner.
@@ -80,37 +93,49 @@
     this.replaceInvalidAcronym = replaceInvalidAcronym;
     this.input = input;
     this.scanner = new StandardTokenizerImpl(input);
-  }/*
-     * (non-Javadoc)
-     *
-     * @see org.apache.lucene.analysis.TokenStream#next()
-     */
-    public Token next(Token result) throws IOException {
+  }
+
+  /*
+   * (non-Javadoc)
+   *
+   * @see org.apache.lucene.analysis.TokenStream#next()
+   */
+  public Token next(Token result) throws IOException {
+      int posIncr = 1;
+
+      while(true) {
 	int tokenType = scanner.getNextToken();
 
 	if (tokenType == StandardTokenizerImpl.YYEOF) {
 	    return null;
 	}
 
-        result.clear();
-        scanner.getText(result);
-        final int start = scanner.yychar();
-        result.setStartOffset(start);
-        result.setEndOffset(start+result.termLength());
-        // This 'if' should be removed in the next release. For now, it converts
-        // invalid acronyms to HOST. When removed, only the 'else' part should
-        // remain.
-        if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
-          if (replaceInvalidAcronym) {
-            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
-            result.setTermLength(result.termLength() - 1); // remove extra '.'
+        if (scanner.yylength() <= maxTokenLength) {
+          result.clear();
+          result.setPositionIncrement(posIncr);
+          scanner.getText(result);
+          final int start = scanner.yychar();
+          result.setStartOffset(start);
+          result.setEndOffset(start+result.termLength());
+          // This 'if' should be removed in the next release. For now, it converts
+          // invalid acronyms to HOST. When removed, only the 'else' part should
+          // remain.
+          if (tokenType == StandardTokenizerImpl.ACRONYM_DEP) {
+            if (replaceInvalidAcronym) {
+              result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.HOST]);
+              result.setTermLength(result.termLength() - 1); // remove extra '.'
+            } else {
+              result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+            }
           } else {
-            result.setType(StandardTokenizerImpl.TOKEN_TYPES[StandardTokenizerImpl.ACRONYM]);
+            result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
           }
-        } else {
-          result.setType(StandardTokenizerImpl.TOKEN_TYPES[tokenType]);
-        }
-        return result;
+          return result;
+        } else
+          // When we skip a too-long term, we still increment the
+          // position increment
+          posIncr++;
+      }
     }
 
     /*

Modified: lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java?rev=609330&r1=609329&r2=609330&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/analysis/TestStandardAnalyzer.java Sun Jan
 6 07:37:44 2008
@@ -30,6 +30,10 @@
   }
 
   public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[]
expectedTypes) throws Exception {
+    assertAnalyzesTo(a, input, expectedImages, expectedTypes, null);
+  }
+
+  public void assertAnalyzesTo(Analyzer a, String input, String[] expectedImages, String[]
expectedTypes, int[] expectedPosIncrs) throws Exception {
     TokenStream ts = a.tokenStream("dummy", new StringReader(input));
     for (int i = 0; i < expectedImages.length; i++) {
       Token t = ts.next();
@@ -38,11 +42,38 @@
       if (expectedTypes != null) {
         assertEquals(expectedTypes[i], t.type());
       }
+      if (expectedPosIncrs != null) {
+        assertEquals(expectedPosIncrs[i], t.getPositionIncrement());
+      }
     }
     assertNull(ts.next());
     ts.close();
   }
 
+
+  public void testMaxTermLength() throws Exception {
+    StandardAnalyzer sa = new StandardAnalyzer();
+    sa.setMaxTokenLength(5);
+    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"});
+  }
+
+  public void testMaxTermLength2() throws Exception {
+    StandardAnalyzer sa = new StandardAnalyzer();
+    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "toolong", "xy",
"z"});
+    sa.setMaxTokenLength(5);
+    
+    assertAnalyzesTo(sa, "ab cd toolong xy z", new String[]{"ab", "cd", "xy", "z"}, null,
new int[]{1, 1, 2, 1});
+  }
+
+  public void testMaxTermLength3() throws Exception {
+    char[] chars = new char[255];
+    for(int i=0;i<255;i++)
+      chars[i] = 'a';
+    String longTerm = new String(chars, 0, 255);
+    
+    assertAnalyzesTo(a, "ab cd " + longTerm + " xy z", new String[]{"ab", "cd", longTerm,
"xy", "z"});
+    assertAnalyzesTo(a, "ab cd " + longTerm + "a xy z", new String[]{"ab", "cd", "xy", "z"});
+  }
 
   public void testAlphanumeric() throws Exception {
     // alphanumeric tokens

Modified: lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=609330&r1=609329&r2=609330&view=diff
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestIndexWriter.java Sun Jan  6 07:37:44
2008
@@ -586,7 +586,9 @@
       // maximum length term, and search on that term:
       doc = new Document();
       doc.add(new Field("content", bigTerm, Field.Store.NO, Field.Index.TOKENIZED));
-      writer  = new IndexWriter(dir, new StandardAnalyzer());
+      StandardAnalyzer sa = new StandardAnalyzer();
+      sa.setMaxTokenLength(100000);
+      writer  = new IndexWriter(dir, sa);
       writer.addDocument(doc);
       writer.close();
       reader = IndexReader.open(dir);



Mime
View raw message