lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ka...@apache.org
Subject svn commit: r733064 - in /lucene/java/trunk/contrib: CHANGES.txt analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
Date Fri, 09 Jan 2009 15:34:53 GMT
Author: kalle
Date: Fri Jan  9 07:34:52 2009
New Revision: 733064

URL: http://svn.apache.org/viewvc?rev=733064&view=rev
Log:
LUCENE-1514
ShingleMatrixFilter#next(Token) easily throws a StackOverflowException due to recursive invocation.
(Karl Wettin)

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=733064&r1=733063&r2=733064&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Fri Jan  9 07:34:52 2009
@@ -22,6 +22,9 @@
  3. LUCENE-1510: InstantiatedIndexReader#norms methods throws NullPointerException on empty
index.
     (Karl Wettin, Robert Newson)
 
+ 4. LUCENE-1514: ShingleMatrixFilter#next(Token) easily throws a StackOverflowException
+    due to recursive invocation. (Karl Wettin)
+
 New features
 
  1. LUCENE-1470: Added TrieRangeQuery, a much faster implementation of

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java?rev=733064&r1=733063&r2=733064&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
Fri Jan  9 07:34:52 2009
@@ -19,7 +19,6 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.LinkedList;
@@ -52,7 +51,7 @@
  * in several languages, notebly the northern Germanic branch.
  *
  * <p>Shingles are amongst many things also known to solve problems
- * in spell checking, language detection and document clustering.  
+ * in spell checking, language detection and document clustering.
  *
  * <p>This filter is backed by a three dimensional column oriented matrix
  * used to create permutations of the second dimension, the rows,
@@ -90,7 +89,7 @@
  * "and_salutations_tellus"
  * "salutations_tellus"
  *  </pre>
- * 
+ *
  * <p>This implementation can be rather heap demanding
  * if (maximum shingle size - minimum shingle size) is a great number and the stream contains
many columns,
  * or if each column contains a great number of rows.
@@ -304,6 +303,7 @@
 
   private Matrix matrix;
 
+
   public Token next(final Token reusableToken) throws IOException {
     assert reusableToken != null;
     if (matrix == null) {
@@ -314,6 +314,30 @@
       }
     }
 
+    // this loop exists in order to avoid recursive calls to the next method
+    // as the complexity of a large matrix
+    // then would require a multi gigabyte sized stack.
+    Token token;
+    do {
+      token = produceNextToken(reusableToken);
+    } while (token == request_next_token);
+    return token;
+  }
+
+  
+  private static final Token request_next_token = new Token();
+
+  /**
+   * This method exists in order to avoid reursive calls to the method
+   * as the complexity of a fairlt small matrix then easily would require
+   * a gigabyte sized stack per thread.
+   *
+   * @param reusableToken
+   * @return null if exhausted, instance request_next_token if one more call is required
for an answer, or instance parameter resuableToken.
+   * @throws IOException
+   */
+  private Token produceNextToken(final Token reusableToken) throws IOException {
+
     if (currentPermuationTokens != null) {
       currentShingleLength++;
 
@@ -343,7 +367,7 @@
 
         // only produce shingles that not already has been created
         if (!shinglesSeen.add(shingle)) {
-          return next(reusableToken);
+          return request_next_token;
         }
 
         // shingle token factory
@@ -368,7 +392,7 @@
           // reset shingle size and move one step to the right in the current tokens permutation
           currentPermutationTokensStartOffset++;
           currentShingleLength = minimumShingleSize - 1;
-          return next(reusableToken);
+          return request_next_token;
         }
 
 
@@ -423,7 +447,7 @@
         }
 
         nextTokensPermutation();
-        return next(reusableToken);
+        return request_next_token;
 
       }
     }
@@ -438,7 +462,7 @@
 
     nextTokensPermutation();
 
-    return next(reusableToken);
+    return request_next_token;
   }
 
   /**
@@ -494,7 +518,7 @@
    * weight +=  shingle part token weight * (1 / sqrt(all shingle part token weights summed))
    *
    * This algorithm gives a slightly greater score for longer shingles
-   * and is rather penalising to great shingle token part weights.  
+   * and is rather penalising to great shingle token part weights.
    *
    * @param shingleToken token returned to consumer
    * @param shingle tokens the tokens used to produce the shingle token.



Mime
View raw message