lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r905043 - in /lucene/java/trunk/contrib: ./ analyzers/common/src/java/org/apache/lucene/analysis/shingle/ analyzers/common/src/test/org/apache/lucene/analysis/shingle/
Date Sun, 31 Jan 2010 14:04:01 GMT
Author: rmuir
Date: Sun Jan 31 14:04:01 2010
New Revision: 905043

URL: http://svn.apache.org/viewvc?rev=905043&view=rev
Log:
LUCENE-2218: Improvements to ShingleFilter (performance, configurable sep. char and min shingle size)

Modified:
    lucene/java/trunk/contrib/CHANGES.txt
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
    lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
    lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java

Modified: lucene/java/trunk/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/CHANGES.txt?rev=905043&r1=905042&r2=905043&view=diff
==============================================================================
--- lucene/java/trunk/contrib/CHANGES.txt (original)
+++ lucene/java/trunk/contrib/CHANGES.txt Sun Jan 31 14:04:01 2010
@@ -99,6 +99,10 @@
  * LUCENE-2243: Add DisjunctionMaxQuery support for FastVectorHighlighter.
    (Koji Sekiguchi)
 
+ * LUCENE-2218: ShingleFilter supports minimum shingle size, and the separator
+   character is now configurable. Its also up to 20% faster. 
+   (Steven Rowe via Robert Muir)
+
 Build
 
  * LUCENE-2124: Moved the JDK-based collation support from contrib/collation 

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java?rev=905043&r1=905042&r2=905043&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapper.java Sun Jan 31 14:04:01 2010
@@ -34,7 +34,9 @@
 public final class ShingleAnalyzerWrapper extends Analyzer {
 
   private final Analyzer defaultAnalyzer;
-  private int maxShingleSize = 2;
+  private int maxShingleSize = ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
+  private int minShingleSize = ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE;
+  private String tokenSeparator = ShingleFilter.TOKEN_SEPARATOR;
   private boolean outputUnigrams = true;
 
   public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer) {
@@ -44,7 +46,13 @@
 
   public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int maxShingleSize) {
     this(defaultAnalyzer);
-    this.maxShingleSize = maxShingleSize;
+    setMaxShingleSize(maxShingleSize);
+  }
+
+  public ShingleAnalyzerWrapper(Analyzer defaultAnalyzer, int minShingleSize, int maxShingleSize) {
+    this(defaultAnalyzer);
+    setMaxShingleSize(maxShingleSize);
+    setMinShingleSize(minShingleSize);
   }
 
   /**
@@ -58,29 +66,73 @@
   /**
    * Wraps {@link StandardAnalyzer}. 
    */
-  public ShingleAnalyzerWrapper(Version matchVersion, int nGramSize) {
+  public ShingleAnalyzerWrapper(Version matchVersion, int minShingleSize, int maxShingleSize) {
     this(matchVersion);
-    this.maxShingleSize = nGramSize;
+    setMaxShingleSize(maxShingleSize);
+    setMinShingleSize(minShingleSize);
   }
 
   /**
-   * The max shingle (ngram) size
+   * The max shingle (token ngram) size
    * 
-   * @return The max shingle (ngram) size
+   * @return The max shingle (token ngram) size
    */
   public int getMaxShingleSize() {
     return maxShingleSize;
   }
 
   /**
-   * Set the maximum size of output shingles
-   * 
+   * Set the maximum size of output shingles (default: 2)
+   *
    * @param maxShingleSize max shingle size
    */
   public void setMaxShingleSize(int maxShingleSize) {
+    if (maxShingleSize < 2) {
+      throw new IllegalArgumentException("Max shingle size must be >= 2");
+    }
     this.maxShingleSize = maxShingleSize;
   }
 
+  /**
+   * The min shingle (token ngram) size
+   * 
+   * @return The min shingle (token ngram) size
+   */
+  public int getMinShingleSize() {
+    return minShingleSize;
+  }
+
+  /**
+   * <p>Set the min shingle size (default: 2).
+   * <p>This method requires that the passed in minShingleSize is not greater
+   * than maxShingleSize, so make sure that maxShingleSize is set before
+   * calling this method.
+   *
+   * @param minShingleSize min size of output shingles
+   */
+  public void setMinShingleSize(int minShingleSize) {
+    if (minShingleSize < 2) {
+      throw new IllegalArgumentException("Min shingle size must be >= 2");
+    }
+    if (minShingleSize > maxShingleSize) {
+      throw new IllegalArgumentException
+        ("Min shingle size must be <= max shingle size");
+    }
+    this.minShingleSize = minShingleSize;
+  }
+
+  public String getTokenSeparator() {
+    return tokenSeparator;
+  }
+
+  /**
+   * Sets the string to use when joining adjacent tokens to form a shingle
+   * @param tokenSeparator used to separate input stream tokens in output shingles
+   */
+  public void setTokenSeparator(String tokenSeparator) {
+    this.tokenSeparator = (tokenSeparator == null ? "" : tokenSeparator);
+  }
+  
   public boolean isOutputUnigrams() {
     return outputUnigrams;
   }
@@ -104,8 +156,10 @@
     } catch (IOException e) {
       wrapped = defaultAnalyzer.tokenStream(fieldName, reader);
     }
-    ShingleFilter filter = new ShingleFilter(wrapped);
+    ShingleFilter filter = new ShingleFilter(wrapped, minShingleSize, maxShingleSize);
+    filter.setMinShingleSize(minShingleSize);
     filter.setMaxShingleSize(maxShingleSize);
+    filter.setTokenSeparator(tokenSeparator);
     filter.setOutputUnigrams(outputUnigrams);
     return filter;
   }
@@ -113,7 +167,7 @@
   private class SavedStreams {
     TokenStream wrapped;
     ShingleFilter shingle;
-  };
+  }
   
   @Override
   public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
@@ -135,6 +189,8 @@
       }
     }
     streams.shingle.setMaxShingleSize(maxShingleSize);
+    streams.shingle.setMinShingleSize(minShingleSize);
+    streams.shingle.setTokenSeparator(tokenSeparator);
     streams.shingle.setOutputUnigrams(outputUnigrams);
     return streams.shingle;
   }

Modified: lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java?rev=905043&r1=905042&r2=905043&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/java/org/apache/lucene/analysis/shingle/ShingleFilter.java Sun Jan 31 14:04:01 2010
@@ -18,18 +18,15 @@
  */
 
 import java.io.IOException;
-import java.util.Iterator;
 import java.util.LinkedList;
 
-import org.apache.lucene.analysis.Token;
 import org.apache.lucene.analysis.TokenFilter;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.analysis.tokenattributes.TermAttribute;
 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.AttributeSource.State;
+
 
 /**
  * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
@@ -44,26 +41,59 @@
  */
 public final class ShingleFilter extends TokenFilter {
 
-  private LinkedList<State> shingleBuf = new LinkedList<State>();
-  private StringBuilder[] shingles;
-  private String tokenType = "shingle";
-
   /**
    * filler token for when positionIncrement is more than 1
    */
   public static final char[] FILLER_TOKEN = { '_' };
 
-
   /**
    * default maximum shingle size is 2.
    */
   public static final int DEFAULT_MAX_SHINGLE_SIZE = 2;
 
   /**
-   * The string to use when joining adjacent tokens to form a shingle
+   * default minimum shingle size is 2.
+   */
+  public static final int DEFAULT_MIN_SHINGLE_SIZE = 2;
+
+  /**
+   * default token type attribute value is "shingle" 
+   */
+  public static final String DEFAULT_TOKEN_TYPE = "shingle";
+  
+  /**
+   * The default string to use when joining adjacent tokens to form a shingle
    */
   public static final String TOKEN_SEPARATOR = " ";
 
+
+  /**
+   * The sequence of input stream tokens (or filler tokens, if necessary)
+   * that will be composed to form output shingles.
+   */
+  private LinkedList<State> inputWindow = new LinkedList<State>();
+  
+  /**
+   * The number of input tokens in the next output token.  This is the "n" in
+   * "token n-grams".
+   */
+  private CircularSequence gramSize;
+
+  /**
+   * Shingle text is composed here.
+   */
+  private StringBuilder shingleBuilder = new StringBuilder();
+
+  /**
+   * The token type attribute value to use - default is "shingle"
+   */
+  private String tokenType = DEFAULT_TOKEN_TYPE;
+
+  /**
+   * The string to use when joining adjacent tokens to form a shingle
+   */
+  private String tokenSeparator = TOKEN_SEPARATOR;
+
   /**
    * By default, we output unigrams (individual tokens) as well as shingles
    * (token n-grams).
@@ -76,15 +106,40 @@
   private int maxShingleSize;
 
   /**
-   * Constructs a ShingleFilter with the specified single size from the
+   * minimum shingle size (number of tokens)
+   */
+  private int minShingleSize;
+
+  /**
+   * The remaining number of filler tokens inserted into the input stream
+   * from which shingles are composed, to handle position increments greater
+   * than one.
+   */
+  private int numFillerTokensToInsert;
+
+  /**
+   * The next input stream token.
+   */
+  private State nextInputStreamToken;
+  
+  private final TermAttribute termAtt;
+  private final OffsetAttribute offsetAtt;
+  private final PositionIncrementAttribute posIncrAtt;
+  private final TypeAttribute typeAtt;
+
+
+  /**
+   * Constructs a ShingleFilter with the specified shingle size from the
    * {@link TokenStream} <code>input</code>
    *
    * @param input input stream
+   * @param minShingleSize minimum shingle size produced by the filter.
    * @param maxShingleSize maximum shingle size produced by the filter.
    */
-  public ShingleFilter(TokenStream input, int maxShingleSize) {
+  public ShingleFilter(TokenStream input, int minShingleSize, int maxShingleSize) {
     super(input);
     setMaxShingleSize(maxShingleSize);
+    setMinShingleSize(minShingleSize);
     this.termAtt = addAttribute(TermAttribute.class);
     this.offsetAtt = addAttribute(OffsetAttribute.class);
     this.posIncrAtt = addAttribute(PositionIncrementAttribute.class);
@@ -92,22 +147,34 @@
   }
 
   /**
-   * Construct a ShingleFilter with default shingle size.
+   * Constructs a ShingleFilter with the specified shingle size from the
+   * {@link TokenStream} <code>input</code>
+   *
+   * @param input input stream
+   * @param maxShingleSize maximum shingle size produced by the filter.
+   */
+  public ShingleFilter(TokenStream input, int maxShingleSize) {
+    this(input, DEFAULT_MIN_SHINGLE_SIZE, maxShingleSize);
+  }
+  
+  /**
+   * Construct a ShingleFilter with default shingle size: 2.
    *
    * @param input input stream
    */
   public ShingleFilter(TokenStream input) {
-    this(input, DEFAULT_MAX_SHINGLE_SIZE);
+    this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
   }
 
   /**
-   * Construct a ShingleFilter with the specified token type for shingle tokens.
+   * Construct a ShingleFilter with the specified token type for shingle tokens
+   * and the default shingle size: 2
    *
    * @param input input stream
    * @param tokenType token type for shingle tokens
    */
   public ShingleFilter(TokenStream input, String tokenType) {
-    this(input, DEFAULT_MAX_SHINGLE_SIZE);
+    this(input, DEFAULT_MIN_SHINGLE_SIZE, DEFAULT_MAX_SHINGLE_SIZE);
     setTokenType(tokenType);
   }
 
@@ -130,6 +197,7 @@
    */
   public void setOutputUnigrams(boolean outputUnigrams) {
     this.outputUnigrams = outputUnigrams;
+    gramSize = new CircularSequence();
   }
 
   /**
@@ -141,203 +209,239 @@
     if (maxShingleSize < 2) {
       throw new IllegalArgumentException("Max shingle size must be >= 2");
     }
-    shingles = new StringBuilder[maxShingleSize];
-    for (int i = 0; i < shingles.length; i++) {
-      shingles[i] = new StringBuilder();
-    }
     this.maxShingleSize = maxShingleSize;
   }
 
   /**
-   * Clear the StringBuilders that are used for storing the output shingles.
+   * <p>Set the min shingle size (default: 2).
+   * <p>This method requires that the passed in minShingleSize is not greater
+   * than maxShingleSize, so make sure that maxShingleSize is set before
+   * calling this method.
+   * <p>The unigram output option is independent of the min shingle size.
+   *
+   * @param minShingleSize min size of output shingles
    */
-  private void clearShingles() {
-    for (int i = 0; i < shingles.length; i++) {
-      shingles[i].setLength(0);
+  public void setMinShingleSize(int minShingleSize) {
+    if (minShingleSize < 2) {
+      throw new IllegalArgumentException("Min shingle size must be >= 2");
     }
+    if (minShingleSize > maxShingleSize) {
+      throw new IllegalArgumentException
+        ("Min shingle size must be <= max shingle size");
+    }
+    this.minShingleSize = minShingleSize;
+    gramSize = new CircularSequence();
+  }
+
+  /**
+   * Sets the string to use when joining adjacent tokens to form a shingle
+   * @param tokenSeparator used to separate input stream tokens in output shingles
+   */
+  public void setTokenSeparator(String tokenSeparator) {
+    this.tokenSeparator = null == tokenSeparator ? "" : tokenSeparator;
   }
-  
-  private AttributeSource.State nextToken;
-  private int shingleBufferPosition;
-  private int[] endOffsets;
 
   /* (non-Javadoc)
    * @see org.apache.lucene.analysis.TokenStream#next()
    */
   @Override
   public final boolean incrementToken() throws IOException {
-    while (true) {
-      if (nextToken == null) {
-        if (!fillShingleBuffer()) {
-          return false;
-        }
-      }
-      
-      nextToken = shingleBuf.getFirst();
-      
-      if (outputUnigrams) {
-        if (shingleBufferPosition == 0) {
-          restoreState(nextToken);
-          posIncrAtt.setPositionIncrement(1);
-          shingleBufferPosition++;
-          return true;
-        }
-      } else if (shingleBufferPosition % this.maxShingleSize == 0){
-        shingleBufferPosition++;
+    boolean tokenAvailable = false; 
+    if (gramSize.atMinValue() || inputWindow.size() < gramSize.getValue()) {
+      shiftInputWindow();
+    }
+    if ( ! inputWindow.isEmpty()) {
+      restoreState(inputWindow.getFirst());
+      if (1 == gramSize.getValue()) {
+        posIncrAtt.setPositionIncrement(1);
+        gramSize.advance();
+        tokenAvailable = true;
+      } else if (inputWindow.size() >= gramSize.getValue()) {
+        getNextShingle();
+        gramSize.advance();
+        tokenAvailable = true;
       }
-  
-      if (shingleBufferPosition < shingleBuf.size()) {
-        restoreState(nextToken);
-        typeAtt.setType(tokenType);
-        offsetAtt.setOffset(offsetAtt.startOffset(), endOffsets[shingleBufferPosition]);
-        StringBuilder buf = shingles[shingleBufferPosition];
-        int termLength = buf.length();
-        char[] termBuffer = termAtt.termBuffer();
-        if (termBuffer.length < termLength)
-          termBuffer = termAtt.resizeTermBuffer(termLength);
-        buf.getChars(0, termLength, termBuffer, 0);
-        termAtt.setTermLength(termLength);
-        if ((! outputUnigrams) && shingleBufferPosition % this.maxShingleSize == 1) {
-          posIncrAtt.setPositionIncrement(1);
-        } else {
-          posIncrAtt.setPositionIncrement(0);
-        }
-        shingleBufferPosition++;
-        if (shingleBufferPosition == shingleBuf.size()) {
-          nextToken = null;
-          shingleBufferPosition = 0;
-        }
-        return true;
-      } else {
-        nextToken = null;
-        shingleBufferPosition = 0;
+    }
+    return tokenAvailable;
+  }
+
+  /**
+   * <p>Makes the next token a shingle of length {@link #gramSize}, 
+   * composed of tokens taken from {@link #inputWindow}.
+   * <p>Callers of this method must first insure that there are at least 
+   * <code>gramSize</code> tokens available in <code>inputWindow</code>.
+   */
+  private void getNextShingle() {
+    int startOffset = offsetAtt.startOffset();
+
+    int minTokNum = gramSize.getValue() - 1; // zero-based inputWindow position
+    if (gramSize.getValue() == minShingleSize) {
+      // Clear the shingle text buffer if this is the first shingle
+      // at the current position in the input stream.
+      shingleBuilder.setLength(0);
+      minTokNum = 0;
+    }
+    for (int tokNum = minTokNum ; tokNum < gramSize.getValue() ; ++tokNum) {
+      if (tokNum > 0) {
+        shingleBuilder.append(tokenSeparator);
       }
+      restoreState(inputWindow.get(tokNum));
+      shingleBuilder.append(termAtt.termBuffer(), 0, termAtt.termLength());
+    }
+    char[] termBuffer = termAtt.termBuffer();
+    int termLength = shingleBuilder.length();
+    if (termBuffer.length < termLength) {
+      termBuffer = termAtt.resizeTermBuffer(termLength);
     }
+    shingleBuilder.getChars(0, termLength, termBuffer, 0);
+    termAtt.setTermLength(termLength);
+    posIncrAtt.setPositionIncrement(gramSize.atMinValue() ? 1 : 0);
+    typeAtt.setType(tokenType);
+    offsetAtt.setOffset(startOffset, offsetAtt.endOffset());
   }
   
-  private int numFillerTokensToInsert;
-  private AttributeSource.State currentToken;
-  private boolean hasCurrentToken;
-   
-  private TermAttribute termAtt;
-  private OffsetAttribute offsetAtt;
-  private PositionIncrementAttribute posIncrAtt;
-  private TypeAttribute typeAtt;
-  
   /**
-   * Get the next token from the input stream and push it on the token buffer.
-   * If we encounter a token with position increment > 1, we put filler tokens
-   * on the token buffer.
-   * <p/>
-   * Returns null when the end of the input stream is reached.
-   * @return the next token, or null if at end of input stream
+   * <p>Get the next token from the input stream.
+   * <p>If the next token has <code>positionIncrement > 1</code>,
+   * <code>positionIncrement - 1</code> {@link #FILLER_TOKEN}s are
+   * inserted first.
+   * @return false for end of stream; true otherwise
    * @throws IOException if the input stream has a problem
    */
   private boolean getNextToken() throws IOException {
-    
-    while (true) {
-  	  if (numFillerTokensToInsert > 0) {
-  	    if (currentToken == null) {
-  	      currentToken = captureState();
-  	    } else {
-  	      restoreState(currentToken);
-  	    }
-  	    numFillerTokensToInsert--;
-        // A filler token occupies no space
-  	    offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
-  	    termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
-        return true;
-  	  } 
-  	  
-  	  if (hasCurrentToken) {
-  	    if (currentToken != null) {
-  	      restoreState(currentToken);
-  	      currentToken = null;
-  	    }
-  	    hasCurrentToken = false;
-  	    return true;
-  	  }
-  	  
-  	  if (!input.incrementToken()) return false;
-  	  hasCurrentToken = true;
-  	  
-  	  if (posIncrAtt.getPositionIncrement() > 1) {
-  	    numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
-  	  }
+    boolean success = false;
+    if (numFillerTokensToInsert > 0) {
+      insertFillerToken();
+      success = true;
+    } else if (null != nextInputStreamToken) {
+      restoreState(nextInputStreamToken);
+      nextInputStreamToken = null;
+      success = true;
+    } else if (input.incrementToken()) {
+      if (posIncrAtt.getPositionIncrement() > 1) {
+        numFillerTokensToInsert = posIncrAtt.getPositionIncrement() - 1;
+        insertFillerToken();
+      }
+      success = true;
     }
+    return success;
 	}
 
   /**
-   * Fill the output buffer with new shingles.
+   * Inserts a {@link #FILLER_TOKEN} and decrements
+   * {@link #numFillerTokensToInsert}.
+   */
+  private void insertFillerToken() {
+    if (null == nextInputStreamToken) {
+      nextInputStreamToken = captureState();
+    } else {
+      restoreState(nextInputStreamToken);
+    }
+    --numFillerTokensToInsert;
+    // A filler token occupies no space
+    offsetAtt.setOffset(offsetAtt.startOffset(), offsetAtt.startOffset());
+    termAtt.setTermBuffer(FILLER_TOKEN, 0, FILLER_TOKEN.length);
+  }
+
+  /**
+   * <p>Fills {@link #inputWindow} with input stream tokens, if available, 
+   * shifting to the right if the window was previously full.
+   * <p>Resets {@link #gramSize} to its minimum value.
    *
    * @throws IOException if there's a problem getting the next token
    */
-  private boolean fillShingleBuffer() throws IOException {
-    boolean addedToken = false;
-    /*
-     * Try to fill the shingle buffer.
-     */
-    do {
-      if (getNextToken()) {
-        shingleBuf.add(captureState());
-        if (shingleBuf.size() > maxShingleSize)
-        {
-          shingleBuf.removeFirst();
-        }
-        addedToken = true;
-      } else {
+  private void shiftInputWindow() throws IOException {
+    if (inputWindow.size() > 0) {
+      inputWindow.removeFirst();
+    }
+    while (getNextToken()) {
+      inputWindow.add(captureState());
+      if (inputWindow.size() == maxShingleSize) {
         break;
       }
-    } while (shingleBuf.size() < maxShingleSize);
-
-    if (shingleBuf.isEmpty()) {
-      return false;
     }
+    gramSize.reset();
+  }
+
+  @Override
+  public void reset() throws IOException {
+    super.reset();
+    gramSize.reset();
+    inputWindow.clear();
+    numFillerTokensToInsert = 0;
+  }
+
+
+  /**
+   * <p>An instance of this class is used to maintain the number of input
+   * stream tokens that will be used to compose the next unigram or shingle:
+   * {@link #gramSize}.
+   * <p><code>gramSize</code> will take on values from the circular sequence
+   * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
+   * <p>1 is included in the circular sequence only if 
+   * {@link #outputUnigrams} = true.
+   */
+  private class CircularSequence {
+    private int value;
+    private int minValue;
     
-    /*
-     * If no new token could be added to the shingle buffer, we have reached
-     * the end of the input stream and have to discard the least recent token.
+    public CircularSequence() {
+      minValue = outputUnigrams ? 1 : minShingleSize;
+      reset();
+    }
+
+    /**
+     * {@see #advance()}
+     * @return the current value.  
      */
-    if (! addedToken) {
-      shingleBuf.removeFirst();
+    public int getValue() {
+      return value;
     }
     
-    if (shingleBuf.isEmpty()) {
-      return false;
+    /**
+     * <p>Increments this circular number's value to the next member in the
+     * circular sequence
+     * <code>gramSize</code> will take on values from the circular sequence
+     * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
+     * <p>1 is included in the circular sequence only if 
+     * {@link #outputUnigrams} = true.
+     * 
+     * @return the next member in the circular sequence
+     */
+    public int advance() {
+      if (value == 1) {
+        value = minShingleSize;
+      } else if (value == maxShingleSize) {
+        reset();
+      } else {
+        ++value;
+      }
+      return value;
     }
 
-    clearShingles();
-
-    endOffsets = new int[shingleBuf.size()];
-    for (int i = 0; i < endOffsets.length; i++) {
-      endOffsets[i] = 0;
+    /**
+     * <p>Sets this circular number's value to the first member of the 
+     * circular sequence
+     * <p><code>gramSize</code> will take on values from the circular sequence
+     * <b>{ [ 1, ] {@link #minShingleSize} [ , ... , {@link #maxShingleSize} ] }</b>.
+     * <p>1 is included in the circular sequence only if 
+     * {@link #outputUnigrams} = true.
+     */
+    public void reset() {
+      value = minValue;
     }
 
-    int i = 0;
-    for (Iterator<State> it = shingleBuf.iterator(); it.hasNext(); ) {
-      restoreState(it.next());
-      for (int j = i; j < shingles.length; j++) {
-        if (shingles[j].length() != 0) {
-          shingles[j].append(TOKEN_SEPARATOR);
-        }
-        shingles[j].append(termAtt.termBuffer(), 0, termAtt.termLength());
-      }
-
-      endOffsets[i] = offsetAtt.endOffset();
-      i++;
+    /**
+     * <p>Returns true if the current value is the first member of the circular
+     * sequence.
+     * <p>If {@link #outputUnigrams} = true, the first member of the circular
+     * sequence will be 1; otherwise, it will be {@link #minShingleSize}.
+     * 
+     * @return true if the current value is the first member of the circular
+     *  sequence; false otherwise
+     */
+    public boolean atMinValue() {
+      return value == minValue;
     }
-    
-    return true;
-  }
-
-  @Override
-  public void reset() throws IOException {
-    super.reset();
-    nextToken = null;
-    shingleBufferPosition = 0;
-    shingleBuf.clear();
-    numFillerTokensToInsert = 0;
-    currentToken = null;
-    hasCurrentToken = false;
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java?rev=905043&r1=905042&r2=905043&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleAnalyzerWrapperTest.java Sun Jan 31 14:04:01 2010
@@ -246,4 +246,117 @@
         new int[] { 6, 13, 13, 18, 18, 27, 27 },
         new int[] { 1, 0, 1, 0, 1, 0, 1 });
   }
+
+  public void testNonDefaultMinShingleSize() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 4);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please",   "please divide this",   "please divide this sentence", 
+                                         "divide",   "divide this sentence", "divide this sentence into", 
+                                         "this",     "this sentence into",   "this sentence into shingles",
+                                         "sentence", "sentence into shingles",
+                                         "into",
+                                         "shingles" },
+                          new int[] { 0,  0,  0,  7,  7,  7, 14, 14, 14, 19, 19, 28, 33 },
+                          new int[] { 6, 18, 27, 13, 27, 32, 18, 32, 41, 27, 41, 32, 41 },
+                          new int[] { 1,  0,  0,  1,  0,  0,  1,  0,  0,  1,  0,  1,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please divide this",   "please divide this sentence", 
+                                         "divide this sentence", "divide this sentence into", 
+                                         "this sentence into",   "this sentence into shingles",
+                                         "sentence into shingles" },
+                          new int[] {  0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 18, 27, 27, 32, 32, 41, 41 },
+                          new int[] {  1,  0,  1,  0,  1,  0,  1 });
+  }
+  
+  public void testNonDefaultMinAndSameMaxShingleSize() throws Exception {
+    ShingleAnalyzerWrapper analyzer
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer(), 3, 3);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please",   "please divide this", 
+                                         "divide",   "divide this sentence", 
+                                         "this",     "this sentence into",
+                                         "sentence", "sentence into shingles",
+                                         "into",
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19, 19, 28, 33 },
+                          new int[] { 6, 18, 13, 27, 18, 32, 27, 41, 32, 41 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1,  0,  1,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide this sentence into shingles",
+                          new String[] { "please divide this", 
+                                         "divide this sentence", 
+                                         "this sentence into",
+                                         "sentence into shingles" },
+                          new int[] {  0,  7, 14, 19 },
+                          new int[] { 18, 27, 32, 41 },
+                          new int[] {  1,  1,  1,  1 });
+  }
+
+  public void testNoTokenSeparator() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+    analyzer.setTokenSeparator("");
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please", "pleasedivide", 
+                                         "divide", "divideinto", 
+                                         "into", "intoshingles", 
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 6, 13, 13, 18, 18, 27, 27 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "pleasedivide", 
+                                         "divideinto", 
+                                         "intoshingles" },
+                          new int[] {  0,  7, 14 },
+                          new int[] { 13, 18, 27 },
+                          new int[] {  1,  1,  1 });
+  }
+
+  public void testNullTokenSeparator() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+    analyzer.setTokenSeparator(null);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please", "pleasedivide", 
+                                         "divide", "divideinto", 
+                                         "into", "intoshingles", 
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 6, 13, 13, 18, 18, 27, 27 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "pleasedivide", 
+                                         "divideinto", 
+                                         "intoshingles" },
+                          new int[] {  0,  7, 14 },
+                          new int[] { 13, 18, 27 },
+                          new int[] {  1,  1,  1 });
+  }
+  public void testAltTokenSeparator() throws Exception {
+    ShingleAnalyzerWrapper analyzer 
+      = new ShingleAnalyzerWrapper(new WhitespaceAnalyzer());
+    analyzer.setTokenSeparator("<SEP>");
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please", "please<SEP>divide", 
+                                         "divide", "divide<SEP>into", 
+                                         "into", "into<SEP>shingles", 
+                                         "shingles" },
+                          new int[] { 0,  0,  7,  7, 14, 14, 19 },
+                          new int[] { 6, 13, 13, 18, 18, 27, 27 },
+                          new int[] { 1,  0,  1,  0,  1,  0,  1 });
+    analyzer.setOutputUnigrams(false);
+    assertAnalyzesToReuse(analyzer, "please divide into shingles",
+                          new String[] { "please<SEP>divide", 
+                                         "divide<SEP>into", 
+                                         "into<SEP>shingles" },
+                          new int[] {  0,  7, 14 },
+                          new int[] { 13, 18, 27 },
+                          new int[] {  1,  1,  1 });
+  }
 }

Modified: lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java?rev=905043&r1=905042&r2=905043&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java (original)
+++ lucene/java/trunk/contrib/analyzers/common/src/test/org/apache/lucene/analysis/shingle/ShingleFilterTest.java Sun Jan 31 14:04:01 2010
@@ -288,7 +288,360 @@
 
   };
 
+  public static final Token[] TRI_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide this", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("divide this sentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("this sentence into", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
+    "word", "shingle",
+    "word", "shingle",
+    "word", "shingle",
+    "word", "shingle",
+    "word",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
+    createToken("please divide this", 0, 18),
+    createToken("divide this sentence", 7, 27),
+    createToken("this sentence into", 14, 32),
+    createToken("sentence into shingles", 19, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
+    1, 1, 1, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
+    "shingle",
+    "shingle",
+    "shingle",
+    "shingle"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_MIN_TRI_GRAM = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide", 7, 13),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this", 14, 18),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence", 19, 27),
+    createToken("sentence into shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1
+  };
+
+  public static final String[] FOUR_GRAM_TYPES_MIN_TRI_GRAM = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word",
+    "word"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new Token[] {
+    createToken("please divide this", 0, 18),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide this sentence", 7, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this sentence into", 14, 32),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence into shingles", 19, 39),
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new int[] {
+    1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_MIN_FOUR_GRAM = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide", 7, 13),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this", 14, 18),
+    createToken("this sentence into shingles", 14, 39),
+    createToken("sentence", 19, 27),
+    createToken("into", 28, 32),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 1, 1
+  };
+
+  public static final String[] FOUR_GRAM_TYPES_MIN_FOUR_GRAM = new String[] {
+    "word", "shingle",
+    "word", "shingle",
+    "word", "shingle",
+    "word",
+    "word",
+    "word"
+  };
+  
+  public static final Token[] FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new Token[] {
+    createToken("please divide this sentence", 0, 27),
+    createToken("divide this sentence into", 7, 32),
+    createToken("this sentence into shingles", 14, 39),
+  };
+
+  public static final int[] FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new int[] {
+    1, 1, 1
+  };
+  
+  public static final String[] FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM = new String[] {
+    "shingle",
+    "shingle",
+    "shingle"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("pleasedivide", 0, 13),
+    createToken("divide", 7, 13),
+    createToken("dividethis", 7, 18),
+    createToken("this", 14, 18),
+    createToken("thissentence", 14, 27),
+    createToken("sentence", 19, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("into", 28, 32),
+    createToken("intoshingles", 28, 39),
+    createToken("shingles", 33, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_NO_SEPARATOR = new String[] {
+    "word", "shingle", "word", "shingle", "word", "shingle", "word",
+    "shingle", "word", "shingle", "word"
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
+    createToken("pleasedivide", 0, 13),
+    createToken("dividethis", 7, 18),
+    createToken("thissentence", 14, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("intoshingles", 28, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
+    1, 1, 1, 1, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
+    "shingle", "shingle", "shingle", "shingle", "shingle"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_NO_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("pleasedivide", 0, 13),
+    createToken("pleasedividethis", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("dividethis", 7, 18),
+    createToken("dividethissentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("thissentence", 14, 27),
+    createToken("thissentenceinto", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("sentenceintoshingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("intoshingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_NO_SEPARATOR = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new Token[] {
+    createToken("pleasedivide", 0, 13),
+    createToken("pleasedividethis", 0, 18),
+    createToken("dividethis", 7, 18),
+    createToken("dividethissentence", 7, 27),
+    createToken("thissentence", 14, 27),
+    createToken("thissentenceinto", 14, 32),
+    createToken("sentenceinto", 19, 32),
+    createToken("sentenceintoshingles", 19, 39),
+    createToken("intoshingles", 28, 39),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle",
+  };
+
+  public static final Token[] BI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please<SEP>divide", 0, 13),
+    createToken("divide", 7, 13),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("this", 14, 18),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("sentence", 19, 27),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("into", 28, 32),
+    createToken("into<SEP>shingles", 28, 39),
+    createToken("shingles", 33, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
+    "word", "shingle", "word", "shingle", "word", "shingle", "word",
+    "shingle", "word", "shingle", "word"
+  };
 
+  public static final Token[] BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
+    createToken("please<SEP>divide", 0, 13),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("into<SEP>shingles", 28, 39),
+  };
+
+  public static final int[] BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
+    1, 1, 1, 1, 1
+  };
+
+  public static final String[] BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
+    "shingle", "shingle", "shingle", "shingle", "shingle"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_ALT_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("please<SEP>divide", 0, 13),
+    createToken("please<SEP>divide<SEP>this", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("divide<SEP>this<SEP>sentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("this<SEP>sentence<SEP>into", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("sentence<SEP>into<SEP>shingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("into<SEP>shingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_ALT_SEPARATOR = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
+  public static final Token[] TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new Token[] {
+    createToken("please<SEP>divide", 0, 13),
+    createToken("please<SEP>divide<SEP>this", 0, 18),
+    createToken("divide<SEP>this", 7, 18),
+    createToken("divide<SEP>this<SEP>sentence", 7, 27),
+    createToken("this<SEP>sentence", 14, 27),
+    createToken("this<SEP>sentence<SEP>into", 14, 32),
+    createToken("sentence<SEP>into", 19, 32),
+    createToken("sentence<SEP>into<SEP>shingles", 19, 39),
+    createToken("into<SEP>shingles", 28, 39),
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new int[] {
+    1, 0, 1, 0, 1, 0, 1, 0, 1
+  };
+  
+  public static final String[] TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR = new String[] {
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle", "shingle",
+    "shingle",
+  };
+
+  public static final Token[] TRI_GRAM_TOKENS_NULL_SEPARATOR = new Token[] {
+    createToken("please", 0, 6),
+    createToken("pleasedivide", 0, 13),
+    createToken("pleasedividethis", 0, 18),
+    createToken("divide", 7, 13),
+    createToken("dividethis", 7, 18),
+    createToken("dividethissentence", 7, 27),
+    createToken("this", 14, 18),
+    createToken("thissentence", 14, 27),
+    createToken("thissentenceinto", 14, 32),
+    createToken("sentence", 19, 27),
+    createToken("sentenceinto", 19, 32),
+    createToken("sentenceintoshingles", 19, 39),
+    createToken("into", 28, 32),
+    createToken("intoshingles", 28, 39),
+    createToken("shingles", 33, 39)
+  };
+
+  public static final int[] TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR = new int[] {
+    1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1
+  };
+
+  public static final String[] TRI_GRAM_TYPES_NULL_SEPARATOR = new String[] {
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle", "shingle",
+    "word", "shingle",
+    "word"
+  };
+  
   @Override
   protected void setUp() throws Exception {
     super.setUp();
@@ -379,6 +732,108 @@
   }
   
   
+  public void testTriGramFilterMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 3, TEST_TOKEN, TRI_GRAM_TOKENS_MIN_TRI_GRAM,
+                           TRI_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM,
+                           TRI_GRAM_TYPES_MIN_TRI_GRAM,
+                           true);
+  }
+  
+  public void testTriGramFilterWithoutUnigramsMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 3, TEST_TOKEN, 
+                           TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, 
+                           TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           false);
+  }
+  
+  public void testFourGramFilterMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_TRI_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_MIN_TRI_GRAM, 
+                           FOUR_GRAM_TYPES_MIN_TRI_GRAM,
+                           true);
+  }
+  
+  public void testFourGramFilterWithoutUnigramsMinTriGram() throws IOException {
+    this.shingleFilterTest(3, 4, TEST_TOKEN, 
+                           FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_TRI_GRAM,
+                           FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_TRI_GRAM, false);
+  }
+
+  public void testFourGramFilterMinFourGram() throws IOException {
+    this.shingleFilterTest(4, 4, TEST_TOKEN, FOUR_GRAM_TOKENS_MIN_FOUR_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_MIN_FOUR_GRAM, 
+                           FOUR_GRAM_TYPES_MIN_FOUR_GRAM,
+                           true);
+  }
+  
+  public void testFourGramFilterWithoutUnigramsMinFourGram() throws IOException {
+    this.shingleFilterTest(4, 4, TEST_TOKEN, 
+                           FOUR_GRAM_TOKENS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
+                           FOUR_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM,
+                           FOUR_GRAM_TYPES_WITHOUT_UNIGRAMS_MIN_FOUR_GRAM, false);
+  }
+ 
+  public void testBiGramFilterNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_NO_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR, 
+                           BI_GRAM_TYPES_NO_SEPARATOR, true);
+  }
+
+  public void testBiGramFilterWithoutUnigramsNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 2, TEST_TOKEN, 
+                           BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR, 
+                           BI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           false);
+  }
+  public void testTriGramFilterNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NO_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_NO_SEPARATOR, 
+                           TRI_GRAM_TYPES_NO_SEPARATOR, true);
+  }
+  
+  public void testTriGramFilterWithoutUnigramsNoSeparator() throws IOException {
+    this.shingleFilterTest("", 2, 3, TEST_TOKEN, 
+                           TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_NO_SEPARATOR,
+                           TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_NO_SEPARATOR, false);
+  }
+  
+  public void testBiGramFilterAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, BI_GRAM_TOKENS_ALT_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR, 
+                           BI_GRAM_TYPES_ALT_SEPARATOR, true);
+  }
+
+  public void testBiGramFilterWithoutUnigramsAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 2, TEST_TOKEN, 
+                           BI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           BI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR, 
+                           BI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           false);
+  }
+  public void testTriGramFilterAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_ALT_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_ALT_SEPARATOR, 
+                           TRI_GRAM_TYPES_ALT_SEPARATOR, true);
+  }
+  
+  public void testTriGramFilterWithoutUnigramsAltSeparator() throws IOException {
+    this.shingleFilterTest("<SEP>", 2, 3, TEST_TOKEN, 
+                           TRI_GRAM_TOKENS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_WITHOUT_UNIGRAMS_ALT_SEPARATOR,
+                           TRI_GRAM_TYPES_WITHOUT_UNIGRAMS_ALT_SEPARATOR, false);
+  }
+
+  public void testTriGramFilterNullSeparator() throws IOException {
+    this.shingleFilterTest(null, 2, 3, TEST_TOKEN, TRI_GRAM_TOKENS_NULL_SEPARATOR,
+                           TRI_GRAM_POSITION_INCREMENTS_NULL_SEPARATOR, 
+                           TRI_GRAM_TYPES_NULL_SEPARATOR, true);
+  }
+  
+  
   public void testReset() throws Exception {
     Tokenizer wsTokenizer = new WhitespaceTokenizer(new StringReader("please divide this sentence"));
     TokenStream filter = new ShingleFilter(wsTokenizer, 2);
@@ -403,30 +858,50 @@
     throws IOException {
 
     ShingleFilter filter = new ShingleFilter(new TestTokenStream(tokensToShingle), maxSize);
+    shingleFilterTestCommon
+      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+  }
+
+  protected void shingleFilterTest(int minSize, int maxSize, Token[] tokensToShingle, 
+                                   Token[] tokensToCompare, int[] positionIncrements,
+                                   String[] types, boolean outputUnigrams)
+    throws IOException {
+    ShingleFilter filter 
+      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+    shingleFilterTestCommon
+      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+  }
+
+  protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, 
+                                   Token[] tokensToCompare, int[] positionIncrements,
+                                   String[] types, boolean outputUnigrams)
+    throws IOException {
+    ShingleFilter filter 
+      = new ShingleFilter(new TestTokenStream(tokensToShingle), minSize, maxSize);
+    filter.setTokenSeparator(tokenSeparator);
+    shingleFilterTestCommon
+      (filter, tokensToCompare, positionIncrements, types, outputUnigrams);
+  }
+
+  protected void shingleFilterTestCommon(ShingleFilter filter,
+                                         Token[] tokensToCompare,
+                                         int[] positionIncrements,
+                                         String[] types, boolean outputUnigrams)
+    throws IOException {
+
     filter.setOutputUnigrams(outputUnigrams);
 
-    TermAttribute termAtt = filter.addAttribute(TermAttribute.class);
-    OffsetAttribute offsetAtt = filter.addAttribute(OffsetAttribute.class);
-    PositionIncrementAttribute posIncrAtt = filter.addAttribute(PositionIncrementAttribute.class);
-    TypeAttribute typeAtt = filter.addAttribute(TypeAttribute.class);
-
-    int i = 0;
-    while (filter.incrementToken()) {
-      assertTrue("ShingleFilter outputted more tokens than expected", i < tokensToCompare.length);
-      String termText = termAtt.term();
-      String goldText = tokensToCompare[i].term();
-      assertEquals("Wrong termText", goldText, termText);
-      assertEquals("Wrong startOffset for token \"" + termText + "\"",
-          tokensToCompare[i].startOffset(), offsetAtt.startOffset());
-      assertEquals("Wrong endOffset for token \"" + termText + "\"",
-          tokensToCompare[i].endOffset(), offsetAtt.endOffset());
-      assertEquals("Wrong positionIncrement for token \"" + termText + "\"",
-          positionIncrements[i], posIncrAtt.getPositionIncrement());
-      assertEquals("Wrong type for token \"" + termText + "\"", types[i], typeAtt.type());
-      i++;
+    String text[] = new String[tokensToCompare.length];
+    int startOffsets[] = new int[tokensToCompare.length];
+    int endOffsets[] = new int[tokensToCompare.length];
+    
+    for (int i = 0; i < tokensToCompare.length; i++) {
+      text[i] = tokensToCompare[i].term();
+      startOffsets[i] = tokensToCompare[i].startOffset();
+      endOffsets[i] = tokensToCompare[i].endOffset();
     }
-    assertEquals("ShingleFilter outputted wrong # of tokens. (# output = " + i + "; # expected =" + tokensToCompare.length + ")",
-                 tokensToCompare.length, i);
+    
+    assertTokenStreamContents(filter, text, startOffsets, endOffsets, types, positionIncrements);
   }
 
   private static Token createToken(String term, int start, int offset)



Mime
View raw message