lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ka...@apache.org
Subject svn commit: r673549 - in /lucene/java/trunk/contrib: analyzers/ analyzers/src/java/org/apache/lucene/analysis/miscellaneous/ analyzers/src/java/org/apache/lucene/analysis/shingle/ analyzers/src/test/org/apache/lucene/analysis/miscellaneous/ analyzers/s...
Date Wed, 02 Jul 2008 23:53:52 GMT
Author: kalle
Date: Wed Jul  2 16:53:51 2008
New Revision: 673549

URL: http://svn.apache.org/viewvc?rev=673549&view=rev
Log:
LUCENE-1320
ShingleMatrixFilter, a multidimensional shingle token filter.

Added:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
Removed:
    lucene/java/trunk/contrib/instantiated/CHANGES.txt
Modified:
    lucene/java/trunk/contrib/analyzers/build.xml

Modified: lucene/java/trunk/contrib/analyzers/build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/build.xml?rev=673549&r1=673548&r2=673549&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/build.xml (original)
+++ lucene/java/trunk/contrib/analyzers/build.xml Wed Jul  2 16:53:51 2008
@@ -23,5 +23,8 @@
     Additional Analyzers
   </description>
 
+  <property name="javac.source" value="1.5" />
+  <property name="javac.target" value="1.5" />
+  
   <import file="../contrib-build.xml"/>
 </project>

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/EmptyTokenStream.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,44 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+import java.io.IOException;
+
+/**
+ * An always exhausted token stream.
+ */
+public class EmptyTokenStream extends TokenStream {
+
+  public Token next() throws IOException {
+    return null;
+  }
+
+  public Token next(Token result) throws IOException {
+    return null;
+  }
+
+  public void reset() throws IOException {
+  }
+
+  public void close() throws IOException {
+  }
+
+}

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAndSuffixAwareTokenFilter.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,71 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.IOException;
+
+/**
+ * Links two PrefixAwareTokenFilter
+ */
+public class PrefixAndSuffixAwareTokenFilter extends TokenStream {
+
+  private PrefixAwareTokenFilter suffix;
+
+  public PrefixAndSuffixAwareTokenFilter(TokenStream prefix, TokenStream input, TokenStream suffix) {
+    prefix = new PrefixAwareTokenFilter(prefix, input) {
+      public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
+        return PrefixAndSuffixAwareTokenFilter.this.updateInputToken(suffixToken, lastInputToken);
+      }
+    };
+    this.suffix = new PrefixAwareTokenFilter(prefix, suffix) {
+      public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
+        return PrefixAndSuffixAwareTokenFilter.this.updateSuffixToken(suffixToken, lastInputToken);
+      }
+    };
+  }
+
+  public Token updateInputToken(Token inputToken, Token lastPrefixToken) {
+    inputToken.setStartOffset(lastPrefixToken.endOffset() + inputToken.startOffset());
+    inputToken.setEndOffset(lastPrefixToken.endOffset() + inputToken.endOffset());
+    return inputToken;
+  }
+
+  public Token updateSuffixToken(Token suffixToken, Token lastInputToken) {
+    suffixToken.setStartOffset(lastInputToken.endOffset() + suffixToken.startOffset());
+    suffixToken.setEndOffset(lastInputToken.endOffset() + suffixToken.endOffset());
+    return suffixToken;
+  }
+
+
+  public Token next(Token result) throws IOException {
+    return suffix.next(result);
+  }
+
+
+  public void reset() throws IOException {
+    suffix.reset();
+  }
+
+
+  public void close() throws IOException {
+    suffix.close();
+  }
+}

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/PrefixAwareTokenFilter.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,148 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.Payload;
+
+import java.io.IOException;
+
+
+/**
+ * Joins two token streams and leaves the last token of the first stream available
+ * to be used when updating the token values in the second stream based on that token.
+ *
+ * The default implementation adds last prefix token end offset to the suffix token start and end offsets.
+ */
+public class PrefixAwareTokenFilter extends TokenStream {
+
+  private TokenStream prefix;
+  private TokenStream suffix;
+
+  public PrefixAwareTokenFilter(TokenStream prefix, TokenStream suffix) {
+    this.suffix = suffix;
+    this.prefix = prefix;
+    prefixExhausted = false;
+  }
+
+  private CopyableToken previousPrefixToken = new CopyableToken();
+
+  private boolean prefixExhausted;
+
+  public Token next(Token result) throws IOException {
+
+    Token buf = result;
+
+    if (!prefixExhausted) {
+      result = prefix.next(result);
+      if (result == null) {
+        prefixExhausted = true;
+      } else {
+        previousPrefixToken.copyFrom(result);        
+        return result;
+      }
+    }
+
+    result = suffix.next(buf);
+    if (result == null) {
+      return null;
+    }
+
+    return updateSuffixToken(result, previousPrefixToken);
+  }
+
+  /**
+   * The default implementation adds last prefix token end offset to the suffix token start and end offsets.
+   *
+   * @param suffixToken a token from the suffix stream
+   * @param lastPrefixToken the last token from the prefix stream
+   * @return consumer token
+   */
+  public Token updateSuffixToken(Token suffixToken, Token lastPrefixToken) {
+    suffixToken.setStartOffset(lastPrefixToken.endOffset() + suffixToken.startOffset());
+    suffixToken.setEndOffset(lastPrefixToken.endOffset() + suffixToken.endOffset());
+    return suffixToken;
+  }
+
+  public void close() throws IOException {
+    prefix.close();
+    suffix.close();
+  }
+
+  public void reset() throws IOException {
+    super.reset();
+    if (prefix != null) {
+      prefixExhausted = false;
+      prefix.reset();
+    }
+    if (suffix != null) {
+      suffix.reset();
+    }
+
+
+  }
+
+
+  public TokenStream getPrefix() {
+    return prefix;
+  }
+
+  public void setPrefix(TokenStream prefix) {
+    this.prefix = prefix;
+  }
+
+  public TokenStream getSuffix() {
+    return suffix;
+  }
+
+  public void setSuffix(TokenStream suffix) {
+    this.suffix = suffix;
+  }
+
+
+  public static class CopyableToken extends Token {
+
+    private Payload buf = new Payload();
+
+    public void copyFrom(Token source) {
+      if (source.termBuffer() != null) {
+        setTermBuffer(source.termBuffer(), 0, source.termLength());
+      } else {
+        setTermText(null);
+        setTermLength(0);
+      }
+
+      setPositionIncrement(source.getPositionIncrement());
+      setFlags(source.getFlags());
+      setStartOffset(source.startOffset());
+      setEndOffset(source.endOffset());
+      setType(source.type());
+      if (source.getPayload() == null) {
+        setPayload(null);
+      } else {
+        setPayload(buf);        
+        if (buf.getData() == null || buf.getData().length < source.getPayload().length()) {
+          buf.setData(new byte[source.getPayload().length()]);
+        }
+        source.getPayload().copyTo(buf.getData(), 0);
+        buf.setData(buf.getData(), 0, source.getPayload().length());
+      }
+    }
+  }
+}

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/miscellaneous/SingleTokenTokenStream.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,59 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+
+import java.io.IOException;
+
+/**
+ * A token stream containing a single token.
+ */
+public class SingleTokenTokenStream extends TokenStream {
+
+  private boolean exhausted = false;
+  private Token token;
+
+
+  public SingleTokenTokenStream(Token token) {
+    this.token = token;
+  }
+
+
+  public Token next(Token result) throws IOException {
+    if (exhausted) {
+      return null;
+    }
+    exhausted = true;
+    return token;
+  }
+
+
+  public void reset() throws IOException {
+    exhausted = false;
+  }
+
+  public Token getToken() {
+    return token;
+  }
+
+  public void setToken(Token token) {
+    this.token = token;
+  }
+}

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/shingle/ShingleMatrixFilter.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,924 @@
+package org.apache.lucene.analysis.shingle;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.EmptyTokenStream;
+import org.apache.lucene.analysis.payloads.PayloadHelper;
+import org.apache.lucene.index.Payload;
+
+import java.io.IOException;
+import java.util.*;
+
+
+/**
+ * <p>A ShingleFilter constructs shingles (token n-grams) from a token stream.
+ * In other words, it creates combinations of tokens as a single token.
+ *
+ * <p>For example, the sentence "please divide this sentence into shingles"
+ * might be tokenized into shingles "please divide", "divide this",
+ * "this sentence", "sentence into", and "into shingles".
+ *
+ * <p>Using a shingle filter at index and query time can in some instances
+ * be used to replace phrase queries, especially them with 0 slop.
+ *
+ * <p>Without a spacer character
+ * it can be used to handle composition and decomposion of words
+ * such as searching for "multi dimensional" instead of "multidimensional".
+ * It is a rather common human problem at query time
+ * in several languages, notebly the northern Germanic branch.
+ *
+ * <p>Shingles are amongst many things also known to solve problems
+ * in spell checking, language detection and document clustering.  
+ *
+ * <p>This filter is backed by a three dimensional column oriented matrix
+ * used to create permutations of the second dimension, the rows,
+ * and leaves the third, the z-axis, for for multi token synonyms.
+ *
+ * <p>In order to use this filter you need to define a way of positioning
+ * the input stream tokens in the matrix. This is done using a
+ * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec}.
+ * There are three simple implementations for demonstrational purposes,
+ * see {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec},
+ * {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec}
+ * and {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec}.
+ *
+ * <p>Consider this token matrix:
+ * <pre>
+ *  Token[column][row][z-axis]{
+ *    {{hello}, {greetings, and, salutations}},
+ *    {{world}, {earth}, {tellus}}
+ *  };
+ * </pre>
+ *
+ * It would produce the following 2-3 gram sized shingles:
+ *
+ * <pre>
+ * "hello_world"
+ * "greetings_and"
+ * "greetings_and_salutations"
+ * "and_salutations"
+ * "and_salutations_world"
+ * "salutations_world"
+ * "hello_earth"
+ * "and_salutations_earth"
+ * "salutations_earth"
+ * "hello_tellus"
+ * "and_salutations_tellus"
+ * "salutations_tellus"
+ *  </pre>
+ * 
+ * <p>This implementation can be rather heap demanding
+ * if (maximum shingle size - minimum shingle size) is a great number and the stream contains many columns,
+ * or if each column contains a great number of rows.
+ *
+ * <p>The problem is that in order avoid producing duplicates
+ * the filter needs to keep track of any shingle already produced and returned to the consumer
+ *
+ * There is a bit of resource management to handle this
+ * but it would of course be much better if the filter was written
+ * so it never created the same shingle more than once in the first place.
+ *
+ * <p>The filter also has basic support for calculating weights for the shingles
+ * based on the weights of the tokens from the input stream, output shingle size, et c.
+ * See {@link #calculateShingleWeight(org.apache.lucene.analysis.Token, java.util.List, int, java.util.List, java.util.List)}.
+ *
+ */
+public class ShingleMatrixFilter extends TokenStream {
+
+  public static Character defaultSpacerCharacter = '_';
+  public static TokenSettingsCodec defaultSettingsCodec = new OneDimensionalNonWeightedTokenSettingsCodec();
+  public static boolean ignoringSinglePrefixOrSuffixShingleByDefault = false;
+
+  /**
+   * Strategy used to code and decode meta data of the tokens from the input stream
+   * regarding how to position the tokens in the matrix, set and retreive weight, et c.
+   */
+  public static abstract class TokenSettingsCodec {
+
+    /**
+     * Retrieves information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
+     * @param token
+     * @return
+     * @throws IOException
+     */
+    public abstract TokenPositioner getTokenPositioner(Token token) throws IOException;
+
+    /**
+     * Sets information on how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
+     *
+     * @param token
+     * @param tokenPositioner
+     */
+    public abstract void setTokenPositioner(Token token, ShingleMatrixFilter.TokenPositioner tokenPositioner);
+
+    /**
+     * Have this method return 1f in order to 'disable' weights.
+     * @param token
+     * @return the weight of parameter token
+     */
+    public abstract float getWeight(Token token);
+
+    /**
+     * Have this method do nothing in order to 'disable' weights.
+     * @param token
+     * @param weight
+     */
+    public abstract void setWeight(Token token, float weight);
+  }
+
+
+  /**
+   * Used to describe how a {@link org.apache.lucene.analysis.Token} is to be inserted to a {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix}.
+   * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#getTokenPositioner(org.apache.lucene.analysis.Token)
+   * @see org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenSettingsCodec#setTokenPositioner(org.apache.lucene.analysis.Token,org.apache.lucene.analysis.shingle.ShingleMatrixFilter.TokenPositioner)
+   */
+  public static enum TokenPositioner {
+    newColumn(0),
+    newRow(1),
+    sameRow(2);
+
+    private final int index;
+
+    private TokenPositioner(int index) {
+      this.index = index;
+    }
+
+    public int getIndex() {
+      return index;
+    }
+  }
+
+  // filter instance settings variables
+
+  private TokenSettingsCodec settingsCodec;
+
+  private int minimumShingleSize;
+  private int maximumShingleSize;
+
+  private boolean ignoringSinglePrefixOrSuffixShingle = false;
+
+  private Character spacerCharacter = '_';
+
+  private TokenStream input;
+
+
+  /**
+   * Creates a shingle filter based on a user defined matrix.
+   *
+   * The filter /will/ delete columns from the input matrix! You will not be able to reset the filter if you used this constructor.
+   * todo: don't touch the matrix! use a boolean, set the input stream to null or something, and keep track of where in the matrix we are at.
+   *
+   * @param matrix the input based for creating shingles. Does not need to contain any information until {@link org.apache.lucene.analysis.shingle.ShingleMatrixFilter#next(org.apache.lucene.analysis.Token)} is called the first time.
+   * @param minimumShingleSize minimum number of tokens in any shingle.
+   * @param maximumShingleSize maximum number of tokens in any shingle.
+   * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
+   * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.
+   * @param settingsCodec codec used to read input token weight and matrix positioning.
+   */
+  public ShingleMatrixFilter(Matrix matrix, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) {
+    this.matrix = matrix;
+    this.minimumShingleSize = minimumShingleSize;
+    this.maximumShingleSize = maximumShingleSize;
+    this.spacerCharacter = spacerCharacter;
+    this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
+    this.settingsCodec = settingsCodec;
+
+    // set the input to be an empty token stream, we already have the data.
+    this.input = new EmptyTokenStream();
+  }
+
+  /**
+   * Creates a shingle filter using default settings.
+   *
+   * @see #defaultSpacerCharacter
+   * @see #ignoringSinglePrefixOrSuffixShingleByDefault
+   * @see #defaultSettingsCodec
+   *
+   * @param input stream from wich to construct the matrix
+   * @param minimumShingleSize minimum number of tokens in any shingle.
+   * @param maximumShingleSize maximum number of tokens in any shingle.
+   */
+  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize) {
+    this(input, minimumShingleSize, maximumShingleSize, defaultSpacerCharacter);
+  }
+
+
+  /**
+   * Creates a shingle filter using default settings.
+   *
+   * @see #ignoringSinglePrefixOrSuffixShingleByDefault
+   * @see #defaultSettingsCodec
+   *
+   * @param input stream from wich to construct the matrix
+   * @param minimumShingleSize minimum number of tokens in any shingle.
+   * @param maximumShingleSize maximum number of tokens in any shingle.
+   * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
+   */
+  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter) {
+    this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingleByDefault);
+  }
+
+  /**
+   * Creates a shingle filter using the default {@link TokenSettingsCodec}.
+   *
+   * @see #defaultSettingsCodec
+   *
+   * @param input stream from wich to construct the matrix
+   * @param minimumShingleSize minimum number of tokens in any shingle.
+   * @param maximumShingleSize maximum number of tokens in any shingle.
+   * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
+   * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.
+   */
+  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle) {
+    this(input, minimumShingleSize, maximumShingleSize, spacerCharacter, ignoringSinglePrefixOrSuffixShingle, defaultSettingsCodec);
+  }
+
+
+  /**
+   * Creates a shingle filter with ad hoc parameter settings.
+   *
+   * @param input stream from wich to construct the matrix
+   * @param minimumShingleSize minimum number of tokens in any shingle.
+   * @param maximumShingleSize maximum number of tokens in any shingle.
+   * @param spacerCharacter character to use between texts of the token parts in a shingle. null for none.
+   * @param ignoringSinglePrefixOrSuffixShingle if true, shingles that only contains permutation of the first of the last column will not be produced as shingles. Useful when adding boundary marker tokens such as '^' and '$'.
+   * @param settingsCodec codec used to read input token weight and matrix positioning.
+   */
+  public ShingleMatrixFilter(TokenStream input, int minimumShingleSize, int maximumShingleSize, Character spacerCharacter, boolean ignoringSinglePrefixOrSuffixShingle, TokenSettingsCodec settingsCodec) {
+    this.input = input;
+    this.minimumShingleSize = minimumShingleSize;
+    this.maximumShingleSize = maximumShingleSize;
+    this.spacerCharacter = spacerCharacter;
+    this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
+    this.settingsCodec = settingsCodec;
+  }
+
+  // internal filter instance variables
+
+  /** iterator over the current matrix row permutations */
+  private Iterator<Matrix.Column.Row[]> permutations;
+
+  /** the current permutation of tokens used to produce shingles */
+  private List<Token> currentPermuationTokens;
+  /** index to what row a token in currentShingleTokens represents*/
+  private List<Matrix.Column.Row> currentPermutationRows;
+
+  private int currentPermutationTokensStartOffset;
+  private int currentShingleLength;
+
+  /**
+   * a set containing shingles that has been the result of a call to next(Token),
+   * used to avoid producing the same shingle more than once.
+   */
+  private Set<List<Token>> shinglesSeen = new HashSet<List<Token>>();
+
+
+  public void reset() throws IOException {
+    permutations = null;
+    shinglesSeen.clear();
+    input.reset();
+  }
+
+  private Matrix matrix;
+
+  public Token next(Token token) throws IOException {
+    if (matrix == null) {
+      matrix = new Matrix();
+      // fill matrix with maximumShingleSize columns
+      while (matrix.columns.size() < maximumShingleSize && readColumn()) {
+        // this loop looks ugly
+      }
+    }
+
+    if (currentPermuationTokens != null) {
+      currentShingleLength++;
+
+      if (currentShingleLength + currentPermutationTokensStartOffset <= currentPermuationTokens.size()
+          && currentShingleLength <= maximumShingleSize) {
+
+        // it is possible to create at least one more shingle of the current matrix permutation
+
+        if (ignoringSinglePrefixOrSuffixShingle
+            && currentShingleLength == 1
+            && (currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isFirst() || currentPermutationRows.get(currentPermutationTokensStartOffset).getColumn().isLast())) {
+          return next(token);
+        }
+
+        int termLength = 0;
+
+        List<Token> shingle = new ArrayList<Token>();
+
+        for (int i = 0; i < currentShingleLength; i++) {
+          Token shingleToken = currentPermuationTokens.get(i + currentPermutationTokensStartOffset);
+          termLength += shingleToken.termLength();
+          shingle.add(shingleToken);
+        }
+        if (spacerCharacter != null) {
+          termLength += currentShingleLength - 1;
+        }
+
+        // only produce shingles that not already has been created
+        if (!shinglesSeen.add(shingle)) {
+          return next(token);
+        }
+
+        // shingle token factory
+        StringBuilder sb = new StringBuilder(termLength + 10); // paranormal abillity to forsay the future.
+        for (Token shingleToken : shingle) {
+          if (spacerCharacter != null && sb.length() > 0) {
+            sb.append(spacerCharacter);
+          }
+          sb.append(shingleToken.termBuffer(), 0, shingleToken.termLength());
+        }
+        token.setTermText(sb.toString());
+        updateToken(token, shingle, currentPermutationTokensStartOffset, currentPermutationRows, currentPermuationTokens);
+
+        return token;
+
+      } else {
+
+        // it is NOT possible to create one more shingles of the current matrix permutation
+
+        if (currentPermutationTokensStartOffset < currentPermuationTokens.size() - 1) {
+          // reset shingle size and move one step to the right in the current tokens permutation
+          currentPermutationTokensStartOffset++;
+          currentShingleLength = minimumShingleSize - 1;
+          return next(token);
+        }
+
+
+        if (permutations == null) {
+          // todo does this ever occur?
+          return null;
+        }
+
+
+        if (!permutations.hasNext()) {
+
+          // load more data (if available) to the matrix
+
+          if (input != null && readColumn()) {
+            // don't really care, we just read it.
+          }
+
+          // get rith of resources
+
+          // delete the first column in the matrix
+          Matrix.Column deletedColumn = matrix.columns.remove(0);
+
+          // remove all shingles seen that include any of the tokens from the deleted column.
+          List<Token> deletedColumnTokens = new ArrayList<Token>();
+          for (Matrix.Column.Row row : deletedColumn.getRows()) {
+            for (Token shingleToken : row.getTokens()) {
+              deletedColumnTokens.add(shingleToken);
+            }
+          }
+          for (Iterator<List<Token>> shinglesSeenIterator = shinglesSeen.iterator(); shinglesSeenIterator.hasNext();) {
+            List<Token> shingle = shinglesSeenIterator.next();
+            for (Token deletedColumnToken : deletedColumnTokens) {
+              if (shingle.contains(deletedColumnToken)) {
+                shinglesSeenIterator.remove();
+                break;
+              }
+            }
+          }
+
+
+          if (matrix.columns.size() < minimumShingleSize) {
+            // exhausted
+            return null;
+          }
+
+          // create permutations of the matrix it now looks
+          permutations = matrix.permutationIterator();
+        }
+
+        nextTokensPermutation();
+        return next(token);
+
+      }
+    }
+
+    if (permutations == null) {
+      permutations = matrix.permutationIterator();
+    }
+
+    if (!permutations.hasNext()) {
+      return null;
+    }
+
+    nextTokensPermutation();
+
+    return next(token);
+  }
+
+  /**
+   * get next permutation of row combinations,
+   * creates list of all tokens in the row and
+   * an index from each such token to what row they exist in.
+   * finally resets the current (next) shingle size and offset.
+   */
+  private void nextTokensPermutation() {
+    Matrix.Column.Row[] rowsPermutation = permutations.next();
+    List<Matrix.Column.Row> currentPermutationRows = new ArrayList<Matrix.Column.Row>();
+    List<Token> currentPermuationTokens = new ArrayList<Token>();
+    for (Matrix.Column.Row row : rowsPermutation) {
+      for (Token shingleToken : row.getTokens()) {
+        currentPermuationTokens.add(shingleToken);
+        currentPermutationRows.add(row);
+      }
+    }
+    this.currentPermuationTokens = currentPermuationTokens;
+    this.currentPermutationRows = currentPermutationRows;
+
+    currentPermutationTokensStartOffset = 0;
+    currentShingleLength = minimumShingleSize - 1;
+
+  }
+
+  /**
+   * Final touch of a shingle token before it is passed on to the consumer from method {@link #next(org.apache.lucene.analysis.Token)}.
+   *
+   * Calculates and sets type, flags, position increment, start/end offsets and weight.
+   *
+   * @param token Shingle token
+   * @param shingle Tokens used to produce the shingle token.
+   * @param currentPermutationStartOffset Start offset in parameter currentPermutationTokens
+   * @param currentPermutationRows index to Matrix.Column.Row from the position of tokens in parameter currentPermutationTokens
+   * @param currentPermuationTokens tokens of the current permutation of rows in the matrix.
+   */
+  public void updateToken(Token token, List<Token> shingle, int currentPermutationStartOffset, List<Matrix.Column.Row> currentPermutationRows, List<Token> currentPermuationTokens) {
+    token.setType(ShingleMatrixFilter.class.getSimpleName());
+    token.setFlags(0);
+    token.setPositionIncrement(1);
+    token.setStartOffset(shingle.get(0).startOffset());
+    token.setEndOffset(shingle.get(shingle.size() - 1).endOffset());
+    settingsCodec.setWeight(token, calculateShingleWeight(token, shingle, currentPermutationStartOffset, currentPermutationRows, currentPermuationTokens));
+  }
+
+  /**
+   * Evaluates the new shingle token weight.
+   *
+   * for (shingle part token in shingle)
+   * weight +=  shingle part token weight * (1 / sqrt(all shingle part token weights summed))
+   *
+   * This algorithm gives a slightly greater score for longer shingles
+   * and is rather penalising to great shingle token part weights.  
+   *
+   * @param shingleToken token returned to consumer
+   * @param shingle tokens the tokens used to produce the shingle token.
+   * @param currentPermutationStartOffset start offset in parameter currentPermutationRows and currentPermutationTokens.
+   * @param currentPermutationRows an index to what matrix row a token in parameter currentPermutationTokens exist.
+   * @param currentPermuationTokens all tokens in the current row permutation of the matrix. A sub list (parameter offset, parameter shingle.size) equals parameter shingle.
+   * @return weight to be set for parameter shingleToken
+   */
+  public float calculateShingleWeight(Token shingleToken, List<Token> shingle, int currentPermutationStartOffset, List<Matrix.Column.Row> currentPermutationRows, List<Token> currentPermuationTokens) {
+    double[] weights = new double[shingle.size()];
+
+    double total = 0f;
+    double top = 0d;
+
+
+    for (int i=0; i<weights.length; i++) {
+      weights[i] = settingsCodec.getWeight(shingle.get(i));
+
+      double tmp = weights[i];
+      if (tmp > top) {
+        top = tmp;
+      }
+      total += tmp;
+    }
+
+    double factor = 1d / Math.sqrt(total);
+
+    double weight = 0d;
+    for (double partWeight : weights) {
+      weight += partWeight * factor;
+    }
+
+    return (float) weight;
+  }
+
+
+  private Token readColumnBuf;
+
+  /**
+   * Loads one column from the token stream.
+   *
+   * When the last token is read from the token stream it will column.setLast(true);
+   *
+   * @return true if it manage to read one more column from the input token stream
+   * @throws IOException if the matrix source input stream throws an exception
+   */
+  private boolean readColumn() throws IOException {
+
+    Token token;
+    if (readColumnBuf != null) {
+      token = readColumnBuf;
+      readColumnBuf = null;
+    } else {
+      token = input.next(new Token());
+    }
+
+    if (token == null) {
+      return false;
+    }
+
+    Matrix.Column currentReaderColumn = matrix.new Column();
+    Matrix.Column.Row currentReaderRow = currentReaderColumn.new Row();
+
+    currentReaderRow.getTokens().add(token);
+    TokenPositioner tokenPositioner;
+    while ((readColumnBuf = input.next(new Token())) != null
+        && (tokenPositioner = settingsCodec.getTokenPositioner(readColumnBuf)) != TokenPositioner.newColumn) {
+
+      if (tokenPositioner == TokenPositioner.sameRow) {
+        currentReaderRow.getTokens().add(readColumnBuf);
+      } else /*if (tokenPositioner == TokenPositioner.newRow)*/ {
+        currentReaderRow = currentReaderColumn.new Row();
+        currentReaderRow.getTokens().add(readColumnBuf);
+      }
+      readColumnBuf = null;
+
+    }
+
+    if (readColumnBuf == null) {
+      readColumnBuf = input.next(new Token());
+      if (readColumnBuf == null) {
+        currentReaderColumn.setLast(true);
+      }
+    }
+
+
+    return true;
+
+  }
+
+
+  /**
+   * A column focused matrix in three dimensions:
+   *
+   * <pre>
+   * Token[column][row][z-axis] {
+   *     {{hello}, {greetings, and, salutations}},
+   *     {{world}, {earth}, {tellus}}
+   * };
+   * </pre>
+   *
+   * todo consider row groups
+   * to indicate that shingles is only to contain permutations with texts in that same row group.
+   *
+   */
+  public static class Matrix {
+
+    private boolean columnsHasBeenCreated = false;
+
+    private List<Column> columns = new ArrayList<Column>();
+
+    public List<Column> getColumns() {
+      return columns;
+    }
+
+    public class Column {
+
+      private boolean last;
+      private boolean first;
+
+      public Matrix getMatrix() {
+        return Matrix.this;
+      }
+
+      public Column(Token token) {
+        this();
+        Row row = new Row();
+        row.getTokens().add(token);
+      }
+
+      public Column() {
+        synchronized (Matrix.this) {
+          if (!columnsHasBeenCreated) {
+            this.setFirst(true);
+            columnsHasBeenCreated = true;
+          }
+        }
+        Matrix.this.columns.add(this);
+      }
+
+      private List<Row> rows = new ArrayList<Row>();
+
+      public List<Row> getRows() {
+        return rows;
+      }
+
+
+      public int getIndex() {
+        return Matrix.this.columns.indexOf(this);
+      }
+
+      public String toString() {
+        return "Column{" +
+            "first=" + first +
+            ", last=" + last +
+            ", rows=" + rows +
+            '}';
+      }
+
+      public boolean isFirst() {
+        return first;
+      }
+
+      public void setFirst(boolean first) {
+        this.first = first;
+      }
+
+      public void setLast(boolean last) {
+        this.last = last;
+      }
+
+      public boolean isLast() {
+        return last;
+      }
+
+      public class Row {
+
+        public Column getColumn() {
+          return Column.this;
+        }
+
+        private List<Token> tokens = new LinkedList<Token>();
+
+        public Row() {
+          Column.this.rows.add(this);
+        }
+
+        public int getIndex() {
+          return Column.this.rows.indexOf(this);
+        }
+
+        public List<Token> getTokens() {
+          return tokens;
+        }
+
+        public void setTokens(List<Token> tokens) {
+          this.tokens = tokens;
+        }
+
+//        public int getStartOffset() {
+//          int ret = tokens[0].startOffset();
+//          if (getIndex() > 0 && ret == 0) {
+//            ret = Column.this.rows.get(0).getStartOffset();
+//          }
+//          return ret;
+//        }
+//
+//        public int getEndOffset() {
+//          int ret = tokens[tokens.length - 1].endOffset();
+//          if (getIndex() > 0 && ret == 0) {
+//            ret = Column.this.rows.get(0).getEndOffset();
+//          }
+//          return ret;
+//        }
+
+        public String toString() {
+          return "Row{" +
+              "index=" + getIndex() +
+              ", tokens=" + (tokens == null ? null : Arrays.asList(tokens)) +
+              '}';
+        }
+      }
+
+    }
+
+
+    public Iterator<Column.Row[]> permutationIterator() {
+
+      return new Iterator<Column.Row[]>() {
+
+        private int[] columnRowCounters = new int[columns.size()];
+
+        public void remove() {
+          throw new IllegalStateException("not implemented");
+        }
+
+        public boolean hasNext() {
+          int s = columnRowCounters.length;
+          return columnRowCounters[s - 1] < columns.get(s - 1).getRows().size();
+        }
+
+        public Column.Row[] next() {
+          if (!hasNext()) {
+            throw new NoSuchElementException("no more elements");
+          }
+
+          Column.Row[] rows = new Column.Row[columnRowCounters.length];
+
+          for (int i = 0; i < columnRowCounters.length; i++) {
+            rows[i] = columns.get(i).rows.get(columnRowCounters[i]);
+          }
+          incrementColumnRowCounters();
+
+          return rows;
+        }
+
+        private void incrementColumnRowCounters() {
+          for (int i = 0; i < columnRowCounters.length; i++) {
+            columnRowCounters[i]++;
+            if (columnRowCounters[i] == columns.get(i).rows.size() &&
+                i < columnRowCounters.length - 1) {
+              columnRowCounters[i] = 0;
+            } else {
+              break;
+            }
+          }
+        }
+      };
+    }
+
+    public String toString() {
+      return "Matrix{" +
+          "columns=" + columns +
+          '}';
+    }
+  }
+
+
+  public int getMinimumShingleSize() {
+    return minimumShingleSize;
+  }
+
+  public void setMinimumShingleSize(int minimumShingleSize) {
+    this.minimumShingleSize = minimumShingleSize;
+  }
+
+  public int getMaximumShingleSize() {
+    return maximumShingleSize;
+  }
+
+  public void setMaximumShingleSize(int maximumShingleSize) {
+    this.maximumShingleSize = maximumShingleSize;
+  }
+
+
+  public Matrix getMatrix() {
+    return matrix;
+  }
+
+  public void setMatrix(Matrix matrix) {
+    this.matrix = matrix;
+  }
+
+  public Character getSpacerCharacter() {
+    return spacerCharacter;
+  }
+
+  public void setSpacerCharacter(Character spacerCharacter) {
+    this.spacerCharacter = spacerCharacter;
+  }
+
+  public boolean isIgnoringSinglePrefixOrSuffixShingle() {
+    return ignoringSinglePrefixOrSuffixShingle;
+  }
+
+  public void setIgnoringSinglePrefixOrSuffixShingle(boolean ignoringSinglePrefixOrSuffixShingle) {
+    this.ignoringSinglePrefixOrSuffixShingle = ignoringSinglePrefixOrSuffixShingle;
+  }
+
+  /**
+   * Using this codec makes a {@link ShingleMatrixFilter} act like {@link org.apache.lucene.analysis.shingle.ShingleFilter}.
+   * It produces the most simple sort of shingles, ignoring token position increments, et c.
+   *
+   * It adds each token as a new column.
+   */
+  public static class OneDimensionalNonWeightedTokenSettingsCodec extends TokenSettingsCodec {
+
+    public TokenPositioner getTokenPositioner(Token token) throws IOException {
+      return TokenPositioner.newColumn;
+    }
+
+    public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
+    }
+
+    public float getWeight(Token token) {
+      return 1f;
+    }
+
+    public void setWeight(Token token, float weight) {
+    }
+
+  }
+
+
+  /**
+   * A codec that creates a two dimensional matrix
+   * by treating tokens from the input stream with 0 position increment
+   * as new rows to the current column.
+   */
+  public static class TwoDimensionalNonWeightedSynonymTokenSettingsCodec extends TokenSettingsCodec {
+
+    public TokenPositioner getTokenPositioner(Token token) throws IOException {
+      if (token.getPositionIncrement() == 0) {
+        return TokenPositioner.newRow;
+      } else {
+        return TokenPositioner.newColumn;
+      }
+    }
+
+    public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
+      throw new UnsupportedOperationException();
+    }
+
+    public float getWeight(Token token) {
+      return 1f;
+    }
+
+    public void setWeight(Token token, float weight) {
+    }
+
+  }
+
+  /**
+   * A full featured codec not to be used for something serious.
+   *
+   * It takes complete control of
+   * payload for weight
+   * and the bit flags for positioning in the matrix.
+   *
+   * Mainly exist for demonstrational purposes.
+   */
+  public static class SimpleThreeDimensionalTokenSettingsCodec extends TokenSettingsCodec {
+
+    /**
+     * @param token
+     * @return the token flags int value as TokenPosition
+     * @throws IOException
+     */
+    public TokenPositioner getTokenPositioner(Token token) throws IOException {
+      switch (token.getFlags()) {
+        case 0:
+          return TokenPositioner.newColumn;
+        case 1:
+          return TokenPositioner.newRow;
+        case 2:
+          return TokenPositioner.sameRow;
+      }
+      throw new IOException("Unknown matrix positioning of token " + token);
+    }
+
+    /**
+     * Sets the TokenPositioner as token flags int value.
+     *
+     * @param token
+     * @param tokenPositioner
+     */
+    public void setTokenPositioner(Token token, TokenPositioner tokenPositioner) {
+      token.setFlags(tokenPositioner.getIndex());
+    }
+
+    /**
+     * Returns a 32 bit float from the payload, or 1f it null.
+     *
+     * @param token
+     * @return
+     */
+    public float getWeight(Token token) {
+      if (token.getPayload() == null || token.getPayload().getData() == null) {
+        return 1f;
+      } else {
+        return PayloadHelper.decodeFloat(token.getPayload().getData());
+      }
+    }
+
+    /**
+     * Stores a 32 bit float in the payload, or set it to null if 1f;
+     * @param token
+     * @param weight
+     */
+    public void setWeight(Token token, float weight) {
+      if (weight == 1f) {
+        token.setPayload(null);
+      } else {
+        token.setPayload(new Payload(PayloadHelper.encodeFloat(weight)));
+      }
+    }
+
+  }
+
+
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestEmptyTokenStream.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,36 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Token;
+
+public class TestEmptyTokenStream extends TestCase {
+
+  public void test() throws IOException {
+    TokenStream ts = new EmptyTokenStream();
+    assertNull(ts.next());
+    ts.reset();
+    assertNull(ts.next(new Token()));
+  }
+
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAndSuffixAwareTokenFilter.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,54 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class TestPrefixAndSuffixAwareTokenFilter extends TestCase {
+
+  public void test() throws IOException {
+
+    PrefixAndSuffixAwareTokenFilter ts = new PrefixAndSuffixAwareTokenFilter(
+        new SingleTokenTokenStream(new Token("^", 0, 0)),
+        new WhitespaceTokenizer(new StringReader("hello world")),
+        new SingleTokenTokenStream(new Token("$", 0, 0)));
+
+    assertNext(ts, "^", 0, 0);
+    assertNext(ts, "hello", 0, 5);
+    assertNext(ts, "world", 6, 11);
+    assertNext(ts, "$", 11, 11);
+    assertNull(ts.next());
+  }
+
+
+  private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
+    Token token = ts.next();
+    assertNotNull(token);
+    assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+    assertEquals(startOffset, token.startOffset());
+    assertEquals(endOffset, token.endOffset());
+    return token;
+  }
+
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestPrefixAwareTokenFilter.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,64 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.WhitespaceTokenizer;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+public class TestPrefixAwareTokenFilter extends TestCase {
+
+  public void test() throws IOException {
+
+    PrefixAwareTokenFilter ts;
+
+    ts = new PrefixAwareTokenFilter(
+        new SingleTokenTokenStream(new Token("a", 0, 1)),
+        new SingleTokenTokenStream(new Token("b", 0, 1)));
+    assertNext(ts, "a", 0, 1);
+    assertNext(ts, "b", 1, 2);
+    assertNull(ts.next());
+
+
+    // prefix and suffix using 2x prefix
+
+    ts = new PrefixAwareTokenFilter(new SingleTokenTokenStream(new Token("^", 0, 0)), new WhitespaceTokenizer(new StringReader("hello world")));
+    ts = new PrefixAwareTokenFilter(ts, new SingleTokenTokenStream(new Token("$", 0, 0)));
+
+    assertNext(ts, "^", 0, 0);
+    assertNext(ts, "hello", 0, 5);
+    assertNext(ts, "world", 6, 11);
+    assertNext(ts, "$", 11, 11);
+    assertNull(ts.next());
+  }
+
+
+  private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
+    Token token = ts.next();
+    assertNotNull(token);
+    assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+    assertEquals(startOffset, token.startOffset());
+    assertEquals(endOffset, token.endOffset());
+    return token;
+  }
+
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/miscellaneous/TestSingleTokenTokenFilter.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,39 @@
+package org.apache.lucene.analysis.miscellaneous;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+import java.io.IOException;
+
+import org.apache.lucene.analysis.Token;
+
+public class TestSingleTokenTokenFilter extends TestCase {
+
+  public void test() throws IOException {
+
+    Token token = new Token();
+
+    SingleTokenTokenStream ts = new SingleTokenTokenStream(token);
+
+    assertEquals(token, ts.next());
+    assertNull(ts.next());
+
+  }
+
+}

Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java?rev=673549&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/shingle/TestShingleMatrixFilter.java Wed Jul  2 16:53:51 2008
@@ -0,0 +1,523 @@
+package org.apache.lucene.analysis.shingle;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.PrefixAndSuffixAwareTokenFilter;
+import org.apache.lucene.analysis.miscellaneous.SingleTokenTokenStream;
+import org.apache.lucene.analysis.payloads.PayloadHelper;
+import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix;
+import org.apache.lucene.analysis.shingle.ShingleMatrixFilter.Matrix.Column;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.LinkedList;
+
+public class TestShingleMatrixFilter extends TestCase {
+
+
+  public void testBehavingAsShingleFilter() throws IOException {
+
+    ShingleMatrixFilter.defaultSettingsCodec = null;
+
+    Token token = new Token(); // for debug use only
+
+
+    TokenStream ts;
+    TokenListStream tls;
+    LinkedList<Token> tokens;
+
+    // test a plain old token stream with synonyms tranlated to rows.
+
+    tokens = new LinkedList<Token>();
+    tokens.add(new Token("please", 0, 6));
+    tokens.add(new Token("divide", 7, 13));
+    tokens.add(new Token("this", 14, 18));
+    tokens.add(new Token("sentence", 19, 27));
+    tokens.add(new Token("into", 28, 32));
+    tokens.add(new Token("shingles", 33, 39));
+
+    tls = new TokenListStream(tokens);
+
+    // bi-grams
+
+    ts = new ShingleMatrixFilter(tls, 1, 2, ' ', false, new ShingleMatrixFilter.OneDimensionalNonWeightedTokenSettingsCodec());
+
+    assertNext(ts, "please", 0, 6);
+    assertNext(ts, "please divide", 0, 13);
+    assertNext(ts, "divide", 7, 13);
+    assertNext(ts, "divide this", 7, 18);
+    assertNext(ts, "this", 14, 18);
+    assertNext(ts, "this sentence", 14, 27);
+    assertNext(ts, "sentence", 19, 27);
+    assertNext(ts, "sentence into", 19, 32);
+    assertNext(ts, "into", 28, 32);
+    assertNext(ts, "into shingles", 28, 39);
+    assertNext(ts, "shingles", 33, 39);
+
+
+    assertNull(ts.next());
+
+  }
+
+  /**
+   * Extracts a matrix from a token stream.
+   * @throws IOException
+   */
+  public void testTokenStream() throws IOException {
+
+    ShingleMatrixFilter.defaultSettingsCodec = null;//new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
+
+    Token token = new Token(); // for debug use only
+
+
+    TokenStream ts;
+    TokenListStream tls;
+    LinkedList<Token> tokens;
+
+    // test a plain old token stream with synonyms tranlated to rows.
+
+    tokens = new LinkedList<Token>();
+    tokens.add(tokenFactory("hello", 1, 0, 4));
+    tokens.add(tokenFactory("greetings", 0, 0, 4));
+    tokens.add(tokenFactory("world", 1, 5, 10));
+    tokens.add(tokenFactory("earth", 0, 5, 10));
+    tokens.add(tokenFactory("tellus", 0, 5, 10));
+
+    tls = new TokenListStream(tokens);
+
+    // bi-grams
+
+    ts = new ShingleMatrixFilter(tls, 2, 2, '_', false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
+
+    assertNext(ts, "hello_world");
+    assertNext(ts, "greetings_world");
+    assertNext(ts, "hello_earth");
+    assertNext(ts, "greetings_earth");
+    assertNext(ts, "hello_tellus");
+    assertNext(ts, "greetings_tellus");
+    assertNull(ts.next());
+
+    // bi-grams with no spacer character, start offset, end offset
+
+    tls.reset();
+    ts = new ShingleMatrixFilter(tls, 2, 2, null, false, new ShingleMatrixFilter.TwoDimensionalNonWeightedSynonymTokenSettingsCodec());
+    assertNext(ts, "helloworld", 0, 10);
+    assertNext(ts, "greetingsworld", 0, 10);
+    assertNext(ts, "helloearth", 0, 10);
+    assertNext(ts, "greetingsearth", 0, 10);
+    assertNext(ts, "hellotellus", 0, 10);
+    assertNext(ts, "greetingstellus", 0, 10);
+    assertNull(ts.next());
+
+
+    // add ^_prefix_and_suffix_$
+    //
+    // using 3d codec as it supports weights
+
+    ShingleMatrixFilter.defaultSettingsCodec = new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec();
+
+    tokens = new LinkedList<Token>();
+    tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
+    tokens.add(tokenFactory("greetings", 0, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
+    tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn));
+    tokens.add(tokenFactory("earth", 0, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
+    tokens.add(tokenFactory("tellus", 0, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
+
+    tls = new TokenListStream(tokens);
+
+    ts = new PrefixAndSuffixAwareTokenFilter(new SingleTokenTokenStream(tokenFactory("^", 1, 100f, 0, 0)), tls, new SingleTokenTokenStream(tokenFactory("$", 1, 50f, 0, 0)));
+    tls = new TokenListStream(ts);
+
+    // bi-grams, position incrememnt, weight, start offset, end offset
+
+    ts = new ShingleMatrixFilter(tls, 2, 2, '_', false);
+//
+//    while ((token = ts.next(token)) != null) {
+//      System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+//      token.clear();
+//    }
+
+    assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+    assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+    assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+    assertNull(ts.next());
+
+    // test unlimited size and allow single boundary token as shingle
+    tls.reset();
+    ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', false);
+
+//
+//    while ((token = ts.next(token)) != null) {
+//      System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+//      token.clear();
+//    }
+
+    assertNext(ts, "^", 1, 10.0f, 0, 0);
+    assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+    assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "hello", 1, 1.0f, 0, 4);
+    assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "world", 1, 1.0f, 5, 10);
+    assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "$", 1, 7.071068f, 10, 10);
+    assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+    assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "greetings", 1, 1.0f, 0, 4);
+    assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "earth", 1, 1.0f, 5, 10);
+    assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "tellus", 1, 1.0f, 5, 10);
+    assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+
+    assertNull(ts.next());
+
+    // test unlimited size but don't allow single boundary token as shingle
+
+    tls.reset();
+    ts = new ShingleMatrixFilter(tls, 1, Integer.MAX_VALUE, '_', true);
+//    while ((token = ts.next(token)) != null) {
+//      System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+//      token.clear();
+//    }
+
+    assertNext(ts, "^_hello", 1, 10.049875f, 0, 4);
+    assertNext(ts, "^_hello_world", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_hello_world_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "hello", 1, 1.0f, 0, 4);
+    assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_world_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "world", 1, 1.0f, 5, 10);
+    assertNext(ts, "world_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "^_greetings", 1, 10.049875f, 0, 4);
+    assertNext(ts, "^_greetings_world", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_greetings_world_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "greetings", 1, 1.0f, 0, 4);
+    assertNext(ts, "greetings_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_world_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "^_hello_earth", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_hello_earth_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_earth_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "earth", 1, 1.0f, 5, 10);
+    assertNext(ts, "earth_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "^_greetings_earth", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_greetings_earth_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "greetings_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_earth_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "^_hello_tellus", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_hello_tellus_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_tellus_$", 1, 7.2111025f, 0, 10);
+    assertNext(ts, "tellus", 1, 1.0f, 5, 10);
+    assertNext(ts, "tellus_$", 1, 7.1414285f, 5, 10);
+    assertNext(ts, "^_greetings_tellus", 1, 10.099504f, 0, 10);
+    assertNext(ts, "^_greetings_tellus_$", 1, 12.328828f, 0, 10);
+    assertNext(ts, "greetings_tellus", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_tellus_$", 1, 7.2111025f, 0, 10);
+
+
+    assertNull(ts.next());
+
+    System.currentTimeMillis();
+
+    // multi-token synonyms
+    //
+    // Token[][][] {
+    //    {{hello}, {greetings, and, salutations},
+    //    {{world}, {earth}, {tellus}}
+    // }
+    //
+
+
+    tokens = new LinkedList<Token>();
+    tokens.add(tokenFactory("hello", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newColumn));
+    tokens.add(tokenFactory("greetings", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.newRow));
+    tokens.add(tokenFactory("and", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow));
+    tokens.add(tokenFactory("salutations", 1, 1f, 0, 4, ShingleMatrixFilter.TokenPositioner.sameRow));
+    tokens.add(tokenFactory("world", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newColumn));
+    tokens.add(tokenFactory("earth", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
+    tokens.add(tokenFactory("tellus", 1, 1f, 5, 10, ShingleMatrixFilter.TokenPositioner.newRow));
+
+    tls = new TokenListStream(tokens);
+
+    // 2-3 grams
+
+    ts = new ShingleMatrixFilter(tls, 2, 3, '_', false);
+
+//    while ((token = ts.next(token)) != null) {
+//      System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+//      token.clear();
+//    }
+
+    // shingle, position increment, weight, start offset, end offset
+
+    assertNext(ts, "hello_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "greetings_and", 1, 1.4142135f, 0, 4);
+    assertNext(ts, "greetings_and_salutations", 1, 1.7320508f, 0, 4);
+    assertNext(ts, "and_salutations", 1, 1.4142135f, 0, 4);
+    assertNext(ts, "and_salutations_world", 1, 1.7320508f, 0, 10);
+    assertNext(ts, "salutations_world", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "and_salutations_earth", 1, 1.7320508f, 0, 10);
+    assertNext(ts, "salutations_earth", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "hello_tellus", 1, 1.4142135f, 0, 10);
+    assertNext(ts, "and_salutations_tellus", 1, 1.7320508f, 0, 10);
+    assertNext(ts, "salutations_tellus", 1, 1.4142135f, 0, 10);
+
+    assertNull(ts.next());
+
+    System.currentTimeMillis();
+
+
+  }
+
+  /**
+   * Tests creat shingles from a pre-assembled matrix
+   *
+   * Tests the row token z-axis, multi token synonyms.
+   *
+   * @throws IOException
+   */
+  public void testMatrix() throws IOException {
+
+    Matrix matrix = new Matrix();
+
+    matrix.new Column(tokenFactory("no", 1));
+    matrix.new Column(tokenFactory("surprise", 1));
+    matrix.new Column(tokenFactory("to", 1));
+    matrix.new Column(tokenFactory("see", 1));
+    matrix.new Column(tokenFactory("england", 1));
+    matrix.new Column(tokenFactory("manager", 1));
+
+    Column col = matrix.new Column();
+
+    // sven göran eriksson is a multi token synonym to svennis
+    col.new Row().getTokens().add(tokenFactory("svennis", 1));
+
+    Column.Row row = col.new Row();
+    row.getTokens().add(tokenFactory("sven", 1));
+    row.getTokens().add(tokenFactory("göran", 1));
+    row.getTokens().add(tokenFactory("eriksson", 1));
+
+    matrix.new Column(tokenFactory("in", 1));
+    matrix.new Column(tokenFactory("the", 1));
+    matrix.new Column(tokenFactory("croud", 1));
+
+    TokenStream ts = new ShingleMatrixFilter(matrix, 2, 4, '_', true, new ShingleMatrixFilter.SimpleThreeDimensionalTokenSettingsCodec());
+
+//    Token token = new Token();
+//    while ((token = ts.next(token)) != null) {
+//      System.out.println("assertNext(ts, \"" + token.termText() + "\", " + token.getPositionIncrement() + ", " + (token.getPayload() == null ? "1.0" : PayloadHelper.decodeFloat(token.getPayload().getData())) + "f, " + token.startOffset() + ", " + token.endOffset() + ");");
+//      token.clear();
+//    }
+
+    assertNext(ts, "no_surprise", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "no_surprise_to", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "no_surprise_to_see", 1, 2.0f, 0, 0);
+    assertNext(ts, "surprise_to", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "surprise_to_see", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "surprise_to_see_england", 1, 2.0f, 0, 0);
+    assertNext(ts, "to_see", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "to_see_england", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "to_see_england_manager", 1, 2.0f, 0, 0);
+    assertNext(ts, "see_england", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "see_england_manager", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "see_england_manager_svennis", 1, 2.0f, 0, 0);
+    assertNext(ts, "england_manager", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "england_manager_svennis", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "england_manager_svennis_in", 1, 2.0f, 0, 0);
+    assertNext(ts, "manager_svennis", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "manager_svennis_in", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "manager_svennis_in_the", 1, 2.0f, 0, 0);
+    assertNext(ts, "svennis_in", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "svennis_in_the", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "svennis_in_the_croud", 1, 2.0f, 0, 0);
+    assertNext(ts, "in_the", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "in_the_croud", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "the_croud", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "see_england_manager_sven", 1, 2.0f, 0, 0);
+    assertNext(ts, "england_manager_sven", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "england_manager_sven_göran", 1, 2.0f, 0, 0);
+    assertNext(ts, "manager_sven", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "manager_sven_göran", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "manager_sven_göran_eriksson", 1, 2.0f, 0, 0);
+    assertNext(ts, "sven_göran", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "sven_göran_eriksson", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "sven_göran_eriksson_in", 1, 2.0f, 0, 0);
+    assertNext(ts, "göran_eriksson", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "göran_eriksson_in", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "göran_eriksson_in_the", 1, 2.0f, 0, 0);
+    assertNext(ts, "eriksson_in", 1, 1.4142135f, 0, 0);
+    assertNext(ts, "eriksson_in_the", 1, 1.7320508f, 0, 0);
+    assertNext(ts, "eriksson_in_the_croud", 1, 2.0f, 0, 0);
+
+    assertNull(ts.next());
+
+  }
+
+  private Token tokenFactory(String text, int startOffset, int endOffset) {
+    return tokenFactory(text, 1, 1f, startOffset, endOffset);
+  }
+
+
+  private Token tokenFactory(String text, int posIncr, int startOffset, int endOffset) {
+    Token token = new Token();
+    token.setTermText(text);
+    token.setPositionIncrement(posIncr);
+    token.setStartOffset(startOffset);
+    token.setEndOffset(endOffset);
+    return token;
+  }
+
+
+  private Token tokenFactory(String text, int posIncr) {
+    return tokenFactory(text, posIncr, 1f, 0, 0);
+  }
+
+  private Token tokenFactory(String text, int posIncr, float weight) {
+    return tokenFactory(text, posIncr, weight, 0, 0);
+  }
+
+  private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset) {
+    Token token = new Token();
+    token.setTermText(text);
+    token.setPositionIncrement(posIncr);
+    ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
+    token.setStartOffset(startOffset);
+    token.setEndOffset(endOffset);
+    return token;
+  }
+
+  private Token tokenFactory(String text, int posIncr, float weight, int startOffset, int endOffset, ShingleMatrixFilter.TokenPositioner positioner) {
+    Token token = new Token();
+    token.setTermText(text);
+    token.setPositionIncrement(posIncr);
+    ShingleMatrixFilter.defaultSettingsCodec.setWeight(token, weight);
+    token.setStartOffset(startOffset);
+    token.setEndOffset(endOffset);
+    ShingleMatrixFilter.defaultSettingsCodec.setTokenPositioner(token, positioner);
+    return token;
+  }
+
+  // assert-methods start here
+
+  private Token assertNext(TokenStream ts, String text) throws IOException {
+    Token token = ts.next(new Token());
+    assertNotNull(token);
+    assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+    return token;
+  }
+
+  private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost) throws IOException {
+    Token token = ts.next(new Token());
+    assertNotNull(token);
+    assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+    assertEquals(positionIncrement, token.getPositionIncrement());
+    assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
+    return token;
+  }
+
+  private Token assertNext(TokenStream ts, String text, int positionIncrement, float boost, int startOffset, int endOffset) throws IOException {
+    Token token = ts.next(new Token());
+    assertNotNull(token);
+    assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+    assertEquals(positionIncrement, token.getPositionIncrement());
+    assertEquals(boost, token.getPayload() == null ? 1f : PayloadHelper.decodeFloat(token.getPayload().getData()));
+    assertEquals(startOffset, token.startOffset());
+    assertEquals(endOffset, token.endOffset());
+    return token;
+  }
+
+  private Token assertNext(TokenStream ts, String text, int startOffset, int endOffset) throws IOException {
+    Token token = ts.next(new Token());
+    assertNotNull(token);
+    assertEquals(text, new String(token.termBuffer(), 0, token.termLength()));
+    assertEquals(startOffset, token.startOffset());
+    assertEquals(endOffset, token.endOffset());
+    return token;
+  }
+
+
+  public static class TokenListStream extends TokenStream {
+
+    private Collection<Token> tokens;
+
+    public TokenListStream(TokenStream ts) throws IOException {
+      tokens = new ArrayList<Token>();
+      Token token;
+      while ((token = ts.next(new Token())) != null) {
+        tokens.add(token);
+      }
+    }
+
+    public TokenListStream(Collection<Token> tokens) {
+      this.tokens = tokens;
+    }
+
+    private Iterator<Token> iterator;
+
+    public Token next() throws IOException {
+      if (iterator == null) {
+        iterator = tokens.iterator();
+      }
+      if (!iterator.hasNext()) {
+        return null;
+      }
+      return iterator.next();
+    }
+
+
+    public void reset() throws IOException {
+      iterator = null;
+    }
+  }
+
+}



Mime
View raw message