lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gsing...@apache.org
Subject svn commit: r565994 - in /lucene/java/trunk: CHANGES.txt src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java
Date Wed, 15 Aug 2007 01:14:55 GMT
Author: gsingers
Date: Tue Aug 14 18:14:54 2007
New Revision: 565994

URL: http://svn.apache.org/viewvc?view=rev&rev=565994
Log:
LUCENE-975: New PositionBasedTermVectorMapper for getting term vector information on a position
by position basis.

Added:
    lucene/java/trunk/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java
  (with props)
    lucene/java/trunk/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java
  (with props)
Modified:
    lucene/java/trunk/CHANGES.txt

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=565994&r1=565993&r2=565994
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Tue Aug 14 18:14:54 2007
@@ -83,6 +83,8 @@
     This implementation contains several extensions of the new abstract TermVectorMapper
class.  The new API should be back-compatible.  No changes in the
      actual storage of Term Vectors has taken place.
 
+ 4. LUCENE-975: Added PositionBasedTermVectorMapper that allows for position based lookup
of term vector information.  See item #3 above (LUCENE-868).
+
 Optimizations
 
  1. LUCENE-937: CachingTokenFilter now uses an iterator to access the 

Added: lucene/java/trunk/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java?view=auto&rev=565994
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java
(added)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java
Tue Aug 14 18:14:54 2007
@@ -0,0 +1,167 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2007 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * For each Field, store position by position information.  It ignores frequency information
+ * <p/>
+ * This is not thread-safe.
+ */
+public class PositionBasedTermVectorMapper extends TermVectorMapper{
+  private Map/*<String, Map<Integer, TVPositionInfo>>*/ fieldToTerms;
+
+  private String currentField;
+  /**
+   * A Map of Integer and TVPositionInfo
+   */
+  private Map/*<Integer, TVPositionInfo>*/ currentPositions;
+  private boolean storeOffsets;
+
+  
+
+
+  /**
+   *
+   *
+   */
+  public PositionBasedTermVectorMapper() {
+    super(false, false);
+  }
+
+  public PositionBasedTermVectorMapper(boolean ignoringOffsets)
+  {
+    super(false, ignoringOffsets);
+  }
+
+  /**
+   * Never ignores positions.  This mapper doesn't make much sense unless there are positions
+   * @return
+   */
+  public boolean isIgnoringPositions() {
+    return false;
+  }
+
+  /**
+   * Callback for the TermVectorReader. 
+   * @param term
+   * @param frequency
+   * @param offsets
+   * @param positions
+   */
+  public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions)
{
+    for (int i = 0; i < positions.length; i++) {
+      Integer posVal = new Integer(positions[i]);
+      TVPositionInfo pos = (TVPositionInfo) currentPositions.get(posVal);
+      if (pos == null) {
+        pos = new TVPositionInfo(positions[i], storeOffsets);
+        currentPositions.put(posVal, pos);
+      }
+      pos.addTerm(term, offsets != null ? offsets[i] : null);
+    }
+  }
+
+  /**
+   * Callback mechanism used by the TermVectorReader
+   * @param field  The field being read
+   * @param numTerms The number of terms in the vector
+   * @param storeOffsets Whether offsets are available
+   * @param storePositions Whether positions are available
+   */
+  public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions)
{
+    if (storePositions == false)
+    {
+      throw new RuntimeException("You must store positions in order to use this Mapper");
+    }
+    if (storeOffsets == true)
+    {
+      //ignoring offsets
+    }
+    fieldToTerms = new HashMap(numTerms);
+    this.storeOffsets = storeOffsets;
+    currentField = field;
+    currentPositions = new HashMap();
+    fieldToTerms.put(currentField, currentPositions);
+  }
+
+  /**
+   * Get the mapping between fields and terms, sorted by the comparator
+   *
+   * @return A map between field names and a Map.  The sub-Map key is the position as the
integer, the value is {@link org.apache.lucene.index.PositionBasedTermVectorMapper.TVPositionInfo}.
+   */
+  public Map getFieldToTerms() {
+    return fieldToTerms;
+  }
+
+  /**
+   * Container for a term at a position
+   */
+  public static class TVPositionInfo{
+    private int position;
+    //a list of Strings
+    private List terms;
+    //A list of TermVectorOffsetInfo
+    private List offsets;
+
+
+    public TVPositionInfo(int position, boolean storeOffsets) {
+      this.position = position;
+      terms = new ArrayList();
+      if (storeOffsets) {
+        offsets = new ArrayList();
+      }
+    }
+
+    void addTerm(String term, TermVectorOffsetInfo info)
+    {
+      terms.add(term);
+      if (offsets != null) {
+        offsets.add(info);
+      }
+    }
+
+    /**
+     *
+     * @return The position of the term
+     */
+    public int getPosition() {
+      return position;
+    }
+
+    /**
+     * Note, there may be multiple terms at the same position
+     * @return A List of Strings
+     */
+    public List getTerms() {
+      return terms;
+    }
+
+    /**
+     * Parallel list (to {@link #getTerms()}) of TermVectorOffsetInfo objects.  There may
be multiple entries since there may be multiple terms at a position
+     * @return A List of TermVectorOffsetInfo objects, if offsets are store.
+     */
+    public List getOffsets() {
+      return offsets;
+    }
+  }
+
+
+}

Propchange: lucene/java/trunk/src/java/org/apache/lucene/index/PositionBasedTermVectorMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java?view=auto&rev=565994
==============================================================================
--- lucene/java/trunk/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java
(added)
+++ lucene/java/trunk/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java
Tue Aug 14 18:14:54 2007
@@ -0,0 +1,107 @@
+package org.apache.lucene.index;
+/**
+ * Copyright 2005 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import junit.framework.TestCase;
+
+import java.io.IOException;
+import java.util.BitSet;
+import java.util.Iterator;
+import java.util.Map;
+
+public class TestPositionBasedTermVectorMapper extends TestCase {
+  protected String[] tokens;
+  protected int[][] thePositions;
+  protected TermVectorOffsetInfo[][] offsets;
+  protected int numPositions;
+
+
+  public TestPositionBasedTermVectorMapper(String s) {
+    super(s);
+  }
+
+  protected void setUp() {
+    tokens = new String[]{"here", "is", "some", "text", "to", "test", "extra"};
+    thePositions = new int[tokens.length][];
+    offsets = new TermVectorOffsetInfo[tokens.length][];
+    numPositions = 0;
+    //save off the last one so we can add it with the same positions as some of the others,
but in a predictable way
+    for (int i = 0; i < tokens.length - 1; i++)
+    {
+      thePositions[i] = new int[2 * i + 1];//give 'em all some positions
+      for (int j = 0; j < thePositions[i].length; j++)
+      {
+        thePositions[i][j] = numPositions++;
+      }
+      offsets[i] = new TermVectorOffsetInfo[thePositions[i].length];
+      for (int j = 0; j < offsets[i].length; j++) {
+        offsets[i][j] = new TermVectorOffsetInfo(j, j + 1);//the actual value here doesn't
much matter
+      }
+    }
+    thePositions[tokens.length - 1] = new int[1];
+    thePositions[tokens.length - 1][0] = 0;//put this at the same position as "here"
+    offsets[tokens.length - 1] = new TermVectorOffsetInfo[1];
+    offsets[tokens.length - 1][0] = new TermVectorOffsetInfo(0, 1);
+  }
+
+  protected void tearDown() {
+
+  }
+
+  public void test() throws IOException {
+    PositionBasedTermVectorMapper mapper = new PositionBasedTermVectorMapper();
+    
+    mapper.setExpectations("test", tokens.length, true, true);
+    //Test single position
+    for (int i = 0; i < tokens.length; i++) {
+      String token = tokens[i];
+      mapper.map(token, 1, null, thePositions[i]);
+
+    }
+    Map map = mapper.getFieldToTerms();
+    assertTrue("map is null and it shouldn't be", map != null);
+    assertTrue("map Size: " + map.size() + " is not: " + 1, map.size() == 1);
+    Map positions = (Map) map.get("test");
+    assertTrue("thePositions is null and it shouldn't be", positions != null);
+    
+    assertTrue("thePositions Size: " + positions.size() + " is not: " + numPositions, positions.size()
== numPositions);
+    BitSet bits = new BitSet(numPositions);
+    for (Iterator iterator = positions.entrySet().iterator(); iterator.hasNext();) {
+      Map.Entry entry = (Map.Entry) iterator.next();
+      PositionBasedTermVectorMapper.TVPositionInfo info = (PositionBasedTermVectorMapper.TVPositionInfo)
entry.getValue();
+      assertTrue("info is null and it shouldn't be", info != null);
+      int pos = ((Integer) entry.getKey()).intValue();
+      bits.set(pos);
+      assertTrue(info.getPosition() + " does not equal: " + pos, info.getPosition() == pos);
+      assertTrue("info.getOffsets() is null and it shouldn't be", info.getOffsets() != null);
+      if (pos == 0)
+      {
+        assertTrue("info.getTerms() Size: " + info.getTerms().size() + " is not: " + 2, info.getTerms().size()
== 2);//need a test for multiple terms at one pos
+        assertTrue("info.getOffsets() Size: " + info.getOffsets().size() + " is not: " +
2, info.getOffsets().size() == 2);
+      }
+      else
+      {
+        assertTrue("info.getTerms() Size: " + info.getTerms().size() + " is not: " + 1, info.getTerms().size()
== 1);//need a test for multiple terms at one pos
+        assertTrue("info.getOffsets() Size: " + info.getOffsets().size() + " is not: " +
1, info.getOffsets().size() == 1);
+      }
+    }
+    assertTrue("Bits are not all on", bits.cardinality() == numPositions);
+  }
+
+
+
+  
+}
\ No newline at end of file

Propchange: lucene/java/trunk/src/test/org/apache/lucene/index/TestPositionBasedTermVectorMapper.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message