lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1201328 - in /lucene/dev/branches/lucene2621: lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/ lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/ lucene/contrib/highlighter/src/test/org/apa...
Date Sat, 12 Nov 2011 21:08:53 GMT
Author: mikemccand
Date: Sat Nov 12 21:08:53 2011
New Revision: 1201328

URL: http://svn.apache.org/viewvc?rev=1201328&view=rev
Log:
LUCENE-2621: modules tests pass

Modified:
    lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
    lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
    lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
    lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
    lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
    lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java
    lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
    lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java

Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
Sat Nov 12 21:08:53 2011
@@ -32,7 +32,11 @@ import org.apache.lucene.analysis.tokena
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.BytesRef;
 
@@ -66,12 +70,14 @@ public class TokenSources {
       String field, Document doc, Analyzer analyzer) throws IOException {
     TokenStream ts = null;
 
-    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
-    if (tfv != null) {
-      if (tfv instanceof TermPositionVector) {
-        ts = getTokenStream((TermPositionVector) tfv);
+    Fields vectors = reader.getTermVectors(docId);
+    if (vectors != null) {
+      Terms vector = vectors.terms(field);
+      if (vector != null) {
+        ts = getTokenStream(vector);
       }
     }
+
     // No token info stored so fall back to analyzing raw content
     if (ts == null) {
       ts = getTokenStream(doc, field, analyzer);
@@ -96,12 +102,14 @@ public class TokenSources {
       String field, Analyzer analyzer) throws IOException {
     TokenStream ts = null;
 
-    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
-    if (tfv != null) {
-      if (tfv instanceof TermPositionVector) {
-        ts = getTokenStream((TermPositionVector) tfv);
+    Fields vectors = reader.getTermVectors(docId);
+    if (vectors != null) {
+      Terms vector = vectors.terms(field);
+      if (vector != null) {
+        ts = getTokenStream(vector);
       }
     }
+
     // No token info stored so fall back to analyzing raw content
     if (ts == null) {
       ts = getTokenStream(reader, docId, field, analyzer);
@@ -109,10 +117,25 @@ public class TokenSources {
     return ts;
   }
 
-  public static TokenStream getTokenStream(TermPositionVector tpv) {
+  public static TokenStream getTokenStream(Terms vector) throws IOException {
     // assumes the worst and makes no assumptions about token position
     // sequences.
-    return getTokenStream(tpv, false);
+    return getTokenStream(vector, false);
+  }
+
+  private static boolean hasPositions(Terms vector) throws IOException {
+    final TermsEnum termsEnum = vector.iterator();
+    if (termsEnum.next() != null) {
+      DocsAndPositionsEnum dpEnum = termsEnum.docsAndPositions(null, null);
+      if (dpEnum != null) {
+        int pos = dpEnum.nextPosition();
+        if (pos >= 0) {
+          return true;
+        }
+      }
+    }
+
+    return false;
   }
 
   /**
@@ -141,9 +164,10 @@ public class TokenSources {
    *        numbers have no overlaps or gaps. If looking to eek out the last
    *        drops of performance, set to true. If in doubt, set to false.
    */
-  public static TokenStream getTokenStream(TermPositionVector tpv,
-      boolean tokenPositionsGuaranteedContiguous) {
-    if (!tokenPositionsGuaranteedContiguous && tpv.getTermPositions(0) != null) {
+  public static TokenStream getTokenStream(Terms tpv,
+      boolean tokenPositionsGuaranteedContiguous) 
+  throws IOException {
+    if (!tokenPositionsGuaranteedContiguous && hasPositions(tpv)) {
       return new TokenStreamFromTermPositionVector(tpv);
     }
 
@@ -183,56 +207,57 @@ public class TokenSources {
       }
     }
     // code to reconstruct the original sequence of Tokens
-    BytesRef[] terms = tpv.getTerms();
-    int[] freq = tpv.getTermFrequencies();
+    TermsEnum termsEnum = tpv.iterator();
     int totalTokens = 0;
-    for (int t = 0; t < freq.length; t++) {
-      totalTokens += freq[t];
+    while(termsEnum.next() != null) {
+      totalTokens += (int) termsEnum.totalTermFreq();
     }
     Token tokensInOriginalOrder[] = new Token[totalTokens];
     ArrayList<Token> unsortedTokens = null;
-    for (int t = 0; t < freq.length; t++) {
-      TermVectorOffsetInfo[] offsets = tpv.getOffsets(t);
-      if (offsets == null) {
+    termsEnum = tpv.iterator();
+    BytesRef text;
+    DocsAndPositionsEnum dpEnum = null;
+    while ((text = termsEnum.next()) != null) {
+
+      dpEnum = termsEnum.docsAndPositions(null, dpEnum);
+      if (dpEnum == null || (!dpEnum.attributes().hasAttribute(OffsetAttribute.class))) {
         throw new IllegalArgumentException(
             "Required TermVector Offset information was not found");
       }
 
-      int[] pos = null;
-      if (tokenPositionsGuaranteedContiguous) {
-        // try get the token position info to speed up assembly of tokens into
-        // sorted sequence
-        pos = tpv.getTermPositions(t);
-      }
-      if (pos == null) {
-        // tokens NOT stored with positions or not guaranteed contiguous - must
-        // add to list and sort later
-        if (unsortedTokens == null) {
-          unsortedTokens = new ArrayList<Token>();
-        }
-        for (int tp = 0; tp < offsets.length; tp++) {
-          Token token = new Token(terms[t].utf8ToString(),
-              offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
+      final String term = text.utf8ToString();
+
+      final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
+      dpEnum.nextDoc();
+      final int freq = dpEnum.freq();
+      for(int posUpto=0;posUpto<freq;posUpto++) {
+        final int pos = dpEnum.nextPosition();
+        final Token token = new Token(term,
+                                      offsetAtt.startOffset(),
+                                      offsetAtt.endOffset());
+        if (tokenPositionsGuaranteedContiguous && pos != -1) {
+          // We have positions stored and a guarantee that the token position
+          // information is contiguous
+
+          // This may be fast BUT wont work if Tokenizers used which create >1
+          // token in same position or
+          // creates jumps in position numbers - this code would fail under those
+          // circumstances
+
+          // tokens stored with positions - can use this to index straight into
+          // sorted array
+          tokensInOriginalOrder[pos] = token;
+        } else {
+          // tokens NOT stored with positions or not guaranteed contiguous - must
+          // add to list and sort later
+          if (unsortedTokens == null) {
+            unsortedTokens = new ArrayList<Token>();
+          }
           unsortedTokens.add(token);
         }
-      } else {
-        // We have positions stored and a guarantee that the token position
-        // information is contiguous
-
-        // This may be fast BUT wont work if Tokenizers used which create >1
-        // token in same position or
-        // creates jumps in position numbers - this code would fail under those
-        // circumstances
-
-        // tokens stored with positions - can use this to index straight into
-        // sorted array
-        for (int tp = 0; tp < pos.length; tp++) {
-          Token token = new Token(terms[t].utf8ToString(),
-              offsets[tp].getStartOffset(), offsets[tp].getEndOffset());
-          tokensInOriginalOrder[pos[tp]] = token;
-        }
       }
     }
+
     // If the field has been stored without position data we must perform a sort
     if (unsortedTokens != null) {
       tokensInOriginalOrder = unsortedTokens.toArray(new Token[unsortedTokens
@@ -250,18 +275,25 @@ public class TokenSources {
 
   public static TokenStream getTokenStream(IndexReader reader, int docId,
       String field) throws IOException {
-    TermFreqVector tfv = reader.getTermFreqVector(docId, field);
-    if (tfv == null) {
+
+    Fields vectors = reader.getTermVectors(docId);
+    if (vectors == null) {
       throw new IllegalArgumentException(field + " in doc #" + docId
           + "does not have any term position data stored");
     }
-    if (tfv instanceof TermPositionVector) {
-      TermPositionVector tpv = (TermPositionVector) reader.getTermFreqVector(
-          docId, field);
-      return getTokenStream(tpv);
+
+    Terms vector = vectors.terms(field);
+    if (vector == null) {
+      throw new IllegalArgumentException(field + " in doc #" + docId
+          + "does not have any term position data stored");
+    }
+
+    if (!hasPositions(vector)) {
+      throw new IllegalArgumentException(field + " in doc #" + docId
+          + "does not have any term position data stored");
     }
-    throw new IllegalArgumentException(field + " in doc #" + docId
-        + "does not have any term position data stored");
+    
+    return getTokenStream(vector);
   }
 
   // convenience method

Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermPositionVector.java
Sat Nov 12 21:08:53 2011
@@ -27,8 +27,9 @@ import org.apache.lucene.analysis.TokenS
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.index.TermPositionVector;
-import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CollectionUtil;
 
@@ -51,27 +52,41 @@ public final class TokenStreamFromTermPo
    *        creating the TokenStream. Must have positions and offsets.
    */
   public TokenStreamFromTermPositionVector(
-      final TermPositionVector termPositionVector) {
+      final Terms vector) throws IOException {
     termAttribute = addAttribute(CharTermAttribute.class);
     positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
     offsetAttribute = addAttribute(OffsetAttribute.class);
-    final BytesRef[] terms = termPositionVector.getTerms();
-    for (int i = 0; i < terms.length; i++) {
-      final TermVectorOffsetInfo[] offsets = termPositionVector.getOffsets(i);
-      final int[] termPositions = termPositionVector.getTermPositions(i);
-      for (int j = 0; j < termPositions.length; j++) {
+    final TermsEnum termsEnum = vector.iterator();
+    BytesRef text;
+    // nocommit find all places where I "blindly" added
+    // calls to .getAttribute(OffsetAttr): these are wrong.
+    // instead i must check .hasAttr first
+    DocsAndPositionsEnum dpEnum = null;
+    while((text = termsEnum.next()) != null) {
+      dpEnum = termsEnum.docsAndPositions(null, dpEnum);
+      dpEnum.nextDoc();
+      final int freq = dpEnum.freq();
+      final OffsetAttribute offsetAtt;
+      if (dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
+        offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
+      } else {
+        offsetAtt = null;
+      }
+      for (int j = 0; j < freq; j++) {
+        int pos = dpEnum.nextPosition();
         Token token;
-        if (offsets != null) {
-          token = new Token(terms[i].utf8ToString(),
-              offsets[j].getStartOffset(), offsets[j].getEndOffset());
+        if (offsetAtt != null) {
+          token = new Token(text.utf8ToString(),
+                            offsetAtt.startOffset(),
+                            offsetAtt.endOffset());
         } else {
           token = new Token();
-          token.setEmpty().append(terms[i].utf8ToString());
+          token.setEmpty().append(text.utf8ToString());
         }
         // Yes - this is the position, not the increment! This is for
         // sorting. This value
         // will be corrected before use.
-        token.setPositionIncrement(termPositions[j]);
+        token.setPositionIncrement(pos);
         this.positionedTokens.add(token);
       }
     }

Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/java/org/apache/lucene/search/vectorhighlight/FieldTermStack.java
Sat Nov 12 21:08:53 2011
@@ -21,10 +21,12 @@ import java.util.Collections;
 import java.util.LinkedList;
 import java.util.Set;
 
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.TermFreqVector;
-import org.apache.lucene.index.TermPositionVector;
-import org.apache.lucene.index.TermVectorOffsetInfo;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 
@@ -76,30 +78,55 @@ public class FieldTermStack {
     // just return to make null snippet if un-matched fieldName specified when fieldMatch
== true
     if( termSet == null ) return;
 
-    TermFreqVector tfv = reader.getTermFreqVector( docId, fieldName );
-    if( tfv == null ) return; // just return to make null snippets
-    TermPositionVector tpv = null;
-    try{
-      tpv = (TermPositionVector)tfv;
+    final Fields vectors = reader.getTermVectors(docId);
+    if (vectors == null) {
+      // null snippet
+      return;
     }
-    catch( ClassCastException e ){
-      return; // just return to make null snippets
+
+    final Terms vector = vectors.terms(fieldName);
+    if (vector == null) {
+      // null snippet
+      return;
     }
-    
+
     final CharsRef spare = new CharsRef();
-    for( BytesRef term : tpv.getTerms() ){
-      if( !termSet.contains( term.utf8ToChars(spare).toString() ) ) continue;
-      int index = tpv.indexOf( term );
-      TermVectorOffsetInfo[] tvois = tpv.getOffsets( index );
-      if( tvois == null ) return; // just return to make null snippets
-      int[] poss = tpv.getTermPositions( index );
-      if( poss == null ) return; // just return to make null snippets
-      for( int i = 0; i < tvois.length; i++ )
-        termList.add( new TermInfo( term.utf8ToChars(spare).toString(), tvois[i].getStartOffset(),
tvois[i].getEndOffset(), poss[i] ) );
+    final TermsEnum termsEnum = vector.iterator();
+    DocsAndPositionsEnum dpEnum = null;
+    BytesRef text;
+    while ((text = termsEnum.next()) != null) {
+      final String term = text.utf8ToChars(spare).toString();
+      if (!termSet.contains(term)) {
+        continue;
+      }
+      dpEnum = termsEnum.docsAndPositions(null, dpEnum);
+      if (dpEnum == null) {
+        // null snippet
+        return;
+      }
+
+      if (!dpEnum.attributes().hasAttribute(OffsetAttribute.class)) {
+        // null snippet
+        return;
+      }
+      dpEnum.nextDoc();
+
+      final OffsetAttribute offsetAtt = dpEnum.attributes().getAttribute(OffsetAttribute.class);
+
+      final int freq = dpEnum.freq();
+      
+      for(int i = 0;i < freq;i++) {
+        final int pos = dpEnum.nextPosition();
+        if (pos == -1) {
+          // null snippet
+          return;
+        }
+        termList.add(new TermInfo(term, offsetAtt.startOffset(), offsetAtt.endOffset(), pos));
+      }
     }
     
     // sort by position
-    Collections.sort( termList );
+    Collections.sort(termList);
   }
 
   /**

Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
Sat Nov 12 21:08:53 2011
@@ -34,7 +34,6 @@ import org.apache.lucene.index.CorruptIn
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermPositionVector;
 import org.apache.lucene.index.IndexReader.AtomicReaderContext;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.IndexSearcher;
@@ -85,7 +84,7 @@ public class HighlighterPhraseTest exten
             new QueryScorer(phraseQuery));
 
         final TokenStream tokenStream = TokenSources
-            .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+            .getTokenStream(indexReader.getTermVector(
                 0, FIELD), false);
         assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
             TEXT), highlighter.getBestFragment(tokenStream, TEXT));
@@ -160,7 +159,7 @@ public class HighlighterPhraseTest exten
             .nextSetBit(position + 1)) {
           assertEquals(0, position);
           final TokenStream tokenStream = TokenSources.getTokenStream(
-              (TermPositionVector) indexReader.getTermFreqVector(position,
+              indexReader.getTermVector(position,
                   FIELD), false);
           assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
               TEXT), highlighter.getBestFragment(tokenStream, TEXT));
@@ -207,7 +206,7 @@ public class HighlighterPhraseTest exten
             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
             new QueryScorer(phraseQuery));
         final TokenStream tokenStream = TokenSources
-            .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+            .getTokenStream(indexReader.getTermVector(
                 0, FIELD), false);
         assertEquals(
             highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
@@ -253,7 +252,7 @@ public class HighlighterPhraseTest exten
             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
             new QueryScorer(phraseQuery));
         final TokenStream tokenStream = TokenSources.getTokenStream(
-            (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), true);
+            indexReader.getTermVector(0, FIELD), true);
         assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
             .getBestFragment(tokenStream, TEXT));
       } finally {
@@ -297,7 +296,7 @@ public class HighlighterPhraseTest exten
             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
             new QueryScorer(phraseQuery));
         final TokenStream tokenStream = TokenSources
-            .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
+            .getTokenStream(indexReader.getTermVector(
                 0, FIELD), false);
         assertEquals(
             highlighter.getBestFragment(new TokenStreamSparse(), TEXT),

Modified: lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
Sat Nov 12 21:08:53 2011
@@ -32,7 +32,6 @@ import org.apache.lucene.index.CorruptIn
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermPositionVector;
 import org.apache.lucene.search.DisjunctionMaxQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
@@ -133,7 +132,7 @@ public class TokenSourcesTest extends Lu
             new QueryScorer(query));
         final TokenStream tokenStream = TokenSources
             .getTokenStream(
-                (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+                indexReader.getTermVector(0, FIELD),
                 false);
         assertEquals("<B>the fox</B> did not jump",
             highlighter.getBestFragment(tokenStream, TEXT));
@@ -182,7 +181,7 @@ public class TokenSourcesTest extends Lu
             new QueryScorer(query));
         final TokenStream tokenStream = TokenSources
             .getTokenStream(
-                (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+                indexReader.getTermVector(0, FIELD),
                 false);
         assertEquals("<B>the fox</B> did not jump",
             highlighter.getBestFragment(tokenStream, TEXT));
@@ -230,7 +229,7 @@ public class TokenSourcesTest extends Lu
             new QueryScorer(phraseQuery));
         final TokenStream tokenStream = TokenSources
             .getTokenStream(
-                (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+                indexReader.getTermVector(0, FIELD),
                 false);
         assertEquals("<B>the fox</B> did not jump",
             highlighter.getBestFragment(tokenStream, TEXT));
@@ -279,7 +278,7 @@ public class TokenSourcesTest extends Lu
             new QueryScorer(phraseQuery));
         final TokenStream tokenStream = TokenSources
             .getTokenStream(
-                (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
+                indexReader.getTermVector(0, FIELD),
                 false);
         assertEquals("<B>the fox</B> did not jump",
             highlighter.getBestFragment(tokenStream, TEXT));

Modified: lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTermDocumentInformation.java
Sat Nov 12 21:08:53 2011
@@ -1,7 +1,5 @@
 package org.apache.lucene.store.instantiated;
 
-import org.apache.lucene.index.TermVectorOffsetInfo;
-
 import java.util.Comparator;
 
 /**

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/IndexReader.java
Sat Nov 12 21:08:53 2011
@@ -804,9 +804,19 @@ public abstract class IndexReader implem
   }
 
   // nocommit javadoc
-  abstract public Fields getTermVectors(int docNumber)
+  abstract public Fields getTermVectors(int docID)
           throws IOException;
 
+  // nocommit javadoc
+  public Terms getTermVector(int docID, String field)
+    throws IOException {
+    Fields vectors = getTermVectors(docID);
+    if (vectors == null) {
+      return null;
+    }
+    return vectors.terms(field);
+  }
+
   /**
    * Returns <code>true</code> if an index exists at the specified directory.
    * @param  directory the directory to check for an index

Modified: lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
(original)
+++ lucene/dev/branches/lucene2621/lucene/src/java/org/apache/lucene/index/codecs/DefaultTermVectorsReader.java
Sat Nov 12 21:08:53 2011
@@ -331,15 +331,19 @@ public class DefaultTermVectorsReader ex
   private class TVTerms extends Terms {
     private final int numTerms;
     private final int docID;
+    private final long tvfFPStart;
 
     public TVTerms(int docID, long tvfFP) throws IOException {
       this.docID = docID;
       tvf.seek(tvfFP);
       numTerms = tvf.readVInt();
+      tvfFPStart = tvf.getFilePointer();
     }
 
     @Override
     public TermsEnum iterator() throws IOException {
+      // nocommit -- to be "safe" we should clone tvf here...?
+      tvf.seek(tvfFPStart);
       return new TVTermsEnum(docID, numTerms);
     }
 

Modified: lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java?rev=1201328&r1=1201327&r2=1201328&view=diff
==============================================================================
--- lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
(original)
+++ lucene/dev/branches/lucene2621/modules/queries/src/java/org/apache/lucene/queries/mlt/MoreLikeThis.java
Sat Nov 12 21:08:53 2011
@@ -15,14 +15,19 @@
  */
 package org.apache.lucene.queries.mlt;
 
+import java.io.*;
+import java.util.*;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
 import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Fields;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.Term;
-import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
 import org.apache.lucene.search.*;
 import org.apache.lucene.search.similarities.DefaultSimilarity;
 import org.apache.lucene.search.similarities.TFIDFSimilarity;
@@ -30,9 +35,6 @@ import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.PriorityQueue;
 
-import java.io.*;
-import java.util.*;
-
 
 /**
  * Generate "more like this" similarity queries.
@@ -701,7 +703,13 @@ public final class MoreLikeThis {
   public PriorityQueue<Object[]> retrieveTerms(int docNum) throws IOException {
     Map<String, Int> termFreqMap = new HashMap<String, Int>();
     for (String fieldName : fieldNames) {
-      TermFreqVector vector = ir.getTermFreqVector(docNum, fieldName);
+      final Fields vectors = ir.getTermVectors(docNum);
+      final Terms vector;
+      if (vectors != null) {
+        vector = vectors.terms(fieldName);
+      } else {
+        vector = null;
+      }
 
       // field does not store term vector info
       if (vector == null) {
@@ -716,7 +724,6 @@ public final class MoreLikeThis {
       } else {
         addTermFrequencies(termFreqMap, vector);
       }
-
     }
 
     return createQueue(termFreqMap);
@@ -728,24 +735,25 @@ public final class MoreLikeThis {
    * @param termFreqMap a Map of terms and their frequencies
    * @param vector List of terms and their frequencies for a doc/field
    */
-  private void addTermFrequencies(Map<String, Int> termFreqMap, TermFreqVector vector)
{
-    BytesRef[] terms = vector.getTerms();
-    int freqs[] = vector.getTermFrequencies();
+  private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws
IOException {
+    final TermsEnum termsEnum = vector.iterator();
     final CharsRef spare = new CharsRef();
-    for (int j = 0; j < terms.length; j++) {
-      final String term = terms[j].utf8ToChars(spare).toString();
-
+    BytesRef text;
+    while((text = termsEnum.next()) != null) {
+      final String term = text.utf8ToChars(spare).toString();
       if (isNoiseWord(term)) {
         continue;
       }
+      final int freq = (int) termsEnum.totalTermFreq();
+
       // increment frequency
       Int cnt = termFreqMap.get(term);
       if (cnt == null) {
         cnt = new Int();
         termFreqMap.put(term, cnt);
-        cnt.x = freqs[j];
+        cnt.x = freq;
       } else {
-        cnt.x += freqs[j];
+        cnt.x += freq;
       }
     }
   }



Mime
View raw message