lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From karl wettin <karl.wet...@gmail.com>
Subject Resolving term vector even when not stored?
Date Fri, 16 Mar 2007 01:47:34 GMT
I propose a change of the current IndexReader.getTermFreqVector/s- 
code so that it /always/ return the vector space model of a document,  
even when set fields are set as Field.TermVector.NO.

Is that crazy? Could be really slow, but except for that.. And if it  
is cached then that information is known by inspecting the fields.  
People don't go fetching term vectors without knowing what thay are  
doing, are they?

Whipped something up that builds the data using TermEnum and  
TermPositions. Very simple. Does the job. Some IndexReader  
implementations can do the job consideraly faster than navigating  
that way.

Used this in order to build a tool that can copy one index to another  
using an IndexReader source and an IndexWriter target (and thus  
transparently allows for converting one index to antoher, e.g. load  
FSDirectory to InstantiatedIndex and vice verse). Sort of like  
Directory.addIndex, but for any implementation.



Supersimple code: (Don't pay too much attention at it beeing a Map,  
it should of course be a TermFreqVector.)

package org.apache.lucene.index;

import java.io.IOException;
import java.util.*;

/**
* Resolves a term frequency vector from the inverted term index.
*
* @author: Karl Wettin <mailto:karl.wettin@gmail.com>
*/
public class DocumentVectorSpaceModel extends HashMap</**field*/ 
String, /**term meta ordered by term*/ 
List<DocumentVectorSpaceModel.DocumentTermMeta>>{

   public DocumentVectorSpaceModel(int doc, IndexReader ir) throws  
IOException {

     TermEnum termEnum = ir.terms();
     while (termEnum.next()) {
       TermPositions termPositions = ir.termPositions(termEnum.term());
       if (termPositions.skipTo(doc)) {
         int[] positions = new int[termPositions.freq()];
         for (int i = 0; i < positions.length; i++) {
           positions[i] = termPositions.nextPosition();
         }
         DocumentTermMeta meta = new DocumentTermMeta(termEnum.term 
(), positions);
         List<DocumentTermMeta> termMetas;
         if (!this.containsKey(termEnum.term().field())) {
           termMetas = new LinkedList<DocumentTermMeta>();
           this.put(termEnum.term().field(), termMetas);
         } else {
           termMetas = this.get(termEnum.term().field());
         }
         int pos = (Collections.binarySearch(termMetas, meta) * -1) - 1;
         termMetas.add(pos, meta);
       }
     }
   }

   public static class DocumentTermMeta implements  
Comparable<DocumentTermMeta> {
     private Term term;
     private int[] termPositions;

     public DocumentTermMeta(Term term, int[] termPositions) {
       this.term = term;
       this.termPositions = termPositions;
     }

     public int compareTo(DocumentTermMeta documentTermMeta) {
       return getTerm().compareTo(documentTermMeta.getTerm());
     }

     public Term getTerm() {
       return term;
     }

     public int[] getTermPositions() {
       return termPositions;
     }
   }
}













Bonus, my import/export code that use the code above to tokenize a  
whole index and send it to a writer:


package org.apache.lucene.index;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;

import java.io.IOException;
import java.io.Reader;
import java.util.*;

/**
* @author: Karl Wettin <mailto:karl.wettin@gmail.com>
*/
public class IndexAppender {

   // private static Log log = LogFactory.getLog(IndexReplicator.class);
   // private static long serialVersionUID = 1l;


   /**
    * Adds the complete content of any one index (using an index  
reader) to any other index (using an index writer).
    * <p/>
    * The analyzer creates one complete token stream of all fields  
with the same name the first time it is requested,
    * and after that an empty for each remaining. todo: this is a  
problem?
    * <p/>
    * It can be buggy if the same token appears as synonym to it self  
(position increment 0). not really something to worry about.. or?
    *
    * @param sourceReader the index from wich all content will be  
copied from.
    * @param targetWriter the index in wich content will be copied to.
    * @throws java.io.IOException when accessing source or target.
    */
   public static void append(final IndexReader sourceReader,  
IndexWriterInterface targetWriter) throws IOException {


     for (int documentNumber = 0; documentNumber < sourceReader.maxDoc 
(); documentNumber++) {

       final int documentNumberInnerAccessHack = documentNumber;
       final Document document = sourceReader.document(documentNumber);
       final DocumentVectorSpaceModel documentVectorSpaceModel = new  
DocumentVectorSpaceModel(documentNumber, sourceReader);

       targetWriter.addDocument(document, new Analyzer() {

         private Set<String> processedFields = new HashSet<String>();

         public TokenStream tokenStream(final String fieldName,  
Reader reader) {

           if (!processedFields.add(fieldName)) {
             return new TokenStream() {

               public Token next() throws IOException {
                 return null;
               }
             };
           } else {

             return new TokenStream() {

               Iterator<DocumentTermPosition> documentTermPositions;
               int previousPosition = -1;
               int startOffset = 0;
               int endOffset = 0;

               public Token next() throws IOException {

                 if (documentTermPositions == null) {
                   List<DocumentTermPosition> list = new  
ArrayList<DocumentTermPosition>();
                   for (DocumentVectorSpaceModel.DocumentTermMeta  
documentTermMeta : documentVectorSpaceModel.get(fieldName)) {
                     for (int position :  
documentTermMeta.getTermPositions()) {
                       list.add(new DocumentTermPosition(position,  
documentTermMeta));
                     }
                   }

                   Collections.sort(list);
                   documentTermPositions = list.iterator();
                 }

                 if (!documentTermPositions.hasNext()) {
                   return null;
                 }

                 DocumentTermPosition documentTermPosition =  
documentTermPositions.next();

                 int positionIncrement =  
documentTermPosition.getPosition() - previousPosition;


                 Token token;

                 // all these suggested things are overkill.

                 Field field = document.getField 
(documentTermPosition.getDocumentTermMeta().getTerm().field());
                 if (field.isStoreOffsetWithTermVector()) {
                   // todo: this can be buggy if the same token  
appears without position increment. not really something to worry  
about.. or?
                   int pos = Arrays.binarySearch 
(documentTermPosition.getDocumentTermMeta().getTermPositions(),  
documentTermPosition.getPosition());
                   TermPositionVector termPositionVector =  
(TermPositionVector) sourceReader.getTermFreqVector 
(documentNumberInnerAccessHack,  
documentTermPosition.getDocumentTermMeta().getTerm().field());
                   TermVectorOffsetInfo offsetInfo =  
termPositionVector.getOffsets(termPositionVector.indexOf 
(documentTermPosition.getDocumentTermMeta().getTerm().text()))[pos];
                   token = new Token 
(documentTermPosition.getDocumentTermMeta().getTerm().text(),  
offsetInfo.getStartOffset(), offsetInfo.getEndOffset(), "token with  
stored offset");
                 } else {
                   endOffset = startOffset +  
documentTermPosition.getDocumentTermMeta().getTerm().text().length();
                   // todo: attempt to find true offset in stored  
field, if available. a mighty regexp matcher could be a dirty  
implementation.
                   token = new Token 
(documentTermPosition.getDocumentTermMeta().getTerm().text(),  
startOffset, endOffset, "token with estimated offsets");
                 }

                 if (positionIncrement == 0) {
                   // don't increate startOffset
                   System.currentTimeMillis();
                 } else if (positionIncrement == 1) {
                   startOffset = endOffset + 1; // just a single  
whitespace. todo: getMeanTokenDistance()
                 } else if (positionIncrement > 1) {
                   startOffset = endOffset + positionIncrement; //  
just single whitespaces. todo: positionIncrement *  
getMeanTokenDistance()
                 } else {
                   throw new RuntimeException("Position increment is  
a negative value: " + positionIncrement);
                 }

                 token.setPositionIncrement(positionIncrement);

                 previousPosition = documentTermPosition.getPosition();

                 return token;
               }
             };
           }
         }
       });
     }
   }

   private static class DocumentTermPosition implements  
Comparable<DocumentTermPosition> {
     private Integer position;
     private DocumentVectorSpaceModel.DocumentTermMeta documentTermMeta;

     public DocumentTermPosition(int position,  
DocumentVectorSpaceModel.DocumentTermMeta documentTermMeta) {
       this.position = position;
       this.documentTermMeta = documentTermMeta;

     }

     public int compareTo(DocumentTermPosition documentTermPosition) {
       return getPosition().compareTo(documentTermPosition.getPosition 
());
     }

     public Integer getPosition() {
       return position;
     }

     public DocumentVectorSpaceModel.DocumentTermMeta  
getDocumentTermMeta() {
       return documentTermMeta;
     }

     public String toString() {
       return getPosition() + " " + getDocumentTermMeta().getTerm();
     }
   }
}




---------------------------------------------------------------------
To unsubscribe, e-mail: java-dev-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-dev-help@lucene.apache.org


Mime
View raw message