lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Ivan Vasilev <ivasi...@sirma.bg>
Subject Term Positions added to one document forward
Date Mon, 29 Oct 2012 17:44:26 GMT
Hi Guys,

I use the following code to index documents and set Payloads to term 
positions:

public class TestPayloads_ {
     private static final String INDEX_DIR =
             "E:/Temp/Index";

     public static void main(String[] args) throws Exception {
         IndexWriterConfig iwc = new 
IndexWriterConfig(Version.LUCENE_40, new MyAnalyzer_());
         iwc.setOpenMode(OpenMode.CREATE);
         IndexWriter writer = new IndexWriter(FSDirectory.open(new 
File(INDEX_DIR)), iwc);

         FieldType fieldType = new FieldType();
         IndexOptions indexOptions = 
IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
         fieldType.setIndexOptions(indexOptions);
         fieldType.setIndexed(true);
         fieldType.setOmitNorms(true);
         fieldType.setStored(true);
         fieldType.freeze();

         Document doc = new Document();
         doc.add(new Field("content", "one two three four.", fieldType));
         writer.addDocument(doc);

         writer.addDocument(doc);
         writer.addDocument(doc);

         writer.close();

         DirectoryReader dr = DirectoryReader.open(FSDirectory.open(new 
File(INDEX_DIR)));
         AtomicReader sr = dr.leaves().get(0).reader();

         Bits liveDocs = sr.getLiveDocs();
         Fields fields = sr.fields();
         for (String currFieldName : fields) {
             Terms currTerms = fields.terms(currFieldName);
             TermsEnum currTermEnum = currTerms.iterator(null);
             boolean currTermsHasPayloads = currTerms.hasPayloads();
             BytesRef currFieldValue;
             while ((currFieldValue = currTermEnum.next()) != null) {
                 String currVfieldValueStr = currFieldValue.utf8ToString();
//                    DocsEnum currDocsEnum = 
currTermEnum.docs(liveDocs, null);
                 DocsAndPositionsEnum currDocsAndPositions =
                     currTermEnum.docsAndPositions(liveDocs, null,
                         DocsAndPositionsEnum.FLAG_PAYLOADS
                         | DocsAndPositionsEnum.FLAG_OFFSETS);
                 int docID;
                 while ((docID = currDocsAndPositions.nextDoc()) != 
DocsEnum.NO_MORE_DOCS) {
                     int freq = currDocsAndPositions.freq();
                     for (int i = 0; i < freq; i++) {
                         byte payload;
                         if (currTermsHasPayloads && 
currDocsAndPositions.getPayload() != null) {
                             payload = 
currDocsAndPositions.getPayload().bytes[0];
                         } else {
                             payload = -1;
                         }
                         System.out.println("Term: (" + currFieldName + 
":" + currVfieldValueStr + "); doc: "
                             + docID + "; position: " + 
currDocsAndPositions.nextPosition()
                             + "; payload: " + payload);
                     }
                 }
             }
         }
         dr.close();
     }

}

class MyAnalyzer_ extends Analyzer {
     @Override
     protected TokenStreamComponents createComponents(String fieldName, 
Reader reader) {
         Tokenizer tokenizer = new StandardTokenizer(Version.LUCENE_40, 
reader);
         return new TokenStreamComponents(tokenizer, new 
MyFilter_(tokenizer));
     }

}

class MyFilter_ extends TokenFilter {
     private PayloadAttribute payloadAttr;
     private byte[] payloadVal;

     MyFilter_(TokenStream in) {
         super(in);
         payloadAttr = addAttribute(PayloadAttribute.class);
         payloadVal = new byte[1];
     }
     public final boolean incrementToken() throws IOException {
         if (input.incrementToken()) {
             payloadVal[0]++;
             payloadAttr.setPayload(new BytesRef(payloadVal));
             return true;
         } else {
             return false;
         }
     }
}




The output is the following:

Term: (content:four); doc: 0; position: 3; payload: -1
Term: (content:four); doc: 1; position: 3; payload: 4
Term: (content:four); doc: 2; position: 3; payload: 8
Term: (content:one); doc: 0; position: 0; payload: -1
Term: (content:one); doc: 1; position: 0; payload: 1
Term: (content:one); doc: 2; position: 0; payload: 5
Term: (content:three); doc: 0; position: 2; payload: -1
Term: (content:three); doc: 1; position: 2; payload: 3
Term: (content:three); doc: 2; position: 2; payload: 7
Term: (content:two); doc: 0; position: 1; payload: -1
Term: (content:two); doc: 1; position: 1; payload: 2
Term: (content:two); doc: 2; position: 1; payload: 6


The payloads of document with Lucene ID #0 were not added. Payloads that 
were intended to doc #0 were added to doc #1, those intended for doc #1 
were added to doc #2.
With the debugger I see that during adding doc #0 payloadVal is 
incremented form 1 to 4, and after each incrementation is invoked 
payloadAttr.setPayload(..), but strangely when reading 
DocsAndPositionsEnumwe see those payloads (1 to 4) belong actually to 
doc #1.

Do I make some mistake with invoking setPayload(..) method or it is a bug?

Cheers,
Ivan Vasilev

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message