lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cutt...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/index DocumentWriter.java
Date Mon, 22 Dec 2003 21:40:18 GMT
cutting     2003/12/22 13:40:18

  Modified:    src/java/org/apache/lucene/index DocumentWriter.java
  Log:
  Distinguish between positions and length when indexing a field.  The
  length is now defined as the total number of tokens, not the final
  position.  Length is used for score normalization
  (Similarity.lengthNorm()) and for controlling memory usage
  (IndexWriter.maxFieldLength).  In both cases the total number of
  tokens is more reasonable than the final position.  Position is used
  in phrase searching (see PhraseQuery and Token.setPositionIncrement()).
  
  Revision  Changes    Path
  1.7       +10 -5     jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java
  
  Index: DocumentWriter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/index/DocumentWriter.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- DocumentWriter.java	20 Sep 2003 17:42:40 -0000	1.6
  +++ DocumentWriter.java	22 Dec 2003 21:40:18 -0000	1.7
  @@ -103,7 +103,8 @@
   
       // invert doc into postingTable
       postingTable.clear();			  // clear postingTable
  -    fieldLengths = new int[fieldInfos.size()];	  // init fieldLengths
  +    fieldLengths = new int[fieldInfos.size()];    // init fieldLengths
  +    fieldPositions = new int[fieldInfos.size()];  // init fieldPositions
   
       fieldBoosts = new float[fieldInfos.size()];	  // init fieldBoosts
       Arrays.fill(fieldBoosts, doc.getBoost());
  @@ -138,6 +139,7 @@
     // Used to buffer a document before it is written to the index.
     private final Hashtable postingTable = new Hashtable();
     private int[] fieldLengths;
  +  private int[] fieldPositions;
     private float[] fieldBoosts;
   
     // Tokenizes the fields of a document into Postings.
  @@ -149,11 +151,13 @@
         String fieldName = field.name();
         int fieldNumber = fieldInfos.fieldNumber(fieldName);
   
  -      int position = fieldLengths[fieldNumber];	  // position in field
  +      int length = fieldLengths[fieldNumber];     // length of field
  +      int position = fieldPositions[fieldNumber]; // position in field
   
         if (field.isIndexed()) {
           if (!field.isTokenized()) {		  // un-tokenized field
             addPosition(fieldName, field.stringValue(), position++);
  +          length++;
           } else {
             Reader reader;			  // find or make Reader
             if (field.readerValue() != null)
  @@ -170,14 +174,15 @@
               for (Token t = stream.next(); t != null; t = stream.next()) {
                 position += (t.getPositionIncrement() - 1);
                 addPosition(fieldName, t.termText(), position++);
  -              if (position > maxFieldLength) break;
  +              if (++length > maxFieldLength) break;
               }
             } finally {
               stream.close();
             }
           }
   
  -        fieldLengths[fieldNumber] = position;	  // save field length
  +        fieldLengths[fieldNumber] = length;	  // save field length
  +        fieldPositions[fieldNumber] = position;	  // save field position
           fieldBoosts[fieldNumber] *= field.getBoost();
         }
       }
  @@ -321,7 +326,7 @@
         if (field.isIndexed()) {
           int n = fieldInfos.fieldNumber(field.name());
           float norm =
  -          fieldBoosts[n] * similarity.lengthNorm(field.name(), fieldLengths[n]);
  +          fieldBoosts[n] * similarity.lengthNorm(field.name(),fieldLengths[n]);
           OutputStream norms = directory.createFile(segment + ".f" + n);
           try {
             norms.writeByte(similarity.encodeNorm(norm));
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message