lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From yo...@apache.org
Subject svn commit: r493641 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/index/ src/site/src/documentation/content/xdocs/
Date Sun, 07 Jan 2007 04:19:22 GMT
Author: yonik
Date: Sat Jan  6 20:19:21 2007
New Revision: 493641

URL: http://svn.apache.org/viewvc?view=rev&rev=493641
Log:
Maintain norms in a single file .nrm: LUCENE-756

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java
    lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
    lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?view=diff&rev=493641&r1=493640&r2=493641
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sat Jan  6 20:19:21 2007
@@ -163,6 +163,13 @@
     small.  This changes the index file format and cannot be
     read by previous versions of Lucene.  (Doron Cohen via Yonik Seeley)
 
+13. LUCENE-756: Maintain all norms in a single .nrm file to reduce the
+    number of open files and file descriptors for the non-compound index
+    format.  This changes the index file format, but maintains the
+    ability to read and update older indicies. The first segment merge
+    on an older format index will create a single .nrm file for the new
+    segment.  (Doron Cohen via Yonik Seeley)
+
 Bug fixes
 
  1. Fixed the web application demo (built with "ant war-demo") which

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java?view=diff&rev=493641&r1=493640&r2=493641
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexFileNames.java Sat Jan  6 20:19:21
2007
@@ -35,6 +35,9 @@
    * pre-lockless indices) */
   static final String DELETABLE = "deletable";
    
+  /** Extension of norms file */
+  static final String NORMS_EXTENSION = "nrm";
+  
   /**
    * This array contains all filename extensions used by
    * Lucene's index files, with two exceptions, namely the
@@ -45,7 +48,8 @@
    */
   static final String INDEX_EXTENSIONS[] = new String[] {
       "cfs", "fnm", "fdx", "fdt", "tii", "tis", "frq", "prx", "del",
-      "tvx", "tvd", "tvf", "tvp", "gen"};
+      "tvx", "tvd", "tvf", "tvp", "gen", "nrm" 
+  };
   
   /** File extensions of old-style index files */
   static final String COMPOUND_EXTENSIONS[] = new String[] {

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java?view=diff&rev=493641&r1=493640&r2=493641
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/IndexWriter.java Sat Jan  6 20:19:21
2007
@@ -639,7 +639,7 @@
     String segmentName = newRAMSegmentName();
     dw.addDocument(segmentName, doc);
     synchronized (this) {
-      ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false));
+      ramSegmentInfos.addElement(new SegmentInfo(segmentName, 1, ramDirectory, false, false));
       maybeFlushRamSegments();
     }
   }
@@ -772,10 +772,10 @@
     while (segmentInfos.size() > 1 ||
            (segmentInfos.size() == 1 &&
             (SegmentReader.hasDeletions(segmentInfos.info(0)) ||
+             SegmentReader.hasSeparateNorms(segmentInfos.info(0)) ||
              segmentInfos.info(0).dir != directory ||
              (useCompoundFile &&
-              (!SegmentReader.usesCompoundFile(segmentInfos.info(0)) ||
-                SegmentReader.hasSeparateNorms(segmentInfos.info(0))))))) {
+              (!SegmentReader.usesCompoundFile(segmentInfos.info(0))))))) {
       int minSegment = segmentInfos.size() - mergeFactor;
       mergeSegments(segmentInfos, minSegment < 0 ? 0 : minSegment, segmentInfos.size());
     }
@@ -1127,7 +1127,7 @@
       int docCount = merger.merge();                // merge 'em
 
       segmentInfos.setSize(0);                      // pop old infos & add new
-      info = new SegmentInfo(mergedName, docCount, directory, false);
+      info = new SegmentInfo(mergedName, docCount, directory, false, true);
       segmentInfos.addElement(info);
       commitPending = true;
 
@@ -1347,7 +1347,7 @@
         }
 
         newSegment = new SegmentInfo(mergedName, mergedDocCount,
-                                     directory, false);
+                                     directory, false, true);
 
 
         if (sourceSegments == ramSegmentInfos) {

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java?view=diff&rev=493641&r1=493640&r2=493641
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentInfo.java Sat Jan  6 20:19:21
2007
@@ -42,8 +42,13 @@
 
   private byte isCompoundFile;                    // -1 if it is not; 1 if it is; 0 if it's
                                                   // pre-2.1 (ie, must check file system
to see
-                                                  // if <name>.cfs exists)        

+                                                  // if <name>.cfs and <name>.nrm
exist)         
 
+  private byte withNrm;                           // 1 if this segment maintains norms in
a single file; 
+                                                  // -1 if not; 0 if check file is required
to tell.
+                                                  // would be -1 for segments populated by
DocumentWriter.
+                                                  // would be 1 for (newly created) merge
resulted segments (both compound and non compound).
+  
   public SegmentInfo(String name, int docCount, Directory dir) {
     this.name = name;
     this.docCount = docCount;
@@ -51,14 +56,13 @@
     delGen = -1;
     isCompoundFile = 0;
     preLockless = true;
+    withNrm = 0;
   }
-  public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile) {
+
+  public SegmentInfo(String name, int docCount, Directory dir, boolean isCompoundFile, boolean
withNrm) { 
     this(name, docCount, dir);
-    if (isCompoundFile) {
-      this.isCompoundFile = 1;
-    } else {
-      this.isCompoundFile = -1;
-    }
+    this.isCompoundFile = (byte) (isCompoundFile ? 1 : -1);
+    this.withNrm = (byte) (withNrm ? 1 : -1);
     preLockless = false;
   }
 
@@ -78,6 +82,7 @@
       System.arraycopy(src.normGen, 0, normGen, 0, src.normGen.length);
     }
     isCompoundFile = src.isCompoundFile;
+    withNrm = src.withNrm;
   }
 
   /**
@@ -111,19 +116,20 @@
       isCompoundFile = 0;
       preLockless = true;
     }
+    withNrm = 0;
   }
   
-  void setNumField(int numField) {
+  void setNumFields(int numFields) {
     if (normGen == null) {
       // normGen is null if we loaded a pre-2.1 segment
       // file, or, if this segments file hasn't had any
       // norms set against it yet:
-      normGen = new long[numField];
+      normGen = new long[numFields];
 
       if (!preLockless) {
         // This is a FORMAT_LOCKLESS segment, which means
         // there are no norms:
-        for(int i=0;i<numField;i++) {
+        for(int i=0;i<numFields;i++) {
           normGen[i] = -1;
         }
       }
@@ -173,6 +179,7 @@
     si.isCompoundFile = isCompoundFile;
     si.delGen = delGen;
     si.preLockless = preLockless;
+    si.withNrm = withNrm;
     if (normGen != null) {
       si.normGen = (long[]) normGen.clone();
     }
@@ -245,7 +252,7 @@
       // pre-LOCKLESS and must be checked in directory:
       for(int i=0;i<normGen.length;i++) {
         if (normGen[i] == 0) {
-          if (dir.fileExists(getNormFileName(i))) {
+          if (hasSeparateNorms(i)) {
             return true;
           }
         }
@@ -285,12 +292,21 @@
     }
     
     if (hasSeparateNorms(number)) {
+      // case 1: separate norm
       prefix = ".s";
       return IndexFileNames.fileNameFromGeneration(name, prefix + number, gen);
-    } else {
-      prefix = ".f";
-      return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
     }
+    
+
+    if (withNrm()) {
+      // case 2: lockless (or nrm file exists) - single file for all norms 
+      prefix = "." + IndexFileNames.NORMS_EXTENSION;
+      return IndexFileNames.fileNameFromGeneration(name, prefix, 0);
+    }
+      
+    // case 3: norm file for each field
+    prefix = ".f";
+    return IndexFileNames.fileNameFromGeneration(name, prefix + number, 0);
   }
 
   /**
@@ -310,11 +326,6 @@
   /**
    * Returns true if this segment is stored as a compound
    * file; else, false.
-   *
-   * @param directory directory to check.  This parameter is
-   * only used when the segment was written before version
-   * 2.1 (at which point compound file or not became stored
-   * in the segments info file).
    */
   boolean getUseCompoundFile() throws IOException {
     if (isCompoundFile == -1) {
@@ -323,6 +334,32 @@
       return true;
     } else {
       return dir.fileExists(name + ".cfs");
+    }
+  }
+  
+  /**
+   * Returns true iff this segment stores filed norms in a single .nrm file.
+   */
+  private boolean withNrm () throws IOException {
+    if (withNrm == -1) {
+      return false;
+    } 
+    if (withNrm == 1) {
+      return true;
+    }
+    Directory d = dir;
+    try {
+      if (getUseCompoundFile()) {
+        d = new CompoundFileReader(dir, name + ".cfs");
+      }
+      boolean res = d.fileExists(name + "." + IndexFileNames.NORMS_EXTENSION);
+      withNrm = (byte) (res ? 1 : -1); // avoid more file tests like this 
+      return res;
+    } finally {
+      if (d!=dir && d!=null) {
+        d.close();
+      }
+      
     }
   }
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java?view=diff&rev=493641&r1=493640&r2=493641
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java Sat Jan  6 20:19:21
2007
@@ -40,6 +40,10 @@
  * @see #add
  */
 final class SegmentMerger {
+  
+  /** norms header placeholder */
+  static final byte[] NORMS_HEADER = new byte[]{'N','R','M',-1}; 
+  
   private Directory directory;
   private String segment;
   private int termIndexInterval = IndexWriter.DEFAULT_TERM_INDEX_INTERVAL;
@@ -116,7 +120,7 @@
             new CompoundFileWriter(directory, fileName);
 
     Vector files =
-      new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + fieldInfos.size());    
+      new Vector(IndexFileNames.COMPOUND_EXTENSIONS.length + 1);    
     
     // Basic files
     for (int i = 0; i < IndexFileNames.COMPOUND_EXTENSIONS.length; i++) {
@@ -127,7 +131,8 @@
     for (int i = 0; i < fieldInfos.size(); i++) {
       FieldInfo fi = fieldInfos.fieldInfo(i);
       if (fi.isIndexed && !fi.omitNorms) {
-        files.add(segment + ".f" + i);
+        files.add(segment + "." + IndexFileNames.NORMS_EXTENSION);
+        break;
       }
     }
 
@@ -408,11 +413,15 @@
 
   private void mergeNorms() throws IOException {
     byte[] normBuffer = null;
-    for (int i = 0; i < fieldInfos.size(); i++) {
-      FieldInfo fi = fieldInfos.fieldInfo(i);
-      if (fi.isIndexed && !fi.omitNorms) {
-        IndexOutput output = directory.createOutput(segment + ".f" + i);
-        try {
+    IndexOutput output = null;
+    try {
+      for (int i = 0; i < fieldInfos.size(); i++) {
+        FieldInfo fi = fieldInfos.fieldInfo(i);
+        if (fi.isIndexed && !fi.omitNorms) {
+          if (output == null) { 
+            output = directory.createOutput(segment + "." + IndexFileNames.NORMS_EXTENSION);
+            output.writeBytes(NORMS_HEADER,NORMS_HEADER.length);
+          }
           for (int j = 0; j < readers.size(); j++) {
             IndexReader reader = (IndexReader) readers.elementAt(j);
             int maxDoc = reader.maxDoc();
@@ -434,9 +443,11 @@
               }
             }
           }
-        } finally {
-          output.close();
         }
+      }
+    } finally {
+      if (output != null) { 
+        output.close();
       }
     }
   }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java?view=diff&rev=493641&r1=493640&r2=493641
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentReader.java Sat Jan  6 20:19:21
2007
@@ -58,23 +58,25 @@
   CompoundFileReader cfsReader = null;
 
   private class Norm {
-    public Norm(IndexInput in, int number)
+    public Norm(IndexInput in, int number, long normSeek)
     {
       this.in = in;
       this.number = number;
+      this.normSeek = normSeek;
     }
 
     private IndexInput in;
     private byte[] bytes;
     private boolean dirty;
     private int number;
+    private long normSeek;
     private boolean rollbackDirty;
 
     private void reWrite(SegmentInfo si) throws IOException {
       // NOTE: norms are re-written in regular directory, not cfs
 
       String oldFileName = si.getNormFileName(this.number);
-      if (oldFileName != null) {
+      if (oldFileName != null && !oldFileName.endsWith("." + IndexFileNames.NORMS_EXTENSION))
{
         // Mark this file for deletion.  Note that we don't
         // actually try to delete it until the new segments files is
         // successfully written:
@@ -215,7 +217,7 @@
       si.clearDelGen();
     }
     if (normsDirty) {               // re-write norms
-      si.setNumField(fieldInfos.size());
+      si.setNumFields(fieldInfos.size());
       Enumeration values = norms.elements();
       while (values.hasMoreElements()) {
         Norm norm = (Norm) values.nextElement();
@@ -301,10 +303,16 @@
       files.addElement(si.getDelFileName());
     }
 
+    boolean addedNrm = false;
     for (int i = 0; i < fieldInfos.size(); i++) {
       String name = si.getNormFileName(i);
-      if (name != null && directory().fileExists(name))
+      if (name != null && directory().fileExists(name)) {
+        if (name.endsWith("." + IndexFileNames.NORMS_EXTENSION)) {
+          if (addedNrm) continue; // add .nrm just once
+          addedNrm = true;
+        }
             files.addElement(name);
+      }
     }
     return files;
   }
@@ -462,7 +470,7 @@
 
     IndexInput normStream = (IndexInput) norm.in.clone();
     try {                                         // read from disk
-      normStream.seek(0);
+      normStream.seek(norm.normSeek);
       normStream.readBytes(bytes, offset, maxDoc());
     } finally {
       normStream.close();
@@ -471,6 +479,8 @@
 
 
   private void openNorms(Directory cfsDir) throws IOException {
+    long nextNormSeek = SegmentMerger.NORMS_HEADER.length; //skip header (header unused for
now)
+    int maxDoc = maxDoc();
     for (int i = 0; i < fieldInfos.size(); i++) {
       FieldInfo fi = fieldInfos.fieldInfo(i);
       if (fi.isIndexed && !fi.omitNorms) {
@@ -479,7 +489,9 @@
         if (!si.hasSeparateNorms(fi.number)) {
           d = cfsDir;
         }
-        norms.put(fi.name, new Norm(d.openInput(fileName), fi.number));
+        long normSeek = (fileName.endsWith("." + IndexFileNames.NORMS_EXTENSION) ? nextNormSeek
: 0);
+        norms.put(fi.name, new Norm(d.openInput(fileName), fi.number, normSeek));
+        nextNormSeek += maxDoc; // increment also if some norms are separate
       }
     }
   }

Modified: lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml?view=diff&rev=493641&r1=493640&r2=493641
==============================================================================
--- lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml (original)
+++ lucene/java/trunk/src/site/src/documentation/content/xdocs/fileformats.xml Sat Jan  6
20:19:21 2007
@@ -1397,7 +1397,9 @@
                 </p>
             </section>
             <section id="Normalization Factors"><title>Normalization Factors</title>
-                <p>There's a norm file for each indexed field with a byte for
+				<p>
+                    <b>Pre-2.1:</b>
+                    There's a norm file for each indexed field with a byte for
                     each document. The .f[0-9]* file contains,
                     for each document, a byte that encodes a value that is multiplied
                     into the score for hits on that field:
@@ -1406,6 +1408,27 @@
                     (.f[0-9]*) --&gt; &lt;Byte&gt;
                     <sup>SegSize</sup>
                 </p>
+				<p>
+                    <b>2.1 and above:</b>
+                    There's a single .nrm file containing all norms:
+                </p>
+                <p>AllNorms
+                    (.nrm) --&gt; NormsHeader,&lt;Norms&gt;
+                    <sup>NumFieldsWithNorms</sup>
+                </p>
+                <p>Norms
+                    --&gt; &lt;Byte&gt;
+                    <sup>SegSize</sup>
+                </p>
+                <p>NormsHeader
+                    --&gt; 'N','R','M',Version
+                </p>
+                <p>Version
+                    --&gt; Byte
+                </p>
+                <p>NormsHeader 
+					has 4 bytes, last of which is the format version for this file, currently -1.
+                </p>
                 <p>Each
                     byte encodes a floating point value. Bits 0-2 contain the 3-bit
                     mantissa, and bits 3-8 contain the 5-bit exponent.
@@ -1441,6 +1464,18 @@
                         </p>
                     </li>
                 </ol>
+                <p>A separate norm file is created when the norm values of an existing
segment are modified. 
+					When field <em>N</em> is modified, a separate norm file <em>.sN</em>

+					is created, to maintain the norm values for that field.
+                </p>
+				<p>
+                    <b>Pre-2.1:</b>
+                    Separate norm files are created only for compound segments.
+                </p>
+				<p>
+                    <b>2.1 and above:</b>
+                    Separate norm files are created (when adequate) for both compound and
non compound segments.
+                </p>
 
             </section>
             <section id="Term Vectors"><title>Term Vectors</title>



Mime
View raw message