lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject svn commit: r431148 - in /lucene/java/trunk: ./ src/java/org/apache/lucene/document/ src/java/org/apache/lucene/index/
Date Sun, 13 Aug 2006 06:12:07 GMT
Author: otis
Date: Sat Aug 12 23:12:07 2006
New Revision: 431148

URL: http://svn.apache.org/viewvc?rev=431148&view=rev
Log:
- LUCENE-629: indexing and optimizing performance improvements when working with compressed
fields

Modified:
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/src/java/org/apache/lucene/document/FieldSelectorResult.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java
    lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java
    lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=431148&r1=431147&r2=431148&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Sat Aug 12 23:12:07 2006
@@ -108,6 +108,10 @@
      internal "files", allowing them to be GCed even if references to the
      RAMDirectory itself still exist. (Nadav Har'El via Chris Hostetter)
 
+  3. LUCENE-629: Compressed fields are no longer uncompressed and recompressed
+     during segment merges (e.g. during indexing or optimizing), thus improving
+     performance . (Michael Busch via Otis Gospodnetic)
+
 Release 2.0.0 2006-05-26
 
 API Changes

Modified: lucene/java/trunk/src/java/org/apache/lucene/document/FieldSelectorResult.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/document/FieldSelectorResult.java?rev=431148&r1=431147&r2=431148&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/document/FieldSelectorResult.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/document/FieldSelectorResult.java Sat Aug
12 23:12:07 2006
@@ -26,11 +26,11 @@
   public static final FieldSelectorResult LAZY_LOAD = new FieldSelectorResult(1);
   public static final FieldSelectorResult NO_LOAD = new FieldSelectorResult(2);
   public static final FieldSelectorResult LOAD_AND_BREAK = new FieldSelectorResult(3);
-  
+  public static final FieldSelectorResult LOAD_FOR_MERGE = new FieldSelectorResult(4);
+
   private int id;
 
-  private FieldSelectorResult(int id)
-  {
+  private FieldSelectorResult(int id) {
     this.id = id;
   }
 

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java?rev=431148&r1=431147&r2=431148&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldsReader.java Sat Aug 12 23:12:07
2006
@@ -16,16 +16,21 @@
  * limitations under the License.
  */
 
-import org.apache.lucene.document.*;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.IndexInput;
-
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.Reader;
 import java.util.zip.DataFormatException;
 import java.util.zip.Inflater;
 
+import org.apache.lucene.document.AbstractField;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
+import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IndexInput;
+
 /**
  * Class responsible for access to stored document fields.
  * <p/>
@@ -89,6 +94,9 @@
       if (acceptField.equals(FieldSelectorResult.LOAD) == true) {
         addField(doc, fi, binary, compressed, tokenize);
       }
+      else if (acceptField.equals(FieldSelectorResult.LOAD_FOR_MERGE) == true) {
+        addFieldForMerge(doc, fi, binary, compressed, tokenize);
+      }
       else if (acceptField.equals(FieldSelectorResult.LOAD_AND_BREAK) == true){
         addField(doc, fi, binary, compressed, tokenize);
         break;//Get out of this loop
@@ -161,6 +169,22 @@
 
   }
 
+  // in merge mode we don't uncompress the data of a compressed field
+  private void addFieldForMerge(Document doc, FieldInfo fi, boolean binary, boolean compressed,
boolean tokenize) throws IOException {
+    Object data;
+      
+    if (binary || compressed) {
+      int toRead = fieldsStream.readVInt();
+      final byte[] b = new byte[toRead];
+      fieldsStream.readBytes(b, 0, b.length);
+      data = b;
+    } else {
+      data = fieldsStream.readString();
+    }
+      
+    doc.add(new FieldForMerge(data, fi, binary, compressed, tokenize));
+  }
+  
   private void addField(Document doc, FieldInfo fi, boolean binary, boolean compressed, boolean
tokenize) throws IOException {
 
     //we have a binary stored field, and it may be compressed
@@ -369,5 +393,38 @@
     
     // Get the decompressed data
     return bos.toByteArray();
+  }
+  
+  // Instances of this class hold field properties and data
+  // for merge
+  final static class FieldForMerge extends AbstractField {
+    public String stringValue() {
+      return (String) this.fieldsData;
+    }
+
+    public Reader readerValue() {
+      // not needed for merge
+      return null;
+    }
+
+    public byte[] binaryValue() {
+      return (byte[]) this.fieldsData;
+    }
+    
+    public FieldForMerge(Object value, FieldInfo fi, boolean binary, boolean compressed,
boolean tokenize) {
+      this.isStored = true;  
+      this.fieldsData = value;
+      this.isCompressed = compressed;
+      this.isBinary = binary;
+      this.isTokenized = tokenize;
+
+      this.name = fi.name.intern();
+      this.isIndexed = fi.isIndexed;
+      this.omitNorms = fi.omitNorms;          
+      this.storeOffsetWithTermVector = fi.storeOffsetWithTermVector;
+      this.storePositionWithTermVector = fi.storePositionWithTermVector;
+      this.storeTermVector = fi.storeTermVector;            
+    }
+     
   }
 }

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java?rev=431148&r1=431147&r2=431148&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/FieldsWriter.java Sat Aug 12 23:12:07
2006
@@ -23,6 +23,7 @@
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field;
+import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexOutput;
 
@@ -55,7 +56,7 @@
         int storedCount = 0;
         Enumeration fields = doc.fields();
         while (fields.hasMoreElements()) {
-            Field field = (Field) fields.nextElement();
+            Fieldable field = (Fieldable) fields.nextElement();
             if (field.isStored())
                 storedCount++;
         }
@@ -63,7 +64,11 @@
 
         fields = doc.fields();
         while (fields.hasMoreElements()) {
-            Field field = (Field) fields.nextElement();
+            Fieldable field = (Fieldable) fields.nextElement();
+            // if the field as an instanceof FieldsReader.FieldForMerge, we're in merge mode
+            // and field.binaryValue() already returns the compressed value for a field
+            // with isCompressed()==true, so we disable compression in that case
+            boolean disableCompression = (field instanceof FieldsReader.FieldForMerge);
             if (field.isStored()) {
                 fieldsStream.writeVInt(fieldInfos.fieldNumber(field.name()));
 
@@ -80,12 +85,19 @@
                 if (field.isCompressed()) {
                   // compression is enabled for the current field
                   byte[] data = null;
-                  // check if it is a binary field
-                  if (field.isBinary()) {
-                    data = compress(field.binaryValue());
-                  }
-                  else {
-                    data = compress(field.stringValue().getBytes("UTF-8"));
+                  
+                  if (disableCompression) {
+                      // optimized case for merging, the data
+                      // is already compressed
+                      data = field.binaryValue();
+                  } else {
+                      // check if it is a binary field
+                      if (field.isBinary()) {
+                        data = compress(field.binaryValue());
+                      }
+                      else {
+                        data = compress(field.stringValue().getBytes("UTF-8"));
+                      }
                   }
                   final int len = data.length;
                   fieldsStream.writeVInt(len);

Modified: lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java?rev=431148&r1=431147&r2=431148&view=diff
==============================================================================
--- lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java (original)
+++ lucene/java/trunk/src/java/org/apache/lucene/index/SegmentMerger.java Sat Aug 12 23:12:07
2006
@@ -21,6 +21,8 @@
 import java.util.Collection;
 import java.io.IOException;
 
+import org.apache.lucene.document.FieldSelector;
+import org.apache.lucene.document.FieldSelectorResult;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.RAMOutputStream;
@@ -177,13 +179,22 @@
 
     FieldsWriter fieldsWriter = // merge field values
             new FieldsWriter(directory, segment, fieldInfos);
+    
+    // for merging we don't want to compress/uncompress the data, so to tell the FieldsReader
that we're
+    // in  merge mode, we use this FieldSelector
+    FieldSelector fieldSelectorMerge = new FieldSelector() {
+      public FieldSelectorResult accept(String fieldName) {
+        return FieldSelectorResult.LOAD_FOR_MERGE;
+      }        
+    };
+    
     try {
       for (int i = 0; i < readers.size(); i++) {
         IndexReader reader = (IndexReader) readers.elementAt(i);
         int maxDoc = reader.maxDoc();
         for (int j = 0; j < maxDoc; j++)
           if (!reader.isDeleted(j)) {               // skip deleted docs
-            fieldsWriter.addDocument(reader.document(j));
+            fieldsWriter.addDocument(reader.document(j, fieldSelectorMerge));
             docCount++;
           }
       }



Mime
View raw message