lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1228935 - in /lucene/dev/branches/branch_3x: ./ lucene/ lucene/CHANGES.txt lucene/src/java/org/apache/lucene/store/DataOutput.java lucene/src/java/org/apache/lucene/util/fst/FST.java
Date Sun, 08 Jan 2012 19:44:56 GMT
Author: mikemccand
Date: Sun Jan  8 19:44:56 2012
New Revision: 1228935

URL: http://svn.apache.org/viewvc?rev=1228935&view=rev
Log:
LUCENE-3681: use 2 bytes (unsigned short) to save label for FST.INPUT_TYPE.BYTE2 case

Modified:
    lucene/dev/branches/branch_3x/   (props changed)
    lucene/dev/branches/branch_3x/lucene/   (props changed)
    lucene/dev/branches/branch_3x/lucene/CHANGES.txt
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/store/DataOutput.java
    lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/FST.java

Modified: lucene/dev/branches/branch_3x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/CHANGES.txt?rev=1228935&r1=1228934&r2=1228935&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_3x/lucene/CHANGES.txt Sun Jan  8 19:44:56 2012
@@ -37,6 +37,11 @@ Changes in backwards compatibility polic
   contrib/queryparser. If you have used those classes in your code
   just add the lucene-queryparser.jar file to your classpath.
   (Uwe Schindler)
+
+* LUCENE-3681: FST now stores labels for BYTE2 input type as 2 bytes
+  instead of vInt; this can make FSTs smaller and faster, but it is a
+  break in the binary format so if you had built and saved any FSTs
+  then you need to rebuild them. (Robert Muir, Mike McCandless)
   
 Security fixes
 

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/store/DataOutput.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/store/DataOutput.java?rev=1228935&r1=1228934&r2=1228935&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/store/DataOutput.java
(original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/store/DataOutput.java
Sun Jan  8 19:44:56 2012
@@ -61,6 +61,14 @@ public abstract class DataOutput {
     writeByte((byte) i);
   }
 
+  /** Writes a short as two bytes.
+   * @see DataInput#readShort()
+   */
+  public void writeShort(short i) throws IOException {
+    writeByte((byte)(i >>  8));
+    writeByte((byte) i);
+  }
+
   /** Writes an int in a variable-length format.  Writes between one and
    * five bytes.  Smaller values take fewer bytes.  Negative numbers are not
    * supported.

Modified: lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/FST.java?rev=1228935&r1=1228934&r2=1228935&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/FST.java (original)
+++ lucene/dev/branches/branch_3x/lucene/src/java/org/apache/lucene/util/fst/FST.java Sun
Jan  8 19:44:56 2012
@@ -82,7 +82,10 @@ public class FST<T> {
   /** Changed numBytesPerArc for array'd case from byte to int. */
   private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1;
 
-  private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC;
+  /** Write BYTE2 labels as 2-byte short, not vInt. */
+  private final static int VERSION_SHORT_BYTE2_LABELS = 2;
+
+  private final static int VERSION_CURRENT = VERSION_SHORT_BYTE2_LABELS;
 
   // Never serialized; just used to represent the virtual
   // final node w/ no arcs:
@@ -189,7 +192,9 @@ public class FST<T> {
   public FST(DataInput in, Outputs<T> outputs) throws IOException {
     this.outputs = outputs;
     writer = null;
-    CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC);
+    // NOTE: only reads most recent format; we don't have
+    // back-compat promise for FSTs (they are experimental):
+    CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_SHORT_BYTE2_LABELS, VERSION_SHORT_BYTE2_LABELS);
     if (in.readByte() == 1) {
       // accepts empty string
       int numBytes = in.readVInt();
@@ -342,7 +347,7 @@ public class FST<T> {
       writer.writeByte((byte) v);
     } else if (inputType == INPUT_TYPE.BYTE2) {
       assert v <= 65535: "v=" + v;
-      writer.writeVInt(v);
+      writer.writeShort((short) v);
     } else {
       //writeInt(v);
       writer.writeVInt(v);
@@ -352,7 +357,11 @@ public class FST<T> {
   int readLabel(DataInput in) throws IOException {
     final int v;
     if (inputType == INPUT_TYPE.BYTE1) {
+      // Unsigned byte:
       v = in.readByte()&0xFF;
+    } else if (inputType == INPUT_TYPE.BYTE2) {
+      // Unsigned short:
+      v = in.readShort()&0xFFFF;
     } else { 
       v = in.readVInt();
     }



Mime
View raw message