lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1441214 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/core/ lucene/core/src/java/org/apache/lucene/util/fst/ lucene/core/src/test/org/apache/lucene/util/fst/
Date Thu, 31 Jan 2013 21:53:11 GMT
Author: mikemccand
Date: Thu Jan 31 21:53:11 2013
New Revision: 1441214

URL: http://svn.apache.org/viewvc?rev=1441214&view=rev
Log:
LUCENE-4739: fix FST.save/load to work with > 1.1 GB FSTs

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1441214&r1=1441213&r2=1441214&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Jan 31 21:53:11 2013
@@ -108,6 +108,9 @@ Bug Fixes
 * LUCENE-4732: Fixed TermsEnum.seekCeil/seekExact on term vectors.
   (Adrien Grand, Robert Muir)
 
+* LUCENE-4739: Fixed bugs that prevented FSTs more than ~1.1GB from
+  being saved and loaded (Adrien Grand, Mike McCandless)
+
 ======================= Lucene 4.1.0 =======================
 
 Changes in backwards compatibility policy

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java?rev=1441214&r1=1441213&r2=1441214&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/BytesStore.java
Thu Jan 31 21:53:11 2013
@@ -46,7 +46,7 @@ class BytesStore extends DataOutput {
   }
 
   /** Pulls bytes from the provided IndexInput.  */
-  public BytesStore(DataInput in, int numBytes, int maxBlockSize) throws IOException {
+  public BytesStore(DataInput in, long numBytes, int maxBlockSize) throws IOException {
     int blockSize = 2;
     int blockBits = 1;
     while(blockSize < numBytes && blockSize < maxBlockSize) {
@@ -56,9 +56,9 @@ class BytesStore extends DataOutput {
     this.blockBits = blockBits;
     this.blockSize = blockSize;
     this.blockMask = blockSize-1;
-    int left = numBytes;
+    long left = numBytes;
     while(left > 0) {
-      final int chunk = Math.min(blockSize, left);
+      final int chunk = (int) Math.min(blockSize, left);
       byte[] block = new byte[chunk];
       in.readBytes(block, 0, block.length);
       blocks.add(block);

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java?rev=1441214&r1=1441213&r2=1441214&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/FST.java
Thu Jan 31 21:53:11 2013
@@ -27,11 +27,6 @@ import java.io.InputStream;
 import java.io.OutputStream;
 import java.util.HashMap;
 import java.util.Map;
-/*
-import java.io.Writer;
-import java.io.OutputStreamWriter;
-import java.io.FileOutputStream;
-*/
 
 import org.apache.lucene.codecs.CodecUtil;
 import org.apache.lucene.store.ByteArrayDataOutput;
@@ -41,12 +36,15 @@ import org.apache.lucene.store.InputStre
 import org.apache.lucene.store.OutputStreamDataOutput;
 import org.apache.lucene.store.RAMOutputStream;
 import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.Constants;
 import org.apache.lucene.util.IOUtils;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.PriorityQueue;
 import org.apache.lucene.util.fst.Builder.UnCompiledNode;
 import org.apache.lucene.util.packed.GrowableWriter;
 import org.apache.lucene.util.packed.PackedInts;
+//import java.io.Writer;
+//import java.io.OutputStreamWriter;
 
 // TODO: break this into WritableFST and ReadOnlyFST.. then
 // we can have subclasses of ReadOnlyFST to handle the
@@ -276,7 +274,6 @@ public final class FST<T> {
     this.outputs = outputs;
     this.allowArrayArcs = allowArrayArcs;
     version = VERSION_CURRENT;
-    // 32 KB blocks:
     bytes = new BytesStore(bytesPageBits);
     // pad: ensure no node gets address 0 which is reserved to mean
     // the stop state w/ no arcs
@@ -295,9 +292,22 @@ public final class FST<T> {
     nodeRefToAddress = null;
   }
 
+  public static final int DEFAULT_MAX_BLOCK_BITS = Constants.JRE_IS_64BIT ? 30 : 28;
+
   /** Load a previously saved FST. */
   public FST(DataInput in, Outputs<T> outputs) throws IOException {
+    this(in, outputs, DEFAULT_MAX_BLOCK_BITS);
+  }
+
+  /** Load a previously saved FST; maxBlockBits allows you to
+   *  control the size of the byte[] pages used to hold the FST bytes. */
+  public FST(DataInput in, Outputs<T> outputs, int maxBlockBits) throws IOException
{
     this.outputs = outputs;
+
+    if (maxBlockBits < 1 || maxBlockBits > 30) {
+      throw new IllegalArgumentException("maxBlockBits should be 1 .. 30; got " + maxBlockBits);
+    }
+
     // NOTE: only reads most recent format; we don't have
     // back-compat promise for FSTs (they are experimental):
     version = CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_PACKED, VERSION_VINT_TARGET);
@@ -345,13 +355,13 @@ public final class FST<T> {
     } else {
       nodeRefToAddress = null;
     }
-    startNode = in.readVInt();
+    startNode = in.readVLong();
     nodeCount = in.readVLong();
     arcCount = in.readVLong();
     arcWithOutputCount = in.readVLong();
 
-    int numBytes = in.readVInt();
-    bytes = new BytesStore(in, numBytes, Integer.MAX_VALUE);
+    long numBytes = in.readVLong();
+    bytes = new BytesStore(in, numBytes, 1<<maxBlockBits);
     
     NO_OUTPUT = outputs.getNoOutput();
 

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java?rev=1441214&r1=1441213&r2=1441214&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
Thu Jan 31 21:53:11 2013
@@ -20,10 +20,16 @@ package org.apache.lucene.util.fst;
 import java.util.Arrays;
 import java.util.Random;
 
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.IOContext;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.MMapDirectory;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.IntsRef;
 import org.apache.lucene.util.LuceneTestCase;
 import org.apache.lucene.util.TimeUnits;
+import org.apache.lucene.util._TestUtil;
 import org.apache.lucene.util.packed.PackedInts;
 import org.junit.Ignore;
 import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
@@ -39,6 +45,8 @@ public class Test2BFST extends LuceneTes
     IntsRef input = new IntsRef(ints, 0, ints.length);
     long seed = random().nextLong();
 
+    Directory dir = new MMapDirectory(_TestUtil.getTempDir("2BFST"));
+
     for(int doPackIter=0;doPackIter<2;doPackIter++) {
       boolean doPack = doPackIter == 1;
 
@@ -72,42 +80,56 @@ public class Test2BFST extends LuceneTes
 
         FST<Object> fst = b.finish();
 
-        System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount="
+ fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
+        for(int verify=0;verify<2;verify++) {
+          System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount="
+ fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
 
-        Arrays.fill(ints2, 0);
-        r = new Random(seed);
+          Arrays.fill(ints2, 0);
+          r = new Random(seed);
 
-        for(int i=0;i<count;i++) {
-          if (i % 1000000 == 0) {
-            System.out.println(i + "...: ");
-          }
-          for(int j=10;j<ints2.length;j++) {
-            ints2[j] = r.nextInt(256);
+          for(int i=0;i<count;i++) {
+            if (i % 1000000 == 0) {
+              System.out.println(i + "...: ");
+            }
+            for(int j=10;j<ints2.length;j++) {
+              ints2[j] = r.nextInt(256);
+            }
+            assertEquals(NO_OUTPUT, Util.get(fst, input2));
+            nextInput(r, ints2);
+          }
+
+          System.out.println("\nTEST: enum all input/outputs");
+          IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
+
+          Arrays.fill(ints2, 0);
+          r = new Random(seed);
+          int upto = 0;
+          while(true) {
+            IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
+            if (pair == null) {
+              break;
+            }
+            for(int j=10;j<ints2.length;j++) {
+              ints2[j] = r.nextInt(256);
+            }
+            assertEquals(input2, pair.input);
+            assertEquals(NO_OUTPUT, pair.output);
+            upto++;
+            nextInput(r, ints2);
+          }
+          assertEquals(count, upto);
+
+          if (verify == 0) {
+            System.out.println("\nTEST: save/load FST and re-verify");
+            IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
+            fst.save(out);
+            out.close();
+            IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
+            fst = new FST<Object>(in, outputs);
+            in.close();
+          } else {
+            dir.deleteFile("fst");
           }
-          assertEquals(NO_OUTPUT, Util.get(fst, input2));
-          nextInput(r, ints2);
         }
-
-        System.out.println("\nTEST: enum all input/outputs");
-        IntsRefFSTEnum<Object> fstEnum = new IntsRefFSTEnum<Object>(fst);
-
-        Arrays.fill(ints2, 0);
-        r = new Random(seed);
-        int upto = 0;
-        while(true) {
-          IntsRefFSTEnum.InputOutput<Object> pair = fstEnum.next();
-          if (pair == null) {
-            break;
-          }
-          for(int j=10;j<ints2.length;j++) {
-            ints2[j] = r.nextInt(256);
-          }
-          assertEquals(input2, pair.input);
-          assertEquals(NO_OUTPUT, pair.output);
-          upto++;
-          nextInput(r, ints2);
-        }
-        assertEquals(count, upto);
       }
 
       // Build FST w/ ByteSequenceOutputs and stop when FST
@@ -138,39 +160,53 @@ public class Test2BFST extends LuceneTes
         }
 
         FST<BytesRef> fst = b.finish();
+        for(int verify=0;verify<2;verify++) {
 
-        System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount="
+ fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
+          System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount="
+ fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
 
-        r = new Random(seed);
-        Arrays.fill(ints, 0);
+          r = new Random(seed);
+          Arrays.fill(ints, 0);
 
-        for(int i=0;i<count;i++) {
-          if (i % 1000000 == 0) {
-            System.out.println(i + "...: ");
+          for(int i=0;i<count;i++) {
+            if (i % 1000000 == 0) {
+              System.out.println(i + "...: ");
+            }
+            r.nextBytes(outputBytes);
+            assertEquals(output, Util.get(fst, input));
+            nextInput(r, ints);
+          }
+
+          System.out.println("\nTEST: enum all input/outputs");
+          IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
+
+          Arrays.fill(ints, 0);
+          r = new Random(seed);
+          int upto = 0;
+          while(true) {
+            IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
+            if (pair == null) {
+              break;
+            }
+            assertEquals(input, pair.input);
+            r.nextBytes(outputBytes);
+            assertEquals(output, pair.output);
+            upto++;
+            nextInput(r, ints);
+          }
+          assertEquals(count, upto);
+
+          if (verify == 0) {
+            System.out.println("\nTEST: save/load FST and re-verify");
+            IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
+            fst.save(out);
+            out.close();
+            IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
+            fst = new FST<BytesRef>(in, outputs);
+            in.close();
+          } else {
+            dir.deleteFile("fst");
           }
-          r.nextBytes(outputBytes);
-          assertEquals(output, Util.get(fst, input));
-          nextInput(r, ints);
         }
-
-        System.out.println("\nTEST: enum all input/outputs");
-        IntsRefFSTEnum<BytesRef> fstEnum = new IntsRefFSTEnum<BytesRef>(fst);
-
-        Arrays.fill(ints, 0);
-        r = new Random(seed);
-        int upto = 0;
-        while(true) {
-          IntsRefFSTEnum.InputOutput<BytesRef> pair = fstEnum.next();
-          if (pair == null) {
-            break;
-          }
-          assertEquals(input, pair.input);
-          r.nextBytes(outputBytes);
-          assertEquals(output, pair.output);
-          upto++;
-          nextInput(r, ints);
-        }
-        assertEquals(count, upto);
       }
 
       // Build FST w/ PositiveIntOutputs and stop when FST
@@ -202,46 +238,62 @@ public class Test2BFST extends LuceneTes
 
         FST<Long> fst = b.finish();
 
-        System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount="
+ fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
-
-        Arrays.fill(ints, 0);
-
-        output = 1;
-        r = new Random(seed);
-        for(int i=0;i<count;i++) {
-          if (i % 1000000 == 0) {
-            System.out.println(i + "...: ");
-          }
+        for(int verify=0;verify<2;verify++) {
 
-          // forward lookup:
-          assertEquals(output, Util.get(fst, input).longValue());
-          // reverse lookup:
-          assertEquals(input, Util.getByOutput(fst, output));
-          output += 1 + r.nextInt(10);
-          nextInput(r, ints);
-        }
+          System.out.println("\nTEST: now verify [fst size=" + fst.sizeInBytes() + "; nodeCount="
+ fst.getNodeCount() + "; arcCount=" + fst.getArcCount() + "]");
 
-        System.out.println("\nTEST: enum all input/outputs");
-        IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
+          Arrays.fill(ints, 0);
 
-        Arrays.fill(ints, 0);
-        r = new Random(seed);
-        int upto = 0;
-        output = 1;
-        while(true) {
-          IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
-          if (pair == null) {
-            break;
+          output = 1;
+          r = new Random(seed);
+          for(int i=0;i<count;i++) {
+            if (i % 1000000 == 0) {
+              System.out.println(i + "...: ");
+            }
+
+            // forward lookup:
+            assertEquals(output, Util.get(fst, input).longValue());
+            // reverse lookup:
+            assertEquals(input, Util.getByOutput(fst, output));
+            output += 1 + r.nextInt(10);
+            nextInput(r, ints);
+          }
+
+          System.out.println("\nTEST: enum all input/outputs");
+          IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
+
+          Arrays.fill(ints, 0);
+          r = new Random(seed);
+          int upto = 0;
+          output = 1;
+          while(true) {
+            IntsRefFSTEnum.InputOutput<Long> pair = fstEnum.next();
+            if (pair == null) {
+              break;
+            }
+            assertEquals(input, pair.input);
+            assertEquals(output, pair.output.longValue());
+            output += 1 + r.nextInt(10);
+            upto++;
+            nextInput(r, ints);
+          }
+          assertEquals(count, upto);
+
+          if (verify == 0) {
+            System.out.println("\nTEST: save/load FST and re-verify");
+            IndexOutput out = dir.createOutput("fst", IOContext.DEFAULT);
+            fst.save(out);
+            out.close();
+            IndexInput in = dir.openInput("fst", IOContext.DEFAULT);
+            fst = new FST<Long>(in, outputs);
+            in.close();
+          } else {
+            dir.deleteFile("fst");
           }
-          assertEquals(input, pair.input);
-          assertEquals(output, pair.output.longValue());
-          output += 1 + r.nextInt(10);
-          upto++;
-          nextInput(r, ints);
         }
-        assertEquals(count, upto);
       }
     }
+    dir.close();
   }
 
   private void nextInput(Random r, int[] ints) {



Mime
View raw message