lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1489064 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/CHANGES.txt lucene/core/ lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
Date Mon, 03 Jun 2013 17:09:21 GMT
Author: mikemccand
Date: Mon Jun  3 17:09:20 2013
New Revision: 1489064

URL: http://svn.apache.org/r1489064
Log:
LUCENE-5025: accept more than 2.1 billion tail nodes while building an FST

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/core/   (props changed)
    lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
    lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1489064&r1=1489063&r2=1489064&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Mon Jun  3 17:09:20 2013
@@ -148,6 +148,10 @@ New Features
 * LUCENE-5026: Added PagedGrowableWriter, a new internal packed-ints structure
   that grows the number of bits per value on demand, can store more than 2B
   values and supports random write and read access. (Adrien Grand)
+
+* LUCENE-5025: FST's Builder can now handle more than 2.1 billion
+  "tail nodes" while building a minimal FST.  (Aaron Binns, Adrien
+  Grand, Mike McCandless)
   
 Build
 

Modified: lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java?rev=1489064&r1=1489063&r2=1489064&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/java/org/apache/lucene/util/fst/NodeHash.java
Mon Jun  3 17:09:20 2013
@@ -19,21 +19,21 @@ package org.apache.lucene.util.fst;
 
 import java.io.IOException;
 
-import org.apache.lucene.util.packed.GrowableWriter;
 import org.apache.lucene.util.packed.PackedInts;
+import org.apache.lucene.util.packed.PagedGrowableWriter;
 
 // Used to dedup states (lookup already-frozen states)
 final class NodeHash<T> {
 
-  private GrowableWriter table;
-  private int count;
-  private int mask;
+  private PagedGrowableWriter table;
+  private long count;
+  private long mask;
   private final FST<T> fst;
   private final FST.Arc<T> scratchArc = new FST.Arc<T>();
   private final FST.BytesReader in;
 
   public NodeHash(FST<T> fst, FST.BytesReader in) {
-    table = new GrowableWriter(8, 16, PackedInts.COMPACT);
+    table = new PagedGrowableWriter(16, 1<<30, 8, PackedInts.COMPACT);
     mask = 15;
     this.fst = fst;
     this.in = in;
@@ -69,10 +69,10 @@ final class NodeHash<T> {
 
   // hash code for an unfrozen node.  This must be identical
   // to the un-frozen case (below)!!
-  private int hash(Builder.UnCompiledNode<T> node) {
+  private long hash(Builder.UnCompiledNode<T> node) {
     final int PRIME = 31;
     //System.out.println("hash unfrozen");
-    int h = 0;
+    long h = 0;
     // TODO: maybe if number of arcs is high we can safely subsample?
     for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
       final Builder.Arc<T> arc = node.arcs[arcIdx];
@@ -87,14 +87,14 @@ final class NodeHash<T> {
       }
     }
     //System.out.println("  ret " + (h&Integer.MAX_VALUE));
-    return h & Integer.MAX_VALUE;
+    return h & Long.MAX_VALUE;
   }
 
   // hash code for a frozen node
-  private int hash(long node) throws IOException {
+  private long hash(long node) throws IOException {
     final int PRIME = 31;
     //System.out.println("hash frozen node=" + node);
-    int h = 0;
+    long h = 0;
     fst.readFirstRealTargetArc(node, scratchArc, in);
     while(true) {
       //System.out.println("  label=" + scratchArc.label + " target=" + scratchArc.target
+ " h=" + h + " output=" + fst.outputs.outputToString(scratchArc.output) + " next?=" + scratchArc.flag(4)
+ " final?=" + scratchArc.isFinal() + " pos=" + in.getPosition());
@@ -111,13 +111,13 @@ final class NodeHash<T> {
       fst.readNextRealArc(scratchArc, in);
     }
     //System.out.println("  ret " + (h&Integer.MAX_VALUE));
-    return h & Integer.MAX_VALUE;
+    return h & Long.MAX_VALUE;
   }
 
   public long add(Builder.UnCompiledNode<T> nodeIn) throws IOException {
-    // System.out.println("hash: add count=" + count + " vs " + table.size());
-    final int h = hash(nodeIn);
-    int pos = h & mask;
+    //System.out.println("hash: add count=" + count + " vs " + table.size() + " mask=" +
mask);
+    final long h = hash(nodeIn);
+    long pos = h & mask;
     int c = 0;
     while(true) {
       final long v = table.get(pos);
@@ -128,7 +128,8 @@ final class NodeHash<T> {
         assert hash(node) == h : "frozenHash=" + hash(node) + " vs h=" + h;
         count++;
         table.set(pos, node);
-        if (table.size() < 2*count) {
+        // Rehash at 2/3 occupancy:
+        if (count > 2*table.size()/3) {
           rehash();
         }
         return node;
@@ -144,7 +145,7 @@ final class NodeHash<T> {
 
   // called only by rehash
   private void addNew(long address) throws IOException {
-    int pos = hash(address) & mask;
+    long pos = hash(address) & mask;
     int c = 0;
     while(true) {
       if (table.get(pos) == 0) {
@@ -158,23 +159,15 @@ final class NodeHash<T> {
   }
 
   private void rehash() throws IOException {
-    final GrowableWriter oldTable = table;
+    final PagedGrowableWriter oldTable = table;
 
-    if (oldTable.size() >= Integer.MAX_VALUE/2) {
-      throw new IllegalStateException("FST too large (> 2.1 GB)");
-    }
-
-    table = new GrowableWriter(oldTable.getBitsPerValue(), 2*oldTable.size(), PackedInts.COMPACT);
+    table = new PagedGrowableWriter(2*oldTable.size(), 1<<30, PackedInts.bitsRequired(count),
PackedInts.COMPACT);
     mask = table.size()-1;
-    for(int idx=0;idx<oldTable.size();idx++) {
+    for(long idx=0;idx<oldTable.size();idx++) {
       final long address = oldTable.get(idx);
       if (address != 0) {
         addNew(address);
       }
     }
   }
-
-  public int count() {
-    return count;
-  }
 }

Modified: lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java?rev=1489064&r1=1489063&r2=1489064&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/core/src/test/org/apache/lucene/util/fst/Test2BFST.java
Mon Jun  3 17:09:20 2013
@@ -34,7 +34,7 @@ import org.apache.lucene.util.packed.Pac
 import org.junit.Ignore;
 import com.carrotsearch.randomizedtesting.annotations.TimeoutSuite;
 
-@Ignore("Requires tons of heap to run (10G works)")
+@Ignore("Requires tons of heap to run (420G works)")
 @TimeoutSuite(millis = 100 * TimeUnits.HOUR)
 public class Test2BFST extends LuceneTestCase {
 
@@ -50,12 +50,12 @@ public class Test2BFST extends LuceneTes
     for(int doPackIter=0;doPackIter<2;doPackIter++) {
       boolean doPack = doPackIter == 1;
 
-      // Build FST w/ NoOutputs and stop when nodeCount > 3B
+      // Build FST w/ NoOutputs and stop when nodeCount > 2.2B
       if (!doPack) {
         System.out.println("\nTEST: 3B nodes; doPack=false output=NO_OUTPUTS");
         Outputs<Object> outputs = NoOutputs.getSingleton();
         Object NO_OUTPUT = outputs.getNoOutput();
-        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0,
0, false, false, Integer.MAX_VALUE, outputs,
+        final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, 0,
0, true, true, Integer.MAX_VALUE, outputs,
                                                       null, doPack, PackedInts.COMPACT, true,
15);
 
         int count = 0;
@@ -72,7 +72,7 @@ public class Test2BFST extends LuceneTes
           if (count % 100000 == 0) {
             System.out.println(count + ": " + b.fstSizeInBytes() + " bytes; " + b.getTotStateCount()
+ " nodes");
           }
-          if (b.getTotStateCount() > LIMIT) {
+          if (b.getTotStateCount() > Integer.MAX_VALUE + 100L * 1024 * 1024) {
             break;
           }
           nextInput(r, ints2);



Mime
View raw message