lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From h..@apache.org
Subject svn commit: r1519242 - in /lucene/dev/branches/lucene3069/lucene: codecs/src/java/org/apache/lucene/codecs/temp/ core/src/java/org/apache/lucene/codecs/ core/src/java/org/apache/lucene/codecs/lucene41/
Date Sun, 01 Sep 2013 04:32:02 GMT
Author: han
Date: Sun Sep  1 04:32:01 2013
New Revision: 1519242

URL: http://svn.apache.org/r1519242
Log:
cleanup codes, downgrade nocommits

Modified:
    lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsReader.java
    lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsWriter.java
    lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
    lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
    lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java

Modified: lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsReader.java?rev=1519242&r1=1519241&r2=1519242&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsReader.java
(original)
+++ lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsReader.java
Sun Sep  1 04:32:01 2013
@@ -408,7 +408,7 @@ public class TempFSTOrdTermsReader exten
         return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
       }
 
-      // nocommit: this can be achieved by making use of Util.getByOutput()
+      // TODO: this can be achieved by making use of Util.getByOutput()
       //           and should have related tests
       @Override
       public void seekExact(long ord) throws IOException {
@@ -541,14 +541,6 @@ public class TempFSTOrdTermsReader exten
         this.fstReader = fst.getBytesReader();
         this.fstOutputs = index.outputs;
         this.fsa = compiled.runAutomaton;
-        /*
-        PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
-        Util.toDot(dict,pw1, false, false);
-        pw1.close();
-        PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
-        pw2.write(compiled.toDot());
-        pw2.close();
-        */
         this.level = -1;
         this.stack = new Frame[16];
         for (int i = 0 ; i < stack.length; i++) {
@@ -679,8 +671,6 @@ public class TempFSTOrdTermsReader exten
         return frame;
       }
 
-      // nocommit: expected to use readFirstTargetArc here?
-
       /** Load frame for target arc(node) on fst */
       Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
         if (!canGrow(top)) {
@@ -743,7 +733,6 @@ public class TempFSTOrdTermsReader exten
         return !frame.arc.isLast();
       }
 
-      // nocommit: need to load ord lazily?
       void pushFrame(Frame frame) {
         final FST.Arc<Long> arc = frame.arc;
         arc.output = fstOutputs.add(topFrame().arc.output, arc.output);

Modified: lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsWriter.java?rev=1519242&r1=1519241&r2=1519242&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsWriter.java
(original)
+++ lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTOrdTermsWriter.java
Sun Sep  1 04:32:01 2013
@@ -55,13 +55,12 @@ public class TempFSTOrdTermsWriter exten
   public static final int TERMS_VERSION_START = 0;
   public static final int TERMS_VERSION_CURRENT = TERMS_VERSION_START;
   public static final int SKIP_INTERVAL = 8;
-  //static final boolean TEST = false;
   
   final PostingsWriterBase postingsWriter;
   final FieldInfos fieldInfos;
   final List<FieldMetaData> fields = new ArrayList<FieldMetaData>();
   IndexOutput blockOut = null;
-  IndexOutput indexOut = null;  // nocommit: hmm, do we really need two streams?
+  IndexOutput indexOut = null;
 
   public TempFSTOrdTermsWriter(SegmentWriteState state, PostingsWriterBase postingsWriter)
throws IOException {
     final String termsIndexFileName = IndexFileNames.segmentFileName(state.segmentInfo.name,
state.segmentSuffix, TERMS_INDEX_EXTENSION);
@@ -134,8 +133,6 @@ public class TempFSTOrdTermsWriter exten
     out.writeLong(dirStart);
   }
 
-  // nocommit: nuke this? we don't need to buffer so much data, 
-  // since close() can do this naturally
   private static class FieldMetaData {
     public FieldInfo fieldInfo;
     public long numTerms;
@@ -145,12 +142,16 @@ public class TempFSTOrdTermsWriter exten
     public int longsSize;
     public FST<Long> dict;
 
-    // nocommit: block encode each part 
-    // (so that we'll have metaLongsOut[])
-    public RAMOutputStream skipOut;       // vint encode next skip point (all values start
from 0, fully decoded when reading)
-    public RAMOutputStream statsOut;      // vint encode df, (ttf-df)
-    public RAMOutputStream metaLongsOut;  // vint encode monotonic long[] and length for
corresponding byte[]
-    public RAMOutputStream metaBytesOut;  // put all bytes blob here
+    // TODO: block encode each part 
+
+    // vint encode next skip point (fully decoded when reading)
+    public RAMOutputStream skipOut;
+    // vint encode df, (ttf-df)
+    public RAMOutputStream statsOut;
+    // vint encode monotonic long[] and length for corresponding byte[]
+    public RAMOutputStream metaLongsOut;
+    // generic byte[]
+    public RAMOutputStream metaBytesOut;
   }
 
   final class TermsWriter extends TermsConsumer {

Modified: lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java?rev=1519242&r1=1519241&r2=1519242&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
(original)
+++ lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempFSTTermsReader.java
Sun Sep  1 04:32:01 2013
@@ -63,7 +63,7 @@ public class TempFSTTermsReader extends 
   final TreeMap<String, TermsReader> fields = new TreeMap<String, TermsReader>();
   final PostingsReaderBase postingsReader;
   final IndexInput in;
-  //static boolean DEBUG = false;
+  //static boolean TEST = false;
 
   public TempFSTTermsReader(SegmentReadState state, PostingsReaderBase postingsReader) throws
IOException {
     final String termsFileName = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix,
TempFSTTermsWriter.TERMS_EXTENSION);
@@ -283,12 +283,6 @@ public class TempFSTTermsReader extends 
         return postingsReader.docsAndPositions(fieldInfo, state, liveDocs, reuse, flags);
       }
 
-      // nocommit: do we need this? for SegmentTermsEnum, we can maintain
-      // a stack to record how current term is constructed on FST, (and ord on each alphabet)
-      // so that during seek we don't have to start from the first arc.
-      // however, we'll be implementing a new fstEnum instead of wrapping current one.
-      //
-      // nocommit: this can also be achieved by making use of Util.getByOutput()
       @Override
       public void seekExact(long ord) throws IOException {
         throw new UnsupportedOperationException();
@@ -428,19 +422,11 @@ public class TempFSTTermsReader extends 
 
       IntersectTermsEnum(CompiledAutomaton compiled, BytesRef startTerm) throws IOException
{
         super();
-        //if (DEBUG) System.out.println("Enum init, startTerm=" + startTerm);
+        //if (TEST) System.out.println("Enum init, startTerm=" + startTerm);
         this.fst = dict;
         this.fstReader = fst.getBytesReader();
         this.fstOutputs = dict.outputs;
         this.fsa = compiled.runAutomaton;
-        /*
-        PrintWriter pw1 = new PrintWriter(new File("../temp/fst.txt"));
-        Util.toDot(dict,pw1, false, false);
-        pw1.close();
-        PrintWriter pw2 = new PrintWriter(new File("../temp/fsa.txt"));
-        pw2.write(compiled.toDot());
-        pw2.close();
-        */
         this.level = -1;
         this.stack = new Frame[16];
         for (int i = 0 ; i < stack.length; i++) {
@@ -511,7 +497,7 @@ public class TempFSTTermsReader extends 
 
       @Override
       public BytesRef next() throws IOException {
-        //if (DEBUG) System.out.println("Enum next()");
+        //if (TEST) System.out.println("Enum next()");
         if (pending) {
           pending = false;
           loadMetaData();
@@ -546,7 +532,7 @@ public class TempFSTTermsReader extends 
       }
 
       private BytesRef doSeekCeil(BytesRef target) throws IOException {
-        //if (DEBUG) System.out.println("Enum doSeekCeil()");
+        //if (TEST) System.out.println("Enum doSeekCeil()");
         Frame frame= null;
         int label, upto = 0, limit = target.length;
         while (upto < limit) {  // to target prefix, or ceil label (rewind prefix)
@@ -580,12 +566,6 @@ public class TempFSTTermsReader extends 
         return null;
       }
 
-      // nocommit: might be great if we can set flag BIT_LAST_ARC
-      // nocommit: actually we can use first arc as candidate...
-      // it always has NO_OUTPUT as output, and BIT_LAST_ARC set.
-      // but we'll have problem if later FST supports output sharing
-      // on first arc!
-
       /** Virtual frame, never pop */
       Frame loadVirtualFrame(Frame frame) throws IOException {
         frame.fstArc.output = fstOutputs.getNoOutput();
@@ -601,8 +581,6 @@ public class TempFSTTermsReader extends 
         return frame;
       }
 
-      // nocommit: expected to use readFirstTargetArc here?
-
       /** Load frame for target arc(node) on fst */
       Frame loadExpandFrame(Frame top, Frame frame) throws IOException {
         if (!canGrow(top)) {
@@ -610,18 +588,13 @@ public class TempFSTTermsReader extends 
         }
         frame.fstArc = fst.readFirstRealTargetArc(top.fstArc.target, frame.fstArc, fstReader);
         frame.fsaState = fsa.step(top.fsaState, frame.fstArc.label);
-        //if (DEBUG) System.out.println(" loadExpand frame="+frame);
+        //if (TEST) System.out.println(" loadExpand frame="+frame);
         if (frame.fsaState == -1) {
           return loadNextFrame(top, frame);
         }
         return frame;
       }
 
-      // nocommit: actually, here we're looking for a valid state for fsa, 
-      //           so if numArcs is large in fst, we should try a reverse lookup?
-      //           but we don have methods like advance(label) in fst, even 
-      //           binary search hurts. 
-      
       /** Load frame for sibling arc(node) on fst */
       Frame loadNextFrame(Frame top, Frame frame) throws IOException {
         if (!canRewind(frame)) {
@@ -634,7 +607,7 @@ public class TempFSTTermsReader extends 
             break;
           }
         }
-        //if (DEBUG) System.out.println(" loadNext frame="+frame);
+        //if (TEST) System.out.println(" loadNext frame="+frame);
         if (frame.fsaState == -1) {
           return null;
         }
@@ -650,7 +623,7 @@ public class TempFSTTermsReader extends 
           return null;
         }
         frame.fsaState = fsa.step(top.fsaState, arc.label);
-        //if (DEBUG) System.out.println(" loadCeil frame="+frame);
+        //if (TEST) System.out.println(" loadCeil frame="+frame);
         if (frame.fsaState == -1) {
           return loadNextFrame(top, frame);
         }
@@ -673,14 +646,14 @@ public class TempFSTTermsReader extends 
       void pushFrame(Frame frame) {
         term = grow(frame.fstArc.label);
         level++;
-        //if (DEBUG) System.out.println("  term=" + term + " level=" + level);
+        //if (TEST) System.out.println("  term=" + term + " level=" + level);
       }
 
       Frame popFrame() {
         term = shrink();
         level--;
         metaUpto = metaUpto > level ? level : metaUpto;
-        //if (DEBUG) System.out.println("  term=" + term + " level=" + level);
+        //if (TEST) System.out.println("  term=" + term + " level=" + level);
         return stack[level+1];
       }
 

Modified: lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java?rev=1519242&r1=1519241&r2=1519242&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
(original)
+++ lucene/dev/branches/lucene3069/lucene/codecs/src/java/org/apache/lucene/codecs/temp/TempTermOutputs.java
Sun Sep  1 04:32:01 2013
@@ -22,8 +22,6 @@ import java.util.Arrays;
 
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfo.IndexOptions;
-import org.apache.lucene.store.IndexInput;
-import org.apache.lucene.store.IndexOutput;
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
 import org.apache.lucene.util.fst.Outputs;
@@ -128,16 +126,18 @@ public class TempTermOutputs extends Out
   @Override
   //
   // The return value will be the smaller one, when these two are 
-  // 'comparable', i.e. every value in long[] fits the same ordering.
+  // 'comparable', i.e. 
+  // 1. every value in t1 is not larger than in t2, or
+  // 2. every value in t1 is not smaller than t2.
   //
   // NOTE: 
   // Only long[] part is 'shared' and pushed towards root.
-  // byte[] and term stats will be on deeper arcs.
+  // byte[] and term stats will be kept on deeper arcs.
   //
   public TempMetaData common(TempMetaData t1, TempMetaData t2) {
-    if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
+    //if (DEBUG) System.out.print("common("+t1+", "+t2+") = ");
     if (t1 == NO_OUTPUT || t2 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
+      //if (DEBUG) System.out.println("ret:"+NO_OUTPUT);
       return NO_OUTPUT;
     }
     assert t1.longs.length == t2.longs.length;
@@ -172,15 +172,15 @@ public class TempTermOutputs extends Out
         ret = new TempMetaData(min, null, 0, -1);
       }
     }
-    if (DEBUG) System.out.println("ret:"+ret);
+    //if (DEBUG) System.out.println("ret:"+ret);
     return ret;
   }
 
   @Override
   public TempMetaData subtract(TempMetaData t1, TempMetaData t2) {
-    if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
+    //if (DEBUG) System.out.print("subtract("+t1+", "+t2+") = ");
     if (t2 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+t1);
+      //if (DEBUG) System.out.println("ret:"+t1);
       return t1;
     }
     assert t1.longs.length == t2.longs.length;
@@ -201,20 +201,21 @@ public class TempTermOutputs extends Out
     } else {
       ret = new TempMetaData(share, t1.bytes, t1.docFreq, t1.totalTermFreq);
     }
-    if (DEBUG) System.out.println("ret:"+ret);
+    //if (DEBUG) System.out.println("ret:"+ret);
     return ret;
   }
 
-  // nocommit: we might refactor out an 'addSelf' later, 
-  // which improves 5~7% for fuzzy queries
+  // TODO: if we refactor a 'addSelf(TempMetaDat other)',
+  // we can gain about 5~7% for fuzzy queries, however on the other hand
+  // we seem to put much stress on FST Outputs decoding?
   @Override
   public TempMetaData add(TempMetaData t1, TempMetaData t2) {
-    if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
+    //if (DEBUG) System.out.print("add("+t1+", "+t2+") = ");
     if (t1 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+t2);
+      //if (DEBUG) System.out.println("ret:"+t2);
       return t2;
     } else if (t2 == NO_OUTPUT) {
-      if (DEBUG) System.out.println("ret:"+t1);
+      //if (DEBUG) System.out.println("ret:"+t1);
       return t1;
     }
     assert t1.longs.length == t2.longs.length;
@@ -233,7 +234,7 @@ public class TempTermOutputs extends Out
     } else {
       ret = new TempMetaData(accum, t1.bytes, t1.docFreq, t1.totalTermFreq);
     }
-    if (DEBUG) System.out.println("ret:"+ret);
+    //if (DEBUG) System.out.println("ret:"+ret);
     return ret;
   }
 

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java?rev=1519242&r1=1519241&r2=1519242&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
(original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/PostingsWriterBase.java
Sun Sep  1 04:32:01 2013
@@ -55,7 +55,7 @@ public abstract class PostingsWriterBase
   public abstract BlockTermState newTermState() throws IOException;
 
   /** Start a new term.  Note that a matching call to {@link
-   *  #finishTerm(long[], DataOutput, TermStats)} is done, only if the term has at least
one
+   *  #finishTerm(BlockTermState)} is done, only if the term has at least one
    *  document. */
   public abstract void startTerm() throws IOException;
 
@@ -67,12 +67,18 @@ public abstract class PostingsWriterBase
   /**
    * Encode metadata as long[] and byte[]. {@code absolute} controls 
    * whether current term is delta encoded according to latest term.
+   *
+   * NOTE: sometimes long[] might contain values that doesn't make sense,
+   * e.g. for Lucene41PostingsFormat, when singletonDocID != -1, docStartFP is not defined.
+   * Here postings side should always use the last docStartFP, to keep each element in 
+   * metadata long[] monotonic.
    */
   public abstract void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState
state, boolean absolute) throws IOException;
 
   /** 
-   * Return the fixed length of longs,
+   * Return the fixed length of long[] metadata (which is fixed per field),
    * called when the writing switches to another field. */
+  // TODO: better name?
   public abstract int setField(FieldInfo fieldInfo);
 
   @Override

Modified: lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java?rev=1519242&r1=1519241&r2=1519242&view=diff
==============================================================================
--- lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
(original)
+++ lucene/dev/branches/lucene3069/lucene/core/src/java/org/apache/lucene/codecs/lucene41/Lucene41PostingsWriter.java
Sun Sep  1 04:32:01 2013
@@ -215,8 +215,6 @@ public final class Lucene41PostingsWrite
     termsOut.writeVInt(BLOCK_SIZE);
   }
 
-  // nocommit better name?
-
   @Override
   public int setField(FieldInfo fieldInfo) {
     IndexOptions indexOptions = fieldInfo.getIndexOptions();
@@ -540,16 +538,12 @@ public final class Lucene41PostingsWrite
     docCount = 0;
   }
   
-  // nocommit explain about the "don't care" values
-
   @Override
   public void encodeTerm(long[] longs, DataOutput out, FieldInfo fieldInfo, BlockTermState
_state, boolean absolute) throws IOException {
     IntBlockTermState state = (IntBlockTermState)_state;
     if (absolute) {
       lastState = newTermState();
     }
-    //System.out.println("PW: state=" + state);
-    //System.out.println("     last=" + lastState);
     if (VERSION_CURRENT < VERSION_META_ARRAY) {  // impersonation
       _encodeTerm(out, fieldInfo, state);
       return;



Mime
View raw message