lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r1049200 - in /lucene/dev/branches/bulkpostings/lucene: contrib/instantiated/src/java/org/apache/lucene/store/instantiated/ src/java/org/apache/lucene/index/
Date Tue, 14 Dec 2010 18:08:56 GMT
Author: mikemccand
Date: Tue Dec 14 18:08:56 2010
New Revision: 1049200

URL: http://svn.apache.org/viewvc?rev=1049200&view=rev
Log:
add missing files

Added:
    lucene/dev/branches/bulkpostings/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java
  (with props)
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BulkPostingsEnum.java
  (with props)
    lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java
  (with props)

Added: lucene/dev/branches/bulkpostings/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java?rev=1049200&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java
(added)
+++ lucene/dev/branches/bulkpostings/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedBulkPostingsEnum.java
Tue Dec 14 18:08:56 2010
@@ -0,0 +1,268 @@
+package org.apache.lucene.store.instantiated;
+
+/**
+ * Copyright 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.index.BulkPostingsEnum;
+
+public class InstantiatedBulkPostingsEnum extends BulkPostingsEnum {
+
+  private final DocDeltasReader docDeltasReader;
+  private final FreqsReader freqsReader;
+  private final PositionDeltasReader positionDeltasReader;
+  private final String field;
+
+  private InstantiatedTerm term;
+
+  public InstantiatedBulkPostingsEnum(String field, boolean doFreq, boolean doPositions)
{
+    this.field = field;
+    docDeltasReader = new DocDeltasReader();
+    if (doFreq) {
+      freqsReader = new FreqsReader();
+    } else {
+      freqsReader = null;
+    }
+
+    if (doPositions) {
+      positionDeltasReader = new PositionDeltasReader();
+    } else {
+      positionDeltasReader = null;
+    }
+  }
+
+  public boolean canReuse(String field, boolean doFreq, boolean doPositions) {
+    return field.equals(this.field) && (doFreq == (freqsReader != null)) &&
(doPositions == (positionDeltasReader != null));
+  }
+
+  private class DocDeltasReader extends BlockReader {
+    private final int[] buffer = new int[64];
+    private InstantiatedTermDocumentInformation[] docs;
+    private int docUpto;
+    private int lastDocID;
+    private int limit;
+
+    public void reset(InstantiatedTerm term) {
+      docUpto = 0;
+      lastDocID = 0;
+      docs = term.getAssociatedDocuments();
+      fill();
+    }
+
+    public void jump(int docUpto, int lastDocID) {
+      this.lastDocID = lastDocID;
+      this.docUpto = docUpto;
+      this.limit = 0;
+    }
+
+    @Override
+    public int[] getBuffer() {
+      return buffer;
+    }
+
+    @Override
+    public int offset() {
+      return 0;
+    }
+
+    @Override
+    public void setOffset(int v) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int end() {
+      return limit;
+    }
+
+    @Override
+    public int fill() {
+      final int chunk = Math.min(buffer.length, docs.length-docUpto);
+      for(int i=0;i<chunk;i++) {
+        final int docID = docs[docUpto++].getDocument().getDocumentNumber();
+        buffer[i] = docID - lastDocID;
+        lastDocID = docID;
+      }
+      docUpto += chunk;
+      return limit = chunk;
+    }
+  }
+
+  private class FreqsReader extends BlockReader {
+    private final int[] buffer = new int[64];
+    private InstantiatedTermDocumentInformation[] docs;
+    private int docUpto;
+    private int limit;
+
+    public void reset(InstantiatedTerm term) {
+      docUpto = 0;
+      docs = term.getAssociatedDocuments();
+      fill();
+    }
+
+    public void jump(int docUpto, int lastDocID) {
+      this.docUpto = docUpto;
+      this.limit = 0;
+    }
+
+    @Override
+    public int[] getBuffer() {
+      return buffer;
+    }
+
+    @Override
+    public int offset() {
+      return 0;
+    }
+
+    @Override
+    public void setOffset(int v) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int end() {
+      return limit;
+    }
+
+    @Override
+    public int fill() {
+      final int chunk = Math.min(buffer.length, docs.length-docUpto);
+      for(int i=0;i<chunk;i++) {
+        buffer[i] =  docs[docUpto++].getTermPositions().length;
+      }
+      docUpto += chunk;
+      return limit = chunk;
+    }
+  }
+
+  private class PositionDeltasReader extends BlockReader {
+    private final int[] buffer = new int[64];
+    private InstantiatedTermDocumentInformation[] docs;
+    private int docUpto;
+    private int posUpto;
+    private int limit;
+
+    public void reset(InstantiatedTerm term) {
+      docUpto = posUpto = 0;
+      docs = term.getAssociatedDocuments();
+      fill();
+    }
+
+    public void jump(int docUpto, int lastDocID) {
+      this.docUpto = docUpto;
+      posUpto = 0;
+      this.limit = 0;
+    }
+
+    @Override
+    public int[] getBuffer() {
+      return buffer;
+    }
+
+    @Override
+    public int offset() {
+      return 0;
+    }
+
+    @Override
+    public void setOffset(int v) {
+      throw new UnsupportedOperationException();
+    }
+
+    @Override
+    public int end() {
+      return limit;
+    }
+
+    @Override
+    public int fill() {
+      int upto = 0;
+      while(docUpto < docs.length) {
+        final InstantiatedTermDocumentInformation doc = docs[docUpto];
+        final int[] positions = doc.getTermPositions();
+        final int chunk = Math.min(buffer.length - upto, positions.length - posUpto);
+        System.arraycopy(positions, posUpto, buffer, upto, chunk);
+        
+        upto += chunk;
+        posUpto += chunk;
+        if (posUpto == positions.length) {
+          docUpto++;
+          posUpto = 0;
+        }
+        if (upto == buffer.length) {
+          break;
+        }
+      }
+      return limit = upto;
+    }
+  }
+
+  public InstantiatedBulkPostingsEnum reset(InstantiatedTerm term) {
+    this.term = term;
+
+    docDeltasReader.reset(term);
+    
+    if (freqsReader != null) {
+      freqsReader.reset(term);
+    }
+    if (positionDeltasReader != null) {
+      positionDeltasReader.reset(term);
+    }
+    return this;
+  }
+
+  @Override
+  public BlockReader getDocDeltasReader() {
+    return docDeltasReader;
+  }
+
+  @Override
+  public BlockReader getPositionDeltasReader() {
+    return positionDeltasReader;
+  }
+
+  @Override
+  public BlockReader getFreqsReader() {
+    return freqsReader;
+  }
+
+  private final JumpResult jumpResult = new JumpResult();
+
+  @Override
+  public JumpResult jump(int target, int curCount) {
+    int docUpto = term.seekCeilingDocumentInformationIndex(target, 0);
+    if (docUpto == -1) {
+      // TODO: the bulk API currently can't express this
+      // ("jumped beyond last doc")... because the skip data
+      // for the core codecs doesn't "know" the last doc
+      return null;
+    }
+
+    final int lastDocID = docUpto == 0 ? 0 : term.getAssociatedDocuments()[docUpto-1].getDocument().getDocumentNumber();
+    docDeltasReader.jump(lastDocID, docUpto);
+    if (freqsReader != null) {
+      freqsReader.jump(lastDocID, docUpto);
+    }
+    if (positionDeltasReader != null) {
+      positionDeltasReader.jump(lastDocID, docUpto);
+    }
+
+    jumpResult.docID = term.getAssociatedDocuments()[docUpto].getDocument().getDocumentNumber();
+    jumpResult.count = docUpto;
+
+    return jumpResult;
+  }
+}

Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BulkPostingsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BulkPostingsEnum.java?rev=1049200&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BulkPostingsEnum.java
(added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/BulkPostingsEnum.java
Tue Dec 14 18:08:56 2010
@@ -0,0 +1,118 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+/** Low level bulk iterator through postings (documents,
+ *  term freq and positions).  This API shifts much
+ *  responsibility to the caller, in order to maximize
+ *  performance:
+ *
+ *    * Caller must track and enforce docFreq limit
+ *    * If omitTFAP is on, caller must handle null
+ *      freq/pos reader
+ *    * Accum docDeltas to get docIDs
+ *    * Accum posDeltas to get positions
+ *    * Correlate positions to docs by adding up termFreqs
+ *    * Enforce skipDocs
+ *    * Jump is not precise -- caller must still scan after
+ *      a successful jump
+ *    * Avoid reading too many ints, ie, impls of this API
+ *      do not do bounds checking
+ *
+ *  @lucene.experimental */
+public abstract class BulkPostingsEnum {
+
+  /** NOTE: on first obtaining the BlockReader it's possible
+   *  there's data in the buffer.  Use offset/end to check. */
+  public static abstract class BlockReader {
+
+    /** Returns int[] that holds each block. Call this once
+     *  up front before you start iterating. */
+    public abstract int[] getBuffer();
+    
+    /** Read another block. Returns the count read, or 0 on
+     *  EOF. */
+    public abstract int fill() throws IOException;
+
+    /** End index plus 1 of valid data in the buffer */
+    public abstract int end();
+
+    /** Start index of valid data in the buffer */
+    public abstract int offset();
+
+    // nocommit messy
+    public abstract void setOffset(int offset);
+
+    // nocommit messy
+    public int next() throws IOException {
+      final int[] buffer = getBuffer();
+      int offset = offset();
+      int end = end();
+      if (offset >= end) {
+        offset = 0;
+        end = fill();
+        if (offset >= end) {
+          // nocommit cleanup
+          throw new IOException("no more ints");
+        }
+      }
+      setOffset(1+offset);
+      return buffer[offset];
+    }
+
+    /** Reads long as 1 or 2 ints, and can only use 61 of
+     *  the 64 long bits. */
+    public long readVLong() throws IOException {
+      int offset = offset();
+      
+      final int v = next();
+      if ((v & 1) == 0) {
+        return v >> 1;
+      } else {
+        final long v2 = next();
+        return (v2 << 30) | (v >> 1);
+      }
+    }
+  }
+
+  public abstract BlockReader getDocDeltasReader() throws IOException;
+
+  /** Returns null if per-document term freq is not indexed */
+  public abstract BlockReader getFreqsReader() throws IOException;
+
+  /** Returns null if positions are not indexed */
+  public abstract BlockReader getPositionDeltasReader() throws IOException;
+
+  public static class JumpResult {
+    public int count;
+    public int docID;
+  }
+
+  /** Only call this if the docID you seek is after the last
+   *  document in the buffer.  This call does not position
+   *  exactly; instead, it jumps forward when possible,
+   *  returning the docID and ord it had jumped to, seeking
+   *  all of the BlockReaders accordingly.  Note that if a
+   *  seek did occur, you must call .offset() and .limit()
+   *  on each BlockReader.  If null is returned then
+   *  skipping is not possible, ie you should just scan
+   *  yourself). */
+  abstract public JumpResult jump(int target, int curCount) throws IOException;
+}

Added: lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java?rev=1049200&view=auto
==============================================================================
--- lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java
(added)
+++ lucene/dev/branches/bulkpostings/lucene/src/java/org/apache/lucene/index/MultiBulkPostingsEnum.java
Tue Dec 14 18:08:56 2010
@@ -0,0 +1,274 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.ReaderUtil;
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+import java.io.IOException;
+
+public final class MultiBulkPostingsEnum extends BulkPostingsEnum {
+  private EnumWithSlice[] subs;
+  int numSubs;
+
+  private final DocDeltasReader docDeltasReader = new DocDeltasReader();
+  private final FreqsReader freqsReader = new FreqsReader();
+  private final PositionsReader positionsReader = new PositionsReader();
+
+  MultiBulkPostingsEnum reset(final EnumWithSlice[] subs, final int numSubs, boolean doFreqs,
boolean doPositions) throws IOException {
+    this.numSubs = numSubs;
+    this.subs = new EnumWithSlice[subs.length];
+    for(int i=0;i<subs.length;i++) {
+      this.subs[i] = new EnumWithSlice();
+      this.subs[i].postingsEnum = subs[i].postingsEnum;
+      this.subs[i].slice = subs[i].slice;
+      this.subs[i].docFreq = subs[i].docFreq;
+    }
+    //System.out.println("MULTI bulk enum init numSub=" + numSubs);
+    docDeltasReader.init();
+    if (doFreqs) {
+      freqsReader.init();
+    }
+    if (doPositions) {
+      positionsReader.init();
+    }
+    return this;
+  }
+
+  private abstract class MultiBlockReader extends BlockReader {
+    // nocommit -- size must be max of all subs
+    int[] buffer;
+    int upto;
+    int currentLeft;
+    int currentEnd;
+    BlockReader current;
+    
+    @Override
+    public int[] getBuffer() {
+      return buffer;
+    }
+
+    @Override
+    public int offset() {
+      return 0;
+    }
+
+    @Override
+    public void setOffset(int offset) {
+      assert offset == 0;
+    }
+
+    @Override
+    public int end() {
+      return currentEnd;
+    }
+
+    protected abstract int getBufferSize() throws IOException;
+
+    public void init() throws IOException {
+      final int bufferSize = getBufferSize();
+      if (buffer == null || buffer.length < bufferSize) {
+        buffer = new int[ArrayUtil.oversize(bufferSize, RamUsageEstimator.NUM_BYTES_INT)];
+      }
+      upto = 0;
+      //System.out.println("mbr.init this=" + this);
+      // nocommit -- must get max buffer size and maybe grow our buffer
+      current = getBlockReader(upto);
+      currentLeft = subs[upto].docFreq;
+      int limit = current.end();
+      int offset = current.offset();
+      if (offset >= limit) {
+        //System.out.println("prefill limit=" + limit + " offs=" + offset + " current=" +
current);
+        limit = current.fill();
+        //System.out.println("  new limit=" + limit);
+      } else {
+        //System.out.println("  no prefill offset=" + offset + " limit=" + limit);
+      }
+      if (limit > offset) {
+        doCopy(offset, limit);
+      }
+    }
+
+    private int doCopy(int offset, int limit) {
+      int chunk = limit - offset;
+      assert chunk > 0;
+      if (chunk > currentLeft) {
+        chunk = currentLeft;
+      }
+      //System.out.println("  doCopy chunk=" + chunk + " offset=" + offset + " limit=" +
limit + " this=" + this);
+      System.arraycopy(current.getBuffer(),
+                       offset,
+                       buffer,
+                       0,
+                       chunk);
+      currentLeft -= chunk;
+      //System.out.println("  currentLeft=" + currentLeft);
+      if (currentLeft == 0) {
+        current = null;
+        //System.out.println("    set current null");
+      }
+      currentEnd = chunk;
+      onFill();
+      return currentEnd;
+    }
+
+    @Override
+    public int fill() throws IOException {
+      //System.out.println("fill this=" + this);
+      while(true) {
+        if (current == null) {
+          if (upto == numSubs-1) {
+            currentEnd = 0;
+            return 0;
+          } else {
+            upto++;
+            current = getBlockReader(upto);
+            currentLeft = subs[upto].docFreq;
+            //System.out.println("  fill current=" + current + " upto=" + upto + " this="
+ this);
+            int limit = current.end();
+            int offset = current.offset();
+            if (offset >= limit) {
+              //System.out.println("prefill2");
+              limit = current.fill();
+            }
+            if (limit > offset) {
+              return doCopy(offset, limit);
+            }
+          }
+        }
+        int limit = current.fill();
+        //int offset = current.offset();
+        return doCopy(0, limit);
+      }
+    }
+
+    protected abstract BlockReader getBlockReader(int upto) throws IOException;
+    protected void onFill() {};
+  }
+
+  private class DocDeltasReader extends MultiBlockReader {
+    int lastDocID;
+    int lastSeg;
+
+    @Override
+    protected int getBufferSize() throws IOException {
+      int maxBufferSize = 0;
+      for(int sub=0;sub<numSubs;sub++) {
+        maxBufferSize = Math.max(maxBufferSize, subs[sub].postingsEnum.getDocDeltasReader().getBuffer().length);
+      }
+      return maxBufferSize;
+    }
+
+    @Override
+    public void init() throws IOException {
+      lastDocID = 0;
+      lastSeg = -1;
+      //System.out.println("docDeltasInit");
+      super.init();
+      //System.out.println("docDeltasInit done");
+    }
+
+    @Override
+    protected BlockReader getBlockReader(int upto) throws IOException {
+      return subs[upto].postingsEnum.getDocDeltasReader();
+    }
+
+    @Override
+    protected void onFill() {
+      if (upto != lastSeg) {
+        assert lastDocID < subs[upto].slice.start || lastDocID == 0;
+        buffer[0] += subs[upto].slice.start - lastDocID;
+        //System.out.println("  add delta to [0] " + (subs[upto].slice.start - lastDocID)
+ " nextStart=" + subs[upto].slice.start + " vs lastDocID=" + lastDocID + " now buffer[0]="
+ buffer[0]);
+        lastSeg = upto;
+      }
+      for(int deltaUpto=0;deltaUpto<currentEnd;deltaUpto++) {
+        lastDocID += buffer[deltaUpto];
+      }
+      //System.out.println("  now lastDocID=" + lastDocID);
+    }
+  }
+
+  private class FreqsReader extends MultiBlockReader {
+    @Override
+    protected int getBufferSize() throws IOException {
+      int maxBufferSize = 0;
+      for(int sub=0;sub<numSubs;sub++) {
+        maxBufferSize = Math.max(maxBufferSize, subs[sub].postingsEnum.getFreqsReader().getBuffer().length);
+      }
+      return maxBufferSize;
+    }
+
+    @Override
+    protected BlockReader getBlockReader(int upto) throws IOException {
+      return subs[upto].postingsEnum.getFreqsReader();
+    }
+  }
+
+  private class PositionsReader extends MultiBlockReader {
+    @Override
+    protected int getBufferSize() throws IOException {
+      int maxBufferSize = 0;
+      for(int sub=0;sub<numSubs;sub++) {
+        maxBufferSize = Math.max(maxBufferSize, subs[sub].postingsEnum.getPositionDeltasReader().getBuffer().length);
+      }
+      return maxBufferSize;
+    }
+    @Override
+    protected BlockReader getBlockReader(int upto) throws IOException {
+      return subs[upto].postingsEnum.getPositionDeltasReader();
+    }
+  }
+
+  @Override
+  public BlockReader getDocDeltasReader() {
+    return docDeltasReader;
+  }
+
+  @Override
+  public BlockReader getFreqsReader() {
+    return freqsReader;
+  }
+
+  @Override
+  public BlockReader getPositionDeltasReader() {
+    return positionsReader;
+  }
+
+  public int getNumSubs() {
+    return numSubs;
+  }
+
+  public EnumWithSlice[] getSubs() {
+    return subs;
+  }
+
+  @Override
+  public JumpResult jump(int target, int curCount) throws IOException {
+    // nocommit
+    return null;
+  }
+
+  // TODO: implement bulk read more efficiently than super
+  public final static class EnumWithSlice {
+    public BulkPostingsEnum postingsEnum;
+    public ReaderUtil.Slice slice;
+    public int docFreq;
+  }
+}
+



Mime
View raw message