lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mharw...@apache.org
Subject svn commit: r1368567 - in /lucene/dev/trunk/lucene: ./ core/src/java/org/apache/lucene/codecs/ core/src/java/org/apache/lucene/codecs/bloom/ core/src/java/org/apache/lucene/util/ core/src/java/org/apache/lucene/util/hash/ core/src/resources/META-INF/se...
Date Thu, 02 Aug 2012 16:04:21 GMT
Author: mharwood
Date: Thu Aug  2 16:04:20 2012
New Revision: 1368567

URL: http://svn.apache.org/viewvc?rev=1368567&view=rev
Log:
New addition: Lucene-4069 BloomFilterPostingsFormat for faster access to low-frequency terms such as primary keys.

Added:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/package.html
    lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/package.html
Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
    lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
    lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
    lucene/dev/trunk/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1368567&r1=1368566&r2=1368567&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Thu Aug  2 16:04:20 2012
@@ -15,6 +15,9 @@ New features
   underlying PayloadFunction's explanation as the explanation
   for the payload score. (Scott Smerchek via Robert Muir)
   
+* LUCENE-4069: Added BloomFilteringPostingsFormat for use with low-frequency terms
+  such as primary keys (Mark Harwood, Mike McCandless) 
+  
 * LUCENE-4201: Added JapaneseIterationMarkCharFilter to normalize Japanese
   iteration marks. (Robert Muir, Christian Moen)
 

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java?rev=1368567&r1=1368566&r2=1368567&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/PostingsFormat.java Thu Aug  2 16:04:20 2012
@@ -53,7 +53,13 @@ public abstract class PostingsFormat imp
 
   /** Reads a segment.  NOTE: by the time this call
    *  returns, it must hold open any files it will need to
-   *  use; else, those files may be deleted. */
+   *  use; else, those files may be deleted. 
+   *  Additionally, required files may be deleted during the execution of 
+   *  this call before there is a chance to open them. Under these 
+   *  circumstances an IOException should be thrown by the implementation. 
+   *  IOExceptions are expected and will automatically cause a retry of the 
+   *  segment opening logic with the newly revised segments.
+   *  */
   public abstract FieldsProducer fieldsProducer(SegmentReadState state) throws IOException;
 
   @Override

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilterFactory.java Thu Aug  2 16:04:20 2012
@@ -0,0 +1,63 @@
+package org.apache.lucene.codecs.bloom;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.FuzzySet;
+
+
+/**
+ * Class used to create index-time {@link FuzzySet} appropriately configured for
+ * each field. Also called to right-size bitsets for serialization.
+ * @lucene.experimental
+ */
+public abstract class BloomFilterFactory {
+  
+  /**
+   * 
+   * @param state  The content to be indexed
+   * @param info
+   *          the field requiring a BloomFilter
+   * @return An appropriately sized set or null if no BloomFiltering required
+   */
+  public abstract FuzzySet getSetForField(SegmentWriteState state, FieldInfo info);
+  
+  /**
+   * Called when downsizing bitsets for serialization
+   * 
+   * @param fieldInfo
+   *          The field with sparse set bits
+   * @param initialSet
+   *          The bits accumulated
+   * @return null or a hopefully more densely packed, smaller bitset
+   */
+  public FuzzySet downsize(FieldInfo fieldInfo, FuzzySet initialSet) {
+    // Aim for a bitset size that would have 10% of bits set (so 90% of searches
+    // would fail-fast)
+    float targetMaxSaturation = 0.1f;
+    return initialSet.downsize(targetMaxSaturation);
+  }
+
+  /**
+   * Used to determine if the given filter has reached saturation and should be retired i.e. not saved any more
+   * @param bloomFilter The bloomFilter being tested
+   * @param fieldInfo The field with which this filter is associated
+   * @return true if the set has reached saturation and should be retired
+   */
+  public abstract boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo);
+  
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/BloomFilteringPostingsFormat.java Thu Aug  2 16:04:20 2012
@@ -0,0 +1,514 @@
+package org.apache.lucene.codecs.bloom;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.lucene.codecs.CodecUtil;
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsConsumer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.TermStats;
+import org.apache.lucene.codecs.TermsConsumer;
+import org.apache.lucene.index.DocsAndPositionsEnum;
+import org.apache.lucene.index.DocsEnum;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldsEnum;
+import org.apache.lucene.index.IndexFileNames;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.index.TermsEnum;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.AttributeSource;
+import org.apache.lucene.util.Bits;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.FuzzySet;
+import org.apache.lucene.util.FuzzySet.ContainsResult;
+import org.apache.lucene.util.IOUtils;
+import org.apache.lucene.util.automaton.CompiledAutomaton;
+import org.apache.lucene.util.hash.MurmurHash2;
+
+/**
+ * <p>
+ * A {@link PostingsFormat} useful for low doc-frequency fields such as primary
+ * keys. Bloom filters are maintained in a ".blm" file which offers "fast-fail"
+ * for reads in segments known to have no record of the key. A choice of
+ * delegate PostingsFormat is used to record all other Postings data.
+ * </p>
+ * <p>
+ * A choice of {@link BloomFilterFactory} can be passed to tailor Bloom Filter
+ * settings on a per-field basis. The default configuration is
+ * {@link DefaultBloomFilterFactory} which allocates a ~8mb bitset and hashes
+ * values using {@link MurmurHash2}. This should be suitable for most purposes.
+ * </p>
+ * <p>
+ * The format of the blm file is as follows:
+ * </p>
+ * <ul>
+ * <li>BloomFilter (.blm) --&gt; Header, DelegatePostingsFormatName,
+ * NumFilteredFields, Filter<sup>NumFilteredFields</sup></li>
+ * <li>Filter --&gt; FieldNumber, FuzzySet</li>
+ * <li>FuzzySet --&gt;See {@link FuzzySet#serialize(DataOutput)}</li>
+ * <li>Header --&gt; {@link CodecUtil#writeHeader CodecHeader}</li>
+ * <li>DelegatePostingsFormatName --&gt; {@link DataOutput#writeString(String)
+ * String} The name of a ServiceProvider registered {@link PostingsFormat}</li>
+ * <li>NumFilteredFields --&gt; {@link DataOutput#writeInt Uint32}</li>
+ * <li>FieldNumber --&gt; {@link DataOutput#writeInt Uint32} The number of the
+ * field in this segment</li>
+ * </ul>
+ * @lucene.experimental
+ */
+public class BloomFilteringPostingsFormat extends PostingsFormat {
+  
+  public static final String BLOOM_CODEC_NAME = "BloomFilter";
+  public static final int BLOOM_CODEC_VERSION = 1;
+  
+  /** Extension of Bloom Filters file */
+  static final String BLOOM_EXTENSION = "blm";
+  
+  BloomFilterFactory bloomFilterFactory = new DefaultBloomFilterFactory();
+  private PostingsFormat delegatePostingsFormat;
+  
+  /**
+   * Creates Bloom filters for a selection of fields created in the index. This
+   * is recorded as a set of Bitsets held as a segment summary in an additional
+   * "blm" file. This PostingsFormat delegates to a choice of delegate
+   * PostingsFormat for encoding all other postings data.
+   * 
+   * @param delegatePostingsFormat
+   *          The PostingsFormat that records all the non-bloom filter data i.e.
+   *          postings info.
+   * @param bloomFilterFactory
+   *          The {@link BloomFilterFactory} responsible for sizing BloomFilters
+   *          appropriately
+   */
+  public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat,
+      BloomFilterFactory bloomFilterFactory) {
+    super(BLOOM_CODEC_NAME);
+    this.delegatePostingsFormat = delegatePostingsFormat;
+    this.bloomFilterFactory = bloomFilterFactory;
+  }
+  
+  /**
+   * Creates Bloom filters for a selection of fields created in the index. This
+   * is recorded as a set of Bitsets held as a segment summary in an additional
+   * "blm" file. This PostingsFormat delegates to a choice of delegate
+   * PostingsFormat for encoding all other postings data. This choice of
+   * constructor defaults to the {@link DefaultBloomFilterFactory} for
+   * configuring per-field BloomFilters.
+   * 
+   * @param delegatePostingsFormat
+   *          The PostingsFormat that records all the non-bloom filter data i.e.
+   *          postings info.
+   */
+  public BloomFilteringPostingsFormat(PostingsFormat delegatePostingsFormat) {
+    this(delegatePostingsFormat, new DefaultBloomFilterFactory());
+  }
+  
+  // Used only by core Lucene at read-time via Service Provider instantiation -
+  // do not use at Write-time in application code.
+  public BloomFilteringPostingsFormat() {
+    super(BLOOM_CODEC_NAME);
+  }
+  
+  public FieldsConsumer fieldsConsumer(SegmentWriteState state)
+      throws IOException {
+    if (delegatePostingsFormat == null) {
+      throw new UnsupportedOperationException("Error - " + getClass().getName()
+          + " has been constructed without a choice of PostingsFormat");
+    }
+    return new BloomFilteredFieldsConsumer(
+        delegatePostingsFormat.fieldsConsumer(state), state,
+        delegatePostingsFormat);
+  }
+  
+  public FieldsProducer fieldsProducer(SegmentReadState state)
+      throws IOException {
+    return new BloomFilteredFieldsProducer(state);
+  }
+  
+  public class BloomFilteredFieldsProducer extends FieldsProducer {
+    private FieldsProducer delegateFieldsProducer;
+    HashMap<String,FuzzySet> bloomsByFieldName = new HashMap<String,FuzzySet>();
+    
+    public BloomFilteredFieldsProducer(SegmentReadState state)
+        throws IOException {
+      
+      String bloomFileName = IndexFileNames.segmentFileName(
+          state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
+      IndexInput bloomIn = null;
+      try {
+        bloomIn = state.dir.openInput(bloomFileName, state.context);
+        CodecUtil.checkHeader(bloomIn, BLOOM_CODEC_NAME, BLOOM_CODEC_VERSION,
+            BLOOM_CODEC_VERSION);
+        // // Load the hash function used in the BloomFilter
+        // hashFunction = HashFunction.forName(bloomIn.readString());
+        // Load the delegate postings format
+        PostingsFormat delegatePostingsFormat = PostingsFormat.forName(bloomIn
+            .readString());
+        
+        this.delegateFieldsProducer = delegatePostingsFormat
+            .fieldsProducer(state);
+        int numBlooms = bloomIn.readInt();
+        for (int i = 0; i < numBlooms; i++) {
+          int fieldNum = bloomIn.readInt();
+          FuzzySet bloom = FuzzySet.deserialize(bloomIn);
+          FieldInfo fieldInfo = state.fieldInfos.fieldInfo(fieldNum);
+          bloomsByFieldName.put(fieldInfo.name, bloom);
+        }
+      } finally {
+        IOUtils.close(bloomIn);
+      }
+      
+    }
+    
+    public FieldsEnum iterator() throws IOException {
+      return new BloomFilteredFieldsEnum(delegateFieldsProducer.iterator(),
+          bloomsByFieldName);
+    }
+    
+    public void close() throws IOException {
+      delegateFieldsProducer.close();
+    }
+    
+    public Terms terms(String field) throws IOException {
+      FuzzySet filter = bloomsByFieldName.get(field);
+      if (filter == null) {
+        return delegateFieldsProducer.terms(field);
+      } else {
+        Terms result = delegateFieldsProducer.terms(field);
+        if (result == null) {
+          return null;
+        }
+        return new BloomFilteredTerms(result, filter);
+      }
+    }
+    
+    public int size() throws IOException {
+      return delegateFieldsProducer.size();
+    }
+    
+    public long getUniqueTermCount() throws IOException {
+      return delegateFieldsProducer.getUniqueTermCount();
+    }
+    
+    // Not all fields in a segment may be subject to a bloom filter. This class
+    // wraps Terms objects appropriately if a filtering request is present
+    class BloomFilteredFieldsEnum extends FieldsEnum {
+      private FieldsEnum delegateFieldsEnum;
+      private HashMap<String,FuzzySet> bloomsByFieldName;
+      private String currentFieldName;
+      
+      public BloomFilteredFieldsEnum(FieldsEnum iterator,
+          HashMap<String,FuzzySet> bloomsByFieldName) {
+        this.delegateFieldsEnum = iterator;
+        this.bloomsByFieldName = bloomsByFieldName;
+      }
+      
+      public AttributeSource attributes() {
+        return delegateFieldsEnum.attributes();
+      }
+      
+      public String next() throws IOException {
+        currentFieldName = delegateFieldsEnum.next();
+        return currentFieldName;
+      }
+      
+      public Terms terms() throws IOException {
+        FuzzySet filter = bloomsByFieldName.get(currentFieldName);
+        if (filter == null) {
+          return delegateFieldsEnum.terms();
+        } else {
+          Terms result = delegateFieldsEnum.terms();
+          if (result == null) {
+            return null;
+          }
+          // wrap the terms object with a bloom filter
+          return new BloomFilteredTerms(result, filter);
+        }
+      }
+      
+    }
+    
+    class BloomFilteredTerms extends Terms {
+      private Terms delegateTerms;
+      private FuzzySet filter;
+      
+      public BloomFilteredTerms(Terms terms, FuzzySet filter) {
+        this.delegateTerms = terms;
+        this.filter = filter;
+      }
+      
+      @Override
+      public TermsEnum intersect(CompiledAutomaton compiled,
+          final BytesRef startTerm) throws IOException {
+        return delegateTerms.intersect(compiled, startTerm);
+      }
+      
+      @Override
+      public TermsEnum iterator(TermsEnum reuse) throws IOException {
+        TermsEnum result;
+        if ((reuse != null) && (reuse instanceof BloomFilteredTermsEnum)) {
+          // recycle the existing BloomFilteredTermsEnum by asking the delegate
+          // to recycle its contained TermsEnum
+          BloomFilteredTermsEnum bfte = (BloomFilteredTermsEnum) reuse;
+          if (bfte.filter == filter) {
+            bfte.delegateTermsEnum = delegateTerms
+                .iterator(bfte.delegateTermsEnum);
+            return bfte;
+          }
+        }
+        // We have been handed something we cannot reuse (either null, wrong
+        // class or wrong filter) so allocate a new object
+        result = new BloomFilteredTermsEnum(delegateTerms.iterator(reuse),
+            filter);
+        return result;
+      }
+      
+      @Override
+      public Comparator<BytesRef> getComparator() throws IOException {
+        return delegateTerms.getComparator();
+      }
+      
+      @Override
+      public long size() throws IOException {
+        return delegateTerms.size();
+      }
+      
+      @Override
+      public long getSumTotalTermFreq() throws IOException {
+        return delegateTerms.getSumTotalTermFreq();
+      }
+      
+      @Override
+      public long getSumDocFreq() throws IOException {
+        return delegateTerms.getSumDocFreq();
+      }
+      
+      @Override
+      public int getDocCount() throws IOException {
+        return delegateTerms.getDocCount();
+      }
+    }
+    
+    class BloomFilteredTermsEnum extends TermsEnum {
+      
+      TermsEnum delegateTermsEnum;
+      private FuzzySet filter;
+      
+      public BloomFilteredTermsEnum(TermsEnum iterator, FuzzySet filter) {
+        this.delegateTermsEnum = iterator;
+        this.filter = filter;
+      }
+      
+      @Override
+      public final BytesRef next() throws IOException {
+        return delegateTermsEnum.next();
+      }
+      
+      @Override
+      public final Comparator<BytesRef> getComparator() {
+        return delegateTermsEnum.getComparator();
+      }
+      
+      @Override
+      public final boolean seekExact(BytesRef text, boolean useCache)
+          throws IOException {
+        // The magical fail-fast speed up that is the entire point of all of
+        // this code - save a disk seek if there is a match on an in-memory
+        // structure
+        // that may occasionally give a false positive but guaranteed no false
+        // negatives
+        if (filter.contains(text) == ContainsResult.NO) {
+          return false;
+        }
+        return delegateTermsEnum.seekExact(text, useCache);
+      }
+      
+      @Override
+      public final SeekStatus seekCeil(BytesRef text, boolean useCache)
+          throws IOException {
+        return delegateTermsEnum.seekCeil(text, useCache);
+      }
+      
+      @Override
+      public final void seekExact(long ord) throws IOException {
+        delegateTermsEnum.seekExact(ord);
+      }
+      
+      @Override
+      public final BytesRef term() throws IOException {
+        return delegateTermsEnum.term();
+      }
+      
+      @Override
+      public final long ord() throws IOException {
+        return delegateTermsEnum.ord();
+      }
+      
+      @Override
+      public final int docFreq() throws IOException {
+        return delegateTermsEnum.docFreq();
+      }
+      
+      @Override
+      public final long totalTermFreq() throws IOException {
+        return delegateTermsEnum.totalTermFreq();
+      }
+      
+
+      @Override
+      public DocsAndPositionsEnum docsAndPositions(Bits liveDocs,
+          DocsAndPositionsEnum reuse, int flags) throws IOException {
+        return delegateTermsEnum.docsAndPositions(liveDocs, reuse, flags);
+      }
+
+      @Override
+      public DocsEnum docs(Bits liveDocs, DocsEnum reuse, int flags)
+          throws IOException {
+        return delegateTermsEnum.docs(liveDocs, reuse, flags);
+      }
+      
+      
+    }
+    
+  }
+  
+  class BloomFilteredFieldsConsumer extends FieldsConsumer {
+    private FieldsConsumer delegateFieldsConsumer;
+    private Map<FieldInfo,FuzzySet> bloomFilters = new HashMap<FieldInfo,FuzzySet>();
+    private SegmentWriteState state;
+    
+    // private PostingsFormat delegatePostingsFormat;
+    
+    public BloomFilteredFieldsConsumer(FieldsConsumer fieldsConsumer,
+        SegmentWriteState state, PostingsFormat delegatePostingsFormat) {
+      this.delegateFieldsConsumer = fieldsConsumer;
+      // this.delegatePostingsFormat=delegatePostingsFormat;
+      this.state = state;
+    }
+    
+    @Override
+    public TermsConsumer addField(FieldInfo field) throws IOException {
+      FuzzySet bloomFilter = bloomFilterFactory.getSetForField(state,field);
+      if (bloomFilter != null) {
+        assert bloomFilters.containsKey(field) == false;
+        bloomFilters.put(field, bloomFilter);
+        return new WrappedTermsConsumer(delegateFieldsConsumer.addField(field),bloomFilter);
+      } else {
+        // No, use the unfiltered fieldsConsumer - we are not interested in
+        // recording any term Bitsets.
+        return delegateFieldsConsumer.addField(field);
+      }
+    }
+    
+    @Override
+    public void close() throws IOException {
+      delegateFieldsConsumer.close();
+      // Now we are done accumulating values for these fields
+      List<Entry<FieldInfo,FuzzySet>> nonSaturatedBlooms = new ArrayList<Map.Entry<FieldInfo,FuzzySet>>();
+      
+      for (Entry<FieldInfo,FuzzySet> entry : bloomFilters.entrySet()) {
+        FuzzySet bloomFilter = entry.getValue();
+        if(!bloomFilterFactory.isSaturated(bloomFilter,entry.getKey())){          
+          nonSaturatedBlooms.add(entry);
+        }
+      }
+      String bloomFileName = IndexFileNames.segmentFileName(
+          state.segmentInfo.name, state.segmentSuffix, BLOOM_EXTENSION);
+      IndexOutput bloomOutput = null;
+      try {
+        bloomOutput = state.directory
+            .createOutput(bloomFileName, state.context);
+        CodecUtil.writeHeader(bloomOutput, BLOOM_CODEC_NAME,
+            BLOOM_CODEC_VERSION);
+        // remember the name of the postings format we will delegate to
+        bloomOutput.writeString(delegatePostingsFormat.getName());
+        
+        // First field in the output file is the number of fields+blooms saved
+        bloomOutput.writeInt(nonSaturatedBlooms.size());
+        for (Entry<FieldInfo,FuzzySet> entry : nonSaturatedBlooms) {
+          FieldInfo fieldInfo = entry.getKey();
+          FuzzySet bloomFilter = entry.getValue();
+          bloomOutput.writeInt(fieldInfo.number);
+          saveAppropriatelySizedBloomFilter(bloomOutput, bloomFilter, fieldInfo);
+        }
+      } finally {
+        IOUtils.close(bloomOutput);
+      }
+      //We are done with large bitsets so no need to keep them hanging around
+      bloomFilters.clear(); 
+    }
+    
+    private void saveAppropriatelySizedBloomFilter(IndexOutput bloomOutput,
+        FuzzySet bloomFilter, FieldInfo fieldInfo) throws IOException {
+      
+      FuzzySet rightSizedSet = bloomFilterFactory.downsize(fieldInfo,
+          bloomFilter);
+      if (rightSizedSet == null) {
+        rightSizedSet = bloomFilter;
+      }
+      rightSizedSet.serialize(bloomOutput);
+    }
+    
+  }
+  
+  class WrappedTermsConsumer extends TermsConsumer {
+    private TermsConsumer delegateTermsConsumer;
+    private FuzzySet bloomFilter;
+    
+    public WrappedTermsConsumer(TermsConsumer termsConsumer,FuzzySet bloomFilter) {
+      this.delegateTermsConsumer = termsConsumer;
+      this.bloomFilter = bloomFilter;
+    }
+    
+    public PostingsConsumer startTerm(BytesRef text) throws IOException {
+      return delegateTermsConsumer.startTerm(text);
+    }
+    
+    public void finishTerm(BytesRef text, TermStats stats) throws IOException {
+      
+      // Record this term in our BloomFilter
+      if (stats.docFreq > 0) {
+        bloomFilter.addValue(text);
+      }
+      delegateTermsConsumer.finishTerm(text, stats);
+    }
+    
+    public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount)
+        throws IOException {
+      delegateTermsConsumer.finish(sumTotalTermFreq, sumDocFreq, docCount);
+    }
+    
+    public Comparator<BytesRef> getComparator() throws IOException {
+      return delegateTermsConsumer.getComparator();
+    }
+    
+  }
+  
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/DefaultBloomFilterFactory.java Thu Aug  2 16:04:20 2012
@@ -0,0 +1,44 @@
+package org.apache.lucene.codecs.bloom;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.FuzzySet;
+import org.apache.lucene.util.hash.HashFunction;
+import org.apache.lucene.util.hash.MurmurHash2;
+
+/**
+ * Default policy is to allocate a bitset with 10% saturation given a unique term per document.
+ * Bits are set via MurmurHash2 hashing function.
+ *  @lucene.experimental
+ */
+public class DefaultBloomFilterFactory extends BloomFilterFactory {
+  
+  @Override
+  public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
+    //Assume all of the docs have a unique term (e.g. a primary key) and we hope to maintain a set with 10% of bits set
+    return FuzzySet.createSetBasedOnQuality(state.segmentInfo.getDocCount(), 0.10f,  new MurmurHash2());
+  }
+  
+  @Override
+  public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
+    // Don't bother saving bitsets if >90% of bits are set - we don't want to
+    // throw any more memory at this problem.
+    return bloomFilter.getSaturation() > 0.9f;
+  }
+  
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/bloom/package.html Thu Aug  2 16:04:20 2012
@@ -0,0 +1,25 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Codec PostingsFormat for fast access to low-frequency terms such as primary key fields.
+</body>
+</html>
\ No newline at end of file

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java?rev=1368567&r1=1368566&r2=1368567&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FixedBitSet.java Thu Aug  2 16:04:20 2012
@@ -54,6 +54,11 @@ public final class FixedBitSet extends D
     bits = new long[bits2words(numBits)];
   }
 
+  public FixedBitSet(long[]storedBits,int numBits) {
+    this.numBits = numBits;
+    this.bits = storedBits;
+  }      
+  
   /** Makes full copy. */
   public FixedBitSet(FixedBitSet other) {
     bits = new long[other.bits.length];

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/FuzzySet.java Thu Aug  2 16:04:20 2012
@@ -0,0 +1,292 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.io.IOException;
+
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.hash.HashFunction;
+
+/**
+ * <p>
+ * A class used to represent a set of many, potentially large, values (e.g. many
+ * long strings such as URLs), using a significantly smaller amount of memory.
+ * </p>
+ * <p>
+ * The set is "lossy" in that it cannot definitively state that is does contain
+ * a value but it <em>can</em> definitively say if a value is <em>not</em> in
+ * the set. It can therefore be used as a Bloom Filter.
+ * </p> 
+ * Another application of the set is that it can be used to perform fuzzy counting because
+ * it can estimate reasonably accurately how many unique values are contained in the set. 
+ * </p>
+ * <p>This class is NOT threadsafe.</p>
+ * <p>
+ * Internally a Bitset is used to record values and once a client has finished recording
+ * a stream of values the {@link #downsize(float)} method can be used to create a suitably smaller set that
+ * is sized appropriately for the number of values recorded and desired saturation levels. 
+ * 
+ * </p>
+ * @lucene.experimental
+ */
+public class FuzzySet {
+  
+  public static final int FUZZY_SERIALIZATION_VERSION=1;
+  
+  public enum ContainsResult {
+    MAYBE, NO
+  };
+  private HashFunction hashFunction;
+  private FixedBitSet filter;
+  private int bloomSize;
+  
+  //The sizes of BitSet used are all numbers that, when expressed in binary form,
+  //are all ones. This is to enable fast downsizing from one bitset to another
+  //by simply ANDing each set index in one bitset with the size of the target bitset
+  // - this provides a fast modulo of the number. Values previously accumulated in
+  // a large bitset and then mapped to a smaller set can be looked up using a single
+  // AND operation of the query term's hash rather than needing to perform a 2-step
+  // translation of the query term that mirrors the stored content's reprojections.
+  static final int usableBitSetSizes[];
+  static
+  {
+    usableBitSetSizes=new int[30];
+    int mask=1;
+    int size=mask;
+    for (int i = 0; i < usableBitSetSizes.length; i++) {
+        size=(size<<1)|mask;
+        usableBitSetSizes[i]=size;
+    }    
+  }
+
+  /**
+   * Rounds down required maxNumberOfBits to the nearest number that is made up
+   * of all ones as a binary number.  
+   * Use this method where controlling memory use is paramount.
+   */
+  public static int getNearestSetSize(int maxNumberOfBits)
+  {
+    int result=usableBitSetSizes[0];
+    for (int i = 0; i < usableBitSetSizes.length; i++) {
+      if(usableBitSetSizes[i]<=maxNumberOfBits)
+      {
+        result=usableBitSetSizes[i];
+      }
+    }
+    return result;
+  }
+
+  /**
+   * Use this method to choose a set size where accuracy (low content saturation) is more important
+   * than deciding how much memory to throw at the problem.
+   * @param maxNumberOfValuesExpected
+   * @param desiredSaturation A number between 0 and 1 expressing the % of bits set once all values have been recorded
+   * @return The size of the set nearest to the required size
+   */
+  public static int getNearestSetSize(int maxNumberOfValuesExpected,
+      float desiredSaturation) {
+    // Iterate around the various scales of bitset from smallest to largest looking for the first that
+    // satisfies value volumes at the chosen saturation level
+    for (int i = 0; i < usableBitSetSizes.length; i++) {
+      int numSetBitsAtDesiredSaturation = (int) (usableBitSetSizes[i] * desiredSaturation);
+      int estimatedNumUniqueValues = getEstimatedNumberUniqueValuesAllowingForCollisions(
+          usableBitSetSizes[i], numSetBitsAtDesiredSaturation);
+      if (estimatedNumUniqueValues > maxNumberOfValuesExpected) {
+        return usableBitSetSizes[i];
+      }
+    }
+    return -1;    
+  }
+  
+  public static FuzzySet createSetBasedOnMaxMemory(int maxNumBytes, HashFunction hashFunction)
+  {
+      int setSize=getNearestSetSize(maxNumBytes);
+      return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
+  }
+  
+  public static FuzzySet createSetBasedOnQuality(int maxNumUniqueValues, float desiredMaxSaturation, HashFunction hashFunction)
+  {
+      int setSize=getNearestSetSize(maxNumUniqueValues,desiredMaxSaturation);
+      return new FuzzySet(new FixedBitSet(setSize+1),setSize,hashFunction);
+  }
+  
+
+  
+  
+  private FuzzySet(FixedBitSet filter, int bloomSize, HashFunction hashFunction) {
+    super();
+    this.filter = filter;
+    this.bloomSize = bloomSize;
+    this.hashFunction=hashFunction;
+  }
+  
+  /**
+   * The main method required for a Bloom filter which, given a value determines set membership.
+   * Unlike a conventional set, the fuzzy set returns NO or MAYBE rather than true or false.
+   * @param value
+   * @return NO or MAYBE
+   */
+  public ContainsResult contains(BytesRef value) {
+    int hash = hashFunction.hash(value);
+    if (hash < 0) {
+      hash = hash * -1;
+    }
+    return mayContainValue(hash);
+  }
+  
+  /**
+   * Serializes the data set to file using the following format:
+   * <ul>
+   *  <li>FuzzySet --&gt;FuzzySetVersion,HashFunctionName,BloomSize,
+   * NumBitSetWords,BitSetWord<sup>NumBitSetWords</sup></li> 
+   * <li>HashFunctionName --&gt; {@link DataOutput#writeString(String) String} The
+   * name of a ServiceProvider registered {@link HashFunction}</li>
+   * <li>FuzzySetVersion --&gt; {@link DataOutput#writeInt Uint32} The version number of the {@link FuzzySet} class</li>
+   * <li>BloomSize --&gt; {@link DataOutput#writeInt Uint32} The modulo value used
+   * to project hashes into the field's Bitset</li>
+   * <li>NumBitSetWords --&gt; {@link DataOutput#writeInt Uint32} The number of
+   * longs (as returned from {@link FixedBitSet#getBits})</li>
+   * <li>BitSetWord --&gt; {@link DataOutput#writeLong Long} A long from the array
+   * returned by {@link FixedBitSet#getBits}</li>
+   * </ul>
+   * @param out Data output stream
+   * @throws IOException
+   */
+  public void serialize(DataOutput out) throws IOException
+  {
+      out.writeInt(FUZZY_SERIALIZATION_VERSION);
+      out.writeString(hashFunction.getName());
+      out.writeInt(bloomSize);
+      long[] bits = filter.getBits();
+      out.writeInt(bits.length);
+      for (int i = 0; i < bits.length; i++) {
+        // Can't used VLong encoding because cant cope with negative numbers
+        // output by FixedBitSet
+        out.writeLong(bits[i]);
+      }
+  }
+  public static FuzzySet deserialize(DataInput in) throws IOException
+  {
+    int version=in.readInt();
+    if(version!=FUZZY_SERIALIZATION_VERSION)
+    {
+      throw new IOException("Error deserializing: set version is not "+FUZZY_SERIALIZATION_VERSION);
+    }
+    HashFunction hashFunction=HashFunction.forName(in.readString());
+    int bloomSize=in.readInt();
+    int numLongs=in.readInt();
+    long[]longs=new long[numLongs];
+    for (int i = 0; i < numLongs; i++) {
+      longs[i]=in.readLong();
+    }
+    FixedBitSet bits = new FixedBitSet(longs,bloomSize+1);
+    return new FuzzySet(bits,bloomSize,hashFunction);
+  }
+  
+  private ContainsResult mayContainValue(int positiveHash) {
+    assert positiveHash >= 0;
+    // Bloom sizes are always base 2 and so can be ANDed for a fast modulo
+    int pos = positiveHash & bloomSize;
+    if (filter.get(pos)) {
+      // This term may be recorded in this index (but could be a collision)
+      return ContainsResult.MAYBE;
+    }
+    // definitely NOT in this segment
+    return ContainsResult.NO;
+  }
+  
+  /**
+   * Records a value in the set. The referenced bytes are hashed and then modulo n'd where n is the
+   * chosen size of the internal bitset.
+   * @param value the key value to be hashed
+   * @throws IOException
+   */
+  public void addValue(BytesRef value) throws IOException {    
+      int hash = hashFunction.hash(value);
+      if (hash < 0) {
+        hash = hash * -1;
+      }
+      // Bitmasking using bloomSize is effectively a modulo operation.
+      int bloomPos = hash & bloomSize;
+      filter.set(bloomPos);
+  }  
+  
+  
+  /**
+   * 
+   * @param targetMaxSaturation A number between 0 and 1 describing the % of bits that would ideally be set in the 
+   * result. Lower values have better qccuracy but require more space.
+   * @return a smaller FuzzySet or null if the current set is already over-saturated
+   */
+  public FuzzySet downsize(float targetMaxSaturation)
+  {
+    int numBitsSet = filter.cardinality();
+    FixedBitSet rightSizedBitSet = filter;
+    int rightSizedBitSetSize = bloomSize;
+    //Hopefully find a smaller size bitset into which we can project accumulated values while maintaining desired saturation level
+    for (int i = 0; i < usableBitSetSizes.length; i++) {
+      int candidateBitsetSize = usableBitSetSizes[i];
+      float candidateSaturation = (float) numBitsSet
+          / (float) candidateBitsetSize;
+      if (candidateSaturation <= targetMaxSaturation) {
+        rightSizedBitSetSize = candidateBitsetSize;
+        break;
+      }
+    }
+    // Re-project the numbers to a smaller space if necessary
+    if (rightSizedBitSetSize < bloomSize) {
+      // Reset the choice of bitset to the smaller version
+      rightSizedBitSet = new FixedBitSet(rightSizedBitSetSize + 1);
+      // Map across the bits from the large set to the smaller one
+      int bitIndex = 0;
+      do {
+        bitIndex = filter.nextSetBit(bitIndex);
+        if (bitIndex >= 0) {
+          // Project the larger number into a smaller one effectively
+          // modulo-ing by using the target bitset size as a mask
+          int downSizedBitIndex = bitIndex & rightSizedBitSetSize;
+          rightSizedBitSet.set(downSizedBitIndex);
+          bitIndex++;
+        }
+      } while ( (bitIndex >= 0)&&(bitIndex<=bloomSize));
+    } else {
+      return null;
+    }
+    return new FuzzySet(rightSizedBitSet,rightSizedBitSetSize, hashFunction);
+  }
+  
+  public int getEstimatedUniqueValues()
+  {
+       return getEstimatedNumberUniqueValuesAllowingForCollisions(bloomSize, filter.cardinality());
+  }
+  
+  // Given a set size and a the number of set bits, produces an estimate of the number of unique values recorded
+  public static int getEstimatedNumberUniqueValuesAllowingForCollisions(
+      int setSize, int numRecordedBits) {
+    double setSizeAsDouble = setSize;
+    double numRecordedBitsAsDouble = numRecordedBits;
+    double saturation = numRecordedBitsAsDouble / setSizeAsDouble;
+    double logInverseSaturation = Math.log(1 - saturation) * -1;
+    return (int) (setSizeAsDouble * logInverseSaturation);
+  }
+
+  public float getSaturation() {
+    int numBitsSet = filter.cardinality();
+    return (float) numBitsSet / (float) bloomSize;
+  }
+}
\ No newline at end of file

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/HashFunction.java Thu Aug  2 16:04:20 2012
@@ -0,0 +1,70 @@
+package org.apache.lucene.util.hash;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+import java.util.Set;
+
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.NamedSPILoader;
+
+
+/**
+ * Base class for hashing functions that can be referred to by name.
+ * Subclasses are expected to provide threadsafe implementations of the hash function
+ * on the range of bytes referenced in the provided {@link BytesRef}
+ * @lucene.experimental
+ */
+public abstract class HashFunction implements NamedSPILoader.NamedSPI {
+
+  /**
+   * Hashes the contents of the referenced bytes
+   * @param bytes the data to be hashed
+   * @return the hash of the bytes referenced by bytes.offset and length bytes.length
+   */
+  public abstract int hash(BytesRef bytes);
+  
+  private static final NamedSPILoader<HashFunction> loader =
+    new NamedSPILoader<HashFunction>(HashFunction.class);
+
+  private final String name;
+
+  public HashFunction(String name) {
+    NamedSPILoader.checkServiceName(name);
+    this.name = name;
+  }
+  
+  /** Returns this codec's name */
+  @Override
+  public final String getName() {
+    return name;
+  }
+  
+  /** looks up a hash function by name */
+  public static HashFunction forName(String name) {
+    return loader.lookup(name);
+  }
+  
+  /** returns a list of all available hash function names */
+  public static Set<String> availableHashFunctionNames() {
+    return loader.availableServices();
+  }
+  
+
+  @Override
+  public String toString() {
+    return name;
+  }  
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/MurmurHash2.java Thu Aug  2 16:04:20 2012
@@ -0,0 +1,105 @@
+package org.apache.lucene.util.hash;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.util.BytesRef;
+
+/**
+ * This is a very fast, non-cryptographic hash suitable for general hash-based
+ * lookup. See http://murmurhash.googlepages.com/ for more details.
+ * <p>
+ * The C version of MurmurHash 2.0 found at that site was ported to Java by
+ * Andrzej Bialecki (ab at getopt org).
+ * </p>
+ * <p>
+ *  The code from getopt.org was adapted by Mark Harwood in the form here as one of a pluggable choice of 
+ *  hashing functions as the core function had to be adapted to work with BytesRefs with offsets and lengths
+ *  rather than raw byte arrays.  
+ * </p>
+ * @lucene.experimental
+ */
+public class MurmurHash2 extends HashFunction{
+  
+  
+  public static final String HASH_NAME="MurmurHash2";
+  
+  public MurmurHash2() {
+    super(HASH_NAME);
+  }
+
+  public static int hash(byte[] data, int seed, int offset, int len) {
+    int m = 0x5bd1e995;
+    int r = 24;
+    int h = seed ^ len;
+    int len_4 = len >> 2;
+    for (int i = 0; i < len_4; i++) {
+      int i_4 = offset + (i << 2);
+      int k = data[i_4 + 3];
+      k = k << 8;
+      k = k | (data[i_4 + 2] & 0xff);
+      k = k << 8;
+      k = k | (data[i_4 + 1] & 0xff);
+      k = k << 8;
+      k = k | (data[i_4 + 0] & 0xff);
+      k *= m;
+      k ^= k >>> r;
+      k *= m;
+      h *= m;
+      h ^= k;
+    }
+    int len_m = len_4 << 2;
+    int left = len - len_m;
+    if (left != 0) {
+      if (left >= 3) {
+        h ^= data[offset + len - 3] << 16;
+      }
+      if (left >= 2) {
+        h ^= data[offset + len - 2] << 8;
+      }
+      if (left >= 1) {
+        h ^= data[offset + len - 1];
+      }
+      h *= m;
+    }
+    h ^= h >>> 13;
+    h *= m;
+    h ^= h >>> 15;
+    return h;
+  }
+  
+  /**
+   * Generates 32 bit hash from byte array with default seed value.
+   * 
+   * @param data 
+   *          byte array to hash
+   * @param offset
+   *          the start position in the array to hash
+   * @param len
+   *          length of the array elements to hash
+   * @return 32 bit hash of the given array
+   */
+  public static final int hash32(final byte[] data, int offset, int len) {
+    return MurmurHash2.hash(data, 0x9747b28c, offset, len);
+  }
+  
+
+  @Override
+  public final int hash(BytesRef br) {
+    return hash32(br.bytes, br.offset, br.length);
+  }
+  
+}

Added: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/package.html?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/package.html (added)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/util/hash/package.html Thu Aug  2 16:04:20 2012
@@ -0,0 +1,25 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Hashing functions load-able via SPI service
+</body>
+</html>
\ No newline at end of file

Modified: lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1368567&r1=1368566&r2=1368567&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (original)
+++ lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat Thu Aug  2 16:04:20 2012
@@ -17,4 +17,5 @@ org.apache.lucene.codecs.lucene40.Lucene
 org.apache.lucene.codecs.pulsing.Pulsing40PostingsFormat
 org.apache.lucene.codecs.simpletext.SimpleTextPostingsFormat
 org.apache.lucene.codecs.memory.MemoryPostingsFormat
+org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat
 org.apache.lucene.codecs.memory.DirectPostingsFormat

Added: lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction (added)
+++ lucene/dev/trunk/lucene/core/src/resources/META-INF/services/org.apache.lucene.util.hash.HashFunction Thu Aug  2 16:04:20 2012
@@ -0,0 +1,16 @@
+#  Licensed to the Apache Software Foundation (ASF) under one or more
+#  contributor license agreements.  See the NOTICE file distributed with
+#  this work for additional information regarding copyright ownership.
+#  The ASF licenses this file to You under the Apache License, Version 2.0
+#  (the "License"); you may not use this file except in compliance with
+#  the License.  You may obtain a copy of the License at
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+
+org.apache.lucene.util.hash.MurmurHash2

Added: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java (added)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/TestBloomFilteredLucene40Postings.java Thu Aug  2 16:04:20 2012
@@ -0,0 +1,77 @@
+package org.apache.lucene.codecs.bloom;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+
+import org.apache.lucene.codecs.FieldsConsumer;
+import org.apache.lucene.codecs.FieldsProducer;
+import org.apache.lucene.codecs.PostingsFormat;
+import org.apache.lucene.codecs.bloom.BloomFilteringPostingsFormat;
+import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.SegmentReadState;
+import org.apache.lucene.index.SegmentWriteState;
+import org.apache.lucene.util.FuzzySet;
+import org.apache.lucene.util.hash.MurmurHash2;
+
+/**
+ * A class used for testing {@link BloomFilteringPostingsFormat} with a concrete
+ * delegate (Lucene40). Creates a Bloom filter on ALL fields and with tiny
+ * amounts of memory reserved for the filter. DO NOT USE IN A PRODUCTION
+ * APPLICATION This is not a realistic application of Bloom Filters as they
+ * ordinarily are larger and operate on only primary key type fields.
+ */
+public class TestBloomFilteredLucene40Postings extends PostingsFormat {
+  
+  private BloomFilteringPostingsFormat delegate;
+  
+  // Special class used to avoid OOM exceptions where Junit tests create many
+  // fields.
+  static class LowMemoryBloomFactory extends BloomFilterFactory {
+    @Override
+    public FuzzySet getSetForField(SegmentWriteState state,FieldInfo info) {
+      return FuzzySet.createSetBasedOnMaxMemory(1024, new MurmurHash2());
+    }
+    
+    @Override
+    public boolean isSaturated(FuzzySet bloomFilter, FieldInfo fieldInfo) {
+      // For test purposes always maintain the BloomFilter - even past the point
+      // of usefulness when all bits are set
+      return false;
+    }
+  }
+  
+  public TestBloomFilteredLucene40Postings() {
+    super("TestBloomFilteredLucene40Postings");
+    delegate = new BloomFilteringPostingsFormat(new Lucene40PostingsFormat(),
+        new LowMemoryBloomFactory());
+  }
+  
+  @Override
+  public FieldsConsumer fieldsConsumer(SegmentWriteState state)
+      throws IOException {
+    return delegate.fieldsConsumer(state);
+  }
+  
+  @Override
+  public FieldsProducer fieldsProducer(SegmentReadState state)
+      throws IOException {
+    return delegate.fieldsProducer(state);
+  }
+}

Added: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/package.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/package.html?rev=1368567&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/package.html (added)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/codecs/bloom/package.html Thu Aug  2 16:04:20 2012
@@ -0,0 +1,25 @@
+<!doctype html public "-//w3c//dtd html 4.0 transitional//en">
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+<head>
+   <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
+</head>
+<body>
+Support for generating test indexes using the BloomFilteringPostingsFormat
+</body>
+</html>

Modified: lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java?rev=1368567&r1=1368566&r2=1368567&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java (original)
+++ lucene/dev/trunk/lucene/test-framework/src/java/org/apache/lucene/index/RandomCodec.java Thu Aug  2 16:04:20 2012
@@ -29,6 +29,7 @@ import java.util.Set;
 
 import org.apache.lucene.codecs.PostingsFormat;
 import org.apache.lucene.codecs.asserting.AssertingPostingsFormat;
+import org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings;
 import org.apache.lucene.codecs.lucene40.Lucene40Codec;
 import org.apache.lucene.codecs.lucene40.Lucene40PostingsFormat;
 import org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds;
@@ -98,6 +99,10 @@ public class RandomCodec extends Lucene4
         new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
         // add pulsing again with (usually) different parameters
         new Pulsing40PostingsFormat(1 + random.nextInt(20), minItemsPerBlock, maxItemsPerBlock),
+        //TODO as a PostingsFormat which wraps others, we should allow TestBloomFilteredLucene40Postings to be constructed 
+        //with a choice of concrete PostingsFormats. Maybe useful to have a generic means of marking and dealing 
+        //with such "wrapper" classes?
+        new TestBloomFilteredLucene40Postings(),                
         new MockSepPostingsFormat(),
         new MockFixedIntBlockPostingsFormat(_TestUtil.nextInt(random, 1, 2000)),
         new MockVariableIntBlockPostingsFormat( _TestUtil.nextInt(random, 1, 127)),

Modified: lucene/dev/trunk/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat?rev=1368567&r1=1368566&r2=1368567&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat (original)
+++ lucene/dev/trunk/lucene/test-framework/src/resources/META-INF/services/org.apache.lucene.codecs.PostingsFormat Thu Aug  2 16:04:20 2012
@@ -20,5 +20,6 @@ org.apache.lucene.codecs.mocksep.MockSep
 org.apache.lucene.codecs.nestedpulsing.NestedPulsingPostingsFormat
 org.apache.lucene.codecs.ramonly.RAMOnlyPostingsFormat
 org.apache.lucene.codecs.lucene40ords.Lucene40WithOrds
+org.apache.lucene.codecs.bloom.TestBloomFilteredLucene40Postings
 org.apache.lucene.codecs.asserting.AssertingPostingsFormat
 



Mime
View raw message