lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Yann-Erwan Perio <ye.pe...@gmail.com>
Subject Re: Filter based on the sum of values of two fields
Date Wed, 27 Mar 2013 00:22:17 GMT
On Sun, Mar 24, 2013 at 10:46 AM, Wei Wang <welshwang@gmail.com> wrote:

Hi,

> For example, assume we have fields F1 and F2, we would like to find
> all documents with condition F1+F2 > 5.0. This filter may be combined
> with other filters to form a BooleanFilter.
>
> The question is, is there any way to construct an efficient filter to do this?

I don't know - but the API looked interesting, so I gave it a try (see
below). I had never worked with search filters before writing that
code, so please proceed with caution, as I am not sure of many things
(iteration of all documents, treatment of deleted documents, what is
that "acceptDocs" variable, what threading constraints to respect...).

---
// add your package declaration


import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

import java.io.IOException;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.TreeSet;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.IntField;
import org.apache.lucene.index.AtomicReader;
import org.apache.lucene.index.AtomicReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.search.DocIdSet;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.FieldCache;
import org.apache.lucene.search.FieldCache.Ints;
import org.apache.lucene.search.Filter;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.MatchAllDocsQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;
import org.junit.Before;
import org.junit.Test;

public class FilterTest {

	private static final Version VERSION = Version.LUCENE_42;

	private static final String FIELD_ID = "id";
	private static final String FIELD_ALPHA = "alpha";
	private static final String FIELD_OMEGA = "omega";

	private static final int SUM_THRESHOLD = 5;
	private static final int[] VALUES_ALPHA = new int[] { 1, 2, 3, 4, 5 };
	private static final int[] VALUES_OMEGA = new int[] { 5, 0, 5, 0, 5 };
	private static final Set<Integer> EXPECTED_MATCHED_DOCUMENT_IDS = new
HashSet<Integer>(Arrays.asList(0, 2, 4));

	private Directory directory;

	@Before
	public void setUp() throws IOException {
		directory = new RAMDirectory();

		Analyzer analyzer = new StandardAnalyzer(VERSION);
		IndexWriterConfig config = new IndexWriterConfig(VERSION, analyzer);
		config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
		IndexWriter writer = new IndexWriter(directory, config);

		for (int ii = 0; ii < VALUES_ALPHA.length; ii++) {
			Document doc = new Document();
			Field id = new IntField(FIELD_ID, ii, IntField.Store.YES);
			Field alpha = new IntField(FIELD_ALPHA, VALUES_ALPHA[ii],
IntField.Store.YES);
			Field omega = new IntField(FIELD_OMEGA, VALUES_OMEGA[ii],
IntField.Store.YES);
			doc.add(id);
			doc.add(alpha);
			doc.add(omega);
			writer.addDocument(doc);
		}

		writer.close();
	}

	@Test
	public void testSumFilter() throws IOException {
		IndexReader reader = DirectoryReader.open(directory);
		IndexSearcher searcher = new IndexSearcher(reader);
		TopDocs results = searcher.search(new MatchAllDocsQuery(), new
SumFilter(SUM_THRESHOLD), VALUES_ALPHA.length);

		try {
			assertEquals(EXPECTED_MATCHED_DOCUMENT_IDS.size(), results.totalHits);
			for (int ii = 0; ii < results.scoreDocs.length; ii++) {
				int docId = results.scoreDocs[ii].doc;
				Document doc = reader.document(docId);
				int idValue = doc.getField(FIELD_ID).numericValue().intValue();
				int alphaValue = doc.getField(FIELD_ALPHA).numericValue().intValue();
				int omegaValue = doc.getField(FIELD_OMEGA).numericValue().intValue();

				assertTrue(EXPECTED_MATCHED_DOCUMENT_IDS.contains(idValue));
				assertTrue(alphaValue + omegaValue > SUM_THRESHOLD);
			}
		} finally {
			reader.close();
		}
	}

	private class SumFilter extends Filter {

		private int minValue;

		public SumFilter(int minValue) {
			this.minValue = minValue;
		}

		@Override
		public DocIdSet getDocIdSet(AtomicReaderContext context, Bits
acceptDocs) throws IOException {
			AtomicReader reader = context.reader();
			Ints alphaCache = FieldCache.DEFAULT.getInts(reader, FIELD_ALPHA, false);
			Ints omegaCache = FieldCache.DEFAULT.getInts(reader, FIELD_OMEGA, false);
			SimpleDocIdSet docIdSet = new SimpleDocIdSet();

			int maxDoc = reader.maxDoc();
			for (int docId = 0; docId < maxDoc; docId++) {
				int sum = alphaCache.get(docId) + omegaCache.get(docId);
				if (sum > minValue) {
					docIdSet.add(docId);
				}
			}

			return docIdSet;
		}
	}

	private class SimpleDocIdSet extends DocIdSet {

		private final TreeSet<Integer> sortedDocIdSet = new TreeSet<Integer>();

		public void add(int docId) {
			sortedDocIdSet.add(docId);
		}

		@Override
		public DocIdSetIterator iterator() throws IOException {
			return new DocIdSetIterator() {

				private Iterator<Integer> sortedDocIdSetIterator =
sortedDocIdSet.iterator();
				private int currentDocId = -1;

				@Override
				public int advance(int target) throws IOException {
					while ((currentDocId = nextDoc()) < target) {
					}
					return currentDocId;
				}

				@Override
				public int docID() {
					if (currentDocId == -1) {
						return -1;
					}
					if (!sortedDocIdSetIterator.hasNext()) {
						return NO_MORE_DOCS;
					}
					return currentDocId;
				}

				@Override
				public int nextDoc() throws IOException {
					if (!sortedDocIdSetIterator.hasNext()) {
						return NO_MORE_DOCS;
					}
					currentDocId = sortedDocIdSetIterator.next();
					return currentDocId;
				}
			};
		}
	}

}

---

Regards,
Yep.

---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message