lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Bauer, Herbert S. (Scott)" <Bauer.Sc...@mayo.edu>
Subject Re: Exception when attempting to query using ToParentBlockJoinQuery in Lucene 5.1
Date Mon, 29 Jun 2015 13:33:29 GMT
I’m able to define this problem with a more discrete example including the
two classes below.  This suggests a bug and unless someone has clearer
direction on this implementation I’m planning to file it as one.


package org.lexevs.lucene.prototype;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.List;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;

public class SmallTestIndexBuilder {
	public enum Code{
		C1234,C23432,C4234,C2308, C8958;
	}
	public SmallTestIndexBuilder() {
		// TODO Auto-generated constructor stub
	}

	public void init(){
		try {
			LuceneContentBuilder builder = new LuceneContentBuilder();
			Path path = Paths.get("/Users/m029206/Desktop/index");
			Directory dir = new MMapDirectory(path);
			Analyzer analyzer=new StandardAnalyzer(new CharArraySet( 0, true));
			IndexWriterConfig iwc= new IndexWriterConfig(analyzer);
			IndexWriter writer = new IndexWriter(dir, iwc);
			createCodingSchemeIndex(builder, writer );
			writer.commit();
			writer.close();
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	private void createCodingSchemeIndex(LuceneContentBuilder builder,
			IndexWriter writer) throws IOException {
			for(Code c :Code.values()){
			List<Document> list = createBlockJoin(c.name());
			writer.addDocuments(list);
			list = createBlockJoin2(c.name());
			writer.addDocuments(list);
			}
	}

	private List<Document> createBlockJoin(String code) {
		List<Document> list = new ArrayList<Document>();
	
		Document doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue", "Mud",
Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Suds", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"coagulant", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"hepa", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"hematoma", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"normal", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"abnormal", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"notfound", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue", "red
blood cells", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
		list.add(doc);
		
		Document par = new Document();
		par.add(new org.apache.lucene.document.TextField("codingSchemeName",
"TestScheme", Field.Store.YES));
		par.add(new org.apache.lucene.document.TextField("parentDoc", "yes",
Field.Store.YES));
		par.add(new org.apache.lucene.document.TextField("entityCode", code,
Field.Store.YES));
		list.add(par);
		return list;
	}

	private List<Document> createBlockJoin2(String code) {
		List<Document> list = new ArrayList<Document>();
	
		Document doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue", "Mud",
Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Suds", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"coagulant", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"hepa", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"hematoma", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"normal", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"abnormal", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"notfound", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue", "red
blood cells", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
		list.add(doc);
		doc = new Document();
		doc.add(new org.apache.lucene.document.TextField("propertyValue",
"Blood", Field.Store.YES));
		list.add(doc);
		
		Document par = new Document();
		par.add(new org.apache.lucene.document.TextField("codingSchemeName",
"TestSchemeToo", Field.Store.YES));
		par.add(new org.apache.lucene.document.TextField("parentDoc", "yes",
Field.Store.YES));
		par.add(new org.apache.lucene.document.TextField("entityCode", code,
Field.Store.YES));
		list.add(par);
		return list;
	}
	public static void main(String[] args) {
		new SmallTestIndexBuilder().init();

	}

}



package org.lexevs.lucene.prototype;

import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;

import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.util.CharArraySet;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.QueryWrapperFilter;
import org.apache.lucene.search.Sort;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.grouping.GroupDocs;
import org.apache.lucene.search.grouping.TopGroups;
import org.apache.lucene.search.join.BitDocIdSetCachingWrapperFilter;
import org.apache.lucene.search.join.BitDocIdSetFilter;
import org.apache.lucene.search.join.ScoreMode;
import org.apache.lucene.search.join.ToParentBlockJoinCollector;
import org.apache.lucene.search.join.ToParentBlockJoinIndexSearcher;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.MMapDirectory;

public class BlockJoinTestQuery {

	public BlockJoinTestQuery() {
		// TODO Auto-generated constructor stub
	}
	
	public void run(){
	Path path = Paths.get("/Users/m029206/Desktop/index");
	Directory index;
	try {
		index = new MMapDirectory(path);

	IndexReader reader =  DirectoryReader.open(index);
	IndexSearcher searcher = new ToParentBlockJoinIndexSearcher(reader);
	ToParentBlockJoinCollector collector = new
ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true);
	BitDocIdSetFilter codingScheme = new BitDocIdSetCachingWrapperFilter(
              new QueryWrapperFilter(new QueryParser("codingSchemeName",
new StandardAnalyzer(new CharArraySet( 0, true))).parse("TestScheme")));

	  Query query = new QueryParser(null, new StandardAnalyzer(new
CharArraySet( 0, true))).createBooleanQuery("propertyValue", "Blood",
Occur.MUST);
	  ToParentBlockJoinQuery termJoinQuery = new ToParentBlockJoinQuery(
			    query, 
			    codingScheme,
			    ScoreMode.Avg);
	  searcher.search(termJoinQuery, collector);
	  TopGroups<Integer> getTopGroupsResults =
collector.getTopGroups(termJoinQuery, null, 0, 10, 0, true);
	  String ecode = null;
	  for (GroupDocs<Integer> result : getTopGroupsResults.groups) {
		  Document parent = searcher.doc(result.groupValue);
		 ecode = parent.get("entityCode");
		 System.out.println("entityCode: " + ecode);
	  }
	} catch (IOException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	} catch (ParseException e) {
		// TODO Auto-generated catch block
		e.printStackTrace();
	}
	}
	
	public static void main(String[] args) {
		new BlockJoinTestQuery().run();

	}

}



On 6/23/15, 4:17 PM, "Bauer, Herbert S. (Scott)" <Bauer.Scott@mayo.edu>
wrote:

>I’m guessing this issue may be related to the SOLR error described here:
>https://issues.apache.org/jira/browse/SOLR-7606.  I can find at least one
>group of documents with a missing parent in my generated index.  This
>doesn’t explain why I didn’t see a similar issue in 4.10.4.  I can see
>that the BitSet implementation isn’t the issue but the filtered bit set
>inside it may be causing the problem given a missing parent.  I have to
>say I’m a little concerned about the lack of feedback on this list.  Is
>there another forum that is a little more active on this subject or is the
>block join implementation just not used or supported that much?
>
>On 6/22/15, 2:21 PM, "Bauer, Herbert S. (Scott)" <Bauer.Scott@mayo.edu>
>wrote:
>
>>Well it’s clear that this is just giving a return value of
>>Integer.MAX_VALUE for the parentDoc.  Given the recent changes noted
>>here:
>> https://issues.apache.org/jira/browse/LUCENE-6021 where FixedBitSet now
>>returns Integer.MAX_VALUE instead of -1 I wonder if a bug wasn’t
>>introduced to the BlockJoinScorer.nextDoc method.  Unfortunately I have
>>yet to come up with an example to make this fail on a smaller test index.
>>The child document in question does have a parent, which is doc #4823684,
>>so I’m confused as to how the NO_MORE_DOCS value would be applied.  Is
>>there something obvious I’m missing here?
>>
>>On 6/5/15, 12:05 PM, "Bauer, Herbert S. (Scott)" <Bauer.Scott@mayo.edu>
>>wrote:
>>
>>>One correction, it looks like the parentBits call has 4823680 passed to
>>>it
>>>to generate the erroneous docId.
>>>
>>>On 6/5/15, 10:34 AM, "Bauer, Herbert S. (Scott)" <Bauer.Scott@mayo.edu>
>>>wrote:
>>>
>>>>I should mention that this worked in 4.10.4 using a very similar code
>>>>base.  -scott
>>>>
>>>>On 6/4/15, 4:51 PM, "Bauer, Herbert S. (Scott)" <Bauer.Scott@mayo.edu>
>>>>wrote:
>>>>
>>>>>I¹m working with Lucene  5.1 to try to make use of the relational
>>>>>structure of the block join index and query mechanisms.  I¹m querying
>>>>>with the following code:
>>>>>
>>>>>IndexReader reader =  DirectoryReader.open(index);
>>>>>
>>>>>ToParentBlockJoinIndexSearcher searcher = new
>>>>>ToParentBlockJoinIndexSearcher(reader);
>>>>>
>>>>>ToParentBlockJoinCollector collector = new
>>>>>ToParentBlockJoinCollector(Sort.RELEVANCE, 2, true, true);
>>>>>
>>>>>BitDocIdSetFilter codingScheme = new BitDocIdSetCachingWrapperFilter(
>>>>>
>>>>>                  new QueryWrapperFilter(new
>>>>>QueryParser("codingSchemeName", new StandardAnalyzer(new CharArraySet(
>>>>>0,
>>>>>true))).parse(scheme.getCodingSchemeName())));
>>>>>
>>>>>  Query query = new QueryParser(null, new StandardAnalyzer(new
>>>>>CharArraySet( 0, true))).createBooleanQuery("propertyValue",
>>>>>term.getTerm(), Occur.MUST);
>>>>>
>>>>>  ToParentBlockJoinQuery termJoinQuery = new ToParentBlockJoinQuery(
>>>>>
>>>>>    query,
>>>>>
>>>>>    codingScheme,
>>>>>
>>>>>    ScoreMode.Avg);
>>>>>
>>>>>  searcher.search(termJoinQuery, collector);
>>>>>
>>>>>
>>>>>To try to get parent values, but it fails on the final line with the
>>>>>following stack trace:
>>>>>
>>>>>
>>>>>Exception in thread "main" java.lang.IllegalStateException: child
>>>>>query
>>>>>must only match non-parent docs, but parent docID=2147483647 matched
>>>>>childScorer=class org.apache.lucene.search.TermScorer
>>>>>
>>>>>at 
>>>>>org.apache.lucene.search.join.ToParentBlockJoinQuery$BlockJoinScorer.n
>>>>>e
>>>>>x
>>>>>t
>>>>>D
>>>>>oc(ToParentBlockJoinQuery.java:330)
>>>>>
>>>>>at 
>>>>>org.apache.lucene.search.join.ToParentBlockJoinIndexSearcher.search(To
>>>>>P
>>>>>a
>>>>>r
>>>>>e
>>>>>ntBlockJoinIndexSearcher.java:63)
>>>>>
>>>>>at 
>>>>>org.apache.lucene.search.IndexSearcher.search(IndexSearcher.java:428)
>>>>>
>>>>>at 
>>>>>org.lexevs.lucene.prototype.LuceneQueryTrial.luceneToParentJoinQuery(L
>>>>>u
>>>>>c
>>>>>e
>>>>>n
>>>>>eQueryTrial.java:78)
>>>>>
>>>>>at 
>>>>>org.lexevs.lucene.prototype.LuceneQueryTrial.main(LuceneQueryTrial.jav
>>>>>a
>>>>>:
>>>>>3
>>>>>2
>>>>>7)
>>>>>
>>>>>
>>>>>I build indexes up to about 36Gb using a code similar to the
>>>>>following:
>>>>>
>>>>>
>>>>>List<Document> list = new ArrayList<Document>();
>>>>>
>>>>>//need a static
>>>>>
>>>>>int staticCount = count;
>>>>>
>>>>>ParentDocObject parent =
>>>>>builder.generateParentDoc(cs.getCodingSchemeName(),
>>>>>
>>>>>cs.getVersion(), cs.getURI(), "description");
>>>>>
>>>>>if 
>>>>>(cs.codingSchemeName.equals(CodingScheme.THESSCHEME.codingSchemeName))
>>>>>{
>>>>>
>>>>>//One per coding Scheme
>>>>>
>>>>>int numberOfProperties = 12;
>>>>>
>>>>>if(!thesExactMatchDone){
>>>>>
>>>>>ChildDocObject child1 =
>>>>>builder.generateChildDocWithSalt(parent,SearchTerms.BLOOD.getTerm());
>>>>>
>>>>>Document doc1 = builder.mapToDocumentExactMatch(child1);
>>>>>
>>>>>list.add(doc1);
>>>>>
>>>>>count++;
>>>>>
>>>>>numberOfProperties--;
>>>>>
>>>>>ChildDocObject child =
>>>>>builder.generateChildDocWithSalt(parent,SearchTerms.CHAR.term);
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>count++;
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>numberOfProperties--;
>>>>>
>>>>>thesExactMatchDone = true;
>>>>>
>>>>>}
>>>>>
>>>>>while (numberOfProperties > 0) {
>>>>>
>>>>>if(count % 547 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.BLOOD.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 233 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.CHAR.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 71 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.ARTICLE.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 2237 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.LUNG_CANCER.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 5077 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGenerator(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.LIVER_CARCINOMA.getTerm())
>>>>>)
>>>>>;
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 2371 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.BLOOD.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 79 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.ARTICLE.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 3581 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.LUNG_CANCER.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>}else if(count % 23 == 0){
>>>>>
>>>>>ChildDocObject child = builder.generateChildDocWithSalt(parent,
>>>>>
>>>>>builder.randomTextGeneratorStartsWith(
>>>>>
>>>>>builder.randomNumberGenerator(),SearchTerms.CHAR.getTerm()));
>>>>>
>>>>>Document doc = builder.mapToDocumentExactMatch(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;numberOfProperties--;
>>>>>
>>>>>} else {
>>>>>
>>>>>ChildDocObject child = builder.generateChildDoc(parent);
>>>>>
>>>>>Document doc = builder.mapToDocument(child);
>>>>>
>>>>>list.add(doc);
>>>>>
>>>>>count++;
>>>>>
>>>>>numberOfProperties--;
>>>>>
>>>>>}
>>>>>
>>>>>}
>>>>>
>>>>>}
>>>>>
>>>>>Document par = builder.mapToDocument(parent);
>>>>>
>>>>>list.add(par);
>>>>>
>>>>>writer.addDocuments(list);
>>>>>
>>>>>}
>>>>>
>>>>>
>>>>>Which works pretty well until I scale it up using several instances of
>>>>>this.  When the nextChildDoc document retrieved gets to id 5874902 the
>>>>>line in ToParentBlockJoinQuery
>>>>>
>>>>>
>>>>>        parentDoc = parentBits.nextSetBit(nextChildDoc);
>>>>>
>>>>>
>>>>>Gives the value  2147483647 to the parentDoc, which is not a document
>>>>>id
>>>>>in my index if I understand lucene and Luke correctly since my index
>>>>>has
>>>>>only 42716877 documents.
>>>>>
>>>>>Can someone shed some light on this exception?
>>>>>
>>>>>
>>>>>Thanks,
>>>>>
>>>>>Scott Bauer
>>>>>
>>>>>
>>>>>
>>>>>
>>>>
>>>>
>>>>---------------------------------------------------------------------
>>>>To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
>>>>For additional commands, e-mail: java-user-help@lucene.apache.org
>>>>
>>>
>>
>

Mime
View raw message