lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1481116 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/queryparser/ lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/ lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/ lucene/queryparser/src/test/o...
Date Fri, 10 May 2013 17:39:33 GMT
Author: rmuir
Date: Fri May 10 17:39:30 2013
New Revision: 1481116

URL: http://svn.apache.org/r1481116
Log:
LUCENE-4991: QueryParser doesnt handle synonyms correctly for chinese

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/lucene/queryparser/   (props changed)
    lucene/dev/branches/branch_4x/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
    lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java
    lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1481116&r1=1481115&r2=1481116&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Fri May 10 17:39:30 2013
@@ -97,6 +97,11 @@ Bug Fixes
 
 * LUCENE-4996: Ensure DocInverterPerField always includes field name
   in exception messages.  (Markus Jelsma via Robert Muir)
+
+* LUCENE-4991: Fix handling of synonyms in classic QueryParser.getFieldQuery for 
+  terms not separated by whitespace. PositionIncrementAttribute was ignored, so with 
+  default AND synonyms wrongly became mandatory clauses, and with OR, the 
+  coordination factor was wrong.  (李威, Robert Muir)
   
 Optimizations
 

Modified: lucene/dev/branches/branch_4x/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java?rev=1481116&r1=1481115&r2=1481116&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/queryparser/src/java/org/apache/lucene/queryparser/classic/QueryParserBase.java
Fri May 10 17:39:30 2013
@@ -576,24 +576,53 @@ public abstract class QueryParserBase im
       if (severalTokensAtSamePosition || (!quoted && !autoGeneratePhraseQueries))
{
         if (positionCount == 1 || (!quoted && !autoGeneratePhraseQueries)) {
           // no phrase query:
-          BooleanQuery q = newBooleanQuery(positionCount == 1);
-
-          BooleanClause.Occur occur = positionCount > 1 && operator == AND_OPERATOR
?
-            BooleanClause.Occur.MUST : BooleanClause.Occur.SHOULD;
-
-          for (int i = 0; i < numTokens; i++) {
-            try {
-              boolean hasNext = buffer.incrementToken();
-              assert hasNext == true;
-              termAtt.fillBytesRef();
-            } catch (IOException e) {
-              // safe to ignore, because we know the number of tokens
+          
+          if (positionCount == 1) {
+            // simple case: only one position, with synonyms
+            BooleanQuery q = newBooleanQuery(true);
+            for (int i = 0; i < numTokens; i++) {
+              try {
+                boolean hasNext = buffer.incrementToken();
+                assert hasNext == true;
+                termAtt.fillBytesRef();
+              } catch (IOException e) {
+                // safe to ignore, because we know the number of tokens
+              }
+              Query currentQuery = newTermQuery(
+                  new Term(field, BytesRef.deepCopyOf(bytes)));
+              q.add(currentQuery, BooleanClause.Occur.SHOULD);
+            }
+            return q;
+          } else {
+            // multiple positions
+            BooleanQuery q = newBooleanQuery(false);
+            final BooleanClause.Occur occur = operator == Operator.AND ? BooleanClause.Occur.MUST
: BooleanClause.Occur.SHOULD;
+            Query currentQuery = null;
+            for (int i = 0; i < numTokens; i++) {
+              try {
+                boolean hasNext = buffer.incrementToken();
+                assert hasNext == true;
+                termAtt.fillBytesRef();
+              } catch (IOException e) {
+                // safe to ignore, because we know the number of tokens
+              }
+              if (posIncrAtt != null && posIncrAtt.getPositionIncrement() == 0) {
+                if (!(currentQuery instanceof BooleanQuery)) {
+                  Query t = currentQuery;
+                  currentQuery = newBooleanQuery(true);
+                  ((BooleanQuery)currentQuery).add(t, BooleanClause.Occur.SHOULD);
+                }
+                ((BooleanQuery)currentQuery).add(newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes))),
BooleanClause.Occur.SHOULD);
+              } else {
+                if (currentQuery != null) {
+                  q.add(currentQuery, occur);
+                }
+                currentQuery = newTermQuery(new Term(field, BytesRef.deepCopyOf(bytes)));
+              }
             }
-            Query currentQuery = newTermQuery(
-                new Term(field, BytesRef.deepCopyOf(bytes)));
             q.add(currentQuery, occur);
+            return q;
           }
-          return q;
         }
         else {
           // phrase query:

Modified: lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java?rev=1481116&r1=1481115&r2=1481116&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/classic/TestQueryParser.java
Fri May 10 17:39:30 2013
@@ -17,9 +17,17 @@ package org.apache.lucene.queryparser.cl
  * limitations under the License.
  */
 
+import java.io.IOException;
+import java.io.Reader;
+
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.MockAnalyzer;
 import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
 import org.apache.lucene.document.DateTools.Resolution;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryparser.classic.QueryParser.Operator;
@@ -27,6 +35,7 @@ import org.apache.lucene.queryparser.fle
 import org.apache.lucene.queryparser.util.QueryParserTestBase;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.TermQuery;
 
@@ -307,4 +316,178 @@ public class TestQueryParser extends Que
     assertEquals(unexpanded, smart.parse("\"dogs\""));
   }
   
+  // TODO: fold these into QueryParserTestBase
+  
+  /** adds synonym of "dog" for "dogs". */
+  static class MockSynonymAnalyzer extends Analyzer {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      MockTokenizer tokenizer = new MockTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new MockSynonymFilter(tokenizer));
+    }
+  }
+  
+  /** simple synonyms test */
+  public void testSynonyms() throws Exception {
+    BooleanQuery expected = new BooleanQuery(true);
+    expected.add(new TermQuery(new Term("field", "dogs")), BooleanClause.Occur.SHOULD);
+    expected.add(new TermQuery(new Term("field", "dog")), BooleanClause.Occur.SHOULD);
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockSynonymAnalyzer());
+    assertEquals(expected, qp.parse("dogs"));
+    assertEquals(expected, qp.parse("\"dogs\""));
+    qp.setDefaultOperator(Operator.AND);
+    assertEquals(expected, qp.parse("dogs"));
+    assertEquals(expected, qp.parse("\"dogs\""));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("dogs^2"));
+    assertEquals(expected, qp.parse("\"dogs\"^2"));
+  }
+  
+  /** forms multiphrase query */
+  public void testSynonymsPhrase() throws Exception {
+    MultiPhraseQuery expected = new MultiPhraseQuery();
+    expected.add(new Term("field", "old"));
+    expected.add(new Term[] { new Term("field", "dogs"), new Term("field", "dog") });
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockSynonymAnalyzer());
+    assertEquals(expected, qp.parse("\"old dogs\""));
+    qp.setDefaultOperator(Operator.AND);
+    assertEquals(expected, qp.parse("\"old dogs\""));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("\"old dogs\"^2"));
+    expected.setSlop(3);
+    assertEquals(expected, qp.parse("\"old dogs\"~3^2"));
+  }
+  
+  /**
+   * adds synonym of "國" for "国".
+   */
+  protected static class MockCJKSynonymFilter extends TokenFilter {
+    CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+    PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+    boolean addSynonym = false;
+    
+    public MockCJKSynonymFilter(TokenStream input) {
+      super(input);
+    }
+
+    @Override
+    public final boolean incrementToken() throws IOException {
+      if (addSynonym) { // inject our synonym
+        clearAttributes();
+        termAtt.setEmpty().append("國");
+        posIncAtt.setPositionIncrement(0);
+        addSynonym = false;
+        return true;
+      }
+      
+      if (input.incrementToken()) {
+        addSynonym = termAtt.toString().equals("国");
+        return true;
+      } else {
+        return false;
+      }
+    } 
+  }
+  
+  static class MockCJKSynonymAnalyzer extends Analyzer {
+    @Override
+    protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
+      Tokenizer tokenizer = new SimpleCJKTokenizer(reader);
+      return new TokenStreamComponents(tokenizer, new MockCJKSynonymFilter(tokenizer));
+    }
+  }
+  
+  /** simple CJK synonym test */
+  public void testCJKSynonym() throws Exception {
+    BooleanQuery expected = new BooleanQuery(true);
+    expected.add(new TermQuery(new Term("field", "国")), BooleanClause.Occur.SHOULD);
+    expected.add(new TermQuery(new Term("field", "國")), BooleanClause.Occur.SHOULD);
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
+    assertEquals(expected, qp.parse("国"));
+    qp.setDefaultOperator(Operator.AND);
+    assertEquals(expected, qp.parse("国"));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("国^2"));
+  }
+  
+  /** synonyms with default OR operator */
+  public void testCJKSynonymsOR() throws Exception {
+    BooleanQuery expected = new BooleanQuery();
+    expected.add(new TermQuery(new Term("field", "中")), BooleanClause.Occur.SHOULD);
+    BooleanQuery inner = new BooleanQuery(true);
+    inner.add(new TermQuery(new Term("field", "国")), BooleanClause.Occur.SHOULD);
+    inner.add(new TermQuery(new Term("field", "國")), BooleanClause.Occur.SHOULD);
+    expected.add(inner, BooleanClause.Occur.SHOULD);
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
+    assertEquals(expected, qp.parse("中国"));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("中国^2"));
+  }
+  
+  /** more complex synonyms with default OR operator */
+  public void testCJKSynonymsOR2() throws Exception {
+    BooleanQuery expected = new BooleanQuery();
+    expected.add(new TermQuery(new Term("field", "中")), BooleanClause.Occur.SHOULD);
+    BooleanQuery inner = new BooleanQuery(true);
+    inner.add(new TermQuery(new Term("field", "国")), BooleanClause.Occur.SHOULD);
+    inner.add(new TermQuery(new Term("field", "國")), BooleanClause.Occur.SHOULD);
+    expected.add(inner, BooleanClause.Occur.SHOULD);
+    BooleanQuery inner2 = new BooleanQuery(true);
+    inner2.add(new TermQuery(new Term("field", "国")), BooleanClause.Occur.SHOULD);
+    inner2.add(new TermQuery(new Term("field", "國")), BooleanClause.Occur.SHOULD);
+    expected.add(inner2, BooleanClause.Occur.SHOULD);
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
+    assertEquals(expected, qp.parse("中国国"));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("中国国^2"));
+  }
+  
+  /** synonyms with default AND operator */
+  public void testCJKSynonymsAND() throws Exception {
+    BooleanQuery expected = new BooleanQuery();
+    expected.add(new TermQuery(new Term("field", "中")), BooleanClause.Occur.MUST);
+    BooleanQuery inner = new BooleanQuery(true);
+    inner.add(new TermQuery(new Term("field", "国")), BooleanClause.Occur.SHOULD);
+    inner.add(new TermQuery(new Term("field", "國")), BooleanClause.Occur.SHOULD);
+    expected.add(inner, BooleanClause.Occur.MUST);
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
+    qp.setDefaultOperator(Operator.AND);
+    assertEquals(expected, qp.parse("中国"));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("中国^2"));
+  }
+  
+  /** more complex synonyms with default AND operator */
+  public void testCJKSynonymsAND2() throws Exception {
+    BooleanQuery expected = new BooleanQuery();
+    expected.add(new TermQuery(new Term("field", "中")), BooleanClause.Occur.MUST);
+    BooleanQuery inner = new BooleanQuery(true);
+    inner.add(new TermQuery(new Term("field", "国")), BooleanClause.Occur.SHOULD);
+    inner.add(new TermQuery(new Term("field", "國")), BooleanClause.Occur.SHOULD);
+    expected.add(inner, BooleanClause.Occur.MUST);
+    BooleanQuery inner2 = new BooleanQuery(true);
+    inner2.add(new TermQuery(new Term("field", "国")), BooleanClause.Occur.SHOULD);
+    inner2.add(new TermQuery(new Term("field", "國")), BooleanClause.Occur.SHOULD);
+    expected.add(inner2, BooleanClause.Occur.MUST);
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
+    qp.setDefaultOperator(Operator.AND);
+    assertEquals(expected, qp.parse("中国国"));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("中国国^2"));
+  }
+  
+  /** forms multiphrase query */
+  public void testCJKSynonymsPhrase() throws Exception {
+    MultiPhraseQuery expected = new MultiPhraseQuery();
+    expected.add(new Term("field", "中"));
+    expected.add(new Term[] { new Term("field", "国"), new Term("field", "國")});
+    QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, "field", new MockCJKSynonymAnalyzer());
+    qp.setDefaultOperator(Operator.AND);
+    assertEquals(expected, qp.parse("\"中国\""));
+    expected.setBoost(2.0f);
+    assertEquals(expected, qp.parse("\"中国\"^2"));
+    expected.setSlop(3);
+    assertEquals(expected, qp.parse("\"中国\"~3^2"));
+  }
+  
 }

Modified: lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java?rev=1481116&r1=1481115&r2=1481116&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/queryparser/src/test/org/apache/lucene/queryparser/util/QueryParserTestBase.java
Fri May 10 17:39:30 2013
@@ -236,7 +236,7 @@ public abstract class QueryParserTestBas
   }
 
   //individual CJK chars as terms, like StandardAnalyzer
-  private class SimpleCJKTokenizer extends Tokenizer {
+  protected static class SimpleCJKTokenizer extends Tokenizer {
     private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 
     public SimpleCJKTokenizer(Reader input) {
@@ -244,7 +244,7 @@ public abstract class QueryParserTestBas
     }
 
     @Override
-    public boolean incrementToken() throws IOException {
+    public final boolean incrementToken() throws IOException {
       int ch = input.read();
       if (ch < 0)
         return false;
@@ -1088,7 +1088,7 @@ public abstract class QueryParserTestBas
   /**
    * adds synonym of "dog" for "dogs".
    */
-  private class MockSynonymFilter extends TokenFilter {
+  protected static class MockSynonymFilter extends TokenFilter {
     CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
     PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
     boolean addSynonym = false;



Mime
View raw message