incubator-blur-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From amccu...@apache.org
Subject [2/2] git commit: Fixing issues with the parser. Now analyzes query input correctly for fuzzy, prefix, wildcard, range, regex, etc.
Date Mon, 21 Sep 2015 19:47:16 GMT
Fixing issues with the parser.  Now analyzes query input correctly for fuzzy, prefix, wildcard,
range, regex, etc.


Project: http://git-wip-us.apache.org/repos/asf/incubator-blur/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-blur/commit/f8c45c38
Tree: http://git-wip-us.apache.org/repos/asf/incubator-blur/tree/f8c45c38
Diff: http://git-wip-us.apache.org/repos/asf/incubator-blur/diff/f8c45c38

Branch: refs/heads/master
Commit: f8c45c38051d02e775cb826c502b7b1e25270a67
Parents: 828c127
Author: Aaron McCurry <amccurry@gmail.com>
Authored: Mon Sep 21 15:46:46 2015 -0400
Committer: Aaron McCurry <amccurry@gmail.com>
Committed: Mon Sep 21 15:46:46 2015 -0400

----------------------------------------------------------------------
 .../blur/lucene/search/BlurQueryParser.java     | 203 ++++++++++++++++++-
 .../apache/blur/lucene/search/SuperParser.java  |  24 ++-
 .../blur/lucene/search/SuperParserTest.java     | 155 +++++++++++++-
 3 files changed, 377 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/f8c45c38/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java
----------------------------------------------------------------------
diff --git a/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java b/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java
index 801c73f..dd95aaa 100644
--- a/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java
+++ b/blur-query/src/main/java/org/apache/blur/lucene/search/BlurQueryParser.java
@@ -17,14 +17,28 @@ package org.apache.blur.lucene.search;
  * limitations under the License.
  */
 import java.io.IOException;
+import java.io.StringReader;
+import java.text.DateFormat;
+import java.util.Calendar;
+import java.util.Date;
 import java.util.HashMap;
+import java.util.Locale;
 import java.util.Map;
+import java.util.TimeZone;
 import java.util.UUID;
 
 import org.apache.blur.analysis.FieldManager;
+import org.apache.blur.analysis.FieldTypeDefinition;
 import org.apache.blur.utils.BlurConstants;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.core.KeywordAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.DateTools;
 import org.apache.lucene.index.Term;
+import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
+import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.PhraseQuery;
@@ -39,11 +53,17 @@ public class BlurQueryParser extends QueryParser {
   protected final Map<Query, String> _fieldNames;
   protected final FieldManager _fieldManager;
 
+  protected final Locale _locale = Locale.getDefault();
+  protected final TimeZone _timeZone = TimeZone.getDefault();
+  protected final boolean _allowLeadingWildcard;
+  protected final int _fuzzyPrefixLength = FuzzyQuery.defaultPrefixLength;
+
   public BlurQueryParser(Version matchVersion, String f, Map<Query, String> fieldNames,
FieldManager fieldManager) {
     super(matchVersion, f, fieldManager.getAnalyzerForQuery());
     _fieldNames = fieldNames == null ? new HashMap<Query, String>() : fieldNames;
     _fieldManager = fieldManager;
-    setAllowLeadingWildcard(true);
+    _allowLeadingWildcard = true;
+    setAllowLeadingWildcard(_allowLeadingWildcard);
     setAutoGeneratePhraseQueries(true);
   }
 
@@ -129,7 +149,7 @@ public class BlurQueryParser extends QueryParser {
     return addField(super.newRangeQuery(resolvedField, part1, part2, startInclusive, endInclusive),
resolvedField);
   }
 
-  private void customQueryCheck(String field) {
+  protected void customQueryCheck(String field) {
     try {
       Boolean b = _fieldManager.checkSupportForCustomQuery(field);
       if (b != null && b) {
@@ -206,9 +226,186 @@ public class BlurQueryParser extends QueryParser {
     return addField(super.newRegexpQuery(new Term(resolvedField, t.text())), resolvedField);
   }
 
-  private Query addField(Query q, String field) {
+  protected Query addField(Query q, String field) {
     _fieldNames.put(q, field);
     return q;
   }
 
+  protected String analyzeField(String field, String text) throws ParseException {
+    try {
+      FieldTypeDefinition fieldTypeDefinition = _fieldManager.getFieldTypeDefinition(field);
+      if (fieldTypeDefinition == null) {
+        return text;
+      }
+      Analyzer analyzerForQuery = fieldTypeDefinition.getAnalyzerForQuery(field);
+      if (analyzerForQuery instanceof KeywordAnalyzer) {
+        return text;
+      }
+
+      StringBuilder builder = new StringBuilder();
+      StringBuilder result = new StringBuilder();
+      for (int i = 0; i < text.length(); i++) {
+        char c = text.charAt(i);
+        if (isSpecialChar(c) && !isEscaped(text, i - 1)) {
+          if (builder.length() > 0) {
+            result.append(analyze(field, builder.toString(), analyzerForQuery));
+            builder.setLength(0);
+          }
+          if (isSpecialRange(c)) {
+            char closingChar = getClosingChar(c);
+            int indexOf = text.indexOf(closingChar, i);
+            if (indexOf < 0) {
+              throw new ParseException("Could not find closing char [" + closingChar + "]
in text [" + text + "]");
+            }
+            String s = text.substring(i, indexOf + 1);
+            result.append(s);
+            i += s.length() - 1;
+          } else {
+            result.append(c);
+          }
+        } else {
+          builder.append(c);
+        }
+      }
+      if (builder.length() > 0) {
+        result.append(analyze(field, builder.toString(), analyzerForQuery));
+        builder.setLength(0);
+      }
+      return result.toString();
+    } catch (IOException e) {
+      throw new ParseException(e.getMessage());
+    }
+  }
+
+  private char getClosingChar(char c) throws ParseException {
+    switch (c) {
+    case '[':
+      return ']';
+    default:
+      throw new ParseException("Closing char for " + c + " not found.");
+    }
+  }
+
+  private boolean isSpecialRange(char c) {
+    switch (c) {
+    case '[':
+      return true;
+    case '{':
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  private boolean isSpecialChar(char c) {
+    switch (c) {
+    case '?':
+    case '/':
+    case '[':
+    case ']':
+    case '}':
+    case '{':
+    case '*':
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  private boolean isEscaped(String text, int pos) {
+    if (pos == 0) {
+      return false;
+    }
+    return text.charAt(pos) == '\\';
+  }
+
+  private String analyze(String field, String text, Analyzer analyzerForQuery) throws IOException,
ParseException {
+    StringBuilder result = new StringBuilder();
+    TokenStream tokenStream = analyzerForQuery.tokenStream(field, new StringReader(text));
+    CharTermAttribute termAttribute = tokenStream.getAttribute(CharTermAttribute.class);
+    tokenStream.reset();
+    if (tokenStream.incrementToken()) {
+      result.append(termAttribute.toString());
+    }
+    if (tokenStream.incrementToken()) {
+      throw new ParseException("Should not have multiple tokens in text [" + text + "] for
field [" + field + "].");
+    }
+    return result.toString();
+  }
+
+  @Override
+  protected Query getRangeQuery(String field, String part1, String part2, boolean startInclusive,
boolean endInclusive)
+      throws ParseException {
+    part1 = part1 == null ? null : analyzeField(field, part1);
+    part2 = part2 == null ? null : analyzeField(field, part2);
+
+    DateFormat df = DateFormat.getDateInstance(DateFormat.SHORT, _locale);
+    df.setLenient(true);
+    DateTools.Resolution resolution = getDateResolution(field);
+
+    try {
+      part1 = DateTools.dateToString(df.parse(part1), resolution);
+    } catch (Exception e) {
+    }
+
+    try {
+      Date d2 = df.parse(part2);
+      if (endInclusive) {
+        // The user can only specify the date, not the time, so make sure
+        // the time is set to the latest possible time of that date to really
+        // include all documents:
+        Calendar cal = Calendar.getInstance(_timeZone, _locale);
+        cal.setTime(d2);
+        cal.set(Calendar.HOUR_OF_DAY, 23);
+        cal.set(Calendar.MINUTE, 59);
+        cal.set(Calendar.SECOND, 59);
+        cal.set(Calendar.MILLISECOND, 999);
+        d2 = cal.getTime();
+      }
+      part2 = DateTools.dateToString(d2, resolution);
+    } catch (Exception e) {
+    }
+    return newRangeQuery(field, part1, part2, startInclusive, endInclusive);
+  }
+
+  @Override
+  protected Query getWildcardQuery(String field, String termStr) throws ParseException {
+    if ("*".equals(field)) {
+      if ("*".equals(termStr)) {
+        return newMatchAllDocsQuery();
+      }
+    }
+    if (!_allowLeadingWildcard && (termStr.startsWith("*") || termStr.startsWith("?")))
{
+      throw new ParseException("'*' or '?' not allowed as first character in WildcardQuery");
+    }
+    if (!"*".equals(termStr)) {
+      termStr = analyzeField(field, termStr);
+    }
+    Term t = new Term(field, termStr);
+    return newWildcardQuery(t);
+  }
+
+  @Override
+  protected Query getRegexpQuery(String field, String termStr) throws ParseException {
+    termStr = analyzeField(field, termStr);
+    Term t = new Term(field, termStr);
+    return newRegexpQuery(t);
+  }
+
+  @Override
+  protected Query getPrefixQuery(String field, String termStr) throws ParseException {
+    if (!_allowLeadingWildcard && termStr.startsWith("*"))
+      throw new ParseException("'*' not allowed as first character in PrefixQuery");
+    termStr = analyzeField(field, termStr);
+    Term t = new Term(field, termStr);
+    return newPrefixQuery(t);
+  }
+
+  @Override
+  protected Query getFuzzyQuery(String field, String termStr, float minSimilarity) throws
ParseException {
+    termStr = analyzeField(field, termStr);
+    Term t = new Term(field, termStr);
+    return newFuzzyQuery(t, minSimilarity, _fuzzyPrefixLength);
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/f8c45c38/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java
----------------------------------------------------------------------
diff --git a/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java b/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java
index b2bc871..40cde15 100644
--- a/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java
+++ b/blur-query/src/main/java/org/apache/blur/lucene/search/SuperParser.java
@@ -274,11 +274,30 @@ public class SuperParser extends BlurQueryParser {
   private boolean isSameGroupName(BooleanQuery booleanQuery) {
     String groupName = findFirstGroupName(booleanQuery);
     if (groupName == null) {
+      if (allFieldQueriesAreSystemFields(booleanQuery)) {
+        return true;
+      }
       return false;
     }
     return isSameGroupName(booleanQuery, groupName);
   }
 
+  private boolean allFieldQueriesAreSystemFields(Query query) {
+    if (query instanceof BooleanQuery) {
+      BooleanQuery booleanQuery = (BooleanQuery) query;
+      for (BooleanClause clause : booleanQuery.clauses()) {
+        if (!allFieldQueriesAreSystemFields(clause.getQuery())) {
+          return false;
+        }
+      }
+      return true;
+    } else if (query instanceof SuperQuery) {
+      return allFieldQueriesAreSystemFields(((SuperQuery) query).getQuery());
+    } else {
+      return isSystemField(_fieldNames.get(query));
+    }
+  }
+
   private boolean isSameGroupName(Query query, String groupName) {
     if (query instanceof BooleanQuery) {
       BooleanQuery booleanQuery = (BooleanQuery) query;
@@ -317,7 +336,10 @@ public class SuperParser extends BlurQueryParser {
     if (query instanceof BooleanQuery) {
       BooleanQuery booleanQuery = (BooleanQuery) query;
       for (BooleanClause clause : booleanQuery.clauses()) {
-        return findFirstGroupName(clause.getQuery());
+        String groupName = findFirstGroupName(clause.getQuery());
+        if (groupName != null) {
+          return groupName;
+        }
       }
       return null;
     } else if (query instanceof SuperQuery) {

http://git-wip-us.apache.org/repos/asf/incubator-blur/blob/f8c45c38/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java
----------------------------------------------------------------------
diff --git a/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java b/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java
index 937589f..f3d8326 100644
--- a/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java
+++ b/blur-query/src/test/java/org/apache/blur/lucene/search/SuperParserTest.java
@@ -22,8 +22,10 @@ import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
 
 import java.io.IOException;
+import java.lang.reflect.Field;
 import java.text.SimpleDateFormat;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Date;
 import java.util.List;
 import java.util.concurrent.TimeUnit;
@@ -41,12 +43,18 @@ import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queryparser.classic.ParseException;
+import org.apache.lucene.search.AutomatonQuery;
 import org.apache.lucene.search.BooleanClause;
 import org.apache.lucene.search.BooleanClause.Occur;
 import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FuzzyQuery;
 import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.NumericRangeQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
+import org.apache.lucene.search.RegexpQuery;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.search.WildcardQuery;
@@ -96,6 +104,7 @@ public class SuperParserTest {
     fieldManager.addColumnDefinitionDouble("a", "id_d");
     fieldManager.addColumnDefinitionFloat("a", "id_f");
     fieldManager.addColumnDefinitionLong("a", "id_l");
+    fieldManager.addColumnDefinitionString("a", "id_s");
     fieldManager.addColumnDefinitionDate("a", "id_date", "yyyy-MM-dd");
     fieldManager.addColumnDefinitionGisRecursivePrefixTree("a", "id_gis");
     return fieldManager;
@@ -565,6 +574,84 @@ public class SuperParserTest {
     assertTrue(equals);
   }
 
+  @Test
+  public void test48() throws ParseException {
+    Query q = parseSq("<a.id_s:ABC*>");
+    Query q1 = sq(pq("a.id_s", "ABC"));
+    assertQuery(q, q1);
+  }
+
+  @Test
+  public void test49() throws ParseException {
+    Query q = parseSq("<the cow Jumped Over the moon>");
+    Query q1 = sq(bq(bc(tq("super", "the")), bc(tq("super", "cow")), bc(tq("super", "jumped")),
+        bc(tq("super", "over")), bc(tq("super", "the")), bc(tq("super", "moon"))));
+    assertQuery(q, q1);
+  }
+
+  @Test
+  public void test50() throws ParseException {
+    Query q = parseSq("<the cow Jumped Over the moon a.id_s:ABC>");
+    Query q1 = sq(bq(bc(tq("super", "the")), bc(tq("super", "cow")), bc(tq("super", "jumped")),
+        bc(tq("super", "over")), bc(tq("super", "the")), bc(tq("super", "moon")), bc(tq("a.id_s",
"ABC"))));
+    assertQuery(q, q1);
+  }
+
+  @Test
+  public void test51() throws ParseException {
+    Query q = parseSq("<Here* We Go*>");
+    Query q1 = sq(bq(bc(pq("super", "here")), bc(tq("super", "we")), bc(pq("super", "go"))));
+    assertQuery(q, q1);
+  }
+
+  @Test
+  public void test52() throws ParseException {
+    Query q = parseSq("<He?e We Go*>");
+    Query q1 = sq(bq(bc(wq("super", "he?e")), bc(tq("super", "we")), bc(pq("super", "go"))));
+    assertQuery(q, q1);
+  }
+
+  @Test
+  public void test53() throws ParseException {
+    Query q = parseSq("</He[rR]e/ We Go*>");
+    Query q1 = sq(bq(bc(rxq("super", "he[rR]e")), bc(tq("super", "we")), bc(pq("super", "go"))));
+    assertQuery(q, q1);
+  }
+
+  @Test
+  public void test54() throws ParseException {
+    Query q = parseSq("<Here~1 We Go*>");
+    Query q1 = sq(bq(bc(fzq("super", "here", 1)), bc(tq("super", "we")), bc(pq("super", "go"))));
+    assertQuery(q, q1);
+  }
+
+  private Query fzq(String field, String text, int maxEdits) {
+    return new FuzzyQuery(new Term(field, text), maxEdits);
+  }
+
+  @Test
+  public void test55() throws ParseException {
+    Query q = parseSq("<a.id_s:[A TO Z}>");
+    Query q1 = sq(rq_ie("a.id_s", "A", "Z"));
+    assertQuery(q, q1);
+  }
+
+  private Query rq_ie(String field, String part1, String part2) {
+    return TermRangeQuery.newStringRange(field, part1, part2, true, false);
+  }
+
+  private RegexpQuery rxq(String field, String text) {
+    return new RegexpQuery(new Term(field, text));
+  }
+
+  private WildcardQuery wq(String field, String text) {
+    return new WildcardQuery(new Term(field, text));
+  }
+
+  private PrefixQuery pq(String field, String text) {
+    return new PrefixQuery(new Term(field, text));
+  }
+
   public static BooleanClause bc_m(Query q) {
     return new BooleanClause(q, Occur.MUST);
   }
@@ -591,6 +678,22 @@ public class SuperParserTest {
       assertEqualsSuperQuery((SuperQuery) expected, (SuperQuery) actual);
     } else if (expected instanceof TermQuery) {
       assertEqualsTermQuery((TermQuery) expected, (TermQuery) actual);
+    } else if (expected instanceof PrefixQuery) {
+      assertEqualsPrefixQuery((PrefixQuery) expected, (PrefixQuery) actual);
+    } else if (expected instanceof WildcardQuery) {
+      assertEqualsWildcardQuery((WildcardQuery) expected, (WildcardQuery) actual);
+    } else if (expected instanceof FuzzyQuery) {
+      assertEqualsFuzzyQuery((FuzzyQuery) expected, (FuzzyQuery) actual);
+    } else if (expected instanceof RegexpQuery) {
+      assertEqualsRegexpQuery((RegexpQuery) expected, (RegexpQuery) actual);
+    } else if (expected instanceof TermRangeQuery) {
+      assertEqualsTermRangeQuery((TermRangeQuery) expected, (TermRangeQuery) actual);
+    } else if (expected instanceof MatchAllDocsQuery) {
+      assertEqualsMatchAllDocsQuery((MatchAllDocsQuery) expected, (MatchAllDocsQuery) actual);
+    } else if (expected instanceof MultiPhraseQuery) {
+      assertEqualsMultiPhraseQuery((MultiPhraseQuery) expected, (MultiPhraseQuery) actual);
+    } else if (expected instanceof PhraseQuery) {
+      assertEqualsPhraseQuery((PhraseQuery) expected, (PhraseQuery) actual);
     } else if (expected instanceof NumericRangeQuery<?>) {
       assertEqualsNumericRangeQuery((NumericRangeQuery<?>) expected, (NumericRangeQuery<?>)
actual);
     } else {
@@ -598,6 +701,56 @@ public class SuperParserTest {
     }
   }
 
+  private static void assertEqualsFuzzyQuery(FuzzyQuery expected, FuzzyQuery actual) {
+    assertEquals(expected.getField(), actual.getField());
+    assertEquals(expected.getTerm(), actual.getTerm());
+    assertEquals(expected.getMaxEdits(), actual.getMaxEdits());
+  }
+
+  private static void assertEqualsPhraseQuery(PhraseQuery expected, PhraseQuery actual) {
+    assertTrue(Arrays.equals(expected.getTerms(), actual.getTerms()));
+    assertTrue(Arrays.equals(expected.getPositions(), actual.getPositions()));
+  }
+
+  private static void assertEqualsMultiPhraseQuery(MultiPhraseQuery expected, MultiPhraseQuery
actual) {
+    throw new RuntimeException("Not Implemented");
+  }
+
+  private static void assertEqualsMatchAllDocsQuery(MatchAllDocsQuery expected, MatchAllDocsQuery
actual) {
+    // do nothing
+  }
+
+  private static void assertEqualsTermRangeQuery(TermRangeQuery expected, TermRangeQuery
actual) {
+    assertEquals(expected.getField(), actual.getField());
+    assertEquals(expected.getLowerTerm(), actual.getLowerTerm());
+    assertEquals(expected.getUpperTerm(), actual.getUpperTerm());
+  }
+
+  private static void assertEqualsRegexpQuery(RegexpQuery expected, RegexpQuery actual) {
+    assertEquals(expected.getField(), actual.getField());
+    assertEquals(getTerm(expected), getTerm(actual));
+  }
+
+  private static Term getTerm(RegexpQuery regexpQuery) {
+    try {
+      Field field = AutomatonQuery.class.getDeclaredField("term");
+      field.setAccessible(true);
+      return (Term) field.get(regexpQuery);
+    } catch (Exception e) {
+      throw new RuntimeException(e);
+    }
+  }
+
+  private static void assertEqualsWildcardQuery(WildcardQuery expected, WildcardQuery actual)
{
+    assertEquals(expected.getField(), actual.getField());
+    assertEquals(expected.getTerm(), actual.getTerm());
+  }
+
+  private static void assertEqualsPrefixQuery(PrefixQuery expected, PrefixQuery actual) {
+    assertEquals(expected.getField(), actual.getField());
+    assertEquals(expected.getPrefix(), actual.getPrefix());
+  }
+
   public static void assertEqualsTermQuery(TermQuery expected, TermQuery actual) {
     Term term1 = expected.getTerm();
     Term term2 = actual.getTerm();
@@ -609,7 +762,7 @@ public class SuperParserTest {
   }
 
   public static void assertEqualsSuperQuery(SuperQuery expected, SuperQuery actual) {
-    assertEquals(expected.getQuery(), actual.getQuery());
+    assertEqualsQuery(expected.getQuery(), actual.getQuery());
   }
 
   public static void assertEqualsBooleanQuery(BooleanQuery expected, BooleanQuery actual)
{


Mime
View raw message