lucenenet-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mhern...@apache.org
Subject [08/16] git commit: Port ComplexPhraseQueryParser
Date Sun, 06 Oct 2013 23:47:56 GMT
Port ComplexPhraseQueryParser


Project: http://git-wip-us.apache.org/repos/asf/lucenenet/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucenenet/commit/2245f83e
Tree: http://git-wip-us.apache.org/repos/asf/lucenenet/tree/2245f83e
Diff: http://git-wip-us.apache.org/repos/asf/lucenenet/diff/2245f83e

Branch: refs/heads/branch_4x
Commit: 2245f83e43d7ebe12b2aac4e403aa9471264e1c5
Parents: c5e9e25
Author: Paul Irwin <paulirwin@gmail.com>
Authored: Thu Oct 3 13:09:31 2013 -0400
Committer: Paul Irwin <paulirwin@gmail.com>
Committed: Sat Oct 5 16:37:26 2013 -0400

----------------------------------------------------------------------
 .../QueryParsers/Classic/QueryParserBase.cs     |  20 +-
 .../ComplexPhrase/ComplexPhraseQueryParser.cs   | 395 +++++++++++++++++++
 .../QueryParsers/Contrib.QueryParsers.csproj    |   1 +
 3 files changed, 406 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2245f83e/src/contrib/QueryParsers/Classic/QueryParserBase.cs
----------------------------------------------------------------------
diff --git a/src/contrib/QueryParsers/Classic/QueryParserBase.cs b/src/contrib/QueryParsers/Classic/QueryParserBase.cs
index 7e0a9fb..42520a1 100644
--- a/src/contrib/QueryParsers/Classic/QueryParserBase.cs
+++ b/src/contrib/QueryParsers/Classic/QueryParserBase.cs
@@ -88,7 +88,7 @@ namespace Lucene.Net.QueryParsers.Classic
         public abstract void ReInit(ICharStream stream);
         public abstract Query TopLevelQuery(String field);
 
-        public Query Parse(String query)
+        public virtual Query Parse(String query)
         {
             ReInit(new FastCharStream(new StringReader(query)));
             try
@@ -615,46 +615,46 @@ namespace Lucene.Net.QueryParsers.Classic
             return NewRangeQuery(field, part1, part2, startInclusive, endInclusive);
         }
 
-        protected BooleanQuery NewBooleanQuery(bool disableCoord)
+        protected virtual BooleanQuery NewBooleanQuery(bool disableCoord)
         {
             return new BooleanQuery(disableCoord);
         }
 
-        protected BooleanClause NewBooleanClause(Query q, Occur occur)
+        protected virtual BooleanClause NewBooleanClause(Query q, Occur occur)
         {
             return new BooleanClause(q, occur);
         }
 
-        protected Query NewTermQuery(Term term)
+        protected virtual Query NewTermQuery(Term term)
         {
             return new TermQuery(term);
         }
 
-        protected PhraseQuery NewPhraseQuery()
+        protected virtual PhraseQuery NewPhraseQuery()
         {
             return new PhraseQuery();
         }
 
-        protected MultiPhraseQuery NewMultiPhraseQuery()
+        protected virtual MultiPhraseQuery NewMultiPhraseQuery()
         {
             return new MultiPhraseQuery();
         }
 
-        protected Query NewPrefixQuery(Term prefix)
+        protected virtual Query NewPrefixQuery(Term prefix)
         {
             PrefixQuery query = new PrefixQuery(prefix);
             query.SetRewriteMethod(multiTermRewriteMethod);
             return query;
         }
 
-        protected Query NewRegexpQuery(Term regexp)
+        protected virtual Query NewRegexpQuery(Term regexp)
         {
             RegexpQuery query = new RegexpQuery(regexp);
             query.SetRewriteMethod(multiTermRewriteMethod);
             return query;
         }
 
-        protected Query NewFuzzyQuery(Term term, float minimumSimilarity, int prefixLength)
+        protected virtual Query NewFuzzyQuery(Term term, float minimumSimilarity, int prefixLength)
         {
             // FuzzyQuery doesn't yet allow constant score rewrite
             String text = term.Text;
@@ -714,7 +714,7 @@ namespace Lucene.Net.QueryParsers.Classic
             return BytesRef.DeepCopyOf(bytes);
         }
 
-        protected Query NewRangeQuery(String field, String part1, String part2, bool startInclusive,
bool endInclusive)
+        protected virtual Query NewRangeQuery(String field, String part1, String part2, bool
startInclusive, bool endInclusive)
         {
             BytesRef start;
             BytesRef end;

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2245f83e/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs
----------------------------------------------------------------------
diff --git a/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs b/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs
new file mode 100644
index 0000000..fe5d893
--- /dev/null
+++ b/src/contrib/QueryParsers/ComplexPhrase/ComplexPhraseQueryParser.cs
@@ -0,0 +1,395 @@
+´╗┐using Lucene.Net.Analysis;
+using Lucene.Net.Index;
+using Lucene.Net.QueryParsers.Classic;
+using Lucene.Net.Search;
+using Lucene.Net.Search.Spans;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Threading.Tasks;
+using Version = Lucene.Net.Util.Version;
+
+namespace Lucene.Net.QueryParsers.ComplexPhrase
+{
+    public class ComplexPhraseQueryParser : QueryParser
+    {
+        private List<ComplexPhraseQuery> complexPhrases = null;
+
+        private bool isPass2ResolvingPhrases;
+
+        private ComplexPhraseQuery currentPhraseQuery = null;
+
+        public ComplexPhraseQueryParser(Version matchVersion, String f, Analyzer a)
+            : base(matchVersion, f, a)
+        {
+        }
+
+        protected override Query GetFieldQuery(string field, string queryText, int slop)
+        {
+            ComplexPhraseQuery cpq = new ComplexPhraseQuery(field, queryText, slop);
+            complexPhrases.Add(cpq); // add to list of phrases to be parsed once
+            // we
+            // are through with this pass
+            return cpq;
+        }
+
+        public override Query Parse(string query)
+        {
+            if (isPass2ResolvingPhrases)
+            {
+                MultiTermQuery.RewriteMethod oldMethod = this.MultiTermRewriteMethod;
+                try
+                {
+                    // Temporarily force BooleanQuery rewrite so that Parser will
+                    // generate visible
+                    // collection of terms which we can convert into SpanQueries.
+                    // ConstantScoreRewrite mode produces an
+                    // opaque ConstantScoreQuery object which cannot be interrogated for
+                    // terms in the same way a BooleanQuery can.
+                    // QueryParser is not guaranteed threadsafe anyway so this temporary
+                    // state change should not
+                    // present an issue
+                    this.MultiTermRewriteMethod = MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE;
+                    return base.Parse(query);
+                }
+                finally
+                {
+                    this.MultiTermRewriteMethod = oldMethod;
+                }
+            }
+
+            // First pass - parse the top-level query recording any PhraseQuerys
+            // which will need to be resolved
+            complexPhrases = new List<ComplexPhraseQuery>();
+            Query q = base.Parse(query);
+
+            // Perform second pass, using this QueryParser to parse any nested
+            // PhraseQueries with different
+            // set of syntax restrictions (i.e. all fields must be same)
+            isPass2ResolvingPhrases = true;
+            try
+            {
+                foreach (ComplexPhraseQuery currentPhraseQuery in complexPhrases)
+                {
+                    // in each phrase, now parse the contents between quotes as a
+                    // separate parse operation
+                    currentPhraseQuery.ParsePhraseElements(this);
+                }
+            }
+            finally
+            {
+                isPass2ResolvingPhrases = false;
+            }
+            return q;
+        }
+
+        protected override Query NewTermQuery(Term term)
+        {
+            if (isPass2ResolvingPhrases)
+            {
+                try
+                {
+                    CheckPhraseClauseIsForSameField(term.Field);
+                }
+                catch (ParseException pe)
+                {
+                    throw new SystemException("Error parsing complex phrase", pe);
+                }
+            }
+            return base.NewTermQuery(term);
+        }
+
+        private void CheckPhraseClauseIsForSameField(string field)
+        {
+            if (!field.Equals(currentPhraseQuery.field))
+            {
+                throw new ParseException("Cannot have clause for field \"" + field
+                    + "\" nested in phrase " + " for field \"" + currentPhraseQuery.field
+                    + "\"");
+            }
+        }
+
+        protected override Query GetWildcardQuery(string field, string termStr)
+        {
+            if (isPass2ResolvingPhrases)
+            {
+                CheckPhraseClauseIsForSameField(field);
+            }
+            return base.GetWildcardQuery(field, termStr);
+        }
+
+        protected override Query GetRangeQuery(string field, string part1, string part2,
bool startInclusive, bool endInclusive)
+        {
+            if (isPass2ResolvingPhrases)
+            {
+                CheckPhraseClauseIsForSameField(field);
+            }
+            return base.GetRangeQuery(field, part1, part2, startInclusive, endInclusive);
+        }
+
+        protected override Query NewRangeQuery(string field, string part1, string part2,
bool startInclusive, bool endInclusive)
+        {
+            if (isPass2ResolvingPhrases)
+            {
+                // Must use old-style RangeQuery in order to produce a BooleanQuery
+                // that can be turned into SpanOr clause
+                TermRangeQuery rangeQuery = TermRangeQuery.NewStringRange(field, part1, part2,
startInclusive, endInclusive);
+                rangeQuery.SetRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+                return rangeQuery;
+            }
+            return base.NewRangeQuery(field, part1, part2, startInclusive, endInclusive);
+        }
+
+        protected override Query GetFuzzyQuery(string field, string termStr, float minSimilarity)
+        {
+            if (isPass2ResolvingPhrases)
+            {
+                CheckPhraseClauseIsForSameField(field);
+            }
+            return base.GetFuzzyQuery(field, termStr, minSimilarity);
+        }
+
+        public class ComplexPhraseQuery : Query
+        {
+            protected internal string field;
+
+            protected internal string phrasedQueryStringContents;
+
+            protected internal int slopFactor;
+
+            private Query contents;
+
+            public ComplexPhraseQuery(string field, string phrasedQueryStringContents, int
slopFactor)
+                : base()
+            {
+                this.field = field;
+                this.phrasedQueryStringContents = phrasedQueryStringContents;
+                this.slopFactor = slopFactor;
+            }
+
+            // Called by ComplexPhraseQueryParser for each phrase after the main
+            // parse
+            // thread is through
+            protected internal void ParsePhraseElements(QueryParser qp)
+            {
+                // TODO ensure that field-sensitivity is preserved ie the query
+                // string below is parsed as
+                // field+":("+phrasedQueryStringContents+")"
+                // but this will need code in rewrite to unwrap the first layer of
+                // boolean query
+                contents = qp.Parse(phrasedQueryStringContents);
+            }
+
+            public override Query Rewrite(IndexReader reader)
+            {
+                // ArrayList spanClauses = new ArrayList();
+                if (contents is TermQuery)
+                {
+                    return contents;
+                }
+                // Build a sequence of Span clauses arranged in a SpanNear - child
+                // clauses can be complex
+                // Booleans e.g. nots and ors etc
+                int numNegatives = 0;
+                if (!(contents is BooleanQuery))
+                {
+                    throw new ArgumentException("Unknown query type \""
+                        + contents.GetType().Name
+                        + "\" found in phrase query string \"" + phrasedQueryStringContents
+                        + "\"");
+                }
+                BooleanQuery bq = (BooleanQuery)contents;
+                BooleanClause[] bclauses = bq.Clauses;
+                SpanQuery[] allSpanClauses = new SpanQuery[bclauses.Length];
+                // For all clauses e.g. one* two~
+                for (int i = 0; i < bclauses.Length; i++)
+                {
+                    // HashSet bclauseterms=new HashSet();
+                    Query qc = bclauses[i].Query;
+                    // Rewrite this clause e.g one* becomes (one OR onerous)
+                    qc = qc.Rewrite(reader);
+                    if (bclauses[i].Occur.Equals(Occur.MUST_NOT))
+                    {
+                        numNegatives++;
+                    }
+
+                    if (qc is BooleanQuery)
+                    {
+                        List<SpanQuery> sc = new List<SpanQuery>();
+                        AddComplexPhraseClause(sc, (BooleanQuery)qc);
+                        if (sc.Count > 0)
+                        {
+                            allSpanClauses[i] = sc[0];
+                        }
+                        else
+                        {
+                            // Insert fake term e.g. phrase query was for "Fred Smithe*"
and
+                            // there were no "Smithe*" terms - need to
+                            // prevent match on just "Fred".
+                            allSpanClauses[i] = new SpanTermQuery(new Term(field,
+                                "Dummy clause because no terms found - must match nothing"));
+                        }
+                    }
+                    else
+                    {
+                        if (qc is TermQuery)
+                        {
+                            TermQuery tq = (TermQuery)qc;
+                            allSpanClauses[i] = new SpanTermQuery(tq.Term);
+                        }
+                        else
+                        {
+                            throw new ArgumentException("Unknown query type \""
+                                + qc.GetType().Name
+                                + "\" found in phrase query string \""
+                                + phrasedQueryStringContents + "\"");
+                        }
+
+                    }
+                }
+                if (numNegatives == 0)
+                {
+                    // The simple case - no negative elements in phrase
+                    return new SpanNearQuery(allSpanClauses, slopFactor, true);
+                }
+                // Complex case - we have mixed positives and negatives in the
+                // sequence.
+                // Need to return a SpanNotQuery
+                List<SpanQuery> positiveClauses = new List<SpanQuery>();
+                for (int j = 0; j < allSpanClauses.Length; j++)
+                {
+                    if (!bclauses[j].Occur.Equals(Occur.MUST_NOT))
+                    {
+                        positiveClauses.Add(allSpanClauses[j]);
+                    }
+                }
+
+                SpanQuery[] includeClauses = positiveClauses.ToArray();
+
+                SpanQuery include = null;
+                if (includeClauses.Length == 1)
+                {
+                    include = includeClauses[0]; // only one positive clause
+                }
+                else
+                {
+                    // need to increase slop factor based on gaps introduced by
+                    // negatives
+                    include = new SpanNearQuery(includeClauses, slopFactor + numNegatives,
+                        true);
+                }
+                // Use sequence of positive and negative values as the exclude.
+                SpanNearQuery exclude = new SpanNearQuery(allSpanClauses, slopFactor,
+                    true);
+                SpanNotQuery snot = new SpanNotQuery(include, exclude);
+                return snot;
+            }
+
+            private void AddComplexPhraseClause(IList<SpanQuery> spanClauses, BooleanQuery
qc)
+            {
+                List<SpanQuery> ors = new List<SpanQuery>();
+                List<SpanQuery> nots = new List<SpanQuery>();
+                BooleanClause[] bclauses = qc.Clauses;
+
+                // For all clauses e.g. one* two~
+                for (int i = 0; i < bclauses.Length; i++)
+                {
+                    Query childQuery = bclauses[i].Query;
+
+                    // select the list to which we will add these options
+                    List<SpanQuery> chosenList = ors;
+                    if (bclauses[i].Occur == Occur.MUST_NOT)
+                    {
+                        chosenList = nots;
+                    }
+
+                    if (childQuery is TermQuery)
+                    {
+                        TermQuery tq = (TermQuery)childQuery;
+                        SpanTermQuery stq = new SpanTermQuery(tq.Term);
+                        stq.Boost = tq.Boost;
+                        chosenList.Add(stq);
+                    }
+                    else if (childQuery is BooleanQuery)
+                    {
+                        BooleanQuery cbq = (BooleanQuery)childQuery;
+                        AddComplexPhraseClause(chosenList, cbq);
+                    }
+                    else
+                    {
+                        // TODO alternatively could call extract terms here?
+                        throw new ArgumentException("Unknown query type:"
+                            + childQuery.GetType().Name);
+                    }
+                }
+                if (ors.Count == 0)
+                {
+                    return;
+                }
+                SpanOrQuery soq = new SpanOrQuery(ors.ToArray());
+                if (nots.Count == 0)
+                {
+                    spanClauses.Add(soq);
+                }
+                else
+                {
+                    SpanOrQuery snqs = new SpanOrQuery(nots.ToArray());
+                    SpanNotQuery snq = new SpanNotQuery(soq, snqs);
+                    spanClauses.Add(snq);
+                }
+            }
+
+            public override string ToString(string field)
+            {
+                return "\"" + phrasedQueryStringContents + "\"";
+            }
+
+            public override int GetHashCode()
+            {
+                int prime = 31;
+                int result = base.GetHashCode();
+                result = prime * result + ((field == null) ? 0 : field.GetHashCode());
+                result = prime
+                    * result
+                    + ((phrasedQueryStringContents == null) ? 0
+                        : phrasedQueryStringContents.GetHashCode());
+                result = prime * result + slopFactor;
+                return result;
+            }
+
+            public override bool Equals(object obj)
+            {
+                if (this == obj)
+                    return true;
+                if (obj == null)
+                    return false;
+                if (GetType() != obj.GetType())
+                    return false;
+                if (!base.Equals(obj))
+                {
+                    return false;
+                }
+                ComplexPhraseQuery other = (ComplexPhraseQuery)obj;
+                if (field == null)
+                {
+                    if (other.field != null)
+                        return false;
+                }
+                else if (!field.Equals(other.field))
+                    return false;
+                if (phrasedQueryStringContents == null)
+                {
+                    if (other.phrasedQueryStringContents != null)
+                        return false;
+                }
+                else if (!phrasedQueryStringContents
+                  .Equals(other.phrasedQueryStringContents))
+                    return false;
+                if (slopFactor != other.slopFactor)
+                    return false;
+                return true;
+            }
+        }
+    }
+}

http://git-wip-us.apache.org/repos/asf/lucenenet/blob/2245f83e/src/contrib/QueryParsers/Contrib.QueryParsers.csproj
----------------------------------------------------------------------
diff --git a/src/contrib/QueryParsers/Contrib.QueryParsers.csproj b/src/contrib/QueryParsers/Contrib.QueryParsers.csproj
index 46d8216..2d29fad 100644
--- a/src/contrib/QueryParsers/Contrib.QueryParsers.csproj
+++ b/src/contrib/QueryParsers/Contrib.QueryParsers.csproj
@@ -51,6 +51,7 @@
     <Compile Include="Classic\QueryParserTokenManager.cs" />
     <Compile Include="Classic\Token.cs" />
     <Compile Include="Classic\TokenMgrError.cs" />
+    <Compile Include="ComplexPhrase\ComplexPhraseQueryParser.cs" />
     <Compile Include="Ext\ExtendableQueryParser.cs" />
     <Compile Include="Ext\ExtensionQuery.cs" />
     <Compile Include="Ext\Extensions.cs" />


Mime
View raw message