lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From briango...@apache.org
Subject cvs commit: jakarta-lucene/src/test/org/apache/lucene/queryParser TestQueryParser.java
Date Thu, 01 Nov 2001 01:12:37 GMT
briangoetz    01/10/31 17:12:37

  Modified:    src/java/org/apache/lucene/queryParser QueryParser.jj
               src/test/org/apache/lucene/queryParser TestQueryParser.java
  Added:       src/java/org/apache/lucene/queryParser FastCharStream.java
  Log:
  Fix query parser so it accepts queries with unicode characters
  
  Revision  Changes    Path
  1.4       +40 -25    jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj
  
  Index: QueryParser.jj
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- QueryParser.jj	2001/09/26 14:38:34	1.3
  +++ QueryParser.jj	2001/11/01 01:12:37	1.4
  @@ -54,7 +54,9 @@
   
   
   options {
  -  STATIC= false;
  +  STATIC=false;
  +  JAVA_UNICODE_ESCAPE=true;
  +  USER_CHAR_STREAM=true;
   }
   
   PARSER_BEGIN(QueryParser)
  @@ -94,6 +96,8 @@
    *   Query  ::= ( Clause )*
    *   Clause ::= ["+", "-"] [<TERM> ":"] ( <TERM> | "(" Query ")" )
    * </pre>
  + *
  + * @author Brian Goetz
    */
   
   public class QueryParser {
  @@ -118,7 +122,7 @@
      *  @param analyzer   used to find terms in the query text.
      */
     public QueryParser(String f, Analyzer a) {
  -    this(new StringReader(""));
  +    this(new FastCharStream(new StringReader("")));
       analyzer = a;
       field = f;
     }
  @@ -128,7 +132,7 @@
      *  @param query	the query string to be parsed.
      */
     public Query parse(String query) throws ParseException {
  -    ReInit(new StringReader(query));
  +    ReInit(new FastCharStream(new StringReader(query)));
       return Query(field);
     }
   
  @@ -168,7 +172,8 @@
       // Use the analyzer to get all the tokens, and then build a TermQuery,
       // PhraseQuery, or nothing based on the term count
       
  -    TokenStream source = analyzer.tokenStream(field, new StringReader(queryText));
  +    TokenStream source = analyzer.tokenStream(field, 
  +                                              new StringReader(queryText));
       Vector v = new Vector();
       org.apache.lucene.analysis.Token t;
   
  @@ -252,8 +257,8 @@
     <#_ALPHA_CHAR: ["a"-"z", "A"-"Z"] >
   | <#_NUM_CHAR:   ["0"-"9"] >
   | <#_ALPHANUM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9" ] >
  -| <#_IDENTIFIER_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_" ] >
  -| <#_IDENTIFIER: <_ALPHA_CHAR> (<_IDENTIFIER_CHAR>)* >
  +| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
  +| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", "\u0080"-"\uFFFE" ] >
   | <#_NEWLINE:    ( "\r\n" | "\r" | "\n" ) >
   | <#_WHITESPACE: ( " " | "\t" ) >
   | <#_QCHAR:      ( "\\" (<_NEWLINE> | ~["a"-"z", "A"-"Z", "0"-"9"] ) ) >
  @@ -272,12 +277,11 @@
   | <CARAT:     "^" >
   | <STAR:      "*" >
   | <QUOTED:     "\"" (~["\""])+ "\"">
  -| <NUMBER:    (<_NUM_CHAR>)+ "." (<_NUM_CHAR>)+ >
  -| <TERM:      <_IDENTIFIER_CHAR> 
  -              ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "*", "?", "~", "{",
"}", "[", "]" ] )* >
  +| <NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
  +| <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
   | <FUZZY:     "~" >
  -| <WILDTERM:  <_IDENTIFIER_CHAR>
  -              ( ~["\"", " ", "\t", "(", ")", ":", "&", "|", "^", "~", "{", "}", "[",
"]" ] )* <_IDENTIFIER_CHAR>>
  +| <WILDTERM:  <_TERM_START_CHAR> 
  +              (<_TERM_CHAR> | ( [ "*", "?" ] ))* >
   | <RANGEIN:   "[" (~["]"])+ "]">
   | <RANGEEX:   "{" (~["}"])+ "}">
   }
  @@ -363,23 +367,34 @@
   }
   {
     ( 
  -     (term=<TERM>|term=<WILDTERM>{wildcard=true;}|term=<NUMBER>)[<STAR>{prefix=true;}|<FUZZY>{fuzzy=true;}][<CARAT>
boost=<NUMBER>]
  -      { if (wildcard)
  -          q = new WildcardQuery(new Term(field, term.image));
  -        else if (prefix) 
  -          q = new PrefixQuery(new Term(field, term.image));
  -        else if (fuzzy)
  -          q = new FuzzyQuery(new Term(field, term.image));
  -        else
  -          q = getFieldQuery(field, analyzer, term.image); }
  -    | (term=<RANGEIN>{rangein=true;}|term=<RANGEEX>)
  +     (
  +       term=<TERM>
  +       | term=<WILDTERM> { wildcard=true; }
  +       | term=<NUMBER>
  +     )
  +     [ <STAR> { prefix=true; } | <FUZZY> { fuzzy=true; } ]
  +     [ <CARAT> boost=<NUMBER> ]
  +     { 
  +       if (wildcard)
  +         q = new WildcardQuery(new Term(field, term.image));
  +       else if (prefix) 
  +         q = new PrefixQuery(new Term(field, term.image));
  +       else if (fuzzy)
  +         q = new FuzzyQuery(new Term(field, term.image));
  +       else
  +         q = getFieldQuery(field, analyzer, term.image); 
  +     }
  +     | ( term=<RANGEIN> { rangein=true; } | term=<RANGEEX> )
           {
             q = getRangeQuery(field, analyzer, 
  -                            term.image.substring(1, term.image.length()-1), rangein);
  +                            term.image.substring(1, term.image.length()-1), 
  +                            rangein);
           }
  -    | term=<QUOTED> 
  -      { q = getFieldQuery(field, analyzer, 
  -                          term.image.substring(1, term.image.length()-1)); }
  +     | term=<QUOTED> 
  +       { 
  +         q = getFieldQuery(field, analyzer, 
  +                           term.image.substring(1, term.image.length()-1)); 
  +       }
     )
     { 
       if (boost != null) {
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/queryParser/FastCharStream.java
  
  Index: FastCharStream.java
  ===================================================================
  // FastCharStream.java
  package org.apache.lucene.queryParser;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.*;
  
  /** An efficient implementation of JavaCC's CharStream interface.  <p>Note that
   * this does not do line-number counting, but instead keeps track of the
   * character position of the token in the input, as required by Lucene's {@link
   * org.apache.lucene.analysis.Token} API. */
  public final class FastCharStream implements CharStream {
    char[] buffer = null;
  
    int bufferLength = 0;				  // end of valid chars
    int bufferPosition = 0;			  // next char to read
    
    int tokenStart = 0;				  // offset in buffer
    int bufferStart = 0;				  // position in file of buffer
  
    Reader input;					  // source of chars
  
    /** Constructs from a Reader. */
    public FastCharStream(Reader r) {
      input = r;
    }
  
    public final char readChar() throws IOException {
      if (bufferPosition >= bufferLength)
        refill();
      return buffer[bufferPosition++];
    }
  
    private final void refill() throws IOException {
      int newPosition = bufferLength - tokenStart;
  
      if (tokenStart == 0) {			  // token won't fit in buffer
        if (buffer == null) {			  // first time: alloc buffer
  	buffer = new char[2048];		  
        } else if (bufferLength == buffer.length) { // grow buffer
  	char[] newBuffer = new char[buffer.length*2];
  	System.arraycopy(buffer, 0, newBuffer, 0, bufferLength);
  	buffer = newBuffer;
        }
      } else {					  // shift token to front
        System.arraycopy(buffer, tokenStart, buffer, 0, newPosition);
      }
  
      bufferLength = newPosition;			  // update state
      bufferPosition = newPosition;
      bufferStart += tokenStart;
      tokenStart = 0;
  
      int charsRead =				  // fill space in buffer
        input.read(buffer, newPosition, buffer.length-newPosition);
      if (charsRead == -1)
        throw new IOException("read past eof");
      else
        bufferLength += charsRead;
    }
  
    public final char BeginToken() throws IOException {
      tokenStart = bufferPosition;
      return readChar();
    }
  
    public final void backup(int amount) {
      bufferPosition -= amount;
    }
  
    public final String GetImage() {
      return new String(buffer, tokenStart, bufferPosition - tokenStart);
    }
  
    public final char[] GetSuffix(int len) {
      char[] value = new char[len];
      System.arraycopy(buffer, bufferPosition - len, value, 0, len);
      return value;
    }
  
    public final void Done() {
      try {
        input.close();
      } catch (IOException e) {
        System.err.println("Caught: " + e + "; ignoring.");
      }
    }
  
    public final int getColumn() {
      return bufferStart + bufferPosition;
    }
    public final int getLine() {
      return 1;
    }
    public final int getEndColumn() {
      return bufferStart + bufferPosition;
    }
    public final int getEndLine() {
      return 1;
    }
    public final int getBeginColumn() {
      return bufferStart + tokenStart;
    }
    public final int getBeginLine() {
      return 1;
    }
  }
  
  
  
  1.3       +3 -0      jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java
  
  Index: TestQueryParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- TestQueryParser.java	2001/09/18 17:35:57	1.2
  +++ TestQueryParser.java	2001/11/01 01:12:37	1.3
  @@ -138,6 +138,8 @@
   
     public void testSimple() throws Exception {
       assertQueryEquals("term term term", null, "term term term");
  +    assertQueryEquals("türm term term", null, "türm term term");
  +    assertQueryEquals("ümlaut", null, "ümlaut");
       assertQueryEquals("term term1 term2", null, "term term term");
       assertQueryEquals("term 1.0 1 2", null, "term");
   
  @@ -163,6 +165,7 @@
   
       assertQueryEquals("germ term^2.0", null, "germ term^2.0");
       assertQueryEquals("term^2.0", null, "term^2.0");
  +    assertQueryEquals("term^2", null, "term^2.0");
   
       assertQueryEquals("(foo OR bar) AND (baz OR boo)", null, 
                         "+(foo bar) +(baz boo)");
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message