lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From briango...@apache.org
Subject cvs commit: jakarta-lucene/src/test/org/apache/lucene/queryParser TestQueryParser.java
Date Thu, 17 Jan 2002 02:49:22 GMT
briangoetz    02/01/16 18:49:22

  Modified:    src/java/org/apache/lucene/queryParser QueryParser.jj
               src/test/org/apache/lucene/analysis TestAnalyzers.java
               src/test/org/apache/lucene/queryParser TestQueryParser.java
  Added:       src/java/org/apache/lucene/analysis NullAnalyzer.java
                        NullTokenizer.java
  Log:
  Fix query parser (finally) to be much more lenient about queries that have funny characters;
added new test cases to test new rules; added NullTokenizer/NullAnalyzer which just pass through
space-separated tokens unmodified (mostly for testing purposes
  
  Revision  Changes    Path
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/NullAnalyzer.java
  
  Index: NullAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.Reader;
  
  /** An Analyzer that filters LetterTokenizer with LowerCaseFilter. */
  
  public final class NullAnalyzer extends Analyzer {
    public final TokenStream tokenStream(String fieldName, Reader reader) {
      return new NullTokenizer(reader);
    }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/NullTokenizer.java
  
  Index: NullTokenizer.java
  ===================================================================
  package org.apache.lucene.analysis;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.Reader;
  
  /** LowerCaseTokenizer performs the function of LetterTokenizer
    and LowerCaseFilter together.  It divides text at non-letters and converts
    them to lower case.  While it is functionally equivalent to the combination
    of LetterTokenizer and LowerCaseFilter, there is a performance advantage
    to doing the two tasks at once, hence this (redundent) implementation.
  
    Note: this does a decent job for most European languages, but does a terrible
    job for some Asian languages, where words are not separated by spaces. */
  
  public final class NullTokenizer extends Tokenizer {
    public NullTokenizer(Reader in) {
      input = in;
    }
  
    private int offset = 0, bufferIndex=0, dataLen=0;
    private final static int MAX_WORD_LEN = 255;
    private final static int IO_BUFFER_SIZE = 1024;
    private final char[] buffer = new char[MAX_WORD_LEN];
    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  
    public final Token next() throws java.io.IOException {
      int length = 0;
      int start = offset;
      while (true) {
        final char c;
  
        offset++;
        if (bufferIndex >= dataLen) {
          dataLen = input.read(ioBuffer);
          bufferIndex = 0;
        };
        if (dataLen == -1) {
  	if (length > 0)
  	  break;
  	else
  	  return null;
        }
        else
          c = (char) ioBuffer[bufferIndex++];
        
        if (Character.isWhitespace(c)) {
          if (length > 0)
            break;
          else
            continue;
        }
  
        if (length == 0)			  // start of token
          start = offset-1;
  
        buffer[length++] = c;
                                                    // buffer it
        if (length == MAX_WORD_LEN)		  // buffer overflow!
          break;
      }
  
      return new Token(new String(buffer, 0, length), start, start+length);
    }
  }
  
  
  
  1.9       +10 -6     jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj
  
  Index: QueryParser.jj
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- QueryParser.jj	17 Jan 2002 02:00:09 -0000	1.8
  +++ QueryParser.jj	17 Jan 2002 02:49:22 -0000	1.9
  @@ -261,11 +261,16 @@
   
   <*> TOKEN : {
     <#_NUM_CHAR:   ["0"-"9"] >
  -| <#_TERM_START_CHAR: [ "a"-"z", "A"-"Z", "_", "\u0080"-"\uFFFE" ] >
  -| <#_TERM_CHAR: [ "a"-"z", "A"-"Z", "0"-"9", "_", ".", "\u0080"-"\uFFFE" ] >
  +| <#_TERM_START_CHAR: ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^", 
  +                         "[", "]", "\"", "{", "}", "~", "*" ] >
  +| <#_TERM_CHAR: <_TERM_START_CHAR> >
   | <#_WHITESPACE: ( " " | "\t" ) >
   }
   
  +<DEFAULT> SKIP : {
  +  <<_WHITESPACE>>
  +}
  +
   <DEFAULT> TOKEN : {
     <AND:       ("AND" | "&&") >
   | <OR:        ("OR" | "||") >
  @@ -275,9 +280,8 @@
   | <LPAREN:    "(" >
   | <RPAREN:    ")" >
   | <COLON:     ":" >
  -| <CARAT:     "^" >
  +| <CARAT:     "^" > : Boost
   | <QUOTED:     "\"" (~["\""])+ "\"">
  -| <NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? >
   | <TERM:      <_TERM_START_CHAR> (<_TERM_CHAR>)*  >
   | <FUZZY:     "~" >
   | <PREFIXTERM:  <_TERM_START_CHAR> (<_TERM_CHAR>)* "*" >
  @@ -287,8 +291,8 @@
   | <RANGEEX:   "{" ( ~[ "}" ] )+ "}">
   }
   
  -<DEFAULT> SKIP : {
  -  <<_WHITESPACE>>
  +<Boost> TOKEN : {
  +<NUMBER:    (<_NUM_CHAR>)+ ( "." (<_NUM_CHAR>)+ )? > : DEFAULT
   }
   
   // *   Query  ::= ( Clause )*
  
  
  
  1.2       +20 -0     jakarta-lucene/src/test/org/apache/lucene/analysis/TestAnalyzers.java
  
  Index: TestAnalyzers.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/analysis/TestAnalyzers.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- TestAnalyzers.java	3 Nov 2001 02:33:46 -0000	1.1
  +++ TestAnalyzers.java	17 Jan 2002 02:49:22 -0000	1.2
  @@ -100,6 +100,26 @@
                        new String[] { "quoted", "word" });
     }
   
  +  public void testNull() throws Exception {
  +    Analyzer a = new NullAnalyzer();
  +    assertAnalyzesTo(a, "foo bar FOO BAR", 
  +                     new String[] { "foo", "bar", "FOO", "BAR" });
  +    assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", 
  +                     new String[] { "foo", "bar", ".", "FOO", "<>", "BAR" });
  +    assertAnalyzesTo(a, "foo.bar.FOO.BAR", 
  +                     new String[] { "foo.bar.FOO.BAR" });
  +    assertAnalyzesTo(a, "U.S.A.", 
  +                     new String[] { "U.S.A." });
  +    assertAnalyzesTo(a, "C++", 
  +                     new String[] { "C++" });
  +    assertAnalyzesTo(a, "B2B", 
  +                     new String[] { "B2B" });
  +    assertAnalyzesTo(a, "2B", 
  +                     new String[] { "2B" });
  +    assertAnalyzesTo(a, "\"QUOTED\" word", 
  +                     new String[] { "\"QUOTED\"", "word" });
  +  }
  +
     public void testStop() throws Exception {
       Analyzer a = new StopAnalyzer();
       assertAnalyzesTo(a, "foo bar FOO BAR", 
  
  
  
  1.7       +20 -3     jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java
  
  Index: TestQueryParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/test/org/apache/lucene/queryParser/TestQueryParser.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- TestQueryParser.java	17 Jan 2002 02:00:09 -0000	1.6
  +++ TestQueryParser.java	17 Jan 2002 02:49:22 -0000	1.7
  @@ -61,6 +61,7 @@
   import org.apache.lucene.queryParser.*;
   import org.apache.lucene.search.*;
   import org.apache.lucene.analysis.*;
  +import org.apache.lucene.analysis.standard.*;
   import org.apache.lucene.analysis.Token;
   
   public class TestQueryParser extends TestCase {
  @@ -135,8 +136,6 @@
       assertQueryEquals("term term term", null, "term term term");
       assertQueryEquals("türm term term", null, "türm term term");
       assertQueryEquals("ümlaut", null, "ümlaut");
  -    assertQueryEquals("term term1 term2", null, "term term term");
  -    assertQueryEquals("term 1.0 1 2", null, "term");
   
       assertQueryEquals("a AND b", null, "+a +b");
       assertQueryEquals("(a AND b)", null, "+a +b");
  @@ -145,7 +144,6 @@
       assertQueryEquals("a AND -b", null, "+a -b");
       assertQueryEquals("a AND !b", null, "+a -b");
       assertQueryEquals("a && b", null, "+a +b");
  -    assertQueryEquals("a&&b", null, "+a +b");
       assertQueryEquals("a && ! b", null, "+a -b");
   
       assertQueryEquals("a OR b", null, "a b");
  @@ -177,6 +175,25 @@
                         "+(apple \"steve jobs\") -(foo bar baz)");
       assertQueryEquals("+title:(dog OR cat) -author:\"bob dole\"", null, 
                         "+(title:dog title:cat) -author:\"bob dole\"");
  +  }
  +
  +  public void testPunct() throws Exception {
  +    Analyzer a = new NullAnalyzer();
  +    assertQueryEquals("a&b", a, "a&b");
  +    assertQueryEquals("a&&b", a, "a&&b");
  +    assertQueryEquals(".NET", a, ".NET");
  +  }
  +
  +  public void testNumber() throws Exception {
  +    // The numbers go away because SimpleAnalzyer ignores them
  +    assertQueryEquals("3", null, "");
  +    assertQueryEquals("term 1.0 1 2", null, "term");
  +    assertQueryEquals("term term1 term2", null, "term term term");
  +
  +    Analyzer a = new StandardAnalyzer();
  +    assertQueryEquals("3", a, "3");
  +    assertQueryEquals("term 1.0 1 2", a, "term 1.0 1 2");
  +    assertQueryEquals("term term1 term2", a, "term term1 term2");
     }
   
     public void testWildcard() throws Exception {
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message