lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From cutt...@apache.org
Subject cvs commit: jakarta-lucene/src/java/org/apache/lucene/analysis CharTokenizer.java WhitespaceAnalyzer.java WhitespaceTokenizer.java LetterTokenizer.java LowerCaseTokenizer.java NullAnalyzer.java NullTokenizer.java
Date Thu, 24 Jan 2002 19:02:52 GMT
cutting     02/01/24 11:02:52

  Modified:    src/java/org/apache/lucene/analysis LetterTokenizer.java
                        LowerCaseTokenizer.java
  Added:       src/java/org/apache/lucene/analysis CharTokenizer.java
                        WhitespaceAnalyzer.java WhitespaceTokenizer.java
  Removed:     src/java/org/apache/lucene/analysis NullAnalyzer.java
                        NullTokenizer.java
  Log:
  Renamed NullTokenizer and Analyzer to WhitespaceTokenizer and Analyzer.
  Also re-structured the implementation of several tokenizers so that they
  share code, basing them on the new class CharAnalyzer.
  
  Revision  Changes    Path
  1.2       +7 -44     jakarta-lucene/src/java/org/apache/lucene/analysis/LetterTokenizer.java
  
  Index: LetterTokenizer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/LetterTokenizer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- LetterTokenizer.java	18 Sep 2001 16:29:49 -0000	1.1
  +++ LetterTokenizer.java	24 Jan 2002 19:02:52 -0000	1.2
  @@ -63,52 +63,15 @@
     Note: this does a decent job for most European languages, but does a terrible
     job for some Asian languages, where words are not separated by spaces. */
   
  -public final class LetterTokenizer extends Tokenizer {
  +public class LetterTokenizer extends CharTokenizer {
  +  /** Construct a new LetterTokenizer. */
     public LetterTokenizer(Reader in) {
  -    input = in;
  +    super(in);
     }
   
  -  private int offset = 0, bufferIndex=0, dataLen=0;
  -  private final static int MAX_WORD_LEN = 255;
  -  private final static int IO_BUFFER_SIZE = 1024;
  -  private final char[] buffer = new char[MAX_WORD_LEN];
  -  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  -
  -  public final Token next() throws java.io.IOException {
  -    int length = 0;
  -    int start = offset;
  -    while (true) {
  -      final char c;
  -
  -      offset++;
  -      if (bufferIndex >= dataLen) {
  -        dataLen = input.read(ioBuffer);
  -        bufferIndex = 0;
  -      };
  -      if (dataLen == -1) {
  -	if (length > 0)
  -	  break;
  -	else
  -	  return null;
  -      }
  -      else
  -        c = (char) ioBuffer[bufferIndex++];
  -      
  -      if (Character.isLetter(c)) {		  // if it's a letter
  -
  -	if (length == 0)			  // start of token
  -	  start = offset-1;
  -
  -	buffer[length++] = c;			  // buffer it
  -
  -	if (length == MAX_WORD_LEN)		  // buffer overflow!
  -	  break;
  -
  -      } else if (length > 0)			  // at non-Letter w/ chars
  -	break;					  // return 'em
  -
  -    }
  -
  -    return new Token(new String(buffer, 0, length), start, start+length);
  +  /** Collects only characters which satisfy
  +   * {@link Character.isLetter(char)}.*/
  +  protected boolean isTokenChar(char c) {
  +    return Character.isLetter(c);
     }
   }
  
  
  
  1.2       +7 -44     jakarta-lucene/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java
  
  Index: LowerCaseTokenizer.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/java/org/apache/lucene/analysis/LowerCaseTokenizer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- LowerCaseTokenizer.java	18 Sep 2001 16:29:50 -0000	1.1
  +++ LowerCaseTokenizer.java	24 Jan 2002 19:02:52 -0000	1.2
  @@ -65,52 +65,15 @@
     Note: this does a decent job for most European languages, but does a terrible
     job for some Asian languages, where words are not separated by spaces. */
   
  -public final class LowerCaseTokenizer extends Tokenizer {
  +public final class LowerCaseTokenizer extends LetterTokenizer {
  +  /** Construct a new LowerCaseTokenizer. */
     public LowerCaseTokenizer(Reader in) {
  -    input = in;
  +    super(in);
     }
   
  -  private int offset = 0, bufferIndex=0, dataLen=0;
  -  private final static int MAX_WORD_LEN = 255;
  -  private final static int IO_BUFFER_SIZE = 1024;
  -  private final char[] buffer = new char[MAX_WORD_LEN];
  -  private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  -
  -  public final Token next() throws java.io.IOException {
  -    int length = 0;
  -    int start = offset;
  -    while (true) {
  -      final char c;
  -
  -      offset++;
  -      if (bufferIndex >= dataLen) {
  -        dataLen = input.read(ioBuffer);
  -        bufferIndex = 0;
  -      };
  -      if (dataLen == -1) {
  -	if (length > 0)
  -	  break;
  -	else
  -	  return null;
  -      }
  -      else
  -        c = (char) ioBuffer[bufferIndex++];
  -      
  -      if (Character.isLetter(c)) {		  // if it's a letter
  -
  -	if (length == 0)			  // start of token
  -	  start = offset-1;
  -
  -	buffer[length++] = Character.toLowerCase(c);
  -                                                  // buffer it
  -	if (length == MAX_WORD_LEN)		  // buffer overflow!
  -	  break;
  -
  -      } else if (length > 0)			  // at non-Letter w/ chars
  -	break;					  // return 'em
  -
  -    }
  -
  -    return new Token(new String(buffer, 0, length), start, start+length);
  +  /** Collects only characters which satisfy
  +   * {@link Character.isLetter(char)}.*/
  +  protected char normalize(char c) {
  +    return Character.toLowerCase(c);
     }
   }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/CharTokenizer.java
  
  Index: CharTokenizer.java
  ===================================================================
  package org.apache.lucene.analysis;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.Reader;
  
  /** An abstract base class for simple, character-oriented tokenizers.*/
  public abstract class CharTokenizer extends Tokenizer {
    public CharTokenizer(Reader input) {
      this.input = input;
    }
  
    private int offset = 0, bufferIndex=0, dataLen=0;
    private final static int MAX_WORD_LEN = 255;
    private final static int IO_BUFFER_SIZE = 1024;
    private final char[] buffer = new char[MAX_WORD_LEN];
    private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  
    /** Returns true iff a character should be included in a token.  This
     * tokenizer generates as tokens adjacent sequences of characters which
     * satisfy this predicate.  Characters for which this is false are used to
     * define token boundaries and are not included in tokens. */
    protected abstract boolean isTokenChar(char c);
  
    /** Called on each token character to normalize it before it is added to the
     * token.  The default implementation does nothing.  Subclasses may use this
     * to, e.g., lowercase tokens. */
    protected char normalize(char c) { return c; }
  
    /** Returns the next token in the stream, or null at EOS. */
    public final Token next() throws java.io.IOException {
      int length = 0;
      int start = offset;
      while (true) {
        final char c;
  
        offset++;
        if (bufferIndex >= dataLen) {
          dataLen = input.read(ioBuffer);
          bufferIndex = 0;
        };
        if (dataLen == -1) {
  	if (length > 0)
  	  break;
  	else
  	  return null;
        }
        else
          c = (char) ioBuffer[bufferIndex++];
        
        if (isTokenChar(c)) {                       // if it's a token char
  
  	if (length == 0)			  // start of token
  	  start = offset-1;
  
  	buffer[length++] = normalize(c);          // buffer it, normalized
  
  	if (length == MAX_WORD_LEN)		  // buffer overflow!
  	  break;
  
        } else if (length > 0)			  // at non-Letter w/ chars
  	break;					  // return 'em
  
      }
  
      return new Token(new String(buffer, 0, length), start, start+length);
    }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/WhitespaceAnalyzer.java
  
  Index: WhitespaceAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.Reader;
  
  /** An Analyzer that uses WhitespaceTokenizer. */
  
  public final class WhitespaceAnalyzer extends Analyzer {
    public final TokenStream tokenStream(String fieldName, Reader reader) {
      return new WhitespaceTokenizer(reader);
    }
  }
  
  
  
  1.1                  jakarta-lucene/src/java/org/apache/lucene/analysis/WhitespaceTokenizer.java
  
  Index: WhitespaceTokenizer.java
  ===================================================================
  package org.apache.lucene.analysis;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.Reader;
  
  /** A WhitespaceTokenizer is a tokenizer that divides text at whitespace.
   * Adjacent sequences of non-Whitespace characters form tokens. */
  
  public class WhitespaceTokenizer extends CharTokenizer {
    /** Construct a new WhitespaceTokenizer. */
    public WhitespaceTokenizer(Reader in) {
      super(in);
    }
  
    /** Collects only characters which do not satisfy
     * {@link Character.isWhitespace(char)}.*/
    protected boolean isTokenChar(char c) {
      return !Character.isWhitespace(c);
    }
  }
  
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message