lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From o...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn ChineseAnalyzer.java ChineseFilter.java ChineseTokenizer.java
Date Tue, 23 Dec 2003 18:57:27 GMT
otis        2003/12/23 10:57:27

  Added:       contributions/analyzers/src/java/org/apache/lucene/analysis/cn
                        ChineseAnalyzer.java ChineseFilter.java
                        ChineseTokenizer.java
  Log:
  - Initial commit (http://nagoya.apache.org/bugzilla/show_bug.cgi?id=23786)
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseAnalyzer.java
  
  Index: ChineseAnalyzer.java
  ===================================================================
  package org.apache.lucene.analysis.cn;
  
  import java.io.Reader;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.TokenStream;
  
  /**
   * Title: ChineseAnalyzer
   * Description:
   *   Subclass of org.apache.lucene.analysis.Analyzer
   *   build from a ChineseTokenizer, filtered with ChineseFilter.
   * Copyright:   Copyright (c) 2001
   * Company:
   * @author Yiyi Sun
   * @version 1.0
   *
   */
  
  public class ChineseAnalyzer extends Analyzer {
  
      public ChineseAnalyzer() {
      }
  
      /**
      * Creates a TokenStream which tokenizes all the text in the provided Reader.
      *
      * @return  A TokenStream build from a ChineseTokenizer filtered with ChineseFilter.
      */
      public final TokenStream tokenStream(String fieldName, Reader reader) {
          TokenStream result = new ChineseTokenizer(reader);
          result = new ChineseFilter(result);
          return result;
      }
  }
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseFilter.java
  
  Index: ChineseFilter.java
  ===================================================================
  package org.apache.lucene.analysis.cn;
  
  import java.util.Hashtable;
  import org.apache.lucene.analysis.*;
  
  /**
   * Title: ChineseFilter
   * Description: Filter with a stop word table
   *              Rule: No digital is allowed.
   *                    English word/token should larger than 1 character.
   *                    One Chinese character as one Chinese word.
   * TO DO:
   *   1. Add Chinese stop words, such as \ue400
   *   2. Dictionary based Chinese word extraction
   *   3. Intelligent Chinese word extraction
   *
   * Copyright:    Copyright (c) 2001
   * Company:
   * @author Yiyi Sun
   * @version 1.0
   *
   */
  
  public final class ChineseFilter extends TokenFilter {
  
  
      // Only English now, Chinese to be added later.
      public static final String[] STOP_WORDS = {
      "and", "are", "as", "at", "be", "but", "by",
      "for", "if", "in", "into", "is", "it",
      "no", "not", "of", "on", "or", "such",
      "that", "the", "their", "then", "there", "these",
      "they", "this", "to", "was", "will", "with"
      };
  
  
      private Hashtable stopTable;
  
      public ChineseFilter(TokenStream in) {
          input = in;
  
          stopTable = new Hashtable(STOP_WORDS.length);
          for (int i = 0; i < STOP_WORDS.length; i++)
              stopTable.put(STOP_WORDS[i], STOP_WORDS[i]);
      }
  
      public final Token next() throws java.io.IOException {
  
          for (Token token = input.next(); token != null; token = input.next()) {
              String text = token.termText();
  
              if (stopTable.get(text) == null) {
                  switch (Character.getType(text.charAt(0))) {
  
                  case Character.LOWERCASE_LETTER:
                  case Character.UPPERCASE_LETTER:
  
                      // English word/token should larger than 1 character.
                      if (text.length()>1) {
                          return token;
                      }
                      break;
                  case Character.OTHER_LETTER:
  
                      // One Chinese character as one Chinese word.
                      // Chinese word extraction to be added later here.
  
                      return token;
                  }
  
              }
  
          }
          return null;
      }
  
  }
  
  
  1.1                  jakarta-lucene-sandbox/contributions/analyzers/src/java/org/apache/lucene/analysis/cn/ChineseTokenizer.java
  
  Index: ChineseTokenizer.java
  ===================================================================
  package org.apache.lucene.analysis.cn;
  
  import java.io.Reader;
  import org.apache.lucene.analysis.*;
  
  
  /**
   * Title: ChineseTokenizer
   * Description: Extract tokens from the Stream using Character.getType()
   *              Rule: A Chinese character as a single token
   * Copyright:   Copyright (c) 2001
   * Company:
   * @author Yiyi Sun
   * @version 1.0
   *
   */
  
  public final class ChineseTokenizer extends Tokenizer {
  
  
      public ChineseTokenizer(Reader in) {
          input = in;
      }
  
      private int offset = 0, bufferIndex=0, dataLen=0;
      private final static int MAX_WORD_LEN = 255;
      private final static int IO_BUFFER_SIZE = 1024;
      private final char[] buffer = new char[MAX_WORD_LEN];
      private final char[] ioBuffer = new char[IO_BUFFER_SIZE];
  
  
      private int length;
      private int start;
  
  
      private final void push(char c) {
  
          if (length == 0) start = offset-1;            // start of token
          buffer[length++] = Character.toLowerCase(c);  // buffer it
  
      }
  
      private final Token flush() {
  
          if (length>0) {
              //System.out.println(new String(buffer, 0, length));
              return new Token(new String(buffer, 0, length), start, start+length);
          }
          else
              return null;
      }
  
      public final Token next() throws java.io.IOException {
  
          length = 0;
          start = offset;
  
  
          while (true) {
  
              final char c;
              offset++;
  
              if (bufferIndex >= dataLen) {
                  dataLen = input.read(ioBuffer);
                  bufferIndex = 0;
              };
  
              if (dataLen == -1) return flush();
              else
                  c = (char) ioBuffer[bufferIndex++];
  
  
              switch(Character.getType(c)) {
  
              case Character.DECIMAL_DIGIT_NUMBER:
              case Character.LOWERCASE_LETTER:
              case Character.UPPERCASE_LETTER:
                  push(c);
                  if (length == MAX_WORD_LEN) return flush();
                  break;
  
              case Character.OTHER_LETTER:
                  if (length>0) {
                      bufferIndex--;
                      return flush();
                  }
                  push(c);
                  return flush();
  
              default:
                  if (length>0) return flush();
                  break;
              }
          }
  
      }
  }
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message