Return-Path: Delivered-To: apmail-lucene-java-commits-archive@www.apache.org Received: (qmail 45801 invoked from network); 16 Jun 2009 16:50:41 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 16 Jun 2009 16:50:41 -0000 Received: (qmail 73210 invoked by uid 500); 16 Jun 2009 16:39:05 -0000 Delivered-To: apmail-lucene-java-commits-archive@lucene.apache.org Received: (qmail 73188 invoked by uid 500); 16 Jun 2009 16:39:05 -0000 Mailing-List: contact java-commits-help@lucene.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: java-dev@lucene.apache.org Delivered-To: mailing list java-commits@lucene.apache.org Received: (qmail 73179 invoked by uid 99); 16 Jun 2009 16:39:05 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 16 Jun 2009 16:39:05 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 16 Jun 2009 16:39:01 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id CFA1B2388895; Tue, 16 Jun 2009 16:38:39 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r785287 - in /lucene/java/trunk/contrib/analyzers/src: java/org/apache/lucene/analysis/cjk/CJKTokenizer.java test/org/apache/lucene/analysis/cjk/ test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Date: Tue, 16 Jun 2009 16:38:39 -0000 To: java-commits@lucene.apache.org From: mikemccand@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090616163839.CFA1B2388895@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: mikemccand Date: Tue Jun 16 16:38:39 2009 New Revision: 785287 URL: http://svn.apache.org/viewvc?rev=785287&view=rev Log: LUCENE-973: add test case for CJKAnalyzer; fix trailing empty string bug Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (with props) Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java?rev=785287&r1=785286&r2=785287&view=diff ============================================================================== --- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java (original) +++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cjk/CJKTokenizer.java Tue Jun 16 16:38:39 2009 @@ -37,7 +37,18 @@ */ public final class CJKTokenizer extends Tokenizer { //~ Static fields/initializers --------------------------------------------- - + /** Word token type */ + static final int WORD_TYPE = 0; + + /** Single byte token type */ + static final int SINGLE_TOKEN_TYPE = 1; + + /** Double byte token type */ + static final int DOUBLE_TOKEN_TYPE = 2; + + /** Names for token types */ + static final String[] TOKEN_TYPE_NAMES = { "word", "single", "double" }; + /** Max word length */ private static final int MAX_WORD_LEN = 255; @@ -68,7 +79,7 @@ private final char[] ioBuffer = new char[IO_BUFFER_SIZE]; /** word type: single=>ASCII double=>non-ASCII word=>default */ - private String tokenType = "word"; + private int tokenType = WORD_TYPE; /** * tag: previous character is a cached double-byte character "C1C2C3C4" @@ -105,12 +116,15 @@ public final Token next(final Token reusableToken) throws java.io.IOException { /** how many character(s) has been stored in buffer */ assert reusableToken != null; - int length = 0; - /** the position used to create Token */ - int start = offset; + while(true) { // loop until we find a non-empty token + + int length = 0; + + /** the position used to create Token */ + int start = offset; - while (true) { + while (true) { // loop until we've found a full token /** current character */ char c; @@ -150,7 +164,7 @@ if (ub == Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS) { int i = (int) c; if (i >= 65281 && i <= 65374) { - /** convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN */ + // convert certain HALFWIDTH_AND_FULLWIDTH_FORMS to BASIC_LATIN i = i - 65248; c = (char) i; } @@ -165,19 +179,17 @@ // ^--: the current character begin to token the ASCII // letter start = offset - 1; - } else if (tokenType == "double") { + } else if (tokenType == DOUBLE_TOKEN_TYPE) { // "javaC1C2C3C4linux"
// ^--: the previous non-ASCII // : the current character offset--; bufferIndex--; - tokenType = "single"; if (preIsTokened == true) { // there is only one non-ASCII has been stored length = 0; preIsTokened = false; - break; } else { break; @@ -186,7 +198,7 @@ // store the LowerCase(c) in the buffer buffer[length++] = Character.toLowerCase(c); - tokenType = "single"; + tokenType = SINGLE_TOKEN_TYPE; // break the procedure if buffer overflowed! if (length == MAX_WORD_LEN) { @@ -206,9 +218,9 @@ if (length == 0) { start = offset - 1; buffer[length++] = c; - tokenType = "double"; + tokenType = DOUBLE_TOKEN_TYPE; } else { - if (tokenType == "single") { + if (tokenType == SINGLE_TOKEN_TYPE) { offset--; bufferIndex--; @@ -216,7 +228,7 @@ break; } else { buffer[length++] = c; - tokenType = "double"; + tokenType = DOUBLE_TOKEN_TYPE; if (length == 2) { offset--; @@ -238,7 +250,16 @@ } } } + + if (length > 0) { + return reusableToken.reinit + (buffer, 0, length, start, start+length, TOKEN_TYPE_NAMES[tokenType]); + } else if (dataLen == -1) { + return null; + } - return reusableToken.reinit(buffer, 0, length, start, start+length, tokenType); + // Cycle back and try for the next token (don't + // return an empty string) + } } } Added: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java?rev=785287&view=auto ============================================================================== --- lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java (added) +++ lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java Tue Jun 16 16:38:39 2009 @@ -0,0 +1,155 @@ +package org.apache.lucene.analysis.cjk; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Token; + + +public class TestCJKTokenizer extends TestCase{ + + public Token newToken(String termText, int start, int end, int type) { + Token token = new Token(start, end); + token.setTermBuffer(termText); + token.setType(CJKTokenizer.TOKEN_TYPE_NAMES[type]); + return token; + } + + public void checkCJKToken(final String str, final Token[] out_tokens) throws IOException { + CJKTokenizer tokenizer = new CJKTokenizer(new StringReader(str)); + int i = 0; + System.out.println("string[" + str + "]"); + System.out.print("tokens["); + final Token reusableToken = new Token(); + for (Token token = tokenizer.next(reusableToken) ; + token != null ; + token = tokenizer.next(reusableToken) ) { + if (token.term().equals(out_tokens[i].term()) + && token.startOffset() == out_tokens[i].startOffset() + && token.endOffset() == out_tokens[i].endOffset() + && token.type().equals(out_tokens[i].type()) ) { + System.out.print( token.term() + " "); + } + else { + fail(token.term() + " (start: " + token.startOffset() + + " end: " + token.endOffset() + " type: " + token.type() + ") != " + + out_tokens[i].term() + " (start: " + out_tokens[i].startOffset() + + " end: " + out_tokens[i].endOffset() + + " type: " + out_tokens[i].type() + ")"); + break; + } + ++i; + } + System.out.println("]" + System.getProperty("line.separator")); + } + + public void testJa1() throws IOException { + String str = "\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341"; + + Token[] out_tokens = { + newToken("\u4e00\u4e8c", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e8c\u4e09", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e09\u56db", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u56db\u4e94", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e94\u516d", 4, 6, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u516d\u4e03", 5, 7, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e03\u516b", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u516b\u4e5d", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e5d\u5341", 8,10, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKToken(str, out_tokens); + } + + public void testJa2() throws IOException { + String str = "\u4e00 \u4e8c\u4e09\u56db \u4e94\u516d\u4e03\u516b\u4e5d \u5341"; + + Token[] out_tokens = { + newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e8c\u4e09", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e09\u56db", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e94\u516d", 6, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u516d\u4e03", 7, 9, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u4e03\u516b", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u516b\u4e5d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u5341", 12,13, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKToken(str, out_tokens); + } + + public void testC() throws IOException { + String str = "abc defgh ijklmn opqrstu vwxy z"; + + Token[] out_tokens = { + newToken("abc", 0, 3, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("defgh", 4, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("ijklmn", 10, 16, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("opqrstu", 17, 24, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("vwxy", 25, 29, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("z", 30, 31, CJKTokenizer.SINGLE_TOKEN_TYPE), + }; + checkCJKToken(str, out_tokens); + } + + public void testMix() throws IOException { + String str = "\u3042\u3044\u3046\u3048\u304aabc\u304b\u304d\u304f\u3051\u3053"; + + Token[] out_tokens = { + newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("abc", 5, 8, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("\u304b\u304d", 8, 10, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304d\u304f", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304f\u3051", 10,12, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3051\u3053", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKToken(str, out_tokens); + } + + public void testMix2() throws IOException { + String str = "\u3042\u3044\u3046\u3048\u304aab\u3093c\u304b\u304d\u304f\u3051 \u3053"; + + Token[] out_tokens = { + newToken("\u3042\u3044", 0, 2, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3044\u3046", 1, 3, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3046\u3048", 2, 4, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3048\u304a", 3, 5, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("ab", 5, 7, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("\u3093", 7, 8, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("c", 8, 9, CJKTokenizer.SINGLE_TOKEN_TYPE), + newToken("\u304b\u304d", 9, 11, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304d\u304f", 10, 12, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u304f\u3051", 11,13, CJKTokenizer.DOUBLE_TOKEN_TYPE), + newToken("\u3053", 14,15, CJKTokenizer.DOUBLE_TOKEN_TYPE) + }; + checkCJKToken(str, out_tokens); + } + + public void testSingleChar() throws IOException { + String str = "\u4e00"; + + Token[] out_tokens = { + newToken("\u4e00", 0, 1, CJKTokenizer.DOUBLE_TOKEN_TYPE), + }; + checkCJKToken(str, out_tokens); + } +} Propchange: lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cjk/TestCJKTokenizer.java ------------------------------------------------------------------------------ svn:eol-style = native