lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r774718 [1/3] - in /lucene/java/trunk: ./ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/ contrib/analyzers/...
Date Thu, 14 May 2009 10:09:24 GMT
Author: mikemccand
Date: Thu May 14 10:09:22 2009
New Revision: 774718

URL: http://svn.apache.org/viewvc?rev=774718&view=rev
Log:
LUCENE-1629: adding new contrib analyzer SmartChineseAnalyzer

Added:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfBigramDictionary.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfWordDictionary.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java   (with props)
    lucene/java/trunk/contrib/analyzers/src/resources/
    lucene/java/trunk/contrib/analyzers/src/resources/org/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/ar/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/ar/stopwords.txt   (props changed)
      - copied unchanged from r773521, lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/bigramdict.mem   (with props)
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/coredict.mem   (with props)
    lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt   (with props)
    lucene/java/trunk/contrib/analyzers/src/test/org/apache/lucene/analysis/cn/TestSmartChineseAnalyzer.java
Removed:
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/ar/stopwords.txt
Modified:
    lucene/java/trunk/BUILD.txt
    lucene/java/trunk/CHANGES.txt
    lucene/java/trunk/common-build.xml
    lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html

Modified: lucene/java/trunk/BUILD.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/BUILD.txt?rev=774718&r1=774717&r2=774718&view=diff
==============================================================================
--- lucene/java/trunk/BUILD.txt (original)
+++ lucene/java/trunk/BUILD.txt Thu May 14 10:09:22 2009
@@ -3,14 +3,14 @@
 $Id$
 
 Basic steps:
-  0) Install JDK 1.4 (or greater), Ant 1.6.2 (or greater)
+  0) Install JDK 1.4 (or greater), Ant 1.6.3 (or greater)
   1) Download Lucene from Apache and unpack it
   2) Connect to the top-level of your Lucene installation
   3) Install JavaCC (optional)
   4) Run ant
 
 Step 0) Set up your development environment (JDK 1.4 or greater,
-Ant 1.6.2 or greater)
+Ant 1.6.3 or greater)
 
 We'll assume that you know how to get and set up the JDK - if you
 don't, then we suggest starting at http://java.sun.com and learning
@@ -18,7 +18,7 @@
 JDK 1.4 and later.
 
 Like many Open Source java projects, Lucene uses Apache Ant for build
-control.  Specifically, you MUST use Ant version 1.6.2 or greater.
+control.  Specifically, you MUST use Ant version 1.6.3 or greater.
 
 Ant is "kind of like make without make's wrinkles".  Ant is
 implemented in java and uses XML-based configuration files.  You can

Modified: lucene/java/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/CHANGES.txt?rev=774718&r1=774717&r2=774718&view=diff
==============================================================================
--- lucene/java/trunk/CHANGES.txt (original)
+++ lucene/java/trunk/CHANGES.txt Thu May 14 10:09:22 2009
@@ -308,6 +308,12 @@
     cross-correlate Spans from different fields.
     (Paul Cowan and Chris Hostetter)
     
+25. LUCENE-1629: Add SmartChineseAnalyzer to contrib/analyzers.  It
+    improves on CJKAnalyzer and ChineseAnalyzer by handling Chinese
+    sentences properly.  SmartChineseAnalyzer uses a Hidden Markov
+    Model to tokenize Chinese words in a more intelligent way.
+    (Xiaoping Gao via Mike McCandless)
+  
 Optimizations
 
  1. LUCENE-1427: Fixed QueryWrapperFilter to not waste time computing

Modified: lucene/java/trunk/common-build.xml
URL: http://svn.apache.org/viewvc/lucene/java/trunk/common-build.xml?rev=774718&r1=774717&r2=774718&view=diff
==============================================================================
--- lucene/java/trunk/common-build.xml (original)
+++ lucene/java/trunk/common-build.xml Thu May 14 10:09:22 2009
@@ -233,6 +233,12 @@
       destdir="${build.dir}/classes/java">
       <classpath refid="classpath"/>
     </compile>
+
+    <!-- Copy the resources folder (if existent) -->
+    <copy todir="${build.dir}/classes/java" includeEmptyDirs="false">
+      <globmapper from="resources/*" to="*" handledirsep="yes"/>
+      <fileset dir="src" includes="resources/**"/>
+    </copy>
   </target>
 
   <target name="compile" depends="compile-core">

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java Thu May 14 10:09:22 2009
@@ -0,0 +1,129 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.PorterStemFilter;
+import org.apache.lucene.analysis.StopFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.cn.smart.SentenceTokenizer;
+import org.apache.lucene.analysis.cn.smart.WordSegmenter;
+import org.apache.lucene.analysis.cn.smart.WordTokenizer;
+
+/**
+ * 
+ * SmartChineseAnalyzer 是一个智能中文分词模块, 能够利用概率对汉语句子进行最优切分,
+ * 并内嵌英文tokenizer,能有效处理中英文混合的文本内容。
+ * 
+ * 它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,
+ * 从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。
+ * 
+ * 因为智能分词需要词典来保存词汇的统计值,SmartChineseAnalyzer的运行需要指定词典位置,如何指定词典位置请参考
+ * org.apache.lucene.analysis.cn.smart.AnalyzerProfile
+ * 
+ * SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(http://www.ictclas.org),
+ * 其中词典已获取www.ictclas.org的apache license v2(APLv2)的授权。在遵循APLv2的条件下,欢迎用户使用。
+ * 在此感谢www.ictclas.org以及ictclas分词软件的工作人员的无私奉献!
+ * 
+ * @see org.apache.lucene.analysis.cn.smart.AnalyzerProfile
+ * 
+ */
+public class SmartChineseAnalyzer extends Analyzer {
+
+  private Set stopWords = null;
+
+  private WordSegmenter wordSegment;
+
+  public SmartChineseAnalyzer() {
+    this(false);
+  }
+
+  /**
+   * SmartChineseAnalyzer内部带有默认停止词库,主要是标点符号。如果不希望结果中出现标点符号,
+   * 可以将useDefaultStopWords设为true, useDefaultStopWords为false时不使用任何停止词
+   * 
+   * @param useDefaultStopWords
+   */
+  public SmartChineseAnalyzer(boolean useDefaultStopWords) {
+    if (useDefaultStopWords) {
+      stopWords = loadStopWords(this.getClass().getResourceAsStream(
+          "stopwords.txt"));
+    }
+    wordSegment = new WordSegmenter();
+  }
+
+  /**
+   * 使用自定义的而不使用内置的停止词库,停止词可以使用SmartChineseAnalyzer.loadStopWords(InputStream)加载
+   * 
+   * @param stopWords
+   * @see SmartChineseAnalyzer.loadStopWords(InputStream)
+   */
+  public SmartChineseAnalyzer(Set stopWords) {
+    this.stopWords = stopWords;
+    wordSegment = new WordSegmenter();
+  }
+
+  public TokenStream tokenStream(String fieldName, Reader reader) {
+    TokenStream result = new SentenceTokenizer(reader);
+    result = new WordTokenizer(result, wordSegment);
+    // result = new LowerCaseFilter(result);
+    // 不再需要LowerCaseFilter,因为SegTokenFilter已经将所有英文字符转换成小写
+    // stem太严格了, This is not bug, this feature:)
+    result = new PorterStemFilter(result);
+    if (stopWords != null) {
+      result = new StopFilter(result, stopWords, false);
+    }
+    return result;
+  }
+
+  /**
+   * 从停用词文件中加载停用词, 停用词文件是普通UTF-8编码的文本文件, 每一行是一个停用词,注释利用“//”, 停用词中包括中文标点符号, 中文空格,
+   * 以及使用率太高而对索引意义不大的词。
+   * 
+   * @param input 停用词文件
+   * @return 停用词组成的HashSet
+   */
+  public static Set loadStopWords(InputStream input) {
+    String line;
+    Set stopWords = new HashSet();
+    try {
+      BufferedReader br = new BufferedReader(new InputStreamReader(input,
+          "UTF-8"));
+      while ((line = br.readLine()) != null) {
+        if (line.indexOf("//") != -1) {
+          line = line.substring(0, line.indexOf("//"));
+        }
+        line = line.trim();
+        if (line.length() != 0)
+          stopWords.add(line.toLowerCase());
+      }
+      br.close();
+    } catch (IOException e) {
+      System.err.println("WARNING: cannot open stop words list!");
+    }
+    return stopWords;
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/SmartChineseAnalyzer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html?rev=774718&r1=774717&r2=774718&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html (original)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/package.html Thu May 14 10:09:22 2009
@@ -1,5 +1,51 @@
-<html><head></head>
+<html>
+<head></head>
 <body>
 Analyzer for Chinese.
+
+
+<h2>About SmartChineseAnalyzer</h2>
+<p>SmartChineseAnalyzer 是一个智能中文分词模块, 与 ChineseAnalyzer (切分每个汉字)和
+CJKAnalyzer (组合每两个汉字)不同, 它能够利用概率对汉语句子进行最优切分, 并内嵌英文tokenizer,
+能有效处理中英文混合的文本内容。目前SmartChineseAnalyzer的词典库只支持简体中文。</p>
+
+<p>它的原理基于自然语言处理领域的隐马尔科夫模型(HMM), 利用大量语料库的训练来统计汉语词汇的词频和跳转概率,
+从而根据这些统计结果对整个汉语句子计算最似然(likelihood)的切分。</p>
+
+<p>三种分词模块的分词结果比较, 由此可以看出智能分词更符合句子的原本语义, 从而提高搜索的准确率。
+<pre>语句: 我是中国人</pre>
+<ol>
+	<li>SmartChineseAnalyzer: 我-是-中国-人</li>
+	<li>ChineseAnalyzer: 我-是-中-国-人</li>
+	<li>CJKAnalyzer: 我是-是中-中国-国人</li>
+</ol>
+</p>
+
+<h3>分词词典的设置</h3>
+<p>因为智能分词需要词典来保存词汇的统计值,默认情况下,SmartChineseAnalyzer使用内置的词典库,当需要指定的词典库时,需要指定词典位置,如何指定词典位置请参考
+org.apache.lucene.analysis.cn.smart.AnalyzerProfile。</p>
+
+<p><b>词库的下载地址为:<a
+	href="http://code.google.com/p/imdict-chinese-analyzer/downloads/list">http://code.google.com/p/imdict-chinese-analyzer/downloads/list</a>
+</b> 下载文件analysis-data.zip保存到本地,解压即可使用。</p>
+
+<p>最简单的指定词典库的办法就是运行时加上参数-Danalysis.data.dir
+<pre>如: java -Danalysis.data.dir=/path/to/analysis-data com.example.YourApplication</pre>
+</p>
+
+<h3>版本要求</h3>
+<p>SmartChineseAnalyzer的JVM要求java 1.4及以上版本;Lucene
+要求2.4.0及以上版本,Lucene 2.3.X版应该也可以使用,但未经测试,有需要的用户可自行测试。</p>
+
+<h3>源文件和文本编码</h3>
+除特定的二进制码文件外,SmartChineseAnalyzer的所有文本和Java源码都采用UTF-8编码,
+因此在读取文本和编译Java源码是请注意采用正确的方式,以避免产生乱码错误。
+
+<h3>SmartChineseAnalyzer的授权</h3>
+<p>SmartChineseAnalyzer的算法和语料库词典来自于ictclas1.0项目(<a
+	href="http://www.ictclas.org">http://www.ictclas.org</a>),
+其中词典已经著作权人www.ictclas.org允许,以apache license
+v2(APLv2)协议发布。在遵循APLv2的条件下,欢迎用户使用。
+在此感谢www.ictclas.org以及ictclas分词软件的工作人员的辛勤工作和无私奉献!</p>
 </body>
 </html>

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java Thu May 14 10:09:22 2009
@@ -0,0 +1,112 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.util.Properties;
+
+/**
+ * 在默认情况下,SmartChineseAnalyzer内置有词典库、默认停止词库,已经经过封装,用户可以直接使用。
+ * 
+ * 特殊情况下,用户需要使用指定的词典库和停止词库,此时需要删除org.apache.lucene.analysis.cn.smart. hhmm下的
+ * coredict.mem 和 bigramdict.mem, 然后使用AnalyzerProfile来指定词典库目录。
+ * 
+ * AnalyzerProfile 用来寻找存放分词词库数据 和停用词数据的目录, 该目录下应该有 bigramdict.dct, coredict.dct,
+ * stopwords_utf8.txt, 查找过程依次如下:
+ * 
+ * <ol>
+ * <li>读取系统运行时参数:-Danalysis.data.dir=/path/to/analysis-data,如果没有,继续下一条</li>
+ * <li>执行命令的当前目录中是否存在analysis-data目录</li>
+ * <li>执行命令的lib/目录中是否存在analysis-data目录</li>
+ * <li>执行命令的当前目录中是否存在analysis.properties文件</li>
+ * <li>执行命令的lib/目录中是否存在analysis.properties文件</li>
+ * </ol>
+ * 
+ * 其中analysis.properties文件analysis.data.dir指明analysis-data目录所在位置.
+ * analysis.properties文件的内容示例:
+ * 
+ * <pre>
+ * analysis.data.dir=D:/path/to/analysis-data/
+ * </pre>
+ * 
+ * 当找不到analysis-data目录时,ANALYSIS_DATA_DIR设置为"",因此在使用前,必须在程序里显式指定data目录,例如:
+ * 
+ * <pre>
+ * AnalyzerProfile.ANALYSIS_DATA_DIR = &quot;/path/to/analysis-data&quot;;
+ * </pre>
+ * 
+ */
+public class AnalyzerProfile {
+
+  public static String ANALYSIS_DATA_DIR = "";
+
+  static {
+    init();
+  }
+
+  private static void init() {
+    String dirName = "analysis-data";
+    String propName = "analysis.properties";
+
+    // 读取系统设置,在运行时加入参数:-Danalysis.data.dir=/path/to/analysis-data
+    ANALYSIS_DATA_DIR = System.getProperty("analysis.data.dir", "");
+    if (ANALYSIS_DATA_DIR.length() != 0)
+      return;
+
+    File[] cadidateFiles = new File[] { new File("./" + dirName),
+        new File("./lib/" + dirName), new File("./" + propName),
+        new File("./lib/" + propName) };
+    for (int i = 0; i < cadidateFiles.length; i++) {
+      File file = cadidateFiles[i];
+      if (file.exists()) {
+        if (file.isDirectory()) {
+          ANALYSIS_DATA_DIR = file.getAbsolutePath();
+        } else if (file.isFile() && getAnalysisDataDir(file).length() != 0) {
+          ANALYSIS_DATA_DIR = getAnalysisDataDir(file);
+        }
+        break;
+      }
+    }
+
+    if (ANALYSIS_DATA_DIR.length() == 0) {
+      // 提示用户未找到词典文件夹
+      System.err
+          .println("WARNING: Can not found lexical dictionary directory!");
+      System.err
+          .println("WARNING: This will cause unpredictable exceptions in your application!");
+      System.err
+          .println("WARNING: Please refer to the manual to download the dictionaries.");
+    }
+
+  }
+
+  private static String getAnalysisDataDir(File propFile) {
+    Properties prop = new Properties();
+    try {
+      FileInputStream input = new FileInputStream(propFile);
+      prop.load(input);
+      String dir = prop.getProperty("analysis.data.dir", "");
+      input.close();
+      return dir;
+    } catch (IOException e) {
+    }
+    return "";
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/AnalyzerProfile.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java Thu May 14 10:09:22 2009
@@ -0,0 +1,38 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+public class CharType {
+
+  public final static int DELIMITER = 0;
+
+  public final static int LETTER = 1;
+
+  public final static int DIGIT = 2;
+
+  public final static int HANZI = 3;
+
+  public final static int SPACE_LIKE = 4;
+
+  // (全角半角)标点符号,半角(字母,数字),汉字,空格,"\t\r\n"等空格或换行字符
+  public final static int FULLWIDTH_LETTER = 5;
+
+  public final static int FULLWIDTH_DIGIT = 6; // 全角字符,字母,数字
+
+  public final static int OTHER = 7;
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/CharType.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java Thu May 14 10:09:22 2009
@@ -0,0 +1,102 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.Tokenizer;
+
+/**
+ * 
+ * 包含一个完整句子的Token,从文件中读出,是下一步分词的对象
+ * 
+ */
+public class SentenceTokenizer extends Tokenizer {
+
+  /**
+   * 用来切断句子的标点符号 。,!?;,!?;
+   */
+  public final static String PUNCTION = "。,!?;,!?;";
+
+  private StringBuffer buffer = new StringBuffer();
+
+  private BufferedReader bufferInput;
+
+  private int tokenStart = 0, tokenEnd = 0;
+
+  private Token t = new Token();
+
+  public SentenceTokenizer(Reader reader) {
+    bufferInput = new BufferedReader(reader, 2048);
+  }
+
+  public Token next() throws IOException {
+    buffer.setLength(0);
+    int ci;
+    char ch, pch;
+    boolean atBegin = true;
+    tokenStart = tokenEnd;
+    ci = bufferInput.read();
+    ch = (char) ci;
+
+    while (true) {
+      if (ci == -1) {
+        break;
+      } else if (PUNCTION.indexOf(ch) != -1) {
+        // 找到了句子末尾
+        buffer.append(ch);
+        tokenEnd++;
+        break;
+      } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
+        tokenStart++;
+        tokenEnd++;
+        ci = bufferInput.read();
+        ch = (char) ci;
+      } else {
+        buffer.append(ch);
+        atBegin = false;
+        tokenEnd++;
+        pch = ch;
+        ci = bufferInput.read();
+        ch = (char) ci;
+        // 如果碰上了两个连续的skip字符,例如两个回车,两个空格或者,
+        // 一个回车,一个空格等等,将其视为句子结束,以免句子太长而内存不足
+        if (Utility.SPACES.indexOf(ch) != -1
+            && Utility.SPACES.indexOf(pch) != -1) {
+          // buffer.append(ch);
+          tokenEnd++;
+          break;
+        }
+      }
+    }
+    if (buffer.length() == 0)
+      return null;
+    else {
+      t.clear();
+      t.reinit(buffer.toString(), tokenStart, tokenEnd, "sentence");
+      return t;
+    }
+  }
+
+  public void close() throws IOException {
+    bufferInput.close();
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java Thu May 14 10:09:22 2009
@@ -0,0 +1,165 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+public class Utility {
+
+  public static final char[] STRING_CHAR_ARRAY = new String("未##串")
+      .toCharArray();
+
+  public static final char[] NUMBER_CHAR_ARRAY = new String("未##数")
+      .toCharArray();
+
+  public static final char[] START_CHAR_ARRAY = new String("始##始")
+      .toCharArray();
+
+  public static final char[] END_CHAR_ARRAY = new String("末##末").toCharArray();
+
+  public static final char[] COMMON_DELIMITER = new char[] { ',' };
+
+  /**
+   * 需要跳过的符号,例如制表符,回车,换行等等。
+   */
+  public static final String SPACES = "  \t\r\n";
+
+  public static final int MAX_FREQUENCE = 2079997 + 80000;
+
+  /**
+   * 比较两个整数数组的大小, 分别从数组的一定位置开始逐个比较, 当依次相等且都到达末尾时, 返回相等, 否则未到达末尾的大于到达末尾的;
+   * 当未到达末尾时有一位不相等, 该位置数值大的数组大于小的
+   * 
+   * @param larray
+   * @param lstartIndex larray的起始位置
+   * @param rarray
+   * @param rstartIndex rarray的起始位置
+   * @return 0表示相等,1表示larray > rarray, -1表示larray < rarray
+   */
+  public static int compareArray(char[] larray, int lstartIndex, char[] rarray,
+      int rstartIndex) {
+
+    if (larray == null) {
+      if (rarray == null || rstartIndex >= rarray.length)
+        return 0;
+      else
+        return -1;
+    } else {
+      // larray != null
+      if (rarray == null) {
+        if (lstartIndex >= larray.length)
+          return 0;
+        else
+          return 1;
+      }
+    }
+
+    int li = lstartIndex, ri = rstartIndex;
+    while (li < larray.length && ri < rarray.length && larray[li] == rarray[ri]) {
+      li++;
+      ri++;
+    }
+    if (li == larray.length) {
+      if (ri == rarray.length) {
+        // 两者一直相等到末尾,因此返回相等,也就是结果0
+        return 0;
+      } else {
+        // 此时不可能ri>rarray.length因此只有ri<rarray.length
+        // 表示larray已经结束,rarray没有结束,因此larray < rarray,返回-1
+        return -1;
+      }
+    } else {
+      // 此时不可能li>larray.length因此只有li < larray.length,表示li没有到达larray末尾
+      if (ri == rarray.length) {
+        // larray没有结束,但是rarray已经结束,因此larray > rarray
+        return 1;
+      } else {
+        // 此时不可能ri>rarray.length因此只有ri < rarray.length
+        // 表示larray和rarray都没有结束,因此按下一个数的大小判断
+        if (larray[li] > rarray[ri])
+          return 1;
+        else
+          return -1;
+      }
+    }
+  }
+
+  /**
+   * 根据前缀来判断两个字符数组的大小,当前者为后者的前缀时,表示相等,当不为前缀时,按照普通字符串方式比较
+   * 
+   * @param shortArray
+   * @param shortIndex
+   * @param longArray
+   * @param longIndex
+   * @return
+   */
+  public static int compareArrayByPrefix(char[] shortArray, int shortIndex,
+      char[] longArray, int longIndex) {
+
+    // 空数组是所有数组的前缀,不考虑index
+    if (shortArray == null)
+      return 0;
+    else if (longArray == null)
+      return (shortIndex < shortArray.length) ? 1 : 0;
+
+    int si = shortIndex, li = longIndex;
+    while (si < shortArray.length && li < longArray.length
+        && shortArray[si] == longArray[li]) {
+      si++;
+      li++;
+    }
+    if (si == shortArray.length) {
+      // shortArray 是 longArray的prefix
+      return 0;
+    } else {
+      // 此时不可能si>shortArray.length因此只有si <
+      // shortArray.length,表示si没有到达shortArray末尾
+
+      // shortArray没有结束,但是longArray已经结束,因此shortArray > longArray
+      if (li == longArray.length)
+        return 1;
+      else
+        // 此时不可能li>longArray.length因此只有li < longArray.length
+        // 表示shortArray和longArray都没有结束,因此按下一个数的大小判断
+        return (shortArray[si] > longArray[li]) ? 1 : -1;
+    }
+  }
+
+  public static int getCharType(char ch) {
+    // 最多的是汉字
+    if (ch >= 0x4E00 && ch <= 0x9FA5)
+      return CharType.HANZI;
+    if ((ch >= 0x0041 && ch <= 0x005A) || (ch >= 0x0061 && ch <= 0x007A))
+      return CharType.LETTER;
+    if (ch >= 0x0030 && ch <= 0x0039)
+      return CharType.DIGIT;
+    if (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n' || ch == ' ')
+      return CharType.SPACE_LIKE;
+    // 最前面的其它的都是标点符号了
+    if ((ch >= 0x0021 && ch <= 0x00BB) || (ch >= 0x2010 && ch <= 0x2642)
+        || (ch >= 0x3001 && ch <= 0x301E))
+      return CharType.DELIMITER;
+
+    // 全角字符区域
+    if ((ch >= 0xFF21 && ch <= 0xFF3A) || (ch >= 0xFF41 && ch <= 0xFF5A))
+      return CharType.FULLWIDTH_LETTER;
+    if (ch >= 0xFF10 && ch <= 0xFF19)
+      return CharType.FULLWIDTH_DIGIT;
+    if (ch >= 0xFE30 && ch <= 0xFF63)
+      return CharType.DELIMITER;
+    return CharType.OTHER;
+
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/Utility.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java Thu May 14 10:09:22 2009
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
+import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
+import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
+
+public class WordSegmenter {
+
+  private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
+
+  private SegTokenFilter tokenFilter = new SegTokenFilter();
+
+  /**
+   * 调用HHMMSegment程序将当前的sentence Token分词,返回分词结果,保存在Token List中
+   * 
+   * @param sentenceToken 句子的Token
+   * @param shortPathCount HHMM算法分词所需要的优化前的最短路径个数。一般越大分词结果越精确,但是计算代价也较高。
+   * @return 分词结果的Token List
+   */
+  public List segmentSentence(Token sentenceToken, int shortPathCount) {
+    String sentence = sentenceToken.term();
+
+    List segTokenList = hhmmSegmenter.process(sentence);
+
+    List result = new ArrayList();
+
+    // i从1到rawTokens.length-2,也就是说将“始##始”,“末##末”两个RawToken去掉
+    for (int i = 1; i < segTokenList.size() - 1; i++) {
+      result.add(convertSegToken((SegToken) segTokenList.get(i), sentence,
+          sentenceToken.startOffset(), "word"));
+    }
+    return result;
+
+  }
+
+  /**
+   * 
+   * 将RawToken类型转换成索引需要的Token类型, 因为索引需要RawToken在原句中的内容, 因此转换时需要指定原句子。
+   * 
+   * @param rt
+   * @param sentence 转换需要的句子内容
+   * @param sentenceStartOffset sentence在文章中的初始位置
+   * @param type token类型,默认应该是word
+   * @return
+   */
+  public Token convertSegToken(SegToken st, String sentence,
+      int sentenceStartOffset, String type) {
+    Token result;
+    switch (st.wordType) {
+      case WordType.STRING:
+      case WordType.NUMBER:
+      case WordType.FULLWIDTH_NUMBER:
+      case WordType.FULLWIDTH_STRING:
+        st.charArray = sentence.substring(st.startOffset, st.endOffset)
+            .toCharArray();
+        break;
+      default:
+        break;
+    }
+
+    st = tokenFilter.filter(st);
+
+    result = new Token(st.charArray, 0, st.charArray.length, st.startOffset
+        + sentenceStartOffset, st.endOffset + sentenceStartOffset);
+    return result;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java Thu May 14 10:09:22 2009
@@ -0,0 +1,87 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.List;
+
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.Tokenizer;
+
+public class WordTokenizer extends Tokenizer {
+
+  /**
+   * 分词主程序,WordTokenizer初始化时加载。
+   */
+  private WordSegmenter wordSegmenter;
+
+  private TokenStream in;
+
+  private Iterator tokenIter;
+
+  private List tokenBuffer;
+
+  private Token sentenceToken = new Token();
+
+  /**
+   * 设计上是SentenceTokenizer的下一处理层。将SentenceTokenizer的句子读出,
+   * 利用HHMMSegment主程序将句子分词,然后将分词结果返回。
+   * 
+   * @param in 句子的Token
+   * @param smooth 平滑函数
+   * @param dataPath 装载核心字典与二叉字典的目录
+   * @see init()
+   */
+  public WordTokenizer(TokenStream in, WordSegmenter wordSegmenter) {
+    this.in = in;
+    this.wordSegmenter = wordSegmenter;
+  }
+
+  public Token next() throws IOException {
+    if (tokenIter != null && tokenIter.hasNext())
+      return (Token) tokenIter.next();
+    else {
+      if (processNextSentence()) {
+        return (Token) tokenIter.next();
+      } else
+        return null;
+    }
+  }
+
+  /**
+   * 当当前的句子分词并索引完毕时,需要读取下一个句子Token, 本函数负责调用上一层的SentenceTokenizer去加载下一个句子, 并将其分词,
+   * 将分词结果保存成Token放在tokenBuffer中
+   * 
+   * @return 读取并处理下一个句子成功与否,如果没有成功,说明文件处理完毕,后面没有Token了
+   * @throws IOException
+   */
+  private boolean processNextSentence() throws IOException {
+    sentenceToken = in.next(sentenceToken);
+    if (sentenceToken == null)
+      return false;
+    tokenBuffer = wordSegmenter.segmentSentence(sentenceToken, 1);
+    tokenIter = tokenBuffer.iterator();
+    return tokenBuffer != null && tokenIter.hasNext();
+  }
+
+  public void close() throws IOException {
+    in.close();
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java Thu May 14 10:09:22 2009
@@ -0,0 +1,37 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+public class WordType {
+
+  public final static int SENTENCE_BEGIN = 0;
+
+  public final static int SENTENCE_END = 1;// 句子的开头和结束
+
+  public final static int CHINESE_WORD = 2;// 中文词
+
+  public final static int STRING = 3;
+
+  public final static int NUMBER = 4; // ascii字符串和数字
+
+  public final static int DELIMITER = 5; // 所有标点符号
+
+  public final static int FULLWIDTH_STRING = 6;
+
+  public final static int FULLWIDTH_NUMBER = 7;// 含有全角字符的字符串,含全角数字的数字
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/WordType.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java Thu May 14 10:09:22 2009
@@ -0,0 +1,195 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.io.UnsupportedEncodingException;
+
+public abstract class AbstractDictionary {
+  /**
+   * 第一个汉字为“啊”,他前面有15个区,共15*94个字符
+   */
+  public static final int GB2312_FIRST_CHAR = 1410;
+
+  /**
+   * GB2312字符集中01~87的字符集才可能有效,共8178个
+   */
+  public static final int GB2312_CHAR_NUM = 87 * 94;
+
+  /**
+   * 词库文件中收录了6768个汉字的词频统计
+   */
+  public static final int CHAR_NUM_IN_FILE = 6768;
+
+  // =====================================================
+  // code +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +A +B +C +D +E +F
+  // B0A0 啊 阿 埃 挨 哎 唉 哀 皑 癌 蔼 矮 艾 碍 爱 隘
+  // B0B0 鞍 氨 安 俺 按 暗 岸 胺 案 肮 昂 盎 凹 敖 熬 翱
+  // B0C0 袄 傲 奥 懊 澳 芭 捌 扒 叭 吧 笆 八 疤 巴 拔 跋
+  // B0D0 靶 把 耙 坝 霸 罢 爸 白 柏 百 摆 佰 败 拜 稗 斑
+  // B0E0 班 搬 扳 般 颁 板 版 扮 拌 伴 瓣 半 办 绊 邦 帮
+  // B0F0 梆 榜 膀 绑 棒 磅 蚌 镑 傍 谤 苞 胞 包 褒 剥
+  // =====================================================
+  //
+  // GB2312 字符集的区位分布表:
+  // 区号 字数 字符类别
+  // 01 94 一般符号
+  // 02 72 顺序号码
+  // 03 94 拉丁字母
+  // 04 83 日文假名
+  // 05 86 Katakana
+  // 06 48 希腊字母
+  // 07 66 俄文字母
+  // 08 63 汉语拼音符号
+  // 09 76 图形符号
+  // 10-15 备用区
+  // 16-55 3755 一级汉字,以拼音为序
+  // 56-87 3008 二级汉字,以笔划为序
+  // 88-94 备用区
+  // ======================================================
+
+  /**
+   * GB2312 共收录有 7445 个字符,其中简化汉字 6763 个,字母和符号 682 个。
+   * 
+   * GB2312 将所收录的字符分为 94 个区,编号为 01 区至 94 区;每个区收录 94 个字符,编号为 01 位至 94
+   * 位,01为起始与0xA1,94位处于0xFE。GB2312 的每一个字符都由与其唯一对应的区号和位号所确定。例如:汉字“啊”,编号为 16 区 01
+   * 位。
+   */
+  /**
+   * @param ccid
+   * @return
+   */
+  public String getCCByGB2312Id(int ccid) {
+    if (ccid < 0 || ccid > WordDictionary.GB2312_CHAR_NUM)
+      return "";
+    int cc1 = ccid / 94 + 161;
+    int cc2 = ccid % 94 + 161;
+    byte[] buffer = new byte[2];
+    buffer[0] = (byte) cc1;
+    buffer[1] = (byte) cc2;
+    try {
+      String cchar = new String(buffer, "GB2312");
+      return cchar;
+    } catch (UnsupportedEncodingException e) {
+      return "";
+    }
+  }
+
+  /**
+   * 根据输入的Unicode字符,获取它的GB2312编码或者ascii编码,
+   * 
+   * @param ch 输入的GB2312中文字符或者ASCII字符(128个)
+   * @return ch在GB2312中的位置,-1表示该字符不认识
+   */
+  public short getGB2312Id(char ch) {
+    try {
+      byte[] buffer = Character.toString(ch).getBytes("GB2312");
+      if (buffer.length != 2) {
+        // 正常情况下buffer应该是两个字节,否则说明ch不属于GB2312编码,故返回'?',此时说明不认识该字符
+        return -1;
+      }
+      int b0 = (int) (buffer[0] & 0x0FF) - 161; // 编码从A1开始,因此减去0xA1=161
+      int b1 = (int) (buffer[1] & 0x0FF) - 161; // 第一个字符和最后一个字符没有汉字,因此每个区只收16*6-2=94个汉字
+      return (short) (b0 * 94 + b1);
+    } catch (UnsupportedEncodingException e) {
+      e.printStackTrace();
+    }
+    return -1;
+  }
+
+  /**
+   * 改进的32位FNV hash算法,用作本程序中的第一hash函数.第一和第二hash函数用来联合计算hash表, 使其均匀分布,
+   * 并能避免因hash表过密而导致的长时间计算的问题
+   * 
+   * @param c 待hash的Unicode字符
+   * @return c的哈希值
+   * @see Utility.hash2()
+   */
+  public long hash1(char c) {
+    final long p = 1099511628211L;
+    long hash = 0xcbf29ce484222325L;
+    hash = (hash ^ (c & 0x00FF)) * p;
+    hash = (hash ^ (c >> 8)) * p;
+    hash += hash << 13;
+    hash ^= hash >> 7;
+    hash += hash << 3;
+    hash ^= hash >> 17;
+    hash += hash << 5;
+    return hash;
+  }
+
+  /**
+   * @see Utility.hash1(char[])
+   * @param carray
+   * @return
+   */
+  public long hash1(char carray[]) {
+    final long p = 1099511628211L;
+    long hash = 0xcbf29ce484222325L;
+    for (int i = 0; i < carray.length; i++) {
+      char d = carray[i];
+      hash = (hash ^ (d & 0x00FF)) * p;
+      hash = (hash ^ (d >> 8)) * p;
+    }
+
+    // hash += hash << 13;
+    // hash ^= hash >> 7;
+    // hash += hash << 3;
+    // hash ^= hash >> 17;
+    // hash += hash << 5;
+    return hash;
+  }
+
+  /**
+   * djb2哈希算法,用作本程序中的第二hash函数
+   * 
+   * djb2 hash algorithm,this algorithm (k=33) was first reported by dan
+   * bernstein many years ago in comp.lang.c. another version of this algorithm
+   * (now favored by bernstein) uses xor: hash(i) = hash(i - 1) * 33 ^ str[i];
+   * the magic of number 33 (why it works better than many other constants,
+   * prime or not) has never been adequately explained.
+   * 
+   * @param c
+   * @return
+   */
+  public int hash2(char c) {
+    int hash = 5381;
+
+    /* hash 33 + c */
+    hash = ((hash << 5) + hash) + c & 0x00FF;
+    hash = ((hash << 5) + hash) + c >> 8;
+
+    return hash;
+  }
+
+  /**
+   * @see Utility.hash2(char[])
+   * @param carray
+   * @return
+   */
+  public int hash2(char carray[]) {
+    int hash = 5381;
+
+    /* hash 33 + c */
+    for (int i = 0; i < carray.length; i++) {
+      char d = carray[i];
+      hash = ((hash << 5) + hash) + d & 0x00FF;
+      hash = ((hash << 5) + hash) + d >> 8;
+    }
+
+    return hash;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/AbstractDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java Thu May 14 10:09:22 2009
@@ -0,0 +1,237 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.cn.smart.Utility;
+
+public class BiSegGraph {
+
+  private Map tokenPairListTable = new HashMap();
+
+  private List segTokenList;
+
+  private static BigramDictionary bigramDict = BigramDictionary.getInstance();
+
+  public BiSegGraph(SegGraph segGraph) {
+    segTokenList = segGraph.makeIndex();
+    generateBiSegGraph(segGraph);
+  }
+
+  /**
+   * 生成两两词之间的二叉图表,将结果保存在一个MultiTokenPairMap中
+   * 
+   * @param segGraph 所有的Token列表
+   * @param smooth 平滑系数
+   * @param biDict 二叉词典
+   * @return
+   * 
+   * @see MultiTokenPairMap
+   */
+  private void generateBiSegGraph(SegGraph segGraph) {
+    double smooth = 0.1;
+    int wordPairFreq = 0;
+    int maxStart = segGraph.getMaxStart();
+    double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;
+
+    int next;
+    char[] idBuffer;
+    // 为segGraph中的每个元素赋以一个下标
+    segTokenList = segGraph.makeIndex();
+    // 因为startToken("始##始")的起始位置是-1因此key为-1时可以取出startToken
+    int key = -1;
+    List nextTokens = null;
+    while (key < maxStart) {
+      if (segGraph.isStartExist(key)) {
+
+        List tokenList = segGraph.getStartList(key);
+
+        // 为某一个key对应的所有Token都计算一次
+        for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
+          SegToken t1 = (SegToken) iter.next();
+          oneWordFreq = t1.weight;
+          next = t1.endOffset;
+          nextTokens = null;
+          // 找到下一个对应的Token,例如“阳光海岸”,当前Token是“阳光”, 下一个Token可以是“海”或者“海岸”
+          // 如果找不到下一个Token,则说明到了末尾,重新循环。
+          while (next <= maxStart) {
+            // 因为endToken的起始位置是sentenceLen,因此等于sentenceLen是可以找到endToken
+            if (segGraph.isStartExist(next)) {
+              nextTokens = segGraph.getStartList(next);
+              break;
+            }
+            next++;
+          }
+          if (nextTokens == null) {
+            break;
+          }
+          for (Iterator iter2 = nextTokens.iterator(); iter2.hasNext();) {
+            SegToken t2 = (SegToken) iter2.next();
+            idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
+            System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
+            idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
+            System.arraycopy(t2.charArray, 0, idBuffer,
+                t1.charArray.length + 1, t2.charArray.length);
+
+            // Two linked Words frequency
+            wordPairFreq = bigramDict.getFrequency(idBuffer);
+
+            // Smoothing
+
+            // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
+            weight = -Math
+                .log(smooth
+                    * (1.0 + oneWordFreq)
+                    / (Utility.MAX_FREQUENCE + 0.0)
+                    + (1.0 - smooth)
+                    * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));
+
+            SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
+                t2.index, weight);
+            this.addSegTokenPair(tokenPair);
+          }
+        }
+      }
+      key++;
+    }
+
+  }
+
+  /**
+   * 查看SegTokenPair的结束位置为to(SegTokenPair.to为to)是否存在SegTokenPair,
+   * 如果没有则说明to处没有SegTokenPair或者还没有添加
+   * 
+   * @param to SegTokenPair.to
+   * @return
+   */
+  public boolean isToExist(int to) {
+    return tokenPairListTable.get(new Integer(to)) != null;
+  }
+
+  /**
+   * 取出SegTokenPair.to为to的所有SegTokenPair,如果没有则返回null
+   * 
+   * @param to
+   * @return 所有相同SegTokenPair.to的SegTokenPair的序列
+   */
+  public List getToList(int to) {
+    return (List) tokenPairListTable.get(new Integer(to));
+  }
+
+  /**
+   * 向BiSegGraph中增加一个SegTokenPair,这些SegTokenPair按照相同SegTokenPair.
+   * to放在同一个ArrayList中
+   * 
+   * @param tokenPair
+   */
+  public void addSegTokenPair(SegTokenPair tokenPair) {
+    int to = tokenPair.to;
+    if (!isToExist(to)) {
+      ArrayList newlist = new ArrayList();
+      newlist.add(tokenPair);
+      tokenPairListTable.put(new Integer(to), newlist);
+    } else {
+      List tokenPairList = (List) tokenPairListTable.get(new Integer(to));
+      tokenPairList.add(tokenPair);
+    }
+  }
+
+  /**
+   * @return TokenPair的列数,也就是Map中不同列号的TokenPair种数。
+   */
+  public int getToCount() {
+    return tokenPairListTable.size();
+  }
+
+  /**
+   * 用veterbi算法计算从起点到终点的最短路径
+   * 
+   * @return
+   */
+  public List getShortPath() {
+    int current;
+    int nodeCount = getToCount();
+    List path = new ArrayList();
+    PathNode zeroPath = new PathNode();
+    zeroPath.weight = 0;
+    zeroPath.preNode = 0;
+    path.add(zeroPath);
+    for (current = 1; current <= nodeCount; current++) {
+      double weight;
+      List edges = getToList(current);
+
+      double minWeight = Double.MAX_VALUE;
+      SegTokenPair minEdge = null;
+      for (Iterator iter1 = edges.iterator(); iter1.hasNext();) {
+        SegTokenPair edge = (SegTokenPair) iter1.next();
+        weight = edge.weight;
+        PathNode preNode = (PathNode) path.get(edge.from);
+        if (preNode.weight + weight < minWeight) {
+          minWeight = preNode.weight + weight;
+          minEdge = edge;
+        }
+      }
+      PathNode newNode = new PathNode();
+      newNode.weight = minWeight;
+      newNode.preNode = minEdge.from;
+      path.add(newNode);
+    }
+
+    // 接下来从nodePaths中计算从起点到终点的真实路径
+    int preNode, lastNode;
+    lastNode = path.size() - 1;
+    current = lastNode;
+    List rpath = new ArrayList();
+    List resultPath = new ArrayList();
+
+    rpath.add(new Integer(current));
+    while (current != 0) {
+      PathNode currentPathNode = (PathNode) path.get(current);
+      preNode = currentPathNode.preNode;
+      rpath.add(new Integer(preNode));
+      current = preNode;
+    }
+    for (int j = rpath.size() - 1; j >= 0; j--) {
+      Integer idInteger = (Integer) rpath.get(j);
+      int id = idInteger.intValue();
+      SegToken t = (SegToken) segTokenList.get(id);
+      resultPath.add(t);
+    }
+    return resultPath;
+
+  }
+
+  public String toString() {
+    StringBuffer sb = new StringBuffer();
+    Collection values = tokenPairListTable.values();
+    for (Iterator iter1 = values.iterator(); iter1.hasNext();) {
+      List segList = (List) iter1.next();
+      for (Iterator iter2 = segList.iterator(); iter2.hasNext();) {
+        SegTokenPair pair = (SegTokenPair) iter2.next();
+        sb.append(pair + "\n");
+      }
+    }
+    return sb.toString();
+  }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java Thu May 14 10:09:22 2009
@@ -0,0 +1,321 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.RandomAccessFile;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
+
+public class BigramDictionary extends AbstractDictionary {
+
+  private BigramDictionary() {
+  }
+
+  public static final char WORD_SEGMENT_CHAR = '@';
+
+  private static BigramDictionary singleInstance;
+
+  public static final int PRIME_BIGRAM_LENGTH = 402137;
+
+  /**
+   * bigramTable 来存储词与词之间的跳转频率, bigramHashTable 和 frequencyTable
+   * 就是用来存储这些频率的数据结构。 为了提高查询速度和节省内存, 采用 hash 值来代替关联词作为查询依据, 关联词就是
+   * (formWord+'@'+toWord) , 利用 FNV1 hash 算法来计算关联词的hash值 ,并保存在 bigramHashTable
+   * 中,利用 hash 值来代替关联词有可能会产生很小概率的冲突, 但是 long 类型
+   * (64bit)的hash值有效地将此概率降到极低。bigramHashTable[i]与frequencyTable[i]一一对应
+   */
+  private long[] bigramHashTable;
+
+  private int[] frequencyTable;
+
+  private int max = 0;
+
+  private int repeat = 0;
+
+  // static Logger log = Logger.getLogger(BigramDictionary.class);
+
+  public synchronized static BigramDictionary getInstance() {
+    if (singleInstance == null) {
+      singleInstance = new BigramDictionary();
+      try {
+        singleInstance.load();
+      } catch (IOException e) {
+        String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+        singleInstance.load(dictRoot);
+      } catch (ClassNotFoundException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    return singleInstance;
+  }
+
+  private boolean loadFromObj(File serialObj) {
+    try {
+      loadFromInputStream(new FileInputStream(serialObj));
+      return true;
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+    } catch (IOException e) {
+      e.printStackTrace();
+    } catch (ClassNotFoundException e) {
+      e.printStackTrace();
+    }
+    return false;
+  }
+
+  private void loadFromInputStream(InputStream serialObjectInputStream)
+      throws IOException, ClassNotFoundException {
+    ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
+    bigramHashTable = (long[]) input.readObject();
+    frequencyTable = (int[]) input.readObject();
+    // log.info("load bigram dict from serialization.");
+    input.close();
+  }
+
+  private void saveToObj(File serialObj) {
+    try {
+      ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
+          serialObj));
+      output.writeObject(bigramHashTable);
+      output.writeObject(frequencyTable);
+      output.close();
+      // log.info("serialize bigram dict.");
+    } catch (Exception e) {
+      // log.warn(e.getMessage());
+    }
+  }
+
+  private void load() throws IOException, ClassNotFoundException {
+    InputStream input = this.getClass().getResourceAsStream("bigramdict.mem");
+    loadFromInputStream(input);
+  }
+
+  private void load(String dictRoot) {
+    String bigramDictPath = dictRoot + "/bigramdict.dct";
+
+    File serialObj = new File(dictRoot + "/bigramdict.mem");
+
+    if (serialObj.exists() && loadFromObj(serialObj)) {
+
+    } else {
+      try {
+        bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
+        frequencyTable = new int[PRIME_BIGRAM_LENGTH];
+        for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
+          // 实际上将0作为初始值有一点问题,因为某个字符串可能hash值为0,但是概率非常小,因此影响不大
+          bigramHashTable[i] = 0;
+          frequencyTable[i] = 0;
+        }
+        loadFromFile(bigramDictPath);
+      } catch (IOException e) {
+        throw new RuntimeException(e.getMessage());
+      }
+      saveToObj(serialObj);
+    }
+  }
+
+  /**
+   * 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
+   * 
+   * @param dctFilePath
+   * @return
+   * @throws FileNotFoundException
+   * @throws IOException
+   * @throws UnsupportedEncodingException
+   */
+  public void loadFromFile(String dctFilePath) throws FileNotFoundException,
+      IOException, UnsupportedEncodingException {
+
+    int i, cnt, length, total = 0;
+    // 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
+    int[] buffer = new int[3];
+    byte[] intBuffer = new byte[4];
+    String tmpword;
+    RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
+
+    // 字典文件中第一个汉字出现的位置是0,最后一个是6768
+    for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
+      String currentStr = getCCByGB2312Id(i);
+      // if (i == 5231)
+      // System.out.println(i);
+
+      dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
+      // endian编码,而java为big endian,必须转换过来
+      cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
+      if (cnt <= 0) {
+        continue;
+      }
+      total += cnt;
+      int j = 0;
+      while (j < cnt) {
+        dctFile.read(intBuffer);
+        buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// frequency
+        dctFile.read(intBuffer);
+        buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// length
+        dctFile.read(intBuffer);
+        // buffer[2] = ByteBuffer.wrap(intBuffer).order(
+        // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+
+        length = buffer[1];
+        if (length > 0) {
+          byte[] lchBuffer = new byte[length];
+          dctFile.read(lchBuffer);
+          tmpword = new String(lchBuffer, "GB2312");
+          if (i != 3755 + GB2312_FIRST_CHAR) {
+            tmpword = currentStr + tmpword;
+          }
+          char carray[] = tmpword.toCharArray();
+          long hashId = hash1(carray);
+          int index = getAvaliableIndex(hashId, carray);
+          if (index != -1) {
+            if (bigramHashTable[index] == 0) {
+              bigramHashTable[index] = hashId;
+              // bigramStringTable[index] = tmpword;
+            }
+            frequencyTable[index] += buffer[0];
+          }
+        }
+        j++;
+      }
+    }
+    dctFile.close();
+    // log.info("load dictionary done! " + dctFilePath + " total:" + total);
+  }
+
+  /*
+   * public void test(String dctFilePath) throws IOException { int i, cnt,
+   * length, total = 0; int corrupt = 0, notFound = 0; //
+   * 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。 int[] buffer = new int[3];
+   * byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
+   * new RandomAccessFile(dctFilePath, "r");
+   * 
+   * // 字典文件中第一个汉字出现的位置是0,最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
+   * GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
+   * getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
+   * 
+   * dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little // endian编码,而java为big
+   * endian,必须转换过来 cnt =
+   * ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
+   * (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
+   * dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
+   * ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
+   * buffer[1] = ByteBuffer.wrap(intBuffer).order(
+   * ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
+   * buffer[2] = ByteBuffer.wrap(intBuffer).order( //
+   * ByteOrder.LITTLE_ENDIAN).getInt();// handle
+   * 
+   * length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
+   * dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
+   * != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
+   * carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
+   * if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
+   * System.out.println("corrupt: " + tmpword + "<->" // +
+   * bigramStringTable[index]); // corrupt++; // } } else {
+   * System.out.println("not found: " + tmpword); notFound++; } } j++; } }
+   * dctFile.close(); System.out.println("num not found:" + notFound);
+   * System.out.println("num corrupt:" + corrupt);
+   * 
+   * log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
+   * 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
+   * != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
+   * }
+   */
+
+  private int getAvaliableIndex(long hashId, char carray[]) {
+    int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
+    int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_BIGRAM_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_BIGRAM_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+        && i < PRIME_BIGRAM_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+      i++;
+    }
+    // System.out.println(i - 1);
+
+    if (i < PRIME_BIGRAM_LENGTH
+        && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
+      return index;
+    } else
+      return -1;
+  }
+
+  /**
+   * @param c
+   * @return
+   */
+  private int getBigramItemIndex(char carray[]) {
+    long hashId = hash1(carray);
+    int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
+    int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_BIGRAM_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_BIGRAM_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    repeat++;
+    while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+        && i < PRIME_BIGRAM_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+      i++;
+      repeat++;
+      if (i > max)
+        max = i;
+    }
+    // System.out.println(i - 1);
+
+    if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
+      return index;
+    } else
+      return -1;
+  }
+
+  public int getFrequency(char[] carray) {
+    int index = getBigramItemIndex(carray);
+    if (index != -1)
+      return frequencyTable[index];
+    return 0;
+  }
+
+  public static void main(String[] args) throws FileNotFoundException,
+      UnsupportedEncodingException, IOException {
+    BigramDictionary dic = new BigramDictionary();
+    dic.load("D:/analysis-data");
+    // dic.test("D:/analysis-data/BigramDict.dct");
+    System.out.println("max:" + dic.max);
+    System.out.println("average repeat:" + (double) dic.repeat / 328856);
+    System.out.println("end");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BigramDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message