lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mikemcc...@apache.org
Subject svn commit: r774718 [2/3] - in /lucene/java/trunk: ./ contrib/analyzers/src/java/org/apache/lucene/analysis/ar/ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/ contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/ contrib/analyzers/...
Date Thu, 14 May 2009 10:09:24 GMT
Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfBigramDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfBigramDictionary.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfBigramDictionary.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfBigramDictionary.java Thu May 14 10:09:22 2009
@@ -0,0 +1,302 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.RandomAccessFile;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
+
+public class CopyOfBigramDictionary extends AbstractDictionary {
+
+  private CopyOfBigramDictionary() {
+  }
+
+  public static final char WORD_SEGMENT_CHAR = '@';
+
+  private static CopyOfBigramDictionary singleInstance;
+
+  public static final int PRIME_BIGRAM_LENGTH = 402137;
+
+  /**
+   * bigramTable 来存储词与词之间的跳转频率, bigramHashTable 和 frequencyTable
+   * 就是用来存储这些频率的数据结构。 为了提高查询速度和节省内存, 采用 hash 值来代替关联词作为查询依据, 关联词就是
+   * (formWord+'@'+toWord) , 利用 FNV1 hash 算法来计算关联词的hash值 ,并保存在 bigramHashTable
+   * 中,利用 hash 值来代替关联词有可能会产生很小概率的冲突, 但是 long 类型
+   * (64bit)的hash值有效地将此概率降到极低。bigramHashTable[i]与frequencyTable[i]一一对应
+   */
+  private long[] bigramHashTable;
+
+  private int[] frequencyTable;
+
+  private int max = 0;
+
+  private int repeat = 0;
+
+  // static Logger log = Logger.getLogger(BigramDictionary.class);
+
+  public synchronized static CopyOfBigramDictionary getInstance() {
+    if (singleInstance == null) {
+      String dictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+      singleInstance = new CopyOfBigramDictionary();
+      singleInstance.load(dictRoot);
+    }
+    return singleInstance;
+  }
+
+  private boolean loadFromObj(File serialObj) {
+    boolean loadFromObject = false;
+    try {
+      ObjectInputStream input = new ObjectInputStream(new FileInputStream(
+          serialObj));
+      bigramHashTable = (long[]) input.readObject();
+      frequencyTable = (int[]) input.readObject();
+      // log.info("load bigram dict from serialization.");
+      loadFromObject = true;
+      input.close();
+    } catch (Exception e) {
+      // log.warn(e.getMessage());
+    }
+    return loadFromObject;
+  }
+
+  private void saveToObj(File serialObj) {
+    try {
+      ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
+          serialObj));
+      output.writeObject(bigramHashTable);
+      output.writeObject(frequencyTable);
+      output.close();
+      // log.info("serialize bigram dict.");
+    } catch (Exception e) {
+      // log.warn(e.getMessage());
+    }
+  }
+
+  private void load(String dictRoot) {
+    String bigramDictPath = dictRoot + "/bigramdict.dct";
+
+    File serialObj = new File(dictRoot + "/bigramdict.mem");
+
+    if (serialObj.exists() && loadFromObj(serialObj)) {
+
+    } else {
+      try {
+        bigramHashTable = new long[PRIME_BIGRAM_LENGTH];
+        frequencyTable = new int[PRIME_BIGRAM_LENGTH];
+        for (int i = 0; i < PRIME_BIGRAM_LENGTH; i++) {
+          // 实际上将0作为初始值有一点问题,因为某个字符串可能hash值为0,但是概率非常小,因此影响不大
+          bigramHashTable[i] = 0;
+          frequencyTable[i] = 0;
+        }
+        loadFromFile(bigramDictPath);
+      } catch (IOException e) {
+        throw new RuntimeException(e.getMessage());
+      }
+      saveToObj(serialObj);
+    }
+  }
+
+  /**
+   * 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
+   * 
+   * @param dctFilePath
+   * @return
+   * @throws FileNotFoundException
+   * @throws IOException
+   * @throws UnsupportedEncodingException
+   */
+  public void loadFromFile(String dctFilePath) throws FileNotFoundException,
+      IOException, UnsupportedEncodingException {
+
+    int i, cnt, length, total = 0;
+    // 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
+    int[] buffer = new int[3];
+    byte[] intBuffer = new byte[4];
+    String tmpword;
+    RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
+
+    // 字典文件中第一个汉字出现的位置是0,最后一个是6768
+    for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
+      String currentStr = getCCByGB2312Id(i);
+      // if (i == 5231)
+      // System.out.println(i);
+
+      dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
+      // endian编码,而java为big endian,必须转换过来
+      cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
+      if (cnt <= 0) {
+        continue;
+      }
+      total += cnt;
+      int j = 0;
+      while (j < cnt) {
+        dctFile.read(intBuffer);
+        buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// frequency
+        dctFile.read(intBuffer);
+        buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// length
+        dctFile.read(intBuffer);
+        // buffer[2] = ByteBuffer.wrap(intBuffer).order(
+        // ByteOrder.LITTLE_ENDIAN).getInt();// handle
+
+        length = buffer[1];
+        if (length > 0) {
+          byte[] lchBuffer = new byte[length];
+          dctFile.read(lchBuffer);
+          tmpword = new String(lchBuffer, "GB2312");
+          if (i != 3755 + GB2312_FIRST_CHAR) {
+            tmpword = currentStr + tmpword;
+          }
+          char carray[] = tmpword.toCharArray();
+          long hashId = hash1(carray);
+          int index = getAvaliableIndex(hashId, carray);
+          if (index != -1) {
+            if (bigramHashTable[index] == 0) {
+              bigramHashTable[index] = hashId;
+              // bigramStringTable[index] = tmpword;
+            }
+            frequencyTable[index] += buffer[0];
+          }
+        }
+        j++;
+      }
+    }
+    dctFile.close();
+    // log.info("load dictionary done! " + dctFilePath + " total:" + total);
+  }
+
+  /*
+   * public void test(String dctFilePath) throws IOException { int i, cnt,
+   * length, total = 0; int corrupt = 0, notFound = 0; //
+   * 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。 int[] buffer = new int[3];
+   * byte[] intBuffer = new byte[4]; String tmpword; RandomAccessFile dctFile =
+   * new RandomAccessFile(dctFilePath, "r");
+   * 
+   * // 字典文件中第一个汉字出现的位置是0,最后一个是6768 for (i = GB2312_FIRST_CHAR; i <
+   * GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { String currentStr =
+   * getCCByGB2312Id(i); // if (i == 5231) // System.out.println(i);
+   * 
+   * dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little // endian编码,而java为big
+   * endian,必须转换过来 cnt =
+   * ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) .getInt(); if
+   * (cnt <= 0) { continue; } total += cnt; int j = 0; while (j < cnt) {
+   * dctFile.read(intBuffer); buffer[0] = ByteBuffer.wrap(intBuffer).order(
+   * ByteOrder.LITTLE_ENDIAN).getInt();// frequency dctFile.read(intBuffer);
+   * buffer[1] = ByteBuffer.wrap(intBuffer).order(
+   * ByteOrder.LITTLE_ENDIAN).getInt();// length dctFile.read(intBuffer); //
+   * buffer[2] = ByteBuffer.wrap(intBuffer).order( //
+   * ByteOrder.LITTLE_ENDIAN).getInt();// handle
+   * 
+   * length = buffer[1]; if (length > 0) { byte[] lchBuffer = new byte[length];
+   * dctFile.read(lchBuffer); tmpword = new String(lchBuffer, "GB2312"); if (i
+   * != 3755 + GB2312_FIRST_CHAR) { tmpword = currentStr + tmpword; } char
+   * carray[] = tmpword.toCharArray(); int index = getBigramItemIndex(carray);
+   * if (index != -1) { // if (!bigramStringTable[index].equals(tmpword)) { //
+   * System.out.println("corrupt: " + tmpword + "<->" // +
+   * bigramStringTable[index]); // corrupt++; // } } else {
+   * System.out.println("not found: " + tmpword); notFound++; } } j++; } }
+   * dctFile.close(); System.out.println("num not found:" + notFound);
+   * System.out.println("num corrupt:" + corrupt);
+   * 
+   * log.info("test dictionary done! " + dctFilePath + " total:" + total); cnt =
+   * 0; for (int j = 0; j < PRIME_BIGRAM_LENGTH; j++) { if (bigramHashTable[j]
+   * != 0) { cnt++; } } System.out.println("total num in bigramTable: " + cnt);
+   * }
+   */
+
+  private int getAvaliableIndex(long hashId, char carray[]) {
+    int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
+    int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_BIGRAM_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_BIGRAM_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+        && i < PRIME_BIGRAM_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+      i++;
+    }
+    // System.out.println(i - 1);
+
+    if (i < PRIME_BIGRAM_LENGTH
+        && (bigramHashTable[index] == 0 || bigramHashTable[index] == hashId)) {
+      return index;
+    } else
+      return -1;
+  }
+
+  /**
+   * @param c
+   * @return
+   */
+  private int getBigramItemIndex(char carray[]) {
+    long hashId = hash1(carray);
+    int hash1 = (int) (hashId % PRIME_BIGRAM_LENGTH);
+    int hash2 = hash2(carray) % PRIME_BIGRAM_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_BIGRAM_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_BIGRAM_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    repeat++;
+    while (bigramHashTable[index] != 0 && bigramHashTable[index] != hashId
+        && i < PRIME_BIGRAM_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_BIGRAM_LENGTH;
+      i++;
+      repeat++;
+      if (i > max)
+        max = i;
+    }
+    // System.out.println(i - 1);
+
+    if (i < PRIME_BIGRAM_LENGTH && bigramHashTable[index] == hashId) {
+      return index;
+    } else
+      return -1;
+  }
+
+  public int getFrequency(char[] carray) {
+    int index = getBigramItemIndex(carray);
+    if (index != -1)
+      return frequencyTable[index];
+    return 0;
+  }
+
+  public static void main(String[] args) throws FileNotFoundException,
+      UnsupportedEncodingException, IOException {
+    CopyOfBigramDictionary dic = new CopyOfBigramDictionary();
+    dic.load("D:/analysis-data");
+    // dic.test("D:/analysis-data/BigramDict.dct");
+    System.out.println("max:" + dic.max);
+    System.out.println("average repeat:" + (double) dic.repeat / 328856);
+    System.out.println("end");
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfBigramDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfWordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfWordDictionary.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfWordDictionary.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfWordDictionary.java Thu May 14 10:09:22 2009
@@ -0,0 +1,541 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.RandomAccessFile;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
+import org.apache.lucene.analysis.cn.smart.Utility;
+
+public class CopyOfWordDictionary extends AbstractDictionary {
+
+  private CopyOfWordDictionary() {
+  }
+
+  private static CopyOfWordDictionary singleInstance;
+
+  /**
+   * 一个较大的素数,保证hash查找能够遍历所有位置
+   */
+  public static final int PRIME_INDEX_LENGTH = 12071;
+
+  /**
+   * wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中,
+   * 当然会有冲突,但实际上本程序只处理GB2312字符部分,6768个字符加上一些ASCII字符,
+   * 因此对这些字符是有效的,为了保证比较的准确性,保留原来的字符在charIndexTable中以确定查找的准确性
+   */
+  private short[] wordIndexTable;
+
+  private char[] charIndexTable;
+
+  /**
+   * 存储所有词库的真正数据结构,为了避免占用空间太多,用了两个单独的多维数组来存储词组和频率。
+   * 每个词放在一个char[]中,每个char对应一个汉字或其他字符,每个频率放在一个int中,
+   * 这两个数组的前两个下表是一一对应的。因此可以利用wordItem_charArrayTable[i][j]来查词,
+   * 用wordItem_frequencyTable[i][j]来查询对应的频率
+   */
+  private char[][][] wordItem_charArrayTable;
+
+  private int[][] wordItem_frequencyTable;
+
+  // static Logger log = Logger.getLogger(WordDictionary.class);
+
+  public synchronized static CopyOfWordDictionary getInstance() {
+    if (singleInstance == null) {
+      singleInstance = new CopyOfWordDictionary();
+      String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+      singleInstance.load(wordDictRoot);
+    }
+    return singleInstance;
+  }
+
+  /**
+   * 加在词典库文件,
+   * 
+   * @param dctFileName 词典库文件的路径
+   */
+  public void load(String dctFileRoot) {
+    String dctFilePath = dctFileRoot + "/coredict.dct";
+    File serialObj = new File(dctFileRoot + "/coredict.mem");
+
+    if (serialObj.exists() && loadFromObj(serialObj)) {
+
+    } else {
+      try {
+        wordIndexTable = new short[PRIME_INDEX_LENGTH];
+        charIndexTable = new char[PRIME_INDEX_LENGTH];
+        for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
+          charIndexTable[i] = 0;
+          wordIndexTable[i] = -1;
+        }
+        wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
+        wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
+        // int total =
+        loadMainDataFromFile(dctFilePath);
+        expandDelimiterData();
+        mergeSameWords();
+        sortEachItems();
+        // log.info("load dictionary: " + dctFilePath + " total:" + total);
+      } catch (IOException e) {
+        throw new RuntimeException(e.getMessage());
+      }
+
+      saveToObj(serialObj);
+    }
+
+  }
+
+  private boolean loadFromObj(File serialObj) {
+    boolean loadFromObject = false;
+    try {
+      ObjectInputStream input = new ObjectInputStream(new FileInputStream(
+          serialObj));
+      wordIndexTable = (short[]) input.readObject();
+      charIndexTable = (char[]) input.readObject();
+      wordItem_charArrayTable = (char[][][]) input.readObject();
+      wordItem_frequencyTable = (int[][]) input.readObject();
+      // log.info("load core dict from serialization.");
+      input.close();
+      loadFromObject = true;
+    } catch (Exception e) {
+      // log.warn(e.getMessage());
+    }
+    return loadFromObject;
+  }
+
+  private void saveToObj(File serialObj) {
+    try {
+      ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
+          serialObj));
+      output.writeObject(wordIndexTable);
+      output.writeObject(charIndexTable);
+      output.writeObject(wordItem_charArrayTable);
+      output.writeObject(wordItem_frequencyTable);
+      output.close();
+      // log.info("serialize core dict.");
+    } catch (Exception e) {
+      // log.warn(e.getMessage());
+    }
+  }
+
+  /**
+   * 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
+   * 
+   * @param dctFilePath
+   * @return
+   * @throws FileNotFoundException
+   * @throws IOException
+   * @throws UnsupportedEncodingException
+   */
+  private int loadMainDataFromFile(String dctFilePath)
+      throws FileNotFoundException, IOException, UnsupportedEncodingException {
+    int i, cnt, length, total = 0;
+    // 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
+    int[] buffer = new int[3];
+    byte[] intBuffer = new byte[4];
+    String tmpword;
+    RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
+
+    // 字典文件中第一个汉字出现的位置是0,最后一个是6768
+    for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
+      // if (i == 5231)
+      // System.out.println(i);
+
+      dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
+      // endian编码,而java为big endian,必须转换过来
+      cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
+      if (cnt <= 0) {
+        wordItem_charArrayTable[i] = null;
+        wordItem_frequencyTable[i] = null;
+        continue;
+      }
+      wordItem_charArrayTable[i] = new char[cnt][];
+      wordItem_frequencyTable[i] = new int[cnt];
+      total += cnt;
+      int j = 0;
+      while (j < cnt) {
+        // wordItemTable[i][j] = new WordItem();
+        dctFile.read(intBuffer);
+        buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// frequency
+        dctFile.read(intBuffer);
+        buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// length
+        dctFile.read(intBuffer);
+        buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// handle
+
+        // wordItemTable[i][j].frequency = buffer[0];
+        wordItem_frequencyTable[i][j] = buffer[0];
+
+        length = buffer[1];
+        if (length > 0) {
+          byte[] lchBuffer = new byte[length];
+          dctFile.read(lchBuffer);
+          tmpword = new String(lchBuffer, "GB2312");
+          // indexTable[i].wordItems[j].word = tmpword;
+          // wordItemTable[i][j].charArray = tmpword.toCharArray();
+          wordItem_charArrayTable[i][j] = tmpword.toCharArray();
+        } else {
+          // wordItemTable[i][j].charArray = null;
+          wordItem_charArrayTable[i][j] = null;
+        }
+        // System.out.println(indexTable[i].wordItems[j]);
+        j++;
+      }
+
+      String str = getCCByGB2312Id(i);
+      setTableIndex(str.charAt(0), i);
+    }
+    dctFile.close();
+    return total;
+  }
+
+  /**
+   * 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)。这里将其展开,分别放到各个符号对应的列表中
+   */
+  private void expandDelimiterData() {
+    int i;
+    int cnt;
+    // 标点符号在从1开始的3755处,将原始的标点符号对应的字典分配到对应的标点符号中
+    int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
+    i = 0;
+    while (i < wordItem_charArrayTable[delimiterIndex].length) {
+      char c = wordItem_charArrayTable[delimiterIndex][i][0];
+      int j = getGB2312Id(c);// 该标点符号应该所在的index值
+      if (wordItem_charArrayTable[j] == null) {
+
+        int k = i;
+        // 从i开始计数后面以j开头的符号的worditem的个数
+        while (k < wordItem_charArrayTable[delimiterIndex].length
+            && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
+          k++;
+        }
+        // 此时k-i为id为j的标点符号对应的wordItem的个数
+        cnt = k - i;
+        if (cnt != 0) {
+          wordItem_charArrayTable[j] = new char[cnt][];
+          wordItem_frequencyTable[j] = new int[cnt];
+        }
+
+        // 为每一个wordItem赋值
+        for (k = 0; k < cnt; k++, i++) {
+          // wordItemTable[j][k] = new WordItem();
+          wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
+          wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
+          System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
+              wordItem_charArrayTable[j][k], 0,
+              wordItem_charArrayTable[j][k].length);
+        }
+        setTableIndex(c, j);
+      }
+    }
+    // 将原符号对应的数组删除
+    wordItem_charArrayTable[delimiterIndex] = null;
+    wordItem_frequencyTable[delimiterIndex] = null;
+  }
+
+  /**
+   * 本程序不做词性标注,因此将相同词不同词性的频率合并到同一个词下,以减小存储空间,加快搜索速度
+   */
+  private void mergeSameWords() {
+    int i;
+    for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
+      if (wordItem_charArrayTable[i] == null)
+        continue;
+      int len = 1;
+      for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
+        if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
+            wordItem_charArrayTable[i][j - 1], 0) != 0)
+          len++;
+
+      }
+      if (len < wordItem_charArrayTable[i].length) {
+        char[][] tempArray = new char[len][];
+        int[] tempFreq = new int[len];
+        int k = 0;
+        tempArray[0] = wordItem_charArrayTable[i][0];
+        tempFreq[0] = wordItem_frequencyTable[i][0];
+        for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
+          if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
+              tempArray[k], 0) != 0) {
+            k++;
+            // temp[k] = wordItemTable[i][j];
+            tempArray[k] = wordItem_charArrayTable[i][j];
+            tempFreq[k] = wordItem_frequencyTable[i][j];
+          } else {
+            // temp[k].frequency += wordItemTable[i][j].frequency;
+            tempFreq[k] += wordItem_frequencyTable[i][j];
+          }
+        }
+        // wordItemTable[i] = temp;
+        wordItem_charArrayTable[i] = tempArray;
+        wordItem_frequencyTable[i] = tempFreq;
+      }
+    }
+  }
+
+  private void sortEachItems() {
+    char[] tmpArray;
+    int tmpFreq;
+    for (int i = 0; i < wordItem_charArrayTable.length; i++) {
+      if (wordItem_charArrayTable[i] != null
+          && wordItem_charArrayTable[i].length > 1) {
+        for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
+          for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
+            if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
+                wordItem_charArrayTable[i][j2], 0) > 0) {
+              tmpArray = wordItem_charArrayTable[i][j];
+              tmpFreq = wordItem_frequencyTable[i][j];
+              wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
+              wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
+              wordItem_charArrayTable[i][j2] = tmpArray;
+              wordItem_frequencyTable[i][j2] = tmpFreq;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * 计算字符c在哈希表中应该在的位置,然后将地址列表中该位置的值初始化
+   * 
+   * @param c
+   * @param j
+   * @return
+   */
+  private boolean setTableIndex(char c, int j) {
+    int index = getAvaliableTableIndex(c);
+    if (index != -1) {
+      charIndexTable[index] = c;
+      wordIndexTable[index] = (short) j;
+      return true;
+    } else
+      return false;
+  }
+
+  private short getAvaliableTableIndex(char c) {
+    int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
+    int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_INDEX_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_INDEX_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    while (charIndexTable[index] != 0 && charIndexTable[index] != c
+        && i < PRIME_INDEX_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
+      i++;
+    }
+    // System.out.println(i - 1);
+
+    if (i < PRIME_INDEX_LENGTH
+        && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
+      return (short) index;
+    } else
+      return -1;
+  }
+
+  /**
+   * @param c
+   * @return
+   */
+  private short getWordItemTableIndex(char c) {
+    int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
+    int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_INDEX_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_INDEX_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    while (charIndexTable[index] != 0 && charIndexTable[index] != c
+        && i < PRIME_INDEX_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
+      i++;
+    }
+
+    if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
+      return (short) index;
+    } else
+      return -1;
+  }
+
+  /**
+   * 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
+   * 
+   * @param charArray 查找单词对应的char数组
+   * @return 单词在单词数组中的位置,如果没找到则返回-1
+   */
+  private int findInTable(char[] charArray) {
+    if (charArray == null || charArray.length == 0)
+      return -1;
+    short index = getWordItemTableIndex(charArray[0]);
+    if (index == -1)
+      return -1;
+
+    return findInTable(index, charArray);
+
+  }
+
+  /**
+   * 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
+   * 
+   * @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置,如果未计算,可以用函数int
+   *        findInTable(char[] charArray) 代替
+   * @param charArray 查找单词对应的char数组
+   * @return 单词在单词数组中的位置,如果没找到则返回-1
+   */
+  private int findInTable(short knownHashIndex, char[] charArray) {
+    if (charArray == null || charArray.length == 0)
+      return -1;
+
+    char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
+    int start = 0, end = items.length - 1;
+    int mid = (start + end) / 2, cmpResult;
+
+    // Binary search for the index of idArray
+    while (start <= end) {
+      cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
+
+      if (cmpResult == 0)
+        return mid;// find it
+      else if (cmpResult < 0)
+        start = mid + 1;
+      else if (cmpResult > 0)
+        end = mid - 1;
+
+      mid = (start + end) / 2;
+    }
+    return -1;
+  }
+
+  /**
+   * charArray这个单词对应的词组在不在WordDictionary中出现
+   * 
+   * @param charArray
+   * @return true表示存在,false表示不存在
+   */
+  public boolean isExist(char[] charArray) {
+    return findInTable(charArray) != -1;
+  }
+
+  /**
+   * @see{getPrefixMatch(char[] charArray, int knownStart)}
+   * @param charArray
+   * @return
+   */
+  public int getPrefixMatch(char[] charArray) {
+    return getPrefixMatch(charArray, 0);
+  }
+
+  /**
+   * 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置。为了减小搜索代价,
+   * 可以根据已有知识设置起始搜索位置, 如果不知道起始位置,默认是0
+   * 
+   * @see{getPrefixMatch(char[] charArray)}
+   * @param charArray 前缀单词
+   * @param knownStart 已知的起始位置
+   * @return 满足前缀条件的第一个单词的位置
+   */
+  public int getPrefixMatch(char[] charArray, int knownStart) {
+    short index = getWordItemTableIndex(charArray[0]);
+    if (index == -1)
+      return -1;
+    char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
+    int start = knownStart, end = items.length - 1;
+
+    int mid = (start + end) / 2, cmpResult;
+
+    // Binary search for the index of idArray
+    while (start <= end) {
+      cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
+      if (cmpResult == 0) {
+        // Get the first item which match the current word
+        while (mid >= 0
+            && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
+          mid--;
+        mid++;
+        return mid;// 找到第一个以charArray为前缀的单词
+      } else if (cmpResult < 0)
+        end = mid - 1;
+      else
+        start = mid + 1;
+      mid = (start + end) / 2;
+    }
+    return -1;
+  }
+
+  /**
+   * 获取idArray对应的词的词频,若pos为-1则获取所有词性的词频
+   * 
+   * @param charArray 输入的单词对应的charArray
+   * @param pos 词性,-1表示要求求出所有的词性的词频
+   * @return idArray对应的词频
+   */
+  public int getFrequency(char[] charArray) {
+    short hashIndex = getWordItemTableIndex(charArray[0]);
+    if (hashIndex == -1)
+      return 0;
+    int itemIndex = findInTable(hashIndex, charArray);
+    if (itemIndex != -1)
+      return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
+    return 0;
+
+  }
+
+  /**
+   * 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
+   * 也就是说charArray的位置查找结果是不是就是wordIndex
+   * 
+   * @param charArray 输入的charArray词组,第一个数表示词典中的索引号
+   * @param itemIndex 位置编号
+   * @return 是否相等
+   */
+  public boolean isEqual(char[] charArray, int itemIndex) {
+    short hashIndex = getWordItemTableIndex(charArray[0]);
+    return Utility.compareArray(charArray, 1,
+        wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
+  }
+
+  public static void main(String[] args) throws FileNotFoundException,
+      IOException {
+    CopyOfWordDictionary dic = new CopyOfWordDictionary();
+    dic.load("D:/analysis-data");
+    Utility.getCharType('。');
+    Utility.getCharType('汗');
+    Utility.getCharType(' ');// 0020
+    Utility.getCharType(' ');// 3000
+    Utility.getCharType('');// E095
+    Utility.getCharType(' ');// 3000
+    Utility.getCharType('\r');// 000D
+    Utility.getCharType('\n');// 000A
+    Utility.getCharType('\t');// 0009
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/CopyOfWordDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java Thu May 14 10:09:22 2009
@@ -0,0 +1,193 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.util.List;
+
+import org.apache.lucene.analysis.cn.smart.CharType;
+import org.apache.lucene.analysis.cn.smart.Utility;
+import org.apache.lucene.analysis.cn.smart.WordType;
+
+public class HHMMSegmenter {
+
+  private static WordDictionary wordDict = WordDictionary.getInstance();
+
+  /**
+   * 寻找sentence中所有可能的Token,最后再添加两个特殊Token,"始##始",
+   * "末##末","始##始"Token的起始位置是-1,"末##末"Token的起始位置是句子的长度
+   * 
+   * @param sentence 输入的句子,不包含"始##始","末##末"等
+   * @param coreDict 核心字典
+   * @return 所有可能的Token
+   * @see MultiTokenMap
+   */
+  private SegGraph createSegGraph(String sentence) {
+    int i = 0, j;
+    int length = sentence.length();
+    int foundIndex;
+    int[] charTypeArray = getCharTypes(sentence);
+    StringBuffer wordBuf = new StringBuffer();
+    SegToken token;
+    int frequency = 0; // word的出现次数
+    boolean hasFullWidth;
+    int wordType;
+    char[] charArray;
+
+    SegGraph segGraph = new SegGraph();
+    while (i < length) {
+      hasFullWidth = false;
+      switch (charTypeArray[i]) {
+        case CharType.SPACE_LIKE:
+          i++;
+          break;
+        case CharType.HANZI:
+          j = i + 1;
+          wordBuf.delete(0, wordBuf.length());
+          // 不管单个汉字能不能构成词,都将单个汉字存到segGraph中去,否则会造成分此图断字
+          wordBuf.append(sentence.charAt(i));
+          charArray = new char[] { sentence.charAt(i) };
+          frequency = wordDict.getFrequency(charArray);
+          token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+              frequency);
+          segGraph.addToken(token);
+
+          foundIndex = wordDict.getPrefixMatch(charArray);
+          while (j <= length && foundIndex != -1) {
+            if (wordDict.isEqual(charArray, foundIndex) && charArray.length > 1) {
+              // 就是我们要找的词, 也就是说找到了从i到j的一个成词SegToken,并且不是单字词
+              frequency = wordDict.getFrequency(charArray);
+              token = new SegToken(charArray, i, j, WordType.CHINESE_WORD,
+                  frequency);
+              segGraph.addToken(token);
+            }
+
+            while (j < length && charTypeArray[j] == CharType.SPACE_LIKE)
+              j++;
+
+            if (j < length && charTypeArray[j] == CharType.HANZI) {
+              wordBuf.append(sentence.charAt(j));
+              charArray = new char[wordBuf.length()];
+              wordBuf.getChars(0, charArray.length, charArray, 0);
+              // idArray作为前缀已经找到过(foundWordIndex!=-1),
+              // 因此加长过后的idArray只可能出现在foundWordIndex以后,
+              // 故从foundWordIndex之后开始查找
+              foundIndex = wordDict.getPrefixMatch(charArray, foundIndex);
+              j++;
+            } else {
+              break;
+            }
+          }
+          i++;
+          break;
+        case CharType.FULLWIDTH_LETTER:
+          hasFullWidth = true;
+        case CharType.LETTER:
+          j = i + 1;
+          while (j < length
+              && (charTypeArray[j] == CharType.LETTER || charTypeArray[j] == CharType.FULLWIDTH_LETTER)) {
+            if (charTypeArray[j] == CharType.FULLWIDTH_LETTER)
+              hasFullWidth = true;
+            j++;
+          }
+          // 找到了从i到j的一个Token,类型为LETTER的字符串
+          charArray = Utility.STRING_CHAR_ARRAY;
+          frequency = wordDict.getFrequency(charArray);
+          wordType = hasFullWidth ? WordType.FULLWIDTH_STRING : WordType.STRING;
+          token = new SegToken(charArray, i, j, wordType, frequency);
+          segGraph.addToken(token);
+          i = j;
+          break;
+        case CharType.FULLWIDTH_DIGIT:
+          hasFullWidth = true;
+        case CharType.DIGIT:
+          j = i + 1;
+          while (j < length
+              && (charTypeArray[j] == CharType.DIGIT || charTypeArray[j] == CharType.FULLWIDTH_DIGIT)) {
+            if (charTypeArray[j] == CharType.FULLWIDTH_DIGIT)
+              hasFullWidth = true;
+            j++;
+          }
+          // 找到了从i到j的一个Token,类型为NUMBER的字符串
+          charArray = Utility.NUMBER_CHAR_ARRAY;
+          frequency = wordDict.getFrequency(charArray);
+          wordType = hasFullWidth ? WordType.FULLWIDTH_NUMBER : WordType.NUMBER;
+          token = new SegToken(charArray, i, j, wordType, frequency);
+          segGraph.addToken(token);
+          i = j;
+          break;
+        case CharType.DELIMITER:
+          j = i + 1;
+          // 标点符号的weight不用查了,选个最大的频率即可
+          frequency = Utility.MAX_FREQUENCE;
+          charArray = new char[] { sentence.charAt(i) };
+          token = new SegToken(charArray, i, j, WordType.DELIMITER, frequency);
+          segGraph.addToken(token);
+          i = j;
+          break;
+        default:
+          j = i + 1;
+          // 把不认识的字符当作未知串看待,例如GB2312编码之外的字符,每个字符当作一个
+          charArray = Utility.STRING_CHAR_ARRAY;
+          frequency = wordDict.getFrequency(charArray);
+          token = new SegToken(charArray, i, j, WordType.STRING, frequency);
+          segGraph.addToken(token);
+          i = j;
+          break;
+      }
+    }
+
+    // 为segGraph增加两个新Token: "始##始","末##末"
+    charArray = Utility.START_CHAR_ARRAY;
+    frequency = wordDict.getFrequency(charArray);
+    token = new SegToken(charArray, -1, 0, WordType.SENTENCE_BEGIN, frequency);
+    segGraph.addToken(token);
+
+    // "末##末"
+    charArray = Utility.END_CHAR_ARRAY;
+    frequency = wordDict.getFrequency(charArray);
+    token = new SegToken(charArray, length, length + 1, WordType.SENTENCE_END,
+        frequency);
+    segGraph.addToken(token);
+
+    return segGraph;
+  }
+
+  /**
+   * 为sentence中的每个字符确定唯一的字符类型
+   * 
+   * @see Utility.charType(char)
+   * @param sentence 输入的完成句子
+   * @return 返回的字符类型数组,如果输入为null,返回也是null
+   */
+  private static int[] getCharTypes(String sentence) {
+    int length = sentence.length();
+    int[] charTypeArray = new int[length];
+    // 生成对应单个汉字的字符类型数组
+    for (int i = 0; i < length; i++) {
+      charTypeArray[i] = Utility.getCharType(sentence.charAt(i));
+    }
+
+    return charTypeArray;
+  }
+
+  public List process(String sentence) {
+    SegGraph segGraph = createSegGraph(sentence);
+    BiSegGraph biSegGraph = new BiSegGraph(segGraph);
+    List shortPath = biSegGraph.getShortPath();
+    return shortPath;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java Thu May 14 10:09:22 2009
@@ -0,0 +1,33 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+public class PathNode implements Comparable {
+  public double weight;
+
+  public int preNode;
+
+  public int compareTo(Object p) {
+    PathNode pn = (PathNode) p;
+    if (weight < pn.weight)
+      return -1;
+    else if (weight == pn.weight)
+      return 0;
+    else
+      return 1;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java Thu May 14 10:09:22 2009
@@ -0,0 +1,144 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+public class SegGraph {
+
+  /**
+   * 用一个ArrayList记录startOffset相同的Token,这个startOffset就是Token的key
+   */
+  private Map tokenListTable = new HashMap();
+
+  private int maxStart = -1;
+
+  /**
+   * 查看startOffset为s的Token是否存在,如果没有则说明s处没有Token或者还没有添加
+   * 
+   * @param s startOffset
+   * @return
+   */
+  public boolean isStartExist(int s) {
+    return tokenListTable.get(new Integer(s)) != null;
+  }
+
+  /**
+   * 取出startOffset为s的所有Tokens,如果没有则返回null
+   * 
+   * @param s
+   * @return 所有相同startOffset的Token的序列
+   */
+  public List getStartList(int s) {
+    return (List) tokenListTable.get(new Integer(s));
+  }
+
+  public int getMaxStart() {
+    return maxStart;
+  }
+
+  /**
+   * 为SegGraph中的所有Tokens生成一个统一的index,index从0开始,
+   * 按照startOffset递增的顺序排序,相同startOffset的Tokens按照放置先后顺序排序
+   */
+  public List makeIndex() {
+    List result = new ArrayList();
+    int s = -1, count = 0, size = tokenListTable.size();
+    List tokenList;
+    short index = 0;
+    while (count < size) {
+      if (isStartExist(s)) {
+        tokenList = (List) tokenListTable.get(new Integer(s));
+        for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
+          SegToken st = (SegToken) iter.next();
+          st.index = index;
+          result.add(st);
+          index++;
+        }
+        count++;
+      }
+      s++;
+    }
+    return result;
+  }
+
+  /**
+   * 向Map中增加一个Token,这些Token按照相同startOffset放在同一个列表中,
+   * 
+   * @param token
+   */
+  public void addToken(SegToken token) {
+    int s = token.startOffset;
+    if (!isStartExist(s)) {
+      ArrayList newlist = new ArrayList();
+      newlist.add(token);
+      tokenListTable.put((Object) (new Integer(s)), newlist);
+    } else {
+      List tokenList = (List) tokenListTable.get((Object) (new Integer(s)));
+      tokenList.add(token);
+    }
+    if (s > maxStart)
+      maxStart = s;
+  }
+
+  /**
+   * 获取SegGraph中不同起始(Start)位置Token类的个数,每个开始位置可能有多个Token,因此位置数与Token数并不一致
+   * 
+   * @return
+   */
+  public int getStartCount() {
+    return tokenListTable.size();
+  }
+
+  /**
+   * 将Map中存储的所有Token按照起始位置从小到大的方式组成一个列表
+   * 
+   * @return
+   */
+  public List toTokenList() {
+    List result = new ArrayList();
+    int s = -1, count = 0, size = tokenListTable.size();
+    List tokenList;
+
+    while (count < size) {
+      if (isStartExist(s)) {
+        tokenList = (List) tokenListTable.get(new Integer(s));
+        for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
+          SegToken st = (SegToken) iter.next();
+          result.add(st);
+        }
+        count++;
+      }
+      s++;
+    }
+    return result;
+  }
+
+  public String toString() {
+    List tokenList = this.toTokenList();
+    StringBuffer sb = new StringBuffer();
+    for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
+      SegToken t = (SegToken) iter.next();
+      sb.append(t + "\n");
+    }
+    return sb.toString();
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java Thu May 14 10:09:22 2009
@@ -0,0 +1,64 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+public class SegToken {
+  public char[] charArray;
+
+  public int startOffset;
+
+  public int endOffset;
+
+  public int wordType;
+
+  public int weight;
+
+  public int index;
+
+  public SegToken(String word, int start, int end, int wordType, int weight) {
+    this.charArray = word.toCharArray();
+    this.startOffset = start;
+    this.endOffset = end;
+    this.wordType = wordType;
+    this.weight = weight;
+  }
+
+  public SegToken(char[] idArray, int start, int end, int wordType, int weight) {
+    this.charArray = idArray;
+    this.startOffset = start;
+    this.endOffset = end;
+    this.wordType = wordType;
+    this.weight = weight;
+  }
+
+  // public String toString() {
+  // return String.valueOf(charArray) + "/s(" + startOffset + ")e("
+  // + endOffset + ")/w(" + weight + ")t(" + wordType + ")";
+  // }
+
+  /**
+   * 判断两个Token相等的充要条件是他们的起始位置相等,因为这样他们的原句中的内容一样,
+   * 而pos与weight都可以从词典中查到多个,可以用一对多的方法表示,因此只需要一个Token
+   * 
+   * @param t
+   * @return
+   */
+  // public boolean equals(RawToken t) {
+  // return this.startOffset == t.startOffset
+  // && this.endOffset == t.endOffset;
+  // }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegToken.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java Thu May 14 10:09:22 2009
@@ -0,0 +1,50 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import org.apache.lucene.analysis.cn.smart.Utility;
+import org.apache.lucene.analysis.cn.smart.WordType;
+
+public class SegTokenFilter {
+
+  public SegToken filter(SegToken token) {
+    switch (token.wordType) {
+      case WordType.FULLWIDTH_NUMBER:
+      case WordType.FULLWIDTH_STRING:
+        for (int i = 0; i < token.charArray.length; i++) {
+          if (token.charArray[i] >= 0xFF10)
+            token.charArray[i] -= 0xFEE0;
+
+          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
+            token.charArray[i] += 0x0020;
+        }
+        break;
+      case WordType.STRING:
+        for (int i = 0; i < token.charArray.length; i++) {
+          if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A)
+            token.charArray[i] += 0x0020;
+        }
+        break;
+      case WordType.DELIMITER:
+        token.charArray = Utility.COMMON_DELIMITER;
+        break;
+      default:
+        break;
+    }
+    return token;
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java Thu May 14 10:09:22 2009
@@ -0,0 +1,48 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+public class SegTokenPair {
+
+  public char[] charArray;
+
+  /**
+   * from和to是Token对的index号,表示本TokenPair的两个Token在segGragh中的位置。
+   */
+  public int from;
+
+  public int to;
+
+  public double weight;
+
+  public SegTokenPair(char[] idArray, int from, int to, double weight) {
+    this.charArray = idArray;
+    this.from = from;
+    this.to = to;
+    this.weight = weight;
+  }
+
+  // public String toString() {
+  // return String.valueOf(charArray) + ":f(" + from + ")t(" + to + "):"
+  // + weight;
+  // }
+
+  // public boolean equals(SegTokenPair tp) {
+  // return this.from == tp.from && this.to == tp.to;
+  // }
+
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegTokenPair.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java (added)
+++ lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java Thu May 14 10:09:22 2009
@@ -0,0 +1,568 @@
+/**
+ * Copyright 2009 www.imdict.net
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart.hhmm;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
+import java.io.RandomAccessFile;
+import java.io.UnsupportedEncodingException;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+
+import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
+import org.apache.lucene.analysis.cn.smart.Utility;
+
+public class WordDictionary extends AbstractDictionary {
+
+  private WordDictionary() {
+  }
+
+  private static WordDictionary singleInstance;
+
+  /**
+   * 一个较大的素数,保证hash查找能够遍历所有位置
+   */
+  public static final int PRIME_INDEX_LENGTH = 12071;
+
+  /**
+   * wordIndexTable保证将Unicode中的所有汉字编码hash到PRIME_INDEX_LENGTH长度的数组中,
+   * 当然会有冲突,但实际上本程序只处理GB2312字符部分,6768个字符加上一些ASCII字符,
+   * 因此对这些字符是有效的,为了保证比较的准确性,保留原来的字符在charIndexTable中以确定查找的准确性
+   */
+  private short[] wordIndexTable;
+
+  private char[] charIndexTable;
+
+  /**
+   * 存储所有词库的真正数据结构,为了避免占用空间太多,用了两个单独的多维数组来存储词组和频率。
+   * 每个词放在一个char[]中,每个char对应一个汉字或其他字符,每个频率放在一个int中,
+   * 这两个数组的前两个下表是一一对应的。因此可以利用wordItem_charArrayTable[i][j]来查词,
+   * 用wordItem_frequencyTable[i][j]来查询对应的频率
+   */
+  private char[][][] wordItem_charArrayTable;
+
+  private int[][] wordItem_frequencyTable;
+
+  // static Logger log = Logger.getLogger(WordDictionary.class);
+
+  public synchronized static WordDictionary getInstance() {
+    if (singleInstance == null) {
+      singleInstance = new WordDictionary();
+      try {
+        singleInstance.load();
+      } catch (IOException e) {
+        String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
+        singleInstance.load(wordDictRoot);
+      } catch (ClassNotFoundException e) {
+        throw new RuntimeException(e);
+      }
+    }
+    return singleInstance;
+  }
+
+  /**
+   * 从外部文件夹dctFileRoot加载词典库文件,首先测试是否有coredict.mem文件, 如果有则直接作为序列化对象加载,
+   * 如果没有则加载词典库源文件coredict.dct
+   * 
+   * @param dctFileName 词典库文件的路径
+   */
+  public void load(String dctFileRoot) {
+    String dctFilePath = dctFileRoot + "/coredict.dct";
+    File serialObj = new File(dctFileRoot + "/coredict.mem");
+
+    if (serialObj.exists() && loadFromObj(serialObj)) {
+
+    } else {
+      try {
+        wordIndexTable = new short[PRIME_INDEX_LENGTH];
+        charIndexTable = new char[PRIME_INDEX_LENGTH];
+        for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
+          charIndexTable[i] = 0;
+          wordIndexTable[i] = -1;
+        }
+        wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
+        wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
+        // int total =
+        loadMainDataFromFile(dctFilePath);
+        expandDelimiterData();
+        mergeSameWords();
+        sortEachItems();
+        // log.info("load dictionary: " + dctFilePath + " total:" + total);
+      } catch (IOException e) {
+        throw new RuntimeException(e.getMessage());
+      }
+
+      saveToObj(serialObj);
+    }
+
+  }
+
+  /**
+   * 从jar内部加载词典库文件,要求保证WordDictionary类当前路径中有coredict.mem文件,以将其作为序列化对象加载
+   * 
+   * @param dctFileName 词典库文件的路径
+   * @throws ClassNotFoundException
+   * @throws IOException
+   */
+  public void load() throws IOException, ClassNotFoundException {
+    InputStream input = this.getClass().getResourceAsStream("coredict.mem");
+    loadFromObjectInputStream(input);
+  }
+
+  private boolean loadFromObj(File serialObj) {
+    try {
+      loadFromObjectInputStream(new FileInputStream(serialObj));
+      return true;
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+    } catch (IOException e) {
+      e.printStackTrace();
+    } catch (ClassNotFoundException e) {
+      e.printStackTrace();
+    }
+    return false;
+  }
+
+  private void loadFromObjectInputStream(InputStream serialObjectInputStream)
+      throws IOException, ClassNotFoundException {
+    ObjectInputStream input = new ObjectInputStream(serialObjectInputStream);
+    wordIndexTable = (short[]) input.readObject();
+    charIndexTable = (char[]) input.readObject();
+    wordItem_charArrayTable = (char[][][]) input.readObject();
+    wordItem_frequencyTable = (int[][]) input.readObject();
+    // log.info("load core dict from serialization.");
+    input.close();
+  }
+
+  private void saveToObj(File serialObj) {
+    try {
+      ObjectOutputStream output = new ObjectOutputStream(new FileOutputStream(
+          serialObj));
+      output.writeObject(wordIndexTable);
+      output.writeObject(charIndexTable);
+      output.writeObject(wordItem_charArrayTable);
+      output.writeObject(wordItem_frequencyTable);
+      output.close();
+      // log.info("serialize core dict.");
+    } catch (Exception e) {
+      // log.warn(e.getMessage());
+    }
+  }
+
+  /**
+   * 将词库文件加载到WordDictionary的相关数据结构中,只是加载,没有进行合并和修改操作
+   * 
+   * @param dctFilePath
+   * @return
+   * @throws FileNotFoundException
+   * @throws IOException
+   * @throws UnsupportedEncodingException
+   */
+  private int loadMainDataFromFile(String dctFilePath)
+      throws FileNotFoundException, IOException, UnsupportedEncodingException {
+    int i, cnt, length, total = 0;
+    // 文件中只统计了6763个汉字加5个空汉字符3756~3760,其中第3756个用来存储符号信息。
+    int[] buffer = new int[3];
+    byte[] intBuffer = new byte[4];
+    String tmpword;
+    RandomAccessFile dctFile = new RandomAccessFile(dctFilePath, "r");
+
+    // 字典文件中第一个汉字出现的位置是0,最后一个是6768
+    for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
+      // if (i == 5231)
+      // System.out.println(i);
+
+      dctFile.read(intBuffer);// 原词库文件在c下开发,所以写入的文件为little
+      // endian编码,而java为big endian,必须转换过来
+      cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
+      if (cnt <= 0) {
+        wordItem_charArrayTable[i] = null;
+        wordItem_frequencyTable[i] = null;
+        continue;
+      }
+      wordItem_charArrayTable[i] = new char[cnt][];
+      wordItem_frequencyTable[i] = new int[cnt];
+      total += cnt;
+      int j = 0;
+      while (j < cnt) {
+        // wordItemTable[i][j] = new WordItem();
+        dctFile.read(intBuffer);
+        buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// frequency
+        dctFile.read(intBuffer);
+        buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// length
+        dctFile.read(intBuffer);
+        buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
+            .getInt();// handle
+
+        // wordItemTable[i][j].frequency = buffer[0];
+        wordItem_frequencyTable[i][j] = buffer[0];
+
+        length = buffer[1];
+        if (length > 0) {
+          byte[] lchBuffer = new byte[length];
+          dctFile.read(lchBuffer);
+          tmpword = new String(lchBuffer, "GB2312");
+          // indexTable[i].wordItems[j].word = tmpword;
+          // wordItemTable[i][j].charArray = tmpword.toCharArray();
+          wordItem_charArrayTable[i][j] = tmpword.toCharArray();
+        } else {
+          // wordItemTable[i][j].charArray = null;
+          wordItem_charArrayTable[i][j] = null;
+        }
+        // System.out.println(indexTable[i].wordItems[j]);
+        j++;
+      }
+
+      String str = getCCByGB2312Id(i);
+      setTableIndex(str.charAt(0), i);
+    }
+    dctFile.close();
+    return total;
+  }
+
+  /**
+   * 原词库将所有标点符号的信息合并到一个列表里(从1开始的3755处)。这里将其展开,分别放到各个符号对应的列表中
+   */
+  private void expandDelimiterData() {
+    int i;
+    int cnt;
+    // 标点符号在从1开始的3755处,将原始的标点符号对应的字典分配到对应的标点符号中
+    int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
+    i = 0;
+    while (i < wordItem_charArrayTable[delimiterIndex].length) {
+      char c = wordItem_charArrayTable[delimiterIndex][i][0];
+      int j = getGB2312Id(c);// 该标点符号应该所在的index值
+      if (wordItem_charArrayTable[j] == null) {
+
+        int k = i;
+        // 从i开始计数后面以j开头的符号的worditem的个数
+        while (k < wordItem_charArrayTable[delimiterIndex].length
+            && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
+          k++;
+        }
+        // 此时k-i为id为j的标点符号对应的wordItem的个数
+        cnt = k - i;
+        if (cnt != 0) {
+          wordItem_charArrayTable[j] = new char[cnt][];
+          wordItem_frequencyTable[j] = new int[cnt];
+        }
+
+        // 为每一个wordItem赋值
+        for (k = 0; k < cnt; k++, i++) {
+          // wordItemTable[j][k] = new WordItem();
+          wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
+          wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
+          System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
+              wordItem_charArrayTable[j][k], 0,
+              wordItem_charArrayTable[j][k].length);
+        }
+        setTableIndex(c, j);
+      }
+    }
+    // 将原符号对应的数组删除
+    wordItem_charArrayTable[delimiterIndex] = null;
+    wordItem_frequencyTable[delimiterIndex] = null;
+  }
+
+  /**
+   * 本程序不做词性标注,因此将相同词不同词性的频率合并到同一个词下,以减小存储空间,加快搜索速度
+   */
+  private void mergeSameWords() {
+    int i;
+    for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
+      if (wordItem_charArrayTable[i] == null)
+        continue;
+      int len = 1;
+      for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
+        if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
+            wordItem_charArrayTable[i][j - 1], 0) != 0)
+          len++;
+
+      }
+      if (len < wordItem_charArrayTable[i].length) {
+        char[][] tempArray = new char[len][];
+        int[] tempFreq = new int[len];
+        int k = 0;
+        tempArray[0] = wordItem_charArrayTable[i][0];
+        tempFreq[0] = wordItem_frequencyTable[i][0];
+        for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
+          if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
+              tempArray[k], 0) != 0) {
+            k++;
+            // temp[k] = wordItemTable[i][j];
+            tempArray[k] = wordItem_charArrayTable[i][j];
+            tempFreq[k] = wordItem_frequencyTable[i][j];
+          } else {
+            // temp[k].frequency += wordItemTable[i][j].frequency;
+            tempFreq[k] += wordItem_frequencyTable[i][j];
+          }
+        }
+        // wordItemTable[i] = temp;
+        wordItem_charArrayTable[i] = tempArray;
+        wordItem_frequencyTable[i] = tempFreq;
+      }
+    }
+  }
+
+  private void sortEachItems() {
+    char[] tmpArray;
+    int tmpFreq;
+    for (int i = 0; i < wordItem_charArrayTable.length; i++) {
+      if (wordItem_charArrayTable[i] != null
+          && wordItem_charArrayTable[i].length > 1) {
+        for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
+          for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
+            if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
+                wordItem_charArrayTable[i][j2], 0) > 0) {
+              tmpArray = wordItem_charArrayTable[i][j];
+              tmpFreq = wordItem_frequencyTable[i][j];
+              wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
+              wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
+              wordItem_charArrayTable[i][j2] = tmpArray;
+              wordItem_frequencyTable[i][j2] = tmpFreq;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * 计算字符c在哈希表中应该在的位置,然后将地址列表中该位置的值初始化
+   * 
+   * @param c
+   * @param j
+   * @return
+   */
+  private boolean setTableIndex(char c, int j) {
+    int index = getAvaliableTableIndex(c);
+    if (index != -1) {
+      charIndexTable[index] = c;
+      wordIndexTable[index] = (short) j;
+      return true;
+    } else
+      return false;
+  }
+
+  private short getAvaliableTableIndex(char c) {
+    int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
+    int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_INDEX_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_INDEX_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    while (charIndexTable[index] != 0 && charIndexTable[index] != c
+        && i < PRIME_INDEX_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
+      i++;
+    }
+    // System.out.println(i - 1);
+
+    if (i < PRIME_INDEX_LENGTH
+        && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
+      return (short) index;
+    } else
+      return -1;
+  }
+
+  /**
+   * @param c
+   * @return
+   */
+  private short getWordItemTableIndex(char c) {
+    int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
+    int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
+    if (hash1 < 0)
+      hash1 = PRIME_INDEX_LENGTH + hash1;
+    if (hash2 < 0)
+      hash2 = PRIME_INDEX_LENGTH + hash2;
+    int index = hash1;
+    int i = 1;
+    while (charIndexTable[index] != 0 && charIndexTable[index] != c
+        && i < PRIME_INDEX_LENGTH) {
+      index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
+      i++;
+    }
+
+    if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
+      return (short) index;
+    } else
+      return -1;
+  }
+
+  /**
+   * 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
+   * 
+   * @param charArray 查找单词对应的char数组
+   * @return 单词在单词数组中的位置,如果没找到则返回-1
+   */
+  private int findInTable(char[] charArray) {
+    if (charArray == null || charArray.length == 0)
+      return -1;
+    short index = getWordItemTableIndex(charArray[0]);
+    if (index == -1)
+      return -1;
+
+    return findInTable(index, charArray);
+
+  }
+
+  /**
+   * 在字典库中查找单词对应的char数组为charArray的字符串。返回该单词在单词序列中的位置
+   * 
+   * @param knownHashIndex 已知单词第一个字符charArray[0]在hash表中的位置,如果未计算,可以用函数int
+   *        findInTable(char[] charArray) 代替
+   * @param charArray 查找单词对应的char数组
+   * @return 单词在单词数组中的位置,如果没找到则返回-1
+   */
+  private int findInTable(short knownHashIndex, char[] charArray) {
+    if (charArray == null || charArray.length == 0)
+      return -1;
+
+    char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
+    int start = 0, end = items.length - 1;
+    int mid = (start + end) / 2, cmpResult;
+
+    // Binary search for the index of idArray
+    while (start <= end) {
+      cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);
+
+      if (cmpResult == 0)
+        return mid;// find it
+      else if (cmpResult < 0)
+        start = mid + 1;
+      else if (cmpResult > 0)
+        end = mid - 1;
+
+      mid = (start + end) / 2;
+    }
+    return -1;
+  }
+
+  /**
+   * charArray这个单词对应的词组在不在WordDictionary中出现
+   * 
+   * @param charArray
+   * @return true表示存在,false表示不存在
+   */
+  public boolean isExist(char[] charArray) {
+    return findInTable(charArray) != -1;
+  }
+
+  /**
+   * @see{getPrefixMatch(char[] charArray, int knownStart)}
+   * @param charArray
+   * @return
+   */
+  public int getPrefixMatch(char[] charArray) {
+    return getPrefixMatch(charArray, 0);
+  }
+
+  /**
+   * 从词典中查找以charArray对应的单词为前缀(prefix)的单词的位置, 并返回第一个满足条件的位置。为了减小搜索代价,
+   * 可以根据已有知识设置起始搜索位置, 如果不知道起始位置,默认是0
+   * 
+   * @see{getPrefixMatch(char[] charArray)}
+   * @param charArray 前缀单词
+   * @param knownStart 已知的起始位置
+   * @return 满足前缀条件的第一个单词的位置
+   */
+  public int getPrefixMatch(char[] charArray, int knownStart) {
+    short index = getWordItemTableIndex(charArray[0]);
+    if (index == -1)
+      return -1;
+    char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
+    int start = knownStart, end = items.length - 1;
+
+    int mid = (start + end) / 2, cmpResult;
+
+    // Binary search for the index of idArray
+    while (start <= end) {
+      cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
+      if (cmpResult == 0) {
+        // Get the first item which match the current word
+        while (mid >= 0
+            && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
+          mid--;
+        mid++;
+        return mid;// 找到第一个以charArray为前缀的单词
+      } else if (cmpResult < 0)
+        end = mid - 1;
+      else
+        start = mid + 1;
+      mid = (start + end) / 2;
+    }
+    return -1;
+  }
+
+  /**
+   * 获取idArray对应的词的词频,若pos为-1则获取所有词性的词频
+   * 
+   * @param charArray 输入的单词对应的charArray
+   * @param pos 词性,-1表示要求求出所有的词性的词频
+   * @return idArray对应的词频
+   */
+  public int getFrequency(char[] charArray) {
+    short hashIndex = getWordItemTableIndex(charArray[0]);
+    if (hashIndex == -1)
+      return 0;
+    int itemIndex = findInTable(hashIndex, charArray);
+    if (itemIndex != -1)
+      return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
+    return 0;
+
+  }
+
+  /**
+   * 判断charArray对应的字符串是否跟词典中charArray[0]对应的wordIndex的charArray相等,
+   * 也就是说charArray的位置查找结果是不是就是wordIndex
+   * 
+   * @param charArray 输入的charArray词组,第一个数表示词典中的索引号
+   * @param itemIndex 位置编号
+   * @return 是否相等
+   */
+  public boolean isEqual(char[] charArray, int itemIndex) {
+    short hashIndex = getWordItemTableIndex(charArray[0]);
+    return Utility.compareArray(charArray, 1,
+        wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
+  }
+
+  public static void main(String[] args) throws FileNotFoundException,
+      IOException {
+    WordDictionary dic = new WordDictionary();
+    dic.load("D:/analysis-data");
+    Utility.getCharType('。');
+    Utility.getCharType('汗');
+    Utility.getCharType(' ');// 0020
+    Utility.getCharType(' ');// 3000
+    Utility.getCharType('');// E095
+    Utility.getCharType(' ');// 3000
+    Utility.getCharType('\r');// 000D
+    Utility.getCharType('\n');// 000A
+    Utility.getCharType('\t');// 0009
+  }
+}

Propchange: lucene/java/trunk/contrib/analyzers/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/ar/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/bigramdict.mem
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/bigramdict.mem?rev=774718&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/bigramdict.mem
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/coredict.mem
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/coredict.mem?rev=774718&view=auto
==============================================================================
Binary file - no diff available.

Propchange: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/smart/hhmm/coredict.mem
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt?rev=774718&view=auto
==============================================================================
--- lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt (added)
+++ lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt Thu May 14 10:09:22 2009
@@ -0,0 +1,58 @@
+////////// 将标点符号全部去掉 ////////////////
+,
+.
+`
+-
+_
+=
+?
+'
+|
+"
+(
+)
+{
+}
+[
+]
+<
+>
+*
+#
+&
+^
+$
+@
+!
+~
+:
+;
++
+/
+\
+《
+》
+—
+-
+,
+。
+、
+:
+;
+!
+·
+?
+“
+”
+)
+(
+【
+】
+ï¼»
+ï¼½
+●
+ //中文空格字符
+
+//////////////// 英文停用词 ////////////////
+
+//////////////// 中文停用词 ////////////////

Propchange: lucene/java/trunk/contrib/analyzers/src/resources/org/apache/lucene/analysis/cn/stopwords.txt
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message