lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1230748 [3/5] - in /lucene/dev/trunk: dev-tools/eclipse/ lucene/contrib/ modules/analysis/ modules/analysis/common/src/java/org/apache/lucene/analysis/util/ modules/analysis/common/src/test/org/apache/lucene/analysis/util/ modules/analysis...
Date Thu, 12 Jan 2012 20:10:52 GMT
Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/util/ToStringUtil.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,1023 @@
+package org.apache.lucene.analysis.kuromoji.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+
+public class ToStringUtil {
+  // a translation map for parts of speech, only used for reflectWith
+  private static final HashMap<String,String> posTranslations = new HashMap<String,String>();
+  static {
+    posTranslations.put("名詞", "noun");
+    posTranslations.put("名詞-一般", "noun-common");
+    posTranslations.put("名詞-固有名詞", "noun-proper");
+    posTranslations.put("名詞-固有名詞-一般", "noun-proper-misc");
+    posTranslations.put("名詞-固有名詞-人名", "noun-proper-person");
+    posTranslations.put("名詞-固有名詞-人名-一般", "noun-proper-person-misc");
+    posTranslations.put("名詞-固有名詞-人名-姓", "noun-proper-person-surname");
+    posTranslations.put("名詞-固有名詞-人名-名", "noun-proper-person-given_name");
+    posTranslations.put("名詞-固有名詞-組織", "noun-proper-organization");
+    posTranslations.put("名詞-固有名詞-地域", "noun-proper-place");
+    posTranslations.put("名詞-固有名詞-地域-一般", "noun-proper-place-misc");
+    posTranslations.put("名詞-固有名詞-地域-国", "noun-proper-place-country");
+    posTranslations.put("名詞-代名詞", "noun-pronoun");
+    posTranslations.put("名詞-代名詞-一般", "noun-pronoun-misc");
+    posTranslations.put("名詞-代名詞-縮約", "noun-pronoun-contraction");
+    posTranslations.put("名詞-副詞可能", "noun-adverbial");
+    posTranslations.put("名詞-サ変接続", "noun-verbal");
+    posTranslations.put("名詞-形容動詞語幹", "noun-adjective-base");
+    posTranslations.put("名詞-数", "noun-numeric");
+    posTranslations.put("名詞-非自立", "noun-affix");
+    posTranslations.put("名詞-非自立-一般", "noun-affix-misc");
+    posTranslations.put("名詞-非自立-副詞可能", "noun-affix-adverbial");
+    posTranslations.put("名詞-非自立-助動詞語幹", "noun-affix-aux");
+    posTranslations.put("名詞-非自立-形容動詞語幹", "noun-affix-adjective-base");
+    posTranslations.put("名詞-特殊", "noun-special");
+    posTranslations.put("名詞-特殊-助動詞語幹", "noun-special-aux");
+    posTranslations.put("名詞-接尾", "noun-suffix");
+    posTranslations.put("名詞-接尾-一般", "noun-suffix-misc");
+    posTranslations.put("名詞-接尾-人名", "noun-suffix-person");
+    posTranslations.put("名詞-接尾-地域", "noun-suffix-place");
+    posTranslations.put("名詞-接尾-サ変接続", "noun-suffix-verbal");
+    posTranslations.put("名詞-接尾-助動詞語幹", "noun-suffix-aux");
+    posTranslations.put("名詞-接尾-形容動詞語幹", "noun-suffix-adjective-base");
+    posTranslations.put("名詞-接尾-副詞可能", "noun-suffix-adverbial");
+    posTranslations.put("名詞-接尾-助数詞", "noun-suffix-classifier");
+    posTranslations.put("名詞-接尾-特殊", "noun-suffix-special");
+    posTranslations.put("名詞-接続詞的", "noun-suffix-conjunctive");
+    posTranslations.put("名詞-動詞非自立的", "noun-verbal_aux");
+    posTranslations.put("名詞-引用文字列", "noun-quotation");
+    posTranslations.put("名詞-ナイ形容詞語幹", "noun-nai_adjective");
+    posTranslations.put("接頭詞", "prefix");
+    posTranslations.put("接頭詞-名詞接続", "prefix-nominal");
+    posTranslations.put("接頭詞-動詞接続", "prefix-verbal");
+    posTranslations.put("接頭詞-形容詞接続", "prefix-adjectival");
+    posTranslations.put("接頭詞-数接続", "prefix-numerical");
+    posTranslations.put("動詞", "verb");
+    posTranslations.put("動詞-自立", "verb-main");
+    posTranslations.put("動詞-非自立", "verb-auxiliary");
+    posTranslations.put("動詞-接尾", "verb-suffix");
+    posTranslations.put("形容詞", "adjective");
+    posTranslations.put("形容詞-自立", "adjective-main");
+    posTranslations.put("形容詞-非自立", "adjective-auxiliary");
+    posTranslations.put("形容詞-接尾", "adjective-suffix");
+    posTranslations.put("副詞", "adverb");
+    posTranslations.put("副詞-一般", "adverb-misc");
+    posTranslations.put("副詞-助詞類接続", "adverb-particle_conjunction");
+    posTranslations.put("連体詞", "adnominal");
+    posTranslations.put("接続詞", "conjunction");
+    posTranslations.put("助詞", "particle");
+    posTranslations.put("助詞-格助詞", "particle-case");
+    posTranslations.put("助詞-格助詞-一般", "particle-case-misc");
+    posTranslations.put("助詞-格助詞-引用", "particle-case-quote");
+    posTranslations.put("助詞-格助詞-連語", "particle-case-compound");
+    posTranslations.put("助詞-接続助詞", "particle-conjunctive");
+    posTranslations.put("助詞-係助詞", "particle-dependency");
+    posTranslations.put("助詞-副助詞", "particle-adverbial");
+    posTranslations.put("助詞-間投助詞", "particle-interjective");
+    posTranslations.put("助詞-並立助詞", "particle-coordinate");
+    posTranslations.put("助詞-終助詞", "particle-final");
+    posTranslations.put("助詞-副助詞/並立助詞/終助詞", "particle-adverbial/conjunctive/final");
+    posTranslations.put("助詞-連体化", "particle-adnominalizer");
+    posTranslations.put("助詞-副詞化", "particle-adnominalizer");
+    posTranslations.put("助詞-特殊", "particle-special");
+    posTranslations.put("助動詞", "auxiliary-verb");
+    posTranslations.put("感動詞", "interjection");
+    posTranslations.put("記号", "symbol");
+    posTranslations.put("記号-一般", "symbol-misc");
+    posTranslations.put("記号-句点", "symbol-period");
+    posTranslations.put("記号-読点", "symbol-comma");
+    posTranslations.put("記号-空白", "symbol-space");
+    posTranslations.put("記号-括弧開", "symbol-open_bracket");
+    posTranslations.put("記号-括弧閉", "symbol-close_bracket");
+    posTranslations.put("記号-アルファベット", "symbol-alphabetic");
+    posTranslations.put("その他", "other");
+    posTranslations.put("その他-間投", "other-interjection");
+    posTranslations.put("フィラー", "filler");
+    posTranslations.put("非言語音", "non-verbal");
+    posTranslations.put("語断片", "fragment");
+    posTranslations.put("未知語", "unknown");
+  }
+  
+  /**
+   * Get the english form of a POS tag
+   */
+  public static String getPOSTranslation(String s) {
+    return posTranslations.get(s);
+  }
+  
+  // a translation map for inflection types, only used for reflectWith
+  private static final HashMap<String,String> inflTypeTranslations = new HashMap<String,String>();
+  static {
+    inflTypeTranslations.put("*", "*");
+    inflTypeTranslations.put("形容詞・アウオ段", "adj-group-a-o-u");
+    inflTypeTranslations.put("形容詞・イ段", "adj-group-i");
+    inflTypeTranslations.put("形容詞・イイ",  "adj-group-ii");
+    inflTypeTranslations.put("不変化型", "non-inflectional");
+    inflTypeTranslations.put("特殊・タ", "special-da");
+    inflTypeTranslations.put("特殊・ダ", "special-ta");
+    inflTypeTranslations.put("文語・ゴトシ", "classical-gotoshi");
+    inflTypeTranslations.put("特殊・ジャ", "special-ja");
+    inflTypeTranslations.put("特殊・ナイ", "special-nai");
+    inflTypeTranslations.put("五段・ラ行特殊", "5-row-cons-r-special");
+    inflTypeTranslations.put("特殊・ヌ", "special-nu");
+    inflTypeTranslations.put("文語・キ", "classical-ki");
+    inflTypeTranslations.put("特殊・タイ", "special-tai");
+    inflTypeTranslations.put("文語・ベシ", "classical-beshi");
+    inflTypeTranslations.put("特殊・ヤ", "special-ya");
+    inflTypeTranslations.put("文語・マジ", "classical-maji");
+    inflTypeTranslations.put("下二・タ行", "2-row-lower-cons-t");
+    inflTypeTranslations.put("特殊・デス", "special-desu");
+    inflTypeTranslations.put("特殊・マス", "special-masu");
+    inflTypeTranslations.put("五段・ラ行アル", "5-row-aru");
+    inflTypeTranslations.put("文語・ナリ", "classical-nari");
+    inflTypeTranslations.put("文語・リ", "classical-ri");
+    inflTypeTranslations.put("文語・ケリ", "classical-keri");
+    inflTypeTranslations.put("文語・ル", "classical-ru");
+    inflTypeTranslations.put("五段・カ行イ音便", "5-row-cons-k-i-onbin");
+    inflTypeTranslations.put("五段・サ行", "5-row-cons-s");
+    inflTypeTranslations.put("一段", "1-row");
+    inflTypeTranslations.put("五段・ワ行促音便", "5-row-cons-w-cons-onbin");
+    inflTypeTranslations.put("五段・マ行", "5-row-cons-m");
+    inflTypeTranslations.put("五段・タ行", "5-row-cons-t");
+    inflTypeTranslations.put("五段・ラ行", "5-row-cons-r");
+    inflTypeTranslations.put("サ変・−スル", "irregular-suffix-suru");
+    inflTypeTranslations.put("五段・ガ行", "5-row-cons-g");
+    inflTypeTranslations.put("サ変・−ズル", "irregular-suffix-zuru");
+    inflTypeTranslations.put("五段・バ行", "5-row-cons-b");
+    inflTypeTranslations.put("五段・ワ行ウ音便", "5-row-cons-w-u-onbin");
+    inflTypeTranslations.put("下二・ダ行", "2-row-lower-cons-d");
+    inflTypeTranslations.put("五段・カ行促音便ユク", "5-row-cons-k-cons-onbin-yuku");
+    inflTypeTranslations.put("上二・ダ行", "2-row-upper-cons-d");
+    inflTypeTranslations.put("五段・カ行促音便", "5-row-cons-k-cons-onbin");
+    inflTypeTranslations.put("一段・得ル", "1-row-eru");
+    inflTypeTranslations.put("四段・タ行", "4-row-cons-t");
+    inflTypeTranslations.put("五段・ナ行", "5-row-cons-n");
+    inflTypeTranslations.put("下二・ハ行", "2-row-lower-cons-h");
+    inflTypeTranslations.put("四段・ハ行", "4-row-cons-h");
+    inflTypeTranslations.put("四段・バ行", "4-row-cons-b");
+    inflTypeTranslations.put("サ変・スル", "irregular-suru");
+    inflTypeTranslations.put("上二・ハ行", "2-row-upper-cons-h");
+    inflTypeTranslations.put("下二・マ行", "2-row-lower-cons-m");
+    inflTypeTranslations.put("四段・サ行", "4-row-cons-s");
+    inflTypeTranslations.put("下二・ガ行", "2-row-lower-cons-g");
+    inflTypeTranslations.put("カ変・来ル", "kuru-kanji");
+    inflTypeTranslations.put("一段・クレル", "1-row-kureru");
+    inflTypeTranslations.put("下二・得", "2-row-lower-u");
+    inflTypeTranslations.put("カ変・クル", "kuru-kana");
+    inflTypeTranslations.put("ラ変", "irregular-cons-r");
+    inflTypeTranslations.put("下二・カ行", "2-row-lower-cons-k");
+  }
+  
+  /**
+   * Get the english form of inflection type
+   */
+  public static String getInflectionTypeTranslation(String s) {
+    return inflTypeTranslations.get(s);
+  }
+
+  // a translation map for inflection forms, only used for reflectWith
+  private static final HashMap<String,String> inflFormTranslations = new HashMap<String,String>();
+  static {
+    inflFormTranslations.put("*", "*");
+    inflFormTranslations.put("基本形", "base");
+    inflFormTranslations.put("文語基本形", "classical-base");
+    inflFormTranslations.put("未然ヌ接続", "imperfective-nu-connection");
+    inflFormTranslations.put("未然ウ接続", "imperfective-u-connection");
+    inflFormTranslations.put("連用タ接続", "conjunctive-ta-connection");
+    inflFormTranslations.put("連用テ接続", "conjunctive-te-connection");
+    inflFormTranslations.put("連用ゴザイ接続", "conjunctive-gozai-connection");
+    inflFormTranslations.put("体言接続", "uninflected-connection");
+    inflFormTranslations.put("仮定形", "subjunctive");
+    inflFormTranslations.put("命令e", "imperative-e");
+    inflFormTranslations.put("仮定縮約1", "conditional-contracted-1");
+    inflFormTranslations.put("仮定縮約2", "conditional-contracted-2");
+    inflFormTranslations.put("ガル接続", "garu-connection");
+    inflFormTranslations.put("未然形", "imperfective");
+    inflFormTranslations.put("連用形", "conjunctive");
+    inflFormTranslations.put("音便基本形", "onbin-base");
+    inflFormTranslations.put("連用デ接続", "conjunctive-de-connection");
+    inflFormTranslations.put("未然特殊", "imperfective-special");
+    inflFormTranslations.put("命令i", "imperative-i");
+    inflFormTranslations.put("連用ニ接続", "conjunctive-ni-connection");
+    inflFormTranslations.put("命令yo", "imperative-yo");
+    inflFormTranslations.put("体言接続特殊", "adnominal-special");
+    inflFormTranslations.put("命令ro", "imperative-ro");
+    inflFormTranslations.put("体言接続特殊2", "uninflected-special-connection-2");
+    inflFormTranslations.put("未然レル接続", "imperfective-reru-connection");
+    inflFormTranslations.put("現代基本形", "modern-base");
+    inflFormTranslations.put("基本形-促音便", "base-onbin"); // not sure about this
+  }
+  
+  /**
+   * Get the english form of inflected form
+   */
+  public static String getInflectedFormTranslation(String s) {
+    return inflFormTranslations.get(s);
+  }
+  
+  /**
+   * Romanize katakana with modified hepburn
+   */
+  public static String getRomanization(String s) {
+    StringBuilder builder = new StringBuilder();
+    final int len = s.length();
+    for (int i = 0; i < len; i++) {
+      // maximum lookahead: 3
+      char ch = s.charAt(i);
+      char ch2 = (i < len - 1) ? s.charAt(i + 1) : 0;
+      char ch3 = (i < len - 2) ? s.charAt(i + 2) : 0;
+      
+      main: switch (ch) {
+        case 'ッ':
+          switch (ch2) {
+            case 'カ':
+            case 'キ':
+            case 'ク':
+            case 'ケ':
+            case 'コ':
+              builder.append('k');
+              break main;
+            case 'サ':
+            case 'シ':
+            case 'ス':
+            case 'セ':
+            case 'ソ':
+              builder.append('s');
+              break main;
+            case 'タ':
+            case 'チ':
+            case 'ツ':
+            case 'テ':
+            case 'ト':
+              builder.append('t');
+              break main;
+            case 'パ':
+            case 'ピ':
+            case 'プ':
+            case 'ペ':
+            case 'ポ':
+              builder.append('p');
+              break main;
+          }
+          break;
+        case 'ア':
+          builder.append('a');
+          break;
+        case 'イ':
+          if (ch2 == 'ィ') {
+            builder.append("yi");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("ye");
+            i++;
+          } else {
+            builder.append('i');
+          }
+          break;
+        case 'ウ':
+          switch(ch2) {
+            case 'ァ':
+              builder.append("wa");
+              i++;
+              break;
+            case 'ィ':
+              builder.append("wi");
+              i++;
+              break;
+            case 'ゥ':
+              builder.append("wu");
+              i++;
+              break;
+            case 'ェ':
+              builder.append("we");
+              i++;
+              break;
+            case 'ォ':
+              builder.append("wo");
+              i++;
+              break;
+            case 'ュ':
+              builder.append("wyu");
+              i++;
+              break;
+            default:
+              builder.append('u');
+              break;
+          }
+          break;
+        case 'エ':
+          builder.append('e');
+          break;
+        case 'オ':
+          if (ch2 == 'ウ') {
+            builder.append('ō');
+            i++;
+          } else {
+            builder.append('o');
+          }
+          break;
+        case 'カ':
+          builder.append("ka");
+          break;
+        case 'キ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("kyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("kyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("kya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("kyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("kyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("kye");
+            i++;
+          } else {
+            builder.append("ki");
+          }
+          break;
+        case 'ク':
+          switch(ch2) {
+            case 'ァ':
+              builder.append("kwa");
+              i++;
+              break;
+            case 'ィ':
+              builder.append("kwi");
+              i++;
+              break;
+            case 'ェ':
+              builder.append("kwe");
+              i++;
+              break;
+            case 'ォ':
+              builder.append("kwo");
+              i++;
+              break;
+            case 'ヮ':
+              builder.append("kwa");
+              i++;
+              break;
+            default:
+              builder.append("ku");
+              break;
+          }
+          break;
+        case 'ケ':
+          builder.append("ke");
+          break;
+        case 'コ':
+          if (ch2 == 'ウ') {
+            builder.append("kō");
+            i++;
+          } else {
+            builder.append("ko");
+          }
+          break;
+        case 'サ':
+          builder.append("sa");
+          break;
+        case 'シ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("shō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("shū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("sha");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("sho");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("shu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("she");
+            i++;
+          } else {
+            builder.append("shi");
+          }
+          break;
+        case 'ス':
+          if (ch2 == 'ィ') {
+            builder.append("si");
+            i++;
+          } else {
+            builder.append("su");
+          }
+          break;
+        case 'セ':
+          builder.append("se");
+          break;
+        case 'ソ':
+          if (ch2 == 'ウ') {
+            builder.append("sō");
+            i++;
+          } else {
+            builder.append("so");
+          }
+          break;
+        case 'タ':
+          builder.append("ta");
+          break;
+        case 'チ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("chō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("chū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("cha");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("cho");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("chu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("che");
+            i++;
+          } else {
+            builder.append("chi");
+          }
+          break;
+        case 'ツ':
+          if (ch2 == 'ァ') {
+            builder.append("tsa");
+            i++;
+          } else if (ch2 == 'ィ') {
+            builder.append("tsi");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("tse");
+            i++;
+          } else if (ch2 == 'ォ') {
+            builder.append("tso");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("tsyu");
+            i++;
+          } else {
+            builder.append("tsu");
+          }
+          break;
+        case 'テ':
+          if (ch2 == 'ィ') {
+            builder.append("ti");
+            i++;
+          } else if (ch2 == 'ゥ') {
+            builder.append("tu");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("tyu");
+            i++;
+          } else {
+            builder.append("te");
+          }
+          break;
+        case 'ト':
+          if (ch2 == 'ウ') {
+            builder.append("tō");
+            i++;
+          } else {
+            builder.append("to");
+          }
+          break;
+        case 'ナ':
+          builder.append("na");
+          break;
+        case 'ニ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("nyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("nyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("nya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("nyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("nyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("nye");
+            i++;
+          } else {
+            builder.append("ni");
+          }
+          break;
+        case 'ヌ':
+          builder.append("nu");
+          break;
+        case 'ネ':
+          builder.append("ne");
+          break;
+        case 'ノ':
+          if (ch2 == 'ウ') {
+            builder.append("nō");
+            i++;
+          } else {
+            builder.append("no");
+          }
+          break;
+        case 'ハ':
+          builder.append("ha");
+          break;
+        case 'ヒ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("hyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("hyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("hya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("hyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("hyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("hye");
+            i++;
+          } else {
+            builder.append("hi");
+          }
+          break;
+        case 'フ':
+          if (ch2 == 'ャ') {
+            builder.append("fya");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("fyu");
+            i++;
+          } else if (ch2 == 'ィ' && ch3 == 'ェ') {
+            builder.append("fye");
+            i+=2;
+          } else if (ch2 == 'ョ') {
+            builder.append("fyo");
+            i++;
+          } else if (ch2 == 'ァ') {
+            builder.append("fa");
+            i++;
+          } else if (ch2 == 'ィ') {
+            builder.append("fi");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("fe");
+            i++;
+          } else if (ch2 == 'ォ') {
+            builder.append("fo");
+            i++;
+          } else {
+            builder.append("fu");
+          }
+          break;
+        case 'ヘ':
+          builder.append("he");
+          break;
+        case 'ホ':
+          if (ch2 == 'ウ') {
+            builder.append("hō");
+            i++;
+          } else if (ch2 == 'ゥ') {
+            builder.append("hu");
+            i++;
+          } else {
+            builder.append("ho");
+          }
+          break;
+        case 'マ':
+          builder.append("ma");
+          break;
+        case 'ミ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("myō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("myū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("mya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("myo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("myu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("mye");
+            i++;
+          } else {
+            builder.append("mi");
+          }
+          break;
+        case 'ム':
+          builder.append("mu");
+          break;
+        case 'メ':
+          builder.append("mi");
+          break;
+        case 'モ':
+          if (ch2 == 'ウ') {
+            builder.append("mō");
+            i++;
+          } else {
+            builder.append("mo");
+          }
+          break;
+        case 'ヤ':
+          builder.append("ya");
+          break;
+        case 'ユ':
+          builder.append("yu");
+          break;
+        case 'ヨ':
+          if (ch2 == 'ウ') {
+            builder.append("yō");
+            i++;
+          } else {
+            builder.append("yo");
+          }
+          break;
+        case 'ラ':
+          builder.append("ra");
+          break;
+        case 'リ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("ryō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("ryū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("rya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("ryo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("ryu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("rye");
+            i++;
+          } else {
+            builder.append("ri");
+          }
+          break;
+        case 'ル':
+          builder.append("ru");
+          break;
+        case 'レ':
+          builder.append("re");
+          break;
+        case 'ロ':
+          if (ch2 == 'ウ') {
+            builder.append("rō");
+            i++;
+          } else {
+            builder.append("ro");
+          }
+          break;
+        case 'ワ':
+          builder.append("wa");
+          break;
+        case 'ヰ':
+          builder.append("i");
+          break;
+        case 'ヱ':
+          builder.append("e");
+          break;
+        case 'ヲ':
+          builder.append("o");
+          break;
+        case 'ン':
+          switch (ch2) {
+            case 'バ':
+            case 'ビ':
+            case 'ブ':
+            case 'ベ':
+            case 'ボ':
+            case 'パ':
+            case 'ピ':
+            case 'プ':
+            case 'ペ':
+            case 'ポ':
+            case 'マ':
+            case 'ミ':
+            case 'ム':
+            case 'メ':
+            case 'モ':
+              builder.append('m');
+              break main;
+            case 'ヤ':
+            case 'ユ':
+            case 'ヨ':
+            case 'ア':
+            case 'イ':
+            case 'ウ':
+            case 'エ':
+            case 'オ':
+              builder.append("n'");
+              break main;
+            default:
+              builder.append("n");
+              break main;
+          }
+        case 'ガ':
+          builder.append("ga");
+          break;
+        case 'ギ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("gyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("gyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("gya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("gyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("gyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("gye");
+            i++;
+          } else {
+            builder.append("gi");
+          }
+          break;
+        case 'グ':
+          switch(ch2) {
+            case 'ァ':
+              builder.append("gwa");
+              i++;
+              break;
+            case 'ィ':
+              builder.append("gwi");
+              i++;
+              break;
+            case 'ェ':
+              builder.append("gwe");
+              i++;
+              break;
+            case 'ォ':
+              builder.append("gwo");
+              i++;
+              break;
+            case 'ヮ':
+              builder.append("gwa");
+              i++;
+              break;
+            default:
+              builder.append("gu");
+              break;
+          }
+          break;
+        case 'ゲ':
+          builder.append("ge");
+          break;
+        case 'ゴ':
+          if (ch2 == 'ウ') {
+            builder.append("gō");
+            i++;
+          } else {
+            builder.append("go");
+          }
+          break;
+        case 'ザ':
+          builder.append("za");
+          break;
+        case 'ジ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("jō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("jū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("ja");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("jo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("ju");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("je");
+            i++;
+          } else {
+            builder.append("ji");
+          }
+          break;
+        case 'ズ':
+          if (ch2 == 'ィ') {
+            builder.append("zi");
+            i++;
+          } else {
+            builder.append("zu");
+          }
+          break;
+        case 'ゼ':
+          builder.append("ze");
+          break;
+        case 'ゾ':
+          if (ch2 == 'ウ') {
+            builder.append("zō");
+            i++;
+          } else {
+            builder.append("zo");
+          }
+          break;
+        case 'ダ':
+          builder.append("da");
+          break;
+        case 'ヂ':
+          builder.append("ji");
+          break;
+        case 'ヅ':
+          builder.append("zu");
+          break;
+        case 'デ':
+          if (ch2 == 'ィ') {
+            builder.append("di");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("dyu");
+            i++;
+          } else {
+            builder.append("de");
+          }
+          break;
+        case 'ド':
+          if (ch2 == 'ウ') {
+            builder.append("dō");
+            i++;
+          } else if (ch2 == 'ゥ') {
+            builder.append("du");
+            i++;
+          } else {
+            builder.append("do");
+          }
+          break;
+        case 'バ':
+          builder.append("ba");
+          break;
+        case 'ビ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("byō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("byū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("bya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("byo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("byu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("bye");
+            i++;
+          } else {
+            builder.append("bi");
+          }
+          break;
+        case 'ブ':
+          builder.append("bu");
+          break;
+        case 'ベ':
+          builder.append("be");
+          break;
+        case 'ボ':
+          if (ch2 == 'ウ') {
+            builder.append("bō");
+            i++;
+          } else {
+            builder.append("bo");
+          }
+          break;
+        case 'パ':
+          builder.append("pa");
+          break;
+        case 'ピ':
+          if (ch2 == 'ョ' && ch3 == 'ウ') {
+            builder.append("pyō");
+            i += 2;
+          } else if (ch2 == 'ュ' && ch3 == 'ウ') {
+            builder.append("pyū");
+            i += 2;
+          } else if (ch2 == 'ャ') {
+            builder.append("pya");
+            i++;
+          } else if (ch2 == 'ョ') {
+            builder.append("pyo");
+            i++;
+          } else if (ch2 == 'ュ') {
+            builder.append("pyu");
+            i++;
+          } else if (ch2 == 'ェ') {
+            builder.append("pye");
+            i++;
+          } else {
+            builder.append("pi");
+          }
+          break;
+        case 'プ':
+          builder.append("pu");
+          break;
+        case 'ペ':
+          builder.append("pe");
+          break;
+        case 'ポ':
+          if (ch2 == 'ウ') {
+            builder.append("pō");
+            i++;
+          } else {
+            builder.append("po");
+          }
+          break;
+        case 'ヴ':
+          if (ch2 == 'ィ' && ch3 == 'ェ') {
+            builder.append("vye");
+            i+= 2;
+          } else {
+            builder.append('v');
+          }
+          break;
+        case 'ァ':
+          builder.append('a');
+          break;
+        case 'ィ':
+          builder.append('i');
+          break;
+        case 'ゥ':
+          builder.append('u');
+          break;
+        case 'ェ':
+          builder.append('e');
+          break;
+        case 'ォ':
+          builder.append('o');
+          break;
+        case 'ヮ':
+          builder.append("wa");
+          break;
+        case 'ャ':
+          builder.append("ya");
+          break;
+        case 'ュ':
+          builder.append("yu");
+          break;
+        case 'ョ':
+          builder.append("yo");
+          break;
+        case 'ー':
+          break;
+        default:
+          builder.append(ch);
+      }
+    }
+    return builder.toString();
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/GraphvizFormatter.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,226 @@
+package org.apache.lucene.analysis.kuromoji.viterbi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+
+public class GraphvizFormatter {
+  
+  private final static String BOS_LABEL = "BOS";
+  
+  private final static String EOS_LABEL = "EOS";
+  
+  private final static String FONT_NAME = "Helvetica";
+  
+  private ConnectionCosts costs;
+  
+  private Map<String, ViterbiNode> nodeMap;
+  
+  private Map<String, String> bestPathMap;
+  
+  private boolean foundBOS;
+  
+  public GraphvizFormatter(ConnectionCosts costs) {
+    this.costs = costs;
+    this.nodeMap = new HashMap<String, ViterbiNode>();
+    this.bestPathMap = new HashMap<String, String>();
+  }
+  
+  public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+    initBestPathMap(null);
+    
+    StringBuilder sb = new StringBuilder();
+    sb.append(formatHeader());
+    sb.append(formatNodes(startsArray, endsArray));
+    sb.append(formatTrailer());
+    return sb.toString();
+  }
+  
+  public String format(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray, List<ViterbiNode> bestPath) {
+    
+    //		List<ViterbiNode> bestPathWithBOSAndEOS = new ArrayList<ViterbiNode>(bastPath);
+    initBestPathMap(bestPath);
+    
+    StringBuilder sb = new StringBuilder();
+    sb.append(formatHeader());
+    sb.append(formatNodes(startsArray, endsArray));
+    sb.append(formatTrailer());
+    return sb.toString();
+    
+  }
+  
+  private void initBestPathMap(List<ViterbiNode> bestPath) {
+    this.bestPathMap.clear();
+    
+    if (bestPath == null){
+      return;
+    }
+    for (int i = 0; i < bestPath.size() - 1; i++) {
+      ViterbiNode from = bestPath.get(i);
+      ViterbiNode to = bestPath.get(i + 1);
+      
+      String fromId = getNodeId(from);
+      String toId = getNodeId(to);
+      
+      assert this.bestPathMap.containsKey(fromId) == false;
+      assert this.bestPathMap.containsValue(toId) == false;
+      this.bestPathMap.put(fromId, toId);
+    }
+  }
+  
+  private String formatNodes(ViterbiNode[][] startsArray, ViterbiNode[][] endsArray) {
+    this.nodeMap.clear();
+    this.foundBOS = false;
+    
+    StringBuilder sb = new StringBuilder();
+    for (int i = 1; i < endsArray.length; i++) {
+      if(endsArray[i] == null || startsArray[i] == null) {
+        continue;
+      }
+      for (int j = 0; j < endsArray[i].length; j++) {
+        ViterbiNode from = endsArray[i][j];
+        if(from == null){
+          continue;
+        }
+        sb.append(formatNodeIfNew(from));
+        for (int k = 0; k < startsArray[i].length; k++) {
+          ViterbiNode to = startsArray[i][k];
+          if(to == null){
+            break;
+          }
+          sb.append(formatNodeIfNew(to));
+          sb.append(formatEdge(from, to));
+        }
+      }
+    }
+    return sb.toString();
+  }
+  
+  private String formatNodeIfNew(ViterbiNode node) {
+    String nodeId = getNodeId(node);
+    if (! this.nodeMap.containsKey(nodeId)) {
+      this.nodeMap.put(nodeId, node);
+      return formatNode(node);
+    } else {
+      return "";
+    }
+  }	
+  
+  private String formatHeader() {
+    StringBuilder sb = new StringBuilder();
+    sb.append("digraph viterbi {\n");
+    sb.append("graph [ fontsize=30 labelloc=\"t\" label=\"\" splines=true overlap=false rankdir = \"LR\" ];\n");
+    sb.append("# A2 paper size\n");
+    sb.append("size = \"34.4,16.5\";\n");
+    sb.append("# try to fill paper\n");
+    sb.append("ratio = fill;\n");
+    sb.append("edge [ fontname=\"" + FONT_NAME + "\" fontcolor=\"red\" color=\"#606060\" ]\n");
+    sb.append("node [ style=\"filled\" fillcolor=\"#e8e8f0\" shape=\"Mrecord\" fontname=\"" + FONT_NAME + "\" ]\n");
+    
+    return sb.toString();
+  }
+  
+  private String formatTrailer() {
+    return "}";
+  }
+  
+  
+  private String formatEdge(ViterbiNode from, ViterbiNode to) {
+    if (this.bestPathMap.containsKey(getNodeId(from)) &&
+        this.bestPathMap.get(getNodeId(from)).equals(getNodeId(to))) {
+      return formatEdge(from, to, "color=\"#40e050\" fontcolor=\"#40a050\" penwidth=3 fontsize=20 ");
+      
+    } else {
+      return formatEdge(from, to, "");
+    }
+  }
+  
+  
+  private String formatEdge(ViterbiNode from, ViterbiNode to, String attributes) {
+    StringBuilder sb = new StringBuilder();
+    sb.append(getNodeId(from));
+    sb.append(" -> ");
+    sb.append(getNodeId(to));
+    sb.append(" [ ");
+    sb.append("label=\"");
+    sb.append(getCost(from, to));
+    sb.append("\"");
+    sb.append(" ");
+    sb.append(attributes);
+    sb.append(" ");
+    sb.append(" ]");
+    sb.append("\n");
+    return sb.toString();
+  }
+  
+  private String formatNode(ViterbiNode node) {
+    StringBuilder sb = new StringBuilder();
+    sb.append("\"");
+    sb.append(getNodeId(node));
+    sb.append("\"");
+    sb.append(" [ ");
+    sb.append("label=");
+    sb.append(formatNodeLabel(node));
+    sb.append(" ]");
+    return sb.toString();
+  }
+  
+  private String formatNodeLabel(ViterbiNode node) {
+    StringBuilder sb = new StringBuilder();
+    sb.append("<<table border=\"0\" cellborder=\"0\">");
+    sb.append("<tr><td>");
+    sb.append(getNodeLabel(node));
+    sb.append("</td></tr>");
+    sb.append("<tr><td>");
+    sb.append("<font color=\"blue\">");
+    sb.append(node.getWordCost());
+    sb.append("</font>");
+    sb.append("</td></tr>");
+    //		sb.append("<tr><td>");
+    //		sb.append(this.dictionary.get(node.getWordId()).getPosInfo());
+    //		sb.append("</td></tr>");
+    sb.append("</table>>");
+    return sb.toString();
+  }
+  
+  private String getNodeId(ViterbiNode node) {
+    return String.valueOf(node.hashCode());
+  }
+  
+  private String getNodeLabel(ViterbiNode node) {
+    if (node.getType() == Type.KNOWN && node.getWordId() == 0) {
+      if (this.foundBOS) {
+        return EOS_LABEL;
+      } else {
+        this.foundBOS = true;
+        return BOS_LABEL;
+      }
+    } else {
+      return node.getSurfaceFormString();
+    }
+  }
+  
+  private int getCost(ViterbiNode from, ViterbiNode to) {
+    return this.costs.get(from.getLeftId(), to.getRightId());
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,361 @@
+package org.apache.lucene.analysis.kuromoji.viterbi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.analysis.kuromoji.Segmenter.Mode;
+import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition;
+import org.apache.lucene.analysis.kuromoji.dict.ConnectionCosts;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.TokenInfoFST;
+import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
+import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
+import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.fst.FST;
+
+public class Viterbi {
+  
+  private final TokenInfoFST fst;
+  
+  private final TokenInfoDictionary dictionary;
+  
+  private final UnknownDictionary unkDictionary;
+  
+  private final ConnectionCosts costs;
+  
+  private final UserDictionary userDictionary;
+  
+  private final CharacterDefinition characterDefinition;
+  
+  private final boolean useUserDictionary;
+  
+  private final boolean searchMode;
+  
+  private final boolean extendedMode;
+  
+  private static final int DEFAULT_COST = 10000000;
+  
+  private static final int SEARCH_MODE_LENGTH_KANJI = 3;
+  
+  private static final int SEARCH_MODE_LENGTH = 7;
+  
+  private static final int SEARCH_MODE_PENALTY = 10000;
+  
+  private static final char[] BOS = "BOS".toCharArray();
+  
+  private static final char[] EOS = "EOS".toCharArray();
+  
+  /**
+   * Constructor
+   */
+  public Viterbi(TokenInfoDictionary dictionary,
+      UnknownDictionary unkDictionary,
+      ConnectionCosts costs,
+      UserDictionary userDictionary,
+      Mode mode) {
+    this.dictionary = dictionary;
+    this.fst = dictionary.getFST();
+    this.unkDictionary = unkDictionary;
+    this.costs = costs;
+    this.userDictionary = userDictionary;
+    if(userDictionary == null) {
+      this.useUserDictionary = false;
+    } else {
+      this.useUserDictionary = true;
+    }
+    
+    switch(mode){
+      case SEARCH:
+        searchMode = true;
+        extendedMode = false;
+        break;
+      case EXTENDED:
+        searchMode = true;
+        extendedMode = true;
+        break;
+      default:
+        searchMode = false;
+        extendedMode = false;
+        break;
+    }
+    
+    this.characterDefinition = unkDictionary.getCharacterDefinition();
+  }
+  
+  /**
+   * Find best path from input lattice.
+   * @param lattice the result of build method
+   * @return	List of ViterbiNode which consist best path 
+   */
+  public List<ViterbiNode> search(ViterbiNode[][][] lattice) {
+    ViterbiNode[][] startIndexArr = lattice[0];
+    ViterbiNode[][] endIndexArr = lattice[1];
+    
+    for (int i = 1; i < startIndexArr.length; i++){
+      
+      if (startIndexArr[i] == null || endIndexArr[i] == null){	// continue since no array which contains ViterbiNodes exists. Or no previous node exists.
+        continue;
+      }
+      
+      for (ViterbiNode node : startIndexArr[i]) {
+        if (node == null){	// If array doesn't contain ViterbiNode any more, continue to next index
+          break;
+        }
+        
+        int backwardConnectionId = node.getLeftId();
+        int wordCost = node.getWordCost();
+        int leastPathCost = DEFAULT_COST;
+        for (ViterbiNode leftNode : endIndexArr[i]) {
+          if (leftNode == null){ // If array doesn't contain ViterbiNode any more, continue to next index
+            break;
+          }
+          
+          int pathCost = leftNode.getPathCost() + costs.get(leftNode.getRightId(), backwardConnectionId) + wordCost;	// cost = [total cost from BOS to previous node] + [connection cost between previous node and current node] + [word cost]
+          
+          // "Search mode". Add extra costs if it is long node.
+          if (searchMode) {
+            //						System.out.print(""); // If this line exists, kuromoji runs faster for some reason when searchMode == false.
+            char[] surfaceForm = node.getSurfaceForm();
+            int offset = node.getOffset();
+            int length = node.getLength();
+            if (length > SEARCH_MODE_LENGTH_KANJI) {
+              boolean allKanji = true;
+              // check if node consists of only kanji
+              for (int pos = 0; pos < length; pos++) {
+                if (!characterDefinition.isKanji(surfaceForm[offset+pos])){
+                  allKanji = false;
+                  break;
+                }				
+              }
+              
+              if (allKanji) {	// Process only Kanji keywords
+                pathCost += (length - SEARCH_MODE_LENGTH_KANJI) * SEARCH_MODE_PENALTY;
+              } else if (length > SEARCH_MODE_LENGTH) {
+                pathCost += (length - SEARCH_MODE_LENGTH) * SEARCH_MODE_PENALTY;								
+              }
+            }
+          }
+          
+          if (pathCost < leastPathCost){	// If total cost is lower than before, set current previous node as best left node (previous means left).
+            leastPathCost = pathCost;
+            node.setPathCost(leastPathCost);
+            node.setLeftNode(leftNode);
+          }					
+        }
+      }
+    }
+    
+    // track best path
+    ViterbiNode node = endIndexArr[0][0];	// EOS
+    LinkedList<ViterbiNode> result = new LinkedList<ViterbiNode>();
+    result.add(node);
+    while (true) {
+      ViterbiNode leftNode = node.getLeftNode();
+      if (leftNode == null) {
+        break;
+      }
+      
+      // EXTENDED mode convert unknown word into unigram node
+      if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
+        byte unigramWordId = CharacterDefinition.NGRAM;
+        int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
+        int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
+        int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
+        char[] surfaceForm = leftNode.getSurfaceForm();
+        int offset = leftNode.getOffset();
+        int length = leftNode.getLength();
+        for (int i = length - 1; i >= 0; i--) {
+          int charLen = 1;
+          if (i > 0 && Character.isLowSurrogate(surfaceForm[offset+i])) {
+            i--;
+            charLen = 2;
+          }
+          ViterbiNode uniGramNode = new ViterbiNode(unigramWordId, surfaceForm, offset + i, charLen, unigramLeftId, unigramRightId, unigramWordCost, leftNode.getStartIndex() + i, Type.UNKNOWN);
+          result.addFirst(uniGramNode);
+        }
+      } else {
+        result.addFirst(leftNode);		
+      }
+      node = leftNode;
+    }
+    
+    return result;
+  }
+
+  /**
+   * Build lattice from input text
+   * @param text
+   */
+  public ViterbiNode[][][] build(char text[], int offset, int length) throws IOException {
+    ViterbiNode[][] startIndexArr = new ViterbiNode[length + 2][];  // text length + BOS and EOS
+    ViterbiNode[][] endIndexArr = new ViterbiNode[length + 2][];  // text length + BOS and EOS
+    int[] startSizeArr = new int[length + 2]; // array to keep ViterbiNode count in startIndexArr
+    int[] endSizeArr = new int[length + 2];   // array to keep ViterbiNode count in endIndexArr
+    FST.Arc<Long> arc = new FST.Arc<Long>();
+    ViterbiNode bosNode = new ViterbiNode(-1, BOS, 0, BOS.length, 0, 0, 0, -1, Type.KNOWN);
+    addToArrays(bosNode, 0, 1, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+    
+    // Process user dictionary;
+    if (useUserDictionary) {
+      processUserDictionary(text, offset, length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+    }
+    
+    int unknownWordEndIndex = -1;	// index of the last character of unknown word
+    
+    final IntsRef wordIdRef = new IntsRef();
+    
+    for (int startIndex = 0; startIndex < length; startIndex++) {
+      // If no token ends where current token starts, skip this index
+      if (endSizeArr[startIndex + 1] == 0) {
+        continue;
+      }
+      
+      int suffixStart = offset + startIndex;
+      int suffixLength = length - startIndex;
+      
+      boolean found = false;
+      arc = fst.getFirstArc(arc);
+      int output = 0;
+      for (int endIndex = 1; endIndex < suffixLength + 1; endIndex++) {
+        int ch = text[suffixStart + endIndex - 1];
+        
+        if (fst.findTargetArc(ch, arc, arc, endIndex == 1) == null) {
+          break; // continue to next position
+        }
+        output += arc.output.intValue();
+
+        if (arc.isFinal()) {
+          output += arc.nextFinalOutput.intValue();
+          found = true; // Don't produce unknown word starting from this index
+          dictionary.lookupWordIds(output, wordIdRef);
+          for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+            final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
+            ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, endIndex, dictionary.getLeftId(wordId), dictionary.getRightId(wordId), dictionary.getWordCost(wordId), startIndex, Type.KNOWN);
+            addToArrays(node, startIndex + 1, startIndex + 1 + endIndex, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+          }
+        }
+      }
+      
+      // In the case of normal mode, it doesn't process unknown word greedily.
+      if(!searchMode && unknownWordEndIndex > startIndex){
+        continue;
+      }
+      
+      // Process Unknown Word: hmm what is this isInvoke logic (same no matter what)
+      int unknownWordLength = 0;
+      char firstCharacter = text[suffixStart];
+      boolean isInvoke = characterDefinition.isInvoke(firstCharacter);
+      if (isInvoke){	// Process "invoke"
+        unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);
+      } else if (found == false){	// Process not "invoke"
+        unknownWordLength = unkDictionary.lookup(text, suffixStart, suffixLength);				
+      }
+      
+      if (unknownWordLength > 0) {      // found unknown word
+        final int characterId = characterDefinition.getCharacterClass(firstCharacter);
+        unkDictionary.lookupWordIds(characterId, wordIdRef); // characters in input text are supposed to be the same
+        for (int ofs = 0; ofs < wordIdRef.length; ofs++) {
+          final int wordId = wordIdRef.ints[wordIdRef.offset + ofs];
+          ViterbiNode node = new ViterbiNode(wordId, text, suffixStart, unknownWordLength, unkDictionary.getLeftId(wordId), unkDictionary.getRightId(wordId), unkDictionary.getWordCost(wordId), startIndex, Type.UNKNOWN);
+          addToArrays(node, startIndex + 1, startIndex + 1 + unknownWordLength, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+        }
+        unknownWordEndIndex = startIndex + unknownWordLength;
+      }
+    }
+    
+    ViterbiNode eosNode = new ViterbiNode(-1, EOS, 0, EOS.length, 0, 0, 0, length + 1, Type.KNOWN);
+    addToArrays(eosNode, length + 1, 0, startIndexArr, endIndexArr, startSizeArr, endSizeArr); //Add EOS node to endIndexArr at index 0
+    
+    ViterbiNode[][][] result = new ViterbiNode[][][]{startIndexArr, endIndexArr};
+    
+    return result;
+  }
+  
+  /**
+   * Find token(s) in input text and set found token(s) in arrays as normal tokens
+   * @param text	
+   * @param startIndexArr
+   * @param endIndexArr
+   * @param startSizeArr
+   * @param endSizeArr
+   */
+  private void processUserDictionary(char text[], int offset, int len, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr) throws IOException {
+    int[][] result = userDictionary.lookup(text, offset, len);
+    for(int[] segmentation : result) {
+      int wordId = segmentation[0];
+      int index = segmentation[1];
+      int length = segmentation[2];
+      ViterbiNode node = new ViterbiNode(wordId, text, offset + index, length, userDictionary.getLeftId(wordId), userDictionary.getRightId(wordId), userDictionary.getWordCost(wordId), index, Type.USER);
+      addToArrays(node, index + 1, index + 1 + length, startIndexArr, endIndexArr, startSizeArr, endSizeArr);
+    }
+  }
+  
+  /**
+   * Add node to arrays and increment count in size array
+   * @param node
+   * @param startIndex
+   * @param endIndex
+   * @param startIndexArr
+   * @param endIndexArr
+   * @param startSizeArr
+   * @param endSizeArr
+   */
+  private void addToArrays(ViterbiNode node, int startIndex, int endIndex, ViterbiNode[][] startIndexArr, ViterbiNode[][] endIndexArr, int[] startSizeArr, int[] endSizeArr ) {
+    int startNodesCount = startSizeArr[startIndex];
+    int endNodesCount = endSizeArr[endIndex];
+    
+    if (startNodesCount == 0) {
+      startIndexArr[startIndex] = new ViterbiNode[10];
+    }
+    
+    if (endNodesCount == 0) {
+      endIndexArr[endIndex] = new ViterbiNode[10];
+    }
+    
+    if (startIndexArr[startIndex].length <= startNodesCount){
+      startIndexArr[startIndex] = extendArray(startIndexArr[startIndex]);
+    }
+    
+    if (endIndexArr[endIndex].length <= endNodesCount){
+      endIndexArr[endIndex] = extendArray(endIndexArr[endIndex]);
+    }
+    
+    startIndexArr[startIndex][startNodesCount] = node;
+    endIndexArr[endIndex][endNodesCount] = node;
+    
+    startSizeArr[startIndex] = startNodesCount + 1;
+    endSizeArr[endIndex] = endNodesCount + 1;
+  }
+  
+  
+  /**
+   * Return twice as big array which contains value of input array
+   * @param array
+   * @return
+   */
+  private ViterbiNode[] extendArray(ViterbiNode[] array) {
+    //extend array
+    ViterbiNode[] newArray = new ViterbiNode[array.length * 2];
+    System.arraycopy(array, 0, newArray, 0, array.length);
+    return newArray;
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/ViterbiNode.java Thu Jan 12 20:10:48 2012
@@ -0,0 +1,147 @@
+package org.apache.lucene.analysis.kuromoji.viterbi;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+public final class ViterbiNode {
+  public enum Type {
+    KNOWN,
+    UNKNOWN,
+    USER
+  }
+  
+  private final int wordId;
+  
+  private final char[] surfaceForm;
+  private final int offset;
+  private final int length;
+  
+  private final int leftId;
+  
+  private final int rightId;
+  
+  /** word cost for this node */
+  private final int wordCost;
+  
+  /** minimum path cost found thus far */
+  private int pathCost;
+  
+  private ViterbiNode leftNode;
+  
+  private final Type type;
+  
+  private final int startIndex;
+  
+  public ViterbiNode(int wordId, char[] surfaceForm, int offset, int length, int leftId, int rightId, int wordCost, int startIndex, Type type) {
+    this.wordId = wordId;
+    this.surfaceForm = surfaceForm;
+    this.offset = offset;
+    this.length = length;
+    this.leftId = leftId;
+    this.rightId = rightId;
+    this.wordCost = wordCost;
+    this.startIndex = startIndex;
+    this.type = type;
+  }
+  
+  
+  /**
+   * @return the wordId
+   */
+  public int getWordId() {
+    return wordId;
+  }
+  
+  /**
+   * @return the surfaceForm
+   */
+  public char[] getSurfaceForm() {
+    return surfaceForm;
+  }
+  
+  /**
+   * @return start offset into surfaceForm
+   */
+  public int getOffset() {
+    return offset;
+  }
+  
+  /**
+   * @return length of surfaceForm
+   */
+  public int getLength() {
+    return length;
+  }
+  
+  /**
+   * @return the surfaceForm as a String
+   */
+  public String getSurfaceFormString() {
+    return new String(surfaceForm, offset, length);
+  }
+  
+  /**
+   * @return the leftId
+   */
+  public int getLeftId() {
+    return leftId;
+  }
+  
+  /**
+   * @return the rightId
+   */
+  public int getRightId() {
+    return rightId;
+  }
+  
+  /**
+   * @return the cost
+   */
+  public int getWordCost() {
+    return wordCost;
+  }
+  
+  /**
+   * @return the cost
+   */
+  public int getPathCost() {
+    return pathCost;
+  }
+  
+  /**
+   * param cost minimum path cost found this far
+   */
+  public void setPathCost(int pathCost) {
+    this.pathCost = pathCost;
+  }
+  
+  public void setLeftNode(ViterbiNode node) {
+    leftNode = node;
+  }
+  
+  public ViterbiNode getLeftNode() {
+    return leftNode;
+  }
+  
+  public int getStartIndex() {
+    return startIndex;
+  }
+  
+  public Type getType() {
+    return type;
+  }
+}

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html?rev=1230748&view=auto
==============================================================================
--- lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html (added)
+++ lucene/dev/trunk/modules/analysis/kuromoji/src/java/overview.html Thu Jan 12 20:10:48 2012
@@ -0,0 +1,26 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<html>
+  <head>
+    <title>
+      analyzers-kuromoji
+    </title>
+  </head>
+  <body>
+  analyzers-kuromoji
+  </body>
+</html>

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/ConnectionCosts.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24buffer.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$fst.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24fst.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24inflDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24posDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/TokenInfoDictionary%24targetMap.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$buffer.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24buffer.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$inflDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24inflDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$posDict.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24posDict.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.

Added: lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary$targetMap.dat
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary%24targetMap.dat?rev=1230748&view=auto
==============================================================================
Binary file - no diff available.



Mime
View raw message