lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From uschind...@apache.org
Subject svn commit: r1227427 - in /lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src: java/org/apache/lucene/analysis/kuromoji/dict/ java/org/apache/lucene/analysis/kuromoji/trie/ java/org/apache/lucene/analysis/kuromoji/viterbi/ resources/org/apach...
Date Thu, 05 Jan 2012 01:37:59 GMT
Author: uschindler
Date: Thu Jan  5 01:37:58 2012
New Revision: 1227427

URL: http://svn.apache.org/viewvc?rev=1227427&view=rev
Log:
LUCENE-3305: Minimize size of CharacterDefinition, remove Java serialization and add codec
header; add codec header to DoubleArrayTrie. All files now have a codec header with version
number == 1

Modified:
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/cd.dat
    lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/trie/dat.dat

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java?rev=1227427&r1=1227426&r2=1227427&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
(original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/CharacterDefinition.java
Thu Jan  5 01:37:58 2012
@@ -17,57 +17,74 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
-import java.io.Serializable;
-import java.util.EnumMap;
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.util.Arrays;
 
-public final class CharacterDefinition implements Serializable {
-  private static final long serialVersionUID = -1436753619176638532L;
-  
-  private final CharacterClass[] characterCategoryMap = new CharacterClass[65536];
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.CodecUtil;
+
+public final class CharacterDefinition {
+  public static final String FILENAME = "cd.dat";
+  public static final String HEADER = "kuromoji_cd";
+  public static final int VERSION = 1;
+
+  private static final int CLASS_COUNT = CharacterClass.values().length;
   
-  private final EnumMap<CharacterClass, int[]> invokeDefinitionMap =
-      new EnumMap<CharacterClass, int[]>(CharacterClass.class); // invoke, group, length
-      
-  public enum CharacterClass {
+  // only used internally for lookup:
+  private static enum CharacterClass {
     NGRAM, DEFAULT, SPACE, SYMBOL, NUMERIC, ALPHA, CYRILLIC, GREEK, HIRAGANA, KATAKANA, KANJI,
KANJINUMERIC;
-    
-    public int getId() {
-      return ordinal();
-    }
   }
       
+  private final byte[] characterCategoryMap = new byte[0x10000];
+  
+  private final boolean[] invokeMap = new boolean[CLASS_COUNT];
+  private final boolean[] groupMap = new boolean[CLASS_COUNT];
+  
+  // the classes:
+  public static final byte NGRAM = (byte) CharacterClass.NGRAM.ordinal();
+  public static final byte DEFAULT = (byte) CharacterClass.DEFAULT.ordinal();
+  public static final byte SPACE = (byte) CharacterClass.SPACE.ordinal();
+  public static final byte SYMBOL = (byte) CharacterClass.SYMBOL.ordinal();
+  public static final byte NUMERIC = (byte) CharacterClass.NUMERIC.ordinal();
+  public static final byte ALPHA = (byte) CharacterClass.ALPHA.ordinal();
+  public static final byte CYRILLIC = (byte) CharacterClass.CYRILLIC.ordinal();
+  public static final byte GREEK = (byte) CharacterClass.GREEK.ordinal();
+  public static final byte HIRAGANA = (byte) CharacterClass.HIRAGANA.ordinal();
+  public static final byte KATAKANA = (byte) CharacterClass.KATAKANA.ordinal();
+  public static final byte KANJI = (byte) CharacterClass.KANJI.ordinal();
+  public static final byte KANJINUMERIC = (byte) CharacterClass.KANJINUMERIC.ordinal();
+  
   /**
    * Constructor
    */
   public CharacterDefinition() {
-    for (int i = 0; i < characterCategoryMap.length; i++) {
-      characterCategoryMap[i] = CharacterClass.DEFAULT;
-    }
+    Arrays.fill(characterCategoryMap, DEFAULT);
   }
   
-  public int lookup(char c) {
-    return characterCategoryMap[c].getId();
-  }
-  
-  public CharacterClass getCharacterClass(char c) {
+  public byte getCharacterClass(char c) {
     return characterCategoryMap[c];
   }
   
   public boolean isInvoke(char c) {
-    CharacterClass characterClass = characterCategoryMap[c];
-    int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
-    return invokeDefinition[0] == 1;
+    return invokeMap[characterCategoryMap[c]];
   }
   
   public boolean isGroup(char c) {
-    CharacterClass characterClass = characterCategoryMap[c];
-    int[] invokeDefinition = invokeDefinitionMap.get(characterClass);
-    return invokeDefinition[1] == 1;
+    return groupMap[characterCategoryMap[c]];
   }
   
   public boolean isKanji(char c) {
-    return characterCategoryMap[c] == CharacterClass.KANJI ||
-        characterCategoryMap[c] == CharacterClass.KANJINUMERIC;
+    final byte characterClass = characterCategoryMap[c];
+    return characterClass == KANJI || characterClass == KANJINUMERIC;
   }
   
   /**
@@ -86,13 +103,61 @@ public final class CharacterDefinition i
     if (codePoint == 0x30FB) {
       characterClassName = "SYMBOL";
     }
-    characterCategoryMap[codePoint] = CharacterClass.valueOf(characterClassName);
+    characterCategoryMap[codePoint] = lookupCharacterClass(characterClassName);
   }
   
   public void putInvokeDefinition(String characterClassName, int invoke, int group, int length)
{
-    CharacterClass characterClass = CharacterClass
-        .valueOf(characterClassName);
-    int[] values = { invoke, group, length };
-    invokeDefinitionMap.put(characterClass, values);
+    final byte characterClass = lookupCharacterClass(characterClassName);
+    invokeMap[characterClass] = invoke == 1;
+    groupMap[characterClass] = group == 1;
+    // TODO: length def ignored
+  }
+  
+  public static byte lookupCharacterClass(String characterClassName) {
+    return (byte) CharacterClass.valueOf(characterClassName).ordinal();
+  }
+
+  public void write(String directoryname) throws IOException {
+    String filename = directoryname + File.separator + FILENAME;
+    OutputStream os = new FileOutputStream(filename);
+    try {
+      os = new BufferedOutputStream(os);
+      final DataOutput out = new OutputStreamDataOutput(os);
+      CodecUtil.writeHeader(out, HEADER, VERSION);
+      out.writeBytes(characterCategoryMap, 0, characterCategoryMap.length);
+      for (int i = 0; i < CLASS_COUNT; i++) {
+        final byte b = (byte) (
+          (invokeMap[i] ? 0x01 : 0x00) | 
+          (groupMap[i] ? 0x02 : 0x00)
+        );
+        out.writeByte(b);
+      }
+    } finally {
+      os.close();
+    }
   }
+  
+  public static CharacterDefinition getInstance() throws IOException, ClassNotFoundException
{
+    InputStream is = CharacterDefinition.class.getResourceAsStream(FILENAME);
+    return read(is);
+  }
+  
+  public static CharacterDefinition read(InputStream is) throws IOException, ClassNotFoundException
{
+    is = new BufferedInputStream(is);
+    try {
+      final DataInput in = new InputStreamDataInput(is);
+      CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
+      CharacterDefinition cd = new CharacterDefinition();
+      in.readBytes(cd.characterCategoryMap, 0, cd.characterCategoryMap.length);
+      for (int i = 0; i < CLASS_COUNT; i++) {
+        final byte b = in.readByte();
+        cd.invokeMap[i] = (b & 0x01) != 0;
+        cd.groupMap[i] = (b & 0x02) != 0;
+      }
+      return cd;
+    } finally {
+      is.close();
+    }
+  }
+
 }

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java?rev=1227427&r1=1227426&r2=1227427&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
(original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/dict/UnknownDictionary.java
Thu Jan  5 01:37:58 2012
@@ -17,16 +17,8 @@ package org.apache.lucene.analysis.kurom
  * limitations under the License.
  */
 
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
 import java.io.File;
-import java.io.FileOutputStream;
 import java.io.IOException;
-import java.io.InputStream;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-
-import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition.CharacterClass;
 
 public class UnknownDictionary extends TokenInfoDictionary {
   
@@ -34,8 +26,6 @@ public class UnknownDictionary extends T
   
   public static final String TARGETMAP_FILENAME = "unk_map.dat";
   
-  public static final String CHARDEF_FILENAME = "cd.dat";
-  
   private CharacterDefinition characterDefinition;
   
   /**
@@ -58,7 +48,7 @@ public class UnknownDictionary extends T
     int result = super.put(entry);
     
     // Put entry in targetMap
-    int characterId = CharacterClass.valueOf(entry[0]).getId();
+    int characterId = CharacterDefinition.lookupCharacterClass(entry[0]);
     addMapping(characterId, wordId);
     return result;
   }
@@ -69,10 +59,10 @@ public class UnknownDictionary extends T
     }
     
     // Extract unknown word. Characters with the same character class are considered to be
part of unknown word
-    int characterIdOfFirstCharacter = characterDefinition.lookup(text.charAt(0));
+    byte characterIdOfFirstCharacter = characterDefinition.getCharacterClass(text.charAt(0));
     int length = 1;
     for (int i = 1; i < text.length(); i++) {
-      if (characterIdOfFirstCharacter == characterDefinition.lookup(text.charAt(i))){
+      if (characterIdOfFirstCharacter == characterDefinition.getCharacterClass(text.charAt(i))){
         length++;    			
       } else {
         break;
@@ -110,29 +100,17 @@ public class UnknownDictionary extends T
   public void write(String directoryname) throws IOException {
     writeDictionary(directoryname + File.separator + FILENAME);
     writeTargetMap(directoryname + File.separator + TARGETMAP_FILENAME);
-    writeCharDef(directoryname + File.separator + CHARDEF_FILENAME);
-  }
-  
-  protected void writeCharDef(String filename) throws IOException {
-    ObjectOutputStream oos = new ObjectOutputStream(new BufferedOutputStream(new FileOutputStream(filename)));
	
-    oos.writeObject(characterDefinition);
-    oos.close();
+    characterDefinition.write(directoryname);
   }
   
   public static UnknownDictionary getInstance() throws IOException, ClassNotFoundException
{
     UnknownDictionary dictionary = new UnknownDictionary();
+    dictionary.characterDefinition = CharacterDefinition.getInstance();
     dictionary.loadDictionary(UnknownDictionary.class.getResourceAsStream(FILENAME));
     dictionary.loadTargetMap(UnknownDictionary.class.getResourceAsStream(TARGETMAP_FILENAME));
-    dictionary.loadCharDef(UnknownDictionary.class.getResourceAsStream(CHARDEF_FILENAME));
     return dictionary;
   }
   
-  protected void loadCharDef(InputStream is) throws IOException, ClassNotFoundException {
-    ObjectInputStream ois = new ObjectInputStream(new BufferedInputStream(is));
-    characterDefinition = (CharacterDefinition) ois.readObject();
-    ois.close();
-  }
-  
   @Override
   public String getReading(int wordId) {
     return null;

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java?rev=1227427&r1=1227426&r2=1227427&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
(original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/trie/DoubleArrayTrie.java
Thu Jan  5 01:37:58 2012
@@ -18,23 +18,31 @@ package org.apache.lucene.analysis.kurom
  */
 
 import java.io.BufferedInputStream;
-import java.io.DataInputStream;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
+import java.io.EOFException;
 import java.io.InputStream;
-import java.io.RandomAccessFile;
 import java.nio.ByteBuffer;
 import java.nio.CharBuffer;
 import java.nio.IntBuffer;
 import java.nio.channels.Channels;
-import java.nio.channels.FileChannel;
 import java.nio.channels.ReadableByteChannel;
+import java.nio.channels.WritableByteChannel;
 
 import org.apache.lucene.analysis.kuromoji.trie.Trie.Node;
 
+import org.apache.lucene.store.DataInput;
+import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.store.InputStreamDataInput;
+import org.apache.lucene.store.OutputStreamDataOutput;
+import org.apache.lucene.util.CodecUtil;
+
 public class DoubleArrayTrie {
   
   public static final String FILENAME = "dat.dat";
+  public static final String HEADER = "kuromoji_double_arr_trie";
+  public static final int VERSION = 1;
   
   public static final char TERMINATING_CHARACTER = '\u0001';
   
@@ -67,35 +75,37 @@ public class DoubleArrayTrie {
     checkBuffer.rewind();
     tailBuffer.rewind();
     
-    File file = new File(filename);
-    if(file.exists()){
-      file.delete();
-    }
-    
-    RandomAccessFile raf = new RandomAccessFile(filename, "rw");
-    FileChannel channel = raf.getChannel();
-    raf.writeInt(baseBuffer.capacity());
-    raf.writeInt(tailBuffer.capacity());		
-    
-    ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
-    IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
-    tmpIntBuffer.put(baseBuffer);
-    tmpBuffer.rewind();
-    channel.write(tmpBuffer);
-    
-    tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
-    tmpIntBuffer = tmpBuffer.asIntBuffer();
-    tmpIntBuffer.put(checkBuffer);
-    tmpBuffer.rewind();
-    channel.write(tmpBuffer);
-    
-    tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
-    CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
-    tmpCharBuffer.put(tailBuffer);
-    tmpBuffer.rewind();
-    channel.write(tmpBuffer);
-    
-    raf.close();
+    final FileOutputStream os = new FileOutputStream(filename);
+    try {
+      final DataOutput out = new OutputStreamDataOutput(os);
+      CodecUtil.writeHeader(out, HEADER, VERSION);
+      out.writeVInt(baseBuffer.capacity());
+      out.writeVInt(tailBuffer.capacity());
+      final WritableByteChannel channel = Channels.newChannel(os);
+      
+      ByteBuffer tmpBuffer = ByteBuffer.allocate(baseBuffer.capacity() * 4);
+      IntBuffer tmpIntBuffer = tmpBuffer.asIntBuffer();
+      tmpIntBuffer.put(baseBuffer);
+      tmpBuffer.rewind();
+      channel.write(tmpBuffer);
+      assert tmpBuffer.remaining() == 0L;
+      
+      tmpBuffer = ByteBuffer.allocate(checkBuffer.capacity() * 4);
+      tmpIntBuffer = tmpBuffer.asIntBuffer();
+      tmpIntBuffer.put(checkBuffer);
+      tmpBuffer.rewind();
+      channel.write(tmpBuffer);
+      assert tmpBuffer.remaining() == 0L;
+      
+      tmpBuffer = ByteBuffer.allocate(tailBuffer.capacity() * 2);
+      CharBuffer tmpCharBuffer = tmpBuffer.asCharBuffer();
+      tmpCharBuffer.put(tailBuffer);
+      tmpBuffer.rewind();
+      channel.write(tmpBuffer);
+      assert tmpBuffer.remaining() == 0L;
+    } finally {
+      os.close();
+    }
   }
   
   public static DoubleArrayTrie getInstance() throws IOException {
@@ -108,30 +118,46 @@ public class DoubleArrayTrie {
    * @throws IOException
    */
   public static DoubleArrayTrie read(InputStream is) throws IOException {
-    DoubleArrayTrie trie = new DoubleArrayTrie();
-    DataInputStream dis = new DataInputStream(new BufferedInputStream(is));
-    int baseCheckSize = dis.readInt();	// Read size of baseArr and checkArr
-    int tailSize = dis.readInt();		// Read size of tailArr
-    ReadableByteChannel channel = Channels.newChannel(dis);
-    
-    
-    ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);	// The size
is 4 times the baseCheckSize since it is the length of array
-    channel.read(tmpBaseBuffer);
-    tmpBaseBuffer.rewind();
-    trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
-    
-    ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(baseCheckSize * 4);
-    channel.read(tmpCheckBuffer);
-    tmpCheckBuffer.rewind();
-    trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
-    
-    ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(tailSize * 2);			// The size is
2 times the tailSize since it is the length of array
-    channel.read(tmpTailBuffer);
-    tmpTailBuffer.rewind();
-    trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
-    
-    is.close();
-    return trie;
+    is = new BufferedInputStream(is);
+    try {
+      final DataInput in = new InputStreamDataInput(is);
+      CodecUtil.checkHeader(in, HEADER, VERSION, VERSION);
+      int baseCheckSize = in.readVInt();	// Read size of baseArr and checkArr
+      int tailSize = in.readVInt();		// Read size of tailArr
+      
+      ReadableByteChannel channel = Channels.newChannel(is);
+      
+      DoubleArrayTrie trie = new DoubleArrayTrie();
+
+      int toRead, read;
+      ByteBuffer tmpBaseBuffer = ByteBuffer.allocateDirect(toRead = baseCheckSize * 4);	//
The size is 4 times the baseCheckSize since it is the length of array
+      read = channel.read(tmpBaseBuffer);
+      if (read != toRead) {
+        throw new EOFException("Cannot read DoubleArrayTree");
+      }
+      tmpBaseBuffer.rewind();
+      trie.baseBuffer = tmpBaseBuffer.asIntBuffer().asReadOnlyBuffer();
+      
+      ByteBuffer tmpCheckBuffer = ByteBuffer.allocateDirect(toRead = baseCheckSize * 4);
+      read = channel.read(tmpCheckBuffer);
+      if (read != toRead) {
+        throw new EOFException("Cannot read DoubleArrayTree");
+      }
+      tmpCheckBuffer.rewind();
+      trie.checkBuffer = tmpCheckBuffer.asIntBuffer().asReadOnlyBuffer();
+      
+      ByteBuffer tmpTailBuffer = ByteBuffer.allocateDirect(toRead = tailSize * 2);			// The
size is 2 times the tailSize since it is the length of array
+      read = channel.read(tmpTailBuffer);
+      if (read != toRead) {
+        throw new EOFException("Cannot read DoubleArrayTree");
+      }
+      tmpTailBuffer.rewind();
+      trie.tailBuffer = tmpTailBuffer.asCharBuffer().asReadOnlyBuffer();
+      
+      return trie;
+    } finally {
+      is.close();
+    }
   }
   
   /**

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java?rev=1227427&r1=1227426&r2=1227427&view=diff
==============================================================================
--- lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
(original)
+++ lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/java/org/apache/lucene/analysis/kuromoji/viterbi/Viterbi.java
Thu Jan  5 01:37:58 2012
@@ -26,7 +26,6 @@ import org.apache.lucene.analysis.kuromo
 import org.apache.lucene.analysis.kuromoji.dict.TokenInfoDictionary;
 import org.apache.lucene.analysis.kuromoji.dict.UnknownDictionary;
 import org.apache.lucene.analysis.kuromoji.dict.UserDictionary;
-import org.apache.lucene.analysis.kuromoji.dict.CharacterDefinition.CharacterClass;
 import org.apache.lucene.analysis.kuromoji.trie.DoubleArrayTrie;
 import org.apache.lucene.analysis.kuromoji.viterbi.ViterbiNode.Type;
 
@@ -174,7 +173,7 @@ public class Viterbi {
       
       // EXTENDED mode convert unknown word into unigram node
       if (extendedMode && leftNode.getType() == Type.UNKNOWN) {
-        int unigramWordId = CharacterClass.NGRAM.getId();
+        byte unigramWordId = CharacterDefinition.NGRAM;
         int unigramLeftId = unkDictionary.getLeftId(unigramWordId); // isn't required
         int unigramRightId = unkDictionary.getLeftId(unigramWordId); // isn't required
         int unigramWordCost = unkDictionary.getWordCost(unigramWordId); // isn't required
@@ -256,7 +255,7 @@ public class Viterbi {
       
       if (unknownWordLength > 0) {      // found unknown word
         String unkWord = suffix.substring(0, unknownWordLength);
-        int characterId = characterDefinition.lookup(firstCharacter);
+        int characterId = characterDefinition.getCharacterClass(firstCharacter);
         int[] wordIds = unkDictionary.lookupWordIds(characterId); // characters in input
text are supposed to be the same
         
         for (int wordId : wordIds) {

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/cd.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/dict/cd.dat?rev=1227427&r1=1227426&r2=1227427&view=diff
==============================================================================
Binary files - no diff available.

Modified: lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/trie/dat.dat
URL: http://svn.apache.org/viewvc/lucene/dev/branches/lucene3305/modules/analysis/kuromoji/src/resources/org/apache/lucene/analysis/kuromoji/trie/dat.dat?rev=1227427&r1=1227426&r2=1227427&view=diff
==============================================================================
Binary files - no diff available.



Mime
View raw message