lucene-java-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r831121 - in /lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart: ./ hhmm/
Date Thu, 29 Oct 2009 22:29:51 GMT
Author: rmuir
Date: Thu Oct 29 22:29:50 2009
New Revision: 831121

URL: http://svn.apache.org/viewvc?rev=831121&view=rev
Log:
LUCENE-1257: port smartchineseanalyzer to java 5

Modified:
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
    lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java?rev=831121&r1=831120&r2=831121&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SmartChineseAnalyzer.java
Thu Oct 29 22:29:50 2009
@@ -60,7 +60,7 @@
  */
 public class SmartChineseAnalyzer extends Analyzer {
 
-  private final Set stopWords;
+  private final Set<?> stopWords;
   
   private static final String DEFAULT_STOPWORD_FILE = "stopwords.txt";
   

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java?rev=831121&r1=831120&r2=831121&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java
Thu Oct 29 22:29:50 2009
@@ -17,7 +17,7 @@
 
 package org.apache.lucene.analysis.cn.smart;
 
-import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
@@ -45,18 +45,19 @@
    * @param startOffset start offset of sentence
    * @return {@link List} of {@link SegToken}
    */
-  public List segmentSentence(String sentence, int startOffset) {
-
-    List segTokenList = hhmmSegmenter.process(sentence);
-
-    List result = new ArrayList();
+  public List<SegToken> segmentSentence(String sentence, int startOffset) {
 
+    List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
     // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
-    for (int i = 1; i < segTokenList.size() - 1; i++) {
-      result.add(convertSegToken((SegToken) segTokenList.get(i), sentence, startOffset));
-    }
+    List<SegToken> result = Collections.emptyList();
+    
+    if (segTokenList.size() > 2) // if its not an empty sentence
+      result = segTokenList.subList(1, segTokenList.size() - 1);
+    
+    for (SegToken st : result)
+      convertSegToken(st, sentence, startOffset);
+    
     return result;
-
   }
 
   /**

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java?rev=831121&r1=831120&r2=831121&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java
Thu Oct 29 22:29:50 2009
@@ -40,9 +40,9 @@
 
   private WordSegmenter wordSegmenter;
 
-  private Iterator tokenIter;
+  private Iterator<SegToken> tokenIter;
 
-  private List tokenBuffer;
+  private List<SegToken> tokenBuffer;
   
   private TermAttribute termAtt;
   private OffsetAttribute offsetAtt;
@@ -81,7 +81,7 @@
     // WordTokenFilter must clear attributes, as it is creating new tokens.
     clearAttributes();
     // There are remaining tokens from the current sentence, return the next one. 
-    SegToken nextWord = (SegToken) tokenIter.next();
+    SegToken nextWord = tokenIter.next();
     termAtt.setTermBuffer(nextWord.charArray, 0, nextWord.charArray.length);
     offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
     typeAtt.setType("word");

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java?rev=831121&r1=831120&r2=831121&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java
Thu Oct 29 22:29:50 2009
@@ -20,7 +20,6 @@
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
@@ -39,9 +38,9 @@
  */
 class BiSegGraph {
 
-  private Map tokenPairListTable = new HashMap();
+  private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<Integer,ArrayList<SegTokenPair>>();
 
-  private List segTokenList;
+  private List<SegToken> segTokenList;
 
   private static BigramDictionary bigramDict = BigramDictionary.getInstance();
 
@@ -65,15 +64,14 @@
     segTokenList = segGraph.makeIndex();
     // Because the beginning position of startToken is -1, therefore startToken can be obtained
when key = -1
     int key = -1;
-    List nextTokens = null;
+    List<SegToken> nextTokens = null;
     while (key < maxStart) {
       if (segGraph.isStartExist(key)) {
 
-        List tokenList = segGraph.getStartList(key);
+        List<SegToken> tokenList = segGraph.getStartList(key);
 
         // Calculate all tokens for a given key.
-        for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
-          SegToken t1 = (SegToken) iter.next();
+        for (SegToken t1 : tokenList) {
           oneWordFreq = t1.weight;
           next = t1.endOffset;
           nextTokens = null;
@@ -91,8 +89,7 @@
           if (nextTokens == null) {
             break;
           }
-          for (Iterator iter2 = nextTokens.iterator(); iter2.hasNext();) {
-            SegToken t2 = (SegToken) iter2.next();
+          for (SegToken t2 : nextTokens) {
             idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
             System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
             idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
@@ -139,8 +136,8 @@
    * @param to index of the second token in the token pair
    * @return {@link List} of token pairs.
    */
-  public List getToList(int to) {
-    return (List) tokenPairListTable.get(Integer.valueOf(to));
+  public List<SegTokenPair> getToList(int to) {
+    return tokenPairListTable.get(to);
   }
 
   /**
@@ -151,11 +148,11 @@
   public void addSegTokenPair(SegTokenPair tokenPair) {
     int to = tokenPair.to;
     if (!isToExist(to)) {
-      ArrayList newlist = new ArrayList();
+      ArrayList<SegTokenPair> newlist = new ArrayList<SegTokenPair>();
       newlist.add(tokenPair);
-      tokenPairListTable.put(Integer.valueOf(to), newlist);
+      tokenPairListTable.put(to, newlist);
     } else {
-      List tokenPairList = (List) tokenPairListTable.get(Integer.valueOf(to));
+      List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
       tokenPairList.add(tokenPair);
     }
   }
@@ -172,24 +169,23 @@
    * Find the shortest path with the Viterbi algorithm.
    * @return {@link List}
    */
-  public List getShortPath() {
+  public List<SegToken> getShortPath() {
     int current;
     int nodeCount = getToCount();
-    List path = new ArrayList();
+    List<PathNode> path = new ArrayList<PathNode>();
     PathNode zeroPath = new PathNode();
     zeroPath.weight = 0;
     zeroPath.preNode = 0;
     path.add(zeroPath);
     for (current = 1; current <= nodeCount; current++) {
       double weight;
-      List edges = getToList(current);
+      List<SegTokenPair> edges = getToList(current);
 
       double minWeight = Double.MAX_VALUE;
       SegTokenPair minEdge = null;
-      for (Iterator iter1 = edges.iterator(); iter1.hasNext();) {
-        SegTokenPair edge = (SegTokenPair) iter1.next();
+      for (SegTokenPair edge : edges) {
         weight = edge.weight;
-        PathNode preNode = (PathNode) path.get(edge.from);
+        PathNode preNode = path.get(edge.from);
         if (preNode.weight + weight < minWeight) {
           minWeight = preNode.weight + weight;
           minEdge = edge;
@@ -205,10 +201,10 @@
     int preNode, lastNode;
     lastNode = path.size() - 1;
     current = lastNode;
-    List rpath = new ArrayList();
-    List resultPath = new ArrayList();
+    List<Integer> rpath = new ArrayList<Integer>();
+    List<SegToken> resultPath = new ArrayList<SegToken>();
 
-    rpath.add(Integer.valueOf(current));
+    rpath.add(current);
     while (current != 0) {
       PathNode currentPathNode = (PathNode) path.get(current);
       preNode = currentPathNode.preNode;
@@ -218,7 +214,7 @@
     for (int j = rpath.size() - 1; j >= 0; j--) {
       Integer idInteger = (Integer) rpath.get(j);
       int id = idInteger.intValue();
-      SegToken t = (SegToken) segTokenList.get(id);
+      SegToken t = segTokenList.get(id);
       resultPath.add(t);
     }
     return resultPath;
@@ -227,11 +223,9 @@
 
   public String toString() {
     StringBuilder sb = new StringBuilder();
-    Collection values = tokenPairListTable.values();
-    for (Iterator iter1 = values.iterator(); iter1.hasNext();) {
-      List segList = (List) iter1.next();
-      for (Iterator iter2 = segList.iterator(); iter2.hasNext();) {
-        SegTokenPair pair = (SegTokenPair) iter2.next();
+    Collection<ArrayList<SegTokenPair>>  values = tokenPairListTable.values();
+    for (ArrayList<SegTokenPair> segList : values) {
+      for (SegTokenPair pair : segList) {
         sb.append(pair + "\n");
       }
     }

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java?rev=831121&r1=831120&r2=831121&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/HHMMSegmenter.java
Thu Oct 29 22:29:50 2009
@@ -22,7 +22,7 @@
 import org.apache.lucene.analysis.cn.smart.CharType;
 import org.apache.lucene.analysis.cn.smart.Utility;
 import org.apache.lucene.analysis.cn.smart.WordType;
-import org.apache.lucene.analysis.cn.smart.hhmm.PathNode;//javadoc @link
+import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;//javadoc @link
 
 /**
  * Finds the optimal segmentation of a sentence into Chinese words
@@ -196,14 +196,14 @@
   }
 
   /**
-   * Return a list of {@link PathNode} representing the best segmentation of a sentence
+   * Return a list of {@link SegToken} representing the best segmentation of a sentence
    * @param sentence input sentence
    * @return best segmentation as a {@link List}
    */
-  public List process(String sentence) {
+  public List<SegToken> process(String sentence) {
     SegGraph segGraph = createSegGraph(sentence);
     BiSegGraph biSegGraph = new BiSegGraph(segGraph);
-    List shortPath = biSegGraph.getShortPath();
+    List<SegToken> shortPath = biSegGraph.getShortPath();
     return shortPath;
   }
 }

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java?rev=831121&r1=831120&r2=831121&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/PathNode.java
Thu Oct 29 22:29:50 2009
@@ -28,13 +28,12 @@
  * supported anymore in such a case.</font>
  * </p>
  */
-class PathNode implements Comparable {
+class PathNode implements Comparable<PathNode> {
   public double weight;
 
   public int preNode;
 
-  public int compareTo(Object p) {
-    PathNode pn = (PathNode) p;
+  public int compareTo(PathNode pn) {
     if (weight < pn.weight)
       return -1;
     else if (weight == pn.weight)

Modified: lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
URL: http://svn.apache.org/viewvc/lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java?rev=831121&r1=831120&r2=831121&view=diff
==============================================================================
--- lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
(original)
+++ lucene/java/trunk/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/SegGraph.java
Thu Oct 29 22:29:50 2009
@@ -19,7 +19,6 @@
 
 import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 
@@ -39,7 +38,7 @@
   /**
    * Map of start offsets to ArrayList of tokens at that position
    */
-  private Map /* <Integer, ArrayList<SegToken>> */ tokenListTable = new HashMap();
+  private Map<Integer,ArrayList<SegToken>> tokenListTable = new HashMap<Integer,ArrayList<SegToken>>();
 
   private int maxStart = -1;
 
@@ -50,7 +49,7 @@
    * @return true if there are tokens for the startOffset
    */
   public boolean isStartExist(int s) {
-    return tokenListTable.get(Integer.valueOf(s)) != null;
+    return tokenListTable.get(s) != null;
   }
 
   /**
@@ -59,8 +58,8 @@
    * @param s startOffset
    * @return List of tokens at the specified start offset.
    */
-  public List getStartList(int s) {
-    return (List) tokenListTable.get(Integer.valueOf(s));
+  public List<SegToken> getStartList(int s) {
+    return tokenListTable.get(s);
   }
 
   /**
@@ -76,16 +75,15 @@
    * Set the {@link SegToken#index} for each token, based upon its order by startOffset.

    * @return a {@link List} of these ordered tokens.
    */
-  public List makeIndex() {
-    List result = new ArrayList();
+  public List<SegToken> makeIndex() {
+    List<SegToken> result = new ArrayList<SegToken>();
     int s = -1, count = 0, size = tokenListTable.size();
-    List tokenList;
+    List<SegToken> tokenList;
     short index = 0;
     while (count < size) {
       if (isStartExist(s)) {
-        tokenList = (List) tokenListTable.get(Integer.valueOf(s));
-        for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
-          SegToken st = (SegToken) iter.next();
+        tokenList = tokenListTable.get(s);
+        for (SegToken st : tokenList) {
           st.index = index;
           result.add(st);
           index++;
@@ -104,11 +102,11 @@
   public void addToken(SegToken token) {
     int s = token.startOffset;
     if (!isStartExist(s)) {
-      ArrayList newlist = new ArrayList();
+      ArrayList<SegToken> newlist = new ArrayList<SegToken>();
       newlist.add(token);
-      tokenListTable.put((Object) (Integer.valueOf(s)), newlist);
+      tokenListTable.put(s, newlist);
     } else {
-      List tokenList = (List) tokenListTable.get((Object) (Integer.valueOf(s)));
+      List<SegToken> tokenList = tokenListTable.get(s);
       tokenList.add(token);
     }
     if (s > maxStart)
@@ -120,16 +118,15 @@
    * 
    * @return {@link List} of all tokens in the map.
    */
-  public List toTokenList() {
-    List result = new ArrayList();
+  public List<SegToken> toTokenList() {
+    List<SegToken> result = new ArrayList<SegToken>();
     int s = -1, count = 0, size = tokenListTable.size();
-    List tokenList;
+    List<SegToken> tokenList;
 
     while (count < size) {
       if (isStartExist(s)) {
-        tokenList = (List) tokenListTable.get(Integer.valueOf(s));
-        for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
-          SegToken st = (SegToken) iter.next();
+        tokenList = tokenListTable.get(s);
+        for (SegToken st : tokenList) {
           result.add(st);
         }
         count++;
@@ -140,10 +137,9 @@
   }
 
   public String toString() {
-    List tokenList = this.toTokenList();
+    List<SegToken> tokenList = this.toTokenList();
     StringBuilder sb = new StringBuilder();
-    for (Iterator iter = tokenList.iterator(); iter.hasNext();) {
-      SegToken t = (SegToken) iter.next();
+    for (SegToken t : tokenList) {
       sb.append(t + "\n");
     }
     return sb.toString();



Mime
View raw message