ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1463284 - in /ctakes/sandbox/ctakes-spelling-corrector: ./ src/org/apache/ctakes/spelling/ src/org/apache/ctakes/spelling/mistakes/ src/org/apache/ctakes/spelling/mistakes/edit/ src/org/apache/ctakes/spelling/priors/ src/org/apache/ctakes/...
Date Mon, 01 Apr 2013 20:17:45 GMT
Author: tmill
Date: Mon Apr  1 20:17:44 2013
New Revision: 1463284

URL: http://svn.apache.org/r1463284
Log:
ctakes-58: Adds code for building term neighborhoods and context triples. Uses JaspellTernarySearchTrie
in lucene, so new dependency in pom.

Added:
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
Modified:
    ctakes/sandbox/ctakes-spelling-corrector/pom.xml
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java

Modified: ctakes/sandbox/ctakes-spelling-corrector/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/pom.xml?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/pom.xml (original)
+++ ctakes/sandbox/ctakes-spelling-corrector/pom.xml Mon Apr  1 20:17:44 2013
@@ -38,5 +38,10 @@
   		<groupId>org.apache.ctakes</groupId>
   		<artifactId>ctakes-clinical-pipeline</artifactId>
   	</dependency>
+  	<dependency>
+  		<groupId>org.apache.lucene</groupId>
+  		<artifactId>lucene-spellchecker</artifactId>
+  		<version>3.6.2</version>
+  	</dependency>
   </dependencies>
 </project>
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
Mon Apr  1 20:17:44 2013
@@ -1,12 +1,24 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling;
 
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Scanner;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.ctakes.spelling.mistakes.SpellingDistanceModel;
 import org.apache.ctakes.spelling.priors.WordPriorModel;

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java?rev=1463284&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
(added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateContextTriples.java
Mon Apr  1 20:17:44 2013
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.util.JCasUtil;
+
+public class GenerateContextTriples {
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		if(args.length < 3){
+			System.err.println("Required arguments: <neighborhood file> <input files>
<output dir>");
+			System.exit(-1);
+		}
+		
+		HashMap<String,String[]> dict = new HashMap<String,String[]>();
+//		HashMap<String,ClusterNode> dict = new HashMap<String,ClusterNode>();
+		HashMap<String,CounterMap<String>> contexts = new HashMap<String,CounterMap<String>>();
+//		
+		Scanner scanner = null;
+		try {
+			scanner = new Scanner(new File(args[0]));
+		} catch (FileNotFoundException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.err.println("Error reading dictinoary file");
+			System.exit(-1);
+		}
+		while(scanner.hasNextLine()){
+			String[] parts = scanner.nextLine().trim().split(" ");
+			if(parts.length == 3){
+				String[] neighbors = parts[2].split(":");
+				dict.put(parts[0], neighbors);
+			}else{
+				dict.put(parts[0], new String[]{});
+			}
+//			if(!dict.contains(o))
+		}
+		
+		CollectionReader reader = null; 
+		AnalysisEngine ae = null;
+		try {
+			ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../ctakes-core/desc/analysis_engine/AggregateAE.xml");
+			reader = CollectionReaderFactory.createCollectionReader(FilesInDirectoryCollectionReader.class
+					,FilesInDirectoryCollectionReader.PARAM_INPUTDIR
+					,args[1]
+					,FilesInDirectoryCollectionReader.PARAM_RECURSE
+					,true
+					,FilesInDirectoryCollectionReader.PARAM_EXTENSIONS
+					,new String[]{"txt"}
+			);
+		} catch (ResourceInitializationException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.exit(-1);
+		} catch (UIMAException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.exit(-1);
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.exit(-1);
+		}
+		
+		JCasIterable casIter = null;
+		try {
+			casIter = new JCasIterable(reader, ae);
+		} catch (UIMAException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.exit(-1);
+		} catch (IOException e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+			System.exit(-1);
+		}
+		String prev = null;
+		String cur = null;
+		String next = null;
+		while(casIter.hasNext()){
+			JCas jcas = casIter.next();
+			Collection<Sentence> sents = JCasUtil.select(jcas, Sentence.class);
+			for(Sentence sent : sents){
+				List<BaseToken> tokens = JCasUtil.selectCovered(jcas, BaseToken.class, sent);
+				for(int i = 0; i < tokens.size(); i++){
+					cur = tokens.get(i).getCoveredText();
+					if(!dict.containsKey(cur)) continue;
+					if(i == 0){
+						prev = "<Start>";
+					}else{
+						prev = tokens.get(i-1).getCoveredText();
+					}
+					if(i == tokens.size()-1){
+						next = "<End>";
+					}else{
+						next = tokens.get(i+1).getCoveredText();
+					}
+					if(!contexts.containsKey(cur)){
+						contexts.put(cur, new CounterMap<String>());
+					}
+					contexts.get(cur).add(prev + "-" + next);
+				}
+			}
+		}
+		
+		// for each word write the contexts and their values
+		for(String word : contexts.keySet()){
+			PrintWriter out;
+			try {
+				out = new PrintWriter(new File(args[2], word));
+				for(String context : contexts.get(word).keySet()){
+					out.print(context);
+					out.print(" : ");
+					out.println(contexts.get(word).get(context));
+				}
+				out.close();
+			} catch (FileNotFoundException e) {
+				e.printStackTrace();
+				System.err.println("Error creating context file!");
+				System.exit(-1);
+			}
+		}
+	}
+	
+	class ClusterNode{
+		public String word;
+//		private List<ClusterNode> parents = new ArrayList<ClusterNode>();
+		
+		@Override
+		public boolean equals(Object n){
+			return word.equals(((ClusterNode)n).word);
+		}
+	}
+
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java?rev=1463284&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
(added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/GenerateTermNeighborhoods.java
Mon Apr  1 20:17:44 2013
@@ -0,0 +1,102 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Scanner;
+
+import org.apache.commons.lang.StringUtils;
+import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+
+public class GenerateTermNeighborhoods {
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		if(args.length < 2){
+			System.err.println("Args: <dictionary file> <output file>");
+			System.exit(-1);
+		}
+		
+		JaspellTernarySearchTrie trie=null;
+		try {
+			trie = new JaspellTernarySearchTrie(new File(args[0]));
+		} catch (IOException e) {
+			e.printStackTrace();
+			System.err.println("Could not read dictionary file!");
+			System.exit(-1);
+		}
+		System.err.println("Trie loaded...");
+
+		// look through terms file
+		Scanner scanner = null; 
+		PrintWriter out = null;
+		try {
+			scanner = new Scanner(new File(args[0]));
+			out = new PrintWriter(args[1]);
+		} catch (FileNotFoundException e) {
+			e.printStackTrace();
+			System.err.println("Could not open input/output file!");
+			System.exit(-1);
+		}
+		
+		while(scanner.hasNextLine()){
+			String word = scanner.next();
+			int maxDiff;
+			if(word.length() <= 4){
+				maxDiff = 1;
+			}else if(word.length() <= 12){
+				maxDiff = 2;
+			}else{
+				maxDiff = 3;
+			}
+			int count = (Integer) trie.get(word);
+			HashSet<String> neighbors = new HashSet<String>();
+			for(int diff = 0; diff <= maxDiff; diff++){
+				trie.setMatchAlmostDiff(diff);
+				neighbors.addAll(trie.matchAlmost(word));				
+			}
+			
+			HashSet<String> toRemove = new HashSet<String>();
+			for(String neighbor : neighbors){
+				int nCount = (Integer) trie.get(neighbor);
+				if(count / nCount < 10){
+					toRemove.add(neighbor);
+				}
+			}
+			for(String rm : toRemove){
+				neighbors.remove(rm);				
+			}
+			
+			out.print(word);
+			out.print(" : ");
+			String joinedNeighbors = StringUtils.join(neighbors, ":");
+			out.println(joinedNeighbors);
+			out.flush();
+			
+			scanner.nextLine(); // go to next line
+		}
+	}
+
+}

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
Mon Apr  1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling.mistakes;
 
 import java.util.List;

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
Mon Apr  1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling.mistakes.edit;
 
 import java.util.ArrayList;

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
Mon Apr  1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling.priors;
 
 public interface WordPriorModel {

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
Mon Apr  1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling.priors.unigram;
 
 import java.io.File;

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
Mon Apr  1 20:17:44 2013
@@ -1,12 +1,27 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling.priors.unigram;
 
-import java.io.File;
-import java.io.FilenameFilter;
 import java.io.IOException;
 import java.io.PrintStream;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.Scanner;
 
 import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
@@ -16,11 +31,10 @@ import org.apache.uima.analysis_engine.A
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
 import org.uimafit.factory.AnalysisEngineFactory;
 import org.uimafit.factory.CollectionReaderFactory;
-import org.uimafit.factory.JCasFactory;
 import org.uimafit.pipeline.JCasIterable;
-import org.uimafit.pipeline.SimplePipeline;
 
 public class UnigramPriorGenerator {
 	static HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
@@ -55,7 +69,7 @@ public class UnigramPriorGenerator {
 		JCasIterable casIter = new JCasIterable(reader, ae);
 		while(casIter.hasNext()){
 			JCas jcas = casIter.next();
-			FSIterator iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
+			FSIterator<Annotation> iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
 			while(iter.hasNext()){
 				BaseToken tok = (BaseToken) iter.next();
 				if(tok instanceof WordToken){
@@ -68,21 +82,21 @@ public class UnigramPriorGenerator {
 		writePriorModel(new PrintStream(args[0]));
 	}
 
-	private static boolean startsWith(String word, String[] prefixes){
+	public static boolean startsWith(String word, String[] prefixes){
 		for(String prefix : prefixes){
 			if(word.startsWith(prefix)) return true;
 		}
 		return false;
 	}
 	
-	private static boolean endsWith(String word, String[] prefixes){
+	public static boolean endsWith(String word, String[] prefixes){
 		for(String prefix : prefixes){
 			if(word.endsWith(prefix)) return true;
 		}
 		return false;
 	}
 	
-	private static String removeStarting(String word, String[] prefixes){
+	public static String removeStarting(String word, String[] prefixes){
 		for(String prefix : prefixes){
 //			if(word.startsWith(prefix)){d
 				word = word.replaceFirst("^"+prefix, "");
@@ -91,7 +105,7 @@ public class UnigramPriorGenerator {
 		return word;
 	}
 	
-	private static String removeEnding(String word, String[] suffixes){
+	public static String removeEnding(String word, String[] suffixes){
 		for(String suffix : suffixes){
 //			if(word.endsWith(suffix)){
 				word = word.replaceFirst(suffix+"$", "");
@@ -100,7 +114,7 @@ public class UnigramPriorGenerator {
 		return word;
 	}
 	
-	private static void handleWord(String w){
+	public static void handleWord(String w){
 		String word = w.toLowerCase();
 //		if(word.matches("^.*\\p{Digit}.*$")) return; // ignore words with numbers
 		if(word.length() == 0) return;
@@ -111,7 +125,7 @@ public class UnigramPriorGenerator {
 		numWords++;
 	}
 	
-	private static void writePriorModel(PrintStream out){
+	public static void writePriorModel(PrintStream out){
 		for(Map.Entry<String,Integer> entry : wordCounts.entrySet()){
 			out.print(entry.getKey());
 			out.print(" : ");

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
Mon Apr  1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling.test;
 
 import java.io.File;

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java?rev=1463284&r1=1463283&r2=1463284&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
Mon Apr  1 20:17:44 2013
@@ -1,3 +1,21 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
 package org.apache.ctakes.spelling.test;
 
 import java.io.FileNotFoundException;

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java?rev=1463284&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
(added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestTrie.java
Mon Apr  1 20:17:44 2013
@@ -0,0 +1,72 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.spelling.test;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Scanner;
+
+import org.apache.lucene.search.suggest.jaspell.JaspellTernarySearchTrie;
+
+public class TestTrie {
+
+	/**
+	 * @param args
+	 * @throws  
+	 */
+	public static void main(String[] args) {
+		if(args.length < 1){
+			System.err.println("Args: <dictionary file>");
+			System.exit(-1);
+		}
+		
+		JaspellTernarySearchTrie trie=null;
+		try {
+			trie = new JaspellTernarySearchTrie(new File(args[0]));
+		} catch (IOException e) {
+			e.printStackTrace();
+			System.err.println("Could not read dictionary file!");
+			System.exit(-1);
+		}
+		System.err.println("Trie loaded...");
+		Scanner scanner = new Scanner(System.in);
+		while(scanner.hasNextLine()){
+			String word = scanner.nextLine().trim();
+			int maxDiff;
+			if(word.length() <= 4){
+				maxDiff = 1;
+			}else if(word.length() <= 12){
+				maxDiff = 2;
+			}else{
+				maxDiff = 3;
+			}
+			List<String> neighbors = new ArrayList<String>();
+			for(int diff = 0; diff <= maxDiff; diff++){
+				trie.setMatchAlmostDiff(diff);
+				neighbors.addAll(trie.matchAlmost(word));				
+			}
+			System.out.println(word + "\t" + trie.get(word));
+			for(String neighbor : neighbors){
+				System.out.println("\t" + neighbor + "\t" + trie.get(neighbor));
+			}
+		}
+	}
+}



Mime
View raw message