incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1387218 - in /incubator/ctakes/sandbox/ctakes-spelling-corrector: ./ .settings/ lib/ src/ src/org/ src/org/apache/ src/org/apache/ctakes/ src/org/apache/ctakes/spelling/ src/org/apache/ctakes/spelling/test/
Date Tue, 18 Sep 2012 15:36:22 GMT
Author: tmill
Date: Tue Sep 18 15:36:21 2012
New Revision: 1387218

URL: http://svn.apache.org/viewvc?rev=1387218&view=rev
Log:
Initial checkin of spelling corrector.  Not ready for prime time.

Added:
    incubator/ctakes/sandbox/ctakes-spelling-corrector/.classpath
    incubator/ctakes/sandbox/ctakes-spelling-corrector/.project
    incubator/ctakes/sandbox/ctakes-spelling-corrector/.settings/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
    incubator/ctakes/sandbox/ctakes-spelling-corrector/lib/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/lib/uimafit-1.2.0.jar   (with props)
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/.classpath
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/.classpath?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/.classpath (added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/.classpath Tue Sep 18 15:36:21 2012
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" path="src"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/clinical documents pipeline"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/core"/>
+	<classpathentry combineaccessrules="false" kind="src" path="/common-type-system"/>
+	<classpathentry kind="lib" path="lib/uimafit-1.2.0.jar"/>
+	<classpathentry kind="con" path="org.eclipse.jdt.USER_LIBRARY/UIMA"/>
+	<classpathentry kind="output" path="bin"/>
+</classpath>

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/.project
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/.project?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/.project (added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/.project Tue Sep 18 15:36:21 2012
@@ -0,0 +1,17 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ctakes-spelling-corrector</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+	</natures>
+</projectDescription>

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/.settings/org.eclipse.jdt.core.prefs
Tue Sep 18 15:36:21 2012
@@ -0,0 +1,11 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.debug.lineNumber=generate
+org.eclipse.jdt.core.compiler.debug.localVariable=generate
+org.eclipse.jdt.core.compiler.debug.sourceFile=generate
+org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
+org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
+org.eclipse.jdt.core.compiler.source=1.6

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/lib/uimafit-1.2.0.jar
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/lib/uimafit-1.2.0.jar?rev=1387218&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/ctakes/sandbox/ctakes-spelling-corrector/lib/uimafit-1.2.0.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java
Tue Sep 18 15:36:21 2012
@@ -0,0 +1,92 @@
+package org.apache.ctakes.spelling;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+
+public class SimpleEditDistanceModel implements SpellingDistanceModel{
+	
+	@Override
+	public double getDistance(String actual, String expected) {
+		// 5% prior of a typo per word, sounds reasonable?
+		HashSet<String> ed1words = getEditDistanceWords(actual, 1);
+		if(actual.equalsIgnoreCase(expected)){
+			return 0.95;
+		}else if(ed1words.contains(expected)){
+			return 0.05 * (1.0 / ed1words.size());
+		}else{
+			return 0.0;
+		}
+	}
+
+	@Override
+	public List<String> getOrderedWords(String actual){
+		ArrayList<String> words = new ArrayList<String>();
+		HashSet<String> ed1words = getEditDistanceWords(actual, 1);
+		words.addAll(ed1words);
+		return words;
+	}
+	
+	public HashSet<String> getEditDistanceWords(String word, int d) {
+		HashSet<String> ed1words = new HashSet<String>();
+		StringBuilder buff = new StringBuilder(word);
+		
+		char curChar;
+		for(int i = 0; i < word.length(); i++){
+			curChar = word.charAt(i);
+			
+			// first delete
+			buff.deleteCharAt(i);
+			ed1words.add(buff.toString().trim());
+			// restore
+			buff.insert(i, curChar);
+			
+			for(char c = 'a'; c < 'z'; c++){
+				// insertion
+				buff.insert(i, c);
+				ed1words.add(buff.toString().trim());
+				// restore
+				buff.deleteCharAt(i);
+				
+				// now substitution
+				if(c != curChar){
+					buff.setCharAt(i, c);
+				}
+				ed1words.add(buff.toString().trim());
+				// restore
+				buff.setCharAt(i, curChar);
+			}
+			
+			if(i+1 < word.length()){
+				// swap
+				buff.setCharAt(i, word.charAt(i+1));
+				buff.setCharAt(i+1, curChar);
+				ed1words.add(buff.toString().trim());
+				// restore
+				buff.setCharAt(i+1, buff.charAt(i));
+				buff.setCharAt(i, curChar);
+			}
+		}
+		
+		// now try adding at the end of word
+		buff.append(' ');
+		int endInd = buff.length()-1;
+		for(char c = 'a'; c < 'z'; c++){
+			buff.setCharAt(endInd, c);
+			ed1words.add(buff.toString());
+		}
+		
+		if(d == 1){
+			return ed1words;
+		}else{
+			HashSet<String> allWords = new HashSet<String>();
+			allWords.addAll(ed1words);
+			for(String w : ed1words){
+				allWords.addAll(getEditDistanceWords(w, d-1));
+			}
+			return allWords;
+		}
+	}
+
+	
+}

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
Tue Sep 18 15:36:21 2012
@@ -0,0 +1,59 @@
+package org.apache.ctakes.spelling;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class SpellingCorrector {
+
+	Pattern priorPatt = Pattern.compile("^([^ ]+) : ([^ ]+)$");
+	HashMap<String,Double> prior = new HashMap<String,Double>();
+	SpellingDistanceModel distModel = null;
+	
+	public SpellingCorrector(SpellingDistanceModel chanModel) {
+		distModel = chanModel;
+	}
+
+	public void usePriorsFile(String priorFilename) throws FileNotFoundException {
+		Scanner scanner = new Scanner(new File(priorFilename));
+		Matcher m = null;
+		while(scanner.hasNextLine()){
+			String wordProb = scanner.nextLine().trim();
+			m = priorPatt.matcher(wordProb);
+			if(m.matches()){
+				prior.put(m.group(1), Double.parseDouble(m.group(2)));
+			}
+		}
+	}
+
+	public double getPrior(String word){
+		if(prior != null && prior.containsKey(word)){
+			return prior.get(word);
+		}else{
+			return 0.0;
+		}
+	}
+	
+	public double getSingleWordProbability(String actual, String expected){
+		return distModel.getDistance(actual, expected) * getPrior(expected);
+	}
+	
+	public String getMostLikelyWord(String actual){
+		double bestProb = 0;
+		String bestWord = actual;
+		
+		List<String> words = distModel.getOrderedWords(actual);
+		for(String cand : words){
+			double prob = getSingleWordProbability(actual, cand);
+			if(prob > bestProb){
+				bestProb = prob;
+				bestWord = cand;
+			}
+		}
+		return bestWord;
+	}
+}

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java
Tue Sep 18 15:36:21 2012
@@ -0,0 +1,9 @@
+package org.apache.ctakes.spelling;
+
+import java.util.List;
+
+public interface SpellingDistanceModel {
+	public double getDistance(String w, String w2);
+
+	List<String> getOrderedWords(String actual);
+}

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java
Tue Sep 18 15:36:21 2012
@@ -0,0 +1,136 @@
+package org.apache.ctakes.spelling;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Scanner;
+
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.JCasFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.SimplePipeline;
+
+import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
+import edu.mayo.bmi.uima.core.type.syntax.WordToken;
+
+public class UnigramPriorGenerator {
+	static HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
+	static long numWords = 0;
+
+	/**
+	 * @param args
+	 * @throws IOException 
+	 * @throws UIMAException 
+	 */
+	public static void main(String[] args) throws IOException, UIMAException {
+		
+		if(args.length < 1){
+			System.err.println("Args: <output file> <dir1> [dir_i]* where each dir contains
txt files.");
+			System.exit(-1);
+		}
+		
+		JCas jcas = null;
+		AnalysisEngine ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../core/desc/analysis_engine/AggregateAE.xml");
+		TypeSystemDescription types = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("../common-type-system/desc/common_type_system.xml");
+		
+		// iterate over directories passed in at command line
+//		for(String dirName : args){
+		for(int i = 1; i < args.length; i++){
+			String dirName = args[i];
+			File curDir = new File(dirName);
+			if(!curDir.exists() || !curDir.isDirectory()){
+				throw new IOException("Erroneous directory passed in!");
+			}
+			
+			// get all the text files in this directory
+			File[] txtFiles = curDir.listFiles(new FilenameFilter() {
+				@Override
+				public boolean accept(File arg0, String arg1) {
+					// TODO Auto-generated method stub
+					return arg1.endsWith(".txt");
+				}
+			});
+			
+			// now tokenize each file
+			for(File f : txtFiles){
+				Scanner scanner = new Scanner(f);
+				StringBuilder buff = new StringBuilder();
+				while(scanner.hasNextLine()){
+					buff.append(scanner.nextLine());
+				}
+				jcas = JCasFactory.createJCas(types);
+				jcas.setDocumentText(buff.toString());
+				SimplePipeline.runPipeline(jcas, ae);
+				FSIterator iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
+				while(iter.hasNext()){
+					BaseToken tok = (BaseToken) iter.next();
+					if(tok instanceof WordToken){
+						handleWord(tok.getCoveredText());
+					}
+				}
+			}
+		}
+		writePriorModel(new PrintStream(args[0]));
+	}
+
+	private static boolean startsWith(String word, String[] prefixes){
+		for(String prefix : prefixes){
+			if(word.startsWith(prefix)) return true;
+		}
+		return false;
+	}
+	
+	private static boolean endsWith(String word, String[] prefixes){
+		for(String prefix : prefixes){
+			if(word.endsWith(prefix)) return true;
+		}
+		return false;
+	}
+	
+	private static String removeStarting(String word, String[] prefixes){
+		for(String prefix : prefixes){
+//			if(word.startsWith(prefix)){d
+				word = word.replaceFirst("^"+prefix, "");
+//			}
+		}
+		return word;
+	}
+	
+	private static String removeEnding(String word, String[] suffixes){
+		for(String suffix : suffixes){
+//			if(word.endsWith(suffix)){
+				word = word.replaceFirst(suffix+"$", "");
+//			}
+		}
+		return word;
+	}
+	
+	private static void handleWord(String w){
+		String word = w.toLowerCase();
+		if(word.matches("^.*\\p{Digit}.*$")) return; // ignore words with numbers
+		if(word.length() == 0) return;
+		if(!wordCounts.containsKey(word)){
+			wordCounts.put(word, 0);
+		}
+		wordCounts.put(word, wordCounts.get(word)+1);
+		numWords++;
+	}
+	
+	private static void writePriorModel(PrintStream out){
+		for(Map.Entry<String,Integer> entry : wordCounts.entrySet()){
+			double prob = (double) entry.getValue() / numWords;
+			out.print(entry.getKey());
+			out.print(" : ");
+			out.println(prob);
+		}
+	}
+}

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
Tue Sep 18 15:36:21 2012
@@ -0,0 +1,87 @@
+package org.apache.ctakes.spelling.test;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FilenameFilter;
+import java.io.IOException;
+import java.util.Scanner;
+
+import org.apache.ctakes.spelling.SimpleEditDistanceModel;
+import org.apache.ctakes.spelling.SpellingCorrector;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.JCasFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.SimplePipeline;
+
+import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
+import edu.mayo.bmi.uima.core.type.syntax.WordToken;
+
+public class TestOnCorpus {
+
+	/**
+	 * @param args
+	 * @throws IOException 
+	 * @throws UIMAException 
+	 */
+	public static void main(String[] args) throws UIMAException, IOException {
+		if(args.length < 2){
+			System.err.println("Required arguments: <priors file> <corpus directory>");
+			System.exit(-1);
+		}
+		SimpleEditDistanceModel chanModel = new SimpleEditDistanceModel();
+		SpellingCorrector speller = new SpellingCorrector(chanModel);
+		try{
+			speller.usePriorsFile(args[0]);
+		}catch(FileNotFoundException e){
+			System.err.println("Error using priors file, could not find!");
+			System.exit(-1);
+		}
+
+		JCas jcas = null;
+		AnalysisEngine ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../core/desc/analysis_engine/AggregateAE.xml");
+		TypeSystemDescription types = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("../common-type-system/desc/common_type_system.xml");
+
+		File dir = new File(args[1]);
+		File[] txtFiles = dir.listFiles(new FilenameFilter() {
+			@Override
+			public boolean accept(File arg0, String arg1) {
+				// TODO Auto-generated method stub
+				return arg1.endsWith(".txt");
+			}
+		});
+
+		int correct = 0;
+		int wrong = 0;
+		
+		for(File f : txtFiles){
+			Scanner scanner = new Scanner(f);
+			StringBuilder buff = new StringBuilder();
+			while(scanner.hasNextLine()){
+				buff.append(scanner.nextLine());
+			}
+			jcas = JCasFactory.createJCas(types);
+			jcas.setDocumentText(buff.toString());
+			SimplePipeline.runPipeline(jcas, ae);
+			FSIterator iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
+			while(iter.hasNext()){
+				BaseToken tok = (BaseToken) iter.next();
+				if(tok instanceof WordToken){
+					String typed = tok.getCoveredText().toLowerCase();
+					String fix = speller.getMostLikelyWord(typed);
+					if(!typed.equalsIgnoreCase(fix)){
+						System.out.println("Replaced " + typed + " with " + fix);
+						wrong++;
+					}else correct++;
+				}
+			}
+		}
+		System.out.println("Attempted to correct " + wrong + " words. Ignored " + correct + " words.");
+		System.out.println("Estimated error rate = " + ((double)wrong/(wrong+correct)));
+	}
+
+}

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java?rev=1387218&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
Tue Sep 18 15:36:21 2012
@@ -0,0 +1,54 @@
+package org.apache.ctakes.spelling.test;
+
+import java.io.FileNotFoundException;
+import java.util.HashSet;
+import java.util.Scanner;
+
+import org.apache.ctakes.spelling.SimpleEditDistanceModel;
+import org.apache.ctakes.spelling.SpellingCorrector;
+
+public class TestSpellingCorrector {
+
+	/**
+	 * @param args
+	 */
+	public static void main(String[] args) {
+		if(args.length < 1){
+			System.err.println("Error: Arg0 must be the priors file.");
+			System.exit(-1);
+		}
+		SimpleEditDistanceModel chanModel = new SimpleEditDistanceModel();
+		SpellingCorrector speller = new SpellingCorrector(chanModel);
+		try{
+			speller.usePriorsFile(args[0]);
+		}catch(FileNotFoundException e){
+			System.err.println("Error using priors file, could not find!");
+			System.exit(-1);
+		}
+
+		Scanner scanner = new Scanner(System.in);
+
+		while(scanner.hasNextLine()){
+			String word = scanner.nextLine().trim();
+
+			System.out.println("Prior probability is: " + speller.getPrior(word));
+
+			HashSet<String> ed1words = chanModel.getEditDistanceWords(word, 1);
+			
+//			System.out.println("## Edit distance 1 words ##");
+//			for(String w : ed1words){
+//				System.out.println(w);
+//			}
+			
+			String mostLikely = speller.getMostLikelyWord(word);
+			System.out.println("Most likely intended word: " + mostLikely);
+			
+			//		HashSet<String> ed2words = chanModel.getEditDistanceWords(word, 2);
+			//		System.out.println("## Edit distance 2 words ##");
+			//		for(String w : ed2words){
+			//			System.out.println(w);			
+			//		}
+		}
+	}
+
+}



Mime
View raw message