incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1387584 - in /incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling: ./ mistakes/ mistakes/edit/ priors/ priors/unigram/ test/
Date Wed, 19 Sep 2012 13:27:15 GMT
Author: tmill
Date: Wed Sep 19 13:27:14 2012
New Revision: 1387584

URL: http://svn.apache.org/viewvc?rev=1387584&view=rev
Log:
Did some significant refactoring to orgnize code into priors & mistake model.  Interface
could be improved.
Performance is still sketchy as well but may be a data issue.


Added:
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
      - copied, changed from r1387218, incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
      - copied, changed from r1387218, incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
      - copied, changed from r1387218, incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java
Removed:
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java
Modified:
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
    incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java

Modified: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java?rev=1387584&r1=1387583&r2=1387584&view=diff
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
(original)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingCorrector.java
Wed Sep 19 13:27:14 2012
@@ -8,38 +8,22 @@ import java.util.Scanner;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.ctakes.spelling.mistakes.SpellingDistanceModel;
+import org.apache.ctakes.spelling.priors.WordPriorModel;
+
 public class SpellingCorrector {
 
-	Pattern priorPatt = Pattern.compile("^([^ ]+) : ([^ ]+)$");
-	HashMap<String,Double> prior = new HashMap<String,Double>();
+	WordPriorModel prior = null;
 	SpellingDistanceModel distModel = null;
 	
-	public SpellingCorrector(SpellingDistanceModel chanModel) {
-		distModel = chanModel;
+	public SpellingCorrector(SpellingDistanceModel chanModel, WordPriorModel priorModel) {
+		this.distModel = chanModel;
+		this.prior = priorModel;
 	}
 
-	public void usePriorsFile(String priorFilename) throws FileNotFoundException {
-		Scanner scanner = new Scanner(new File(priorFilename));
-		Matcher m = null;
-		while(scanner.hasNextLine()){
-			String wordProb = scanner.nextLine().trim();
-			m = priorPatt.matcher(wordProb);
-			if(m.matches()){
-				prior.put(m.group(1), Double.parseDouble(m.group(2)));
-			}
-		}
-	}
-
-	public double getPrior(String word){
-		if(prior != null && prior.containsKey(word)){
-			return prior.get(word);
-		}else{
-			return 0.0;
-		}
-	}
 	
 	public double getSingleWordProbability(String actual, String expected){
-		return distModel.getDistance(actual, expected) * getPrior(expected);
+		return distModel.getDistance(actual, expected) * prior.getPrior(new String[]{expected},0);
 	}
 	
 	public String getMostLikelyWord(String actual){

Copied: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
(from r1387218, incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java)
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java?p2=incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java&p1=incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java&r1=1387218&r2=1387584&rev=1387584&view=diff
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SpellingDistanceModel.java
(original)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/SpellingDistanceModel.java
Wed Sep 19 13:27:14 2012
@@ -1,4 +1,4 @@
-package org.apache.ctakes.spelling;
+package org.apache.ctakes.spelling.mistakes;
 
 import java.util.List;
 

Copied: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
(from r1387218, incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java)
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java?p2=incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java&p1=incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java&r1=1387218&r2=1387584&rev=1387584&view=diff
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/SimpleEditDistanceModel.java
(original)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/edit/SimpleEditDistanceModel.java
Wed Sep 19 13:27:14 2012
@@ -1,9 +1,11 @@
-package org.apache.ctakes.spelling;
+package org.apache.ctakes.spelling.mistakes.edit;
 
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 
+import org.apache.ctakes.spelling.mistakes.SpellingDistanceModel;
+
 public class SimpleEditDistanceModel implements SpellingDistanceModel{
 	
 	@Override

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java?rev=1387584&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/WordPriorModel.java
Wed Sep 19 13:27:14 2012
@@ -0,0 +1,7 @@
+package org.apache.ctakes.spelling.priors;
+
+public interface WordPriorModel {
+	// interface: given an array of words as context, and an index pointing to the word
+	// we want the probability of, return a probability of that word given the context.
+	public double getPrior(String[] context, int index);
+}

Added: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java?rev=1387584&view=auto
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
(added)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/SimpleUnigramPrior.java
Wed Sep 19 13:27:14 2012
@@ -0,0 +1,36 @@
+package org.apache.ctakes.spelling.priors.unigram;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.HashMap;
+import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.spelling.priors.WordPriorModel;
+
+public class SimpleUnigramPrior implements WordPriorModel {
+	Pattern priorPatt = Pattern.compile("^([^ ]+) : ([^ ]+)$");
+	HashMap<String,Double> prior = new HashMap<String,Double>();
+
+	public SimpleUnigramPrior(String priorFilename) throws FileNotFoundException {
+		Scanner scanner = new Scanner(new File(priorFilename));
+		Matcher m = null;
+		while(scanner.hasNextLine()){
+			String wordProb = scanner.nextLine().trim();
+			m = priorPatt.matcher(wordProb);
+			if(m.matches()){
+				prior.put(m.group(1), Double.parseDouble(m.group(2)));
+			}
+		}
+	}
+
+	public double getPrior(String[] word, int i){
+		if(prior != null && prior.containsKey(word[i])){
+			return prior.get(word[i]);
+		}else{
+			return 0.0;
+		}
+	}
+
+}

Copied: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
(from r1387218, incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java)
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java?p2=incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java&p1=incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java&r1=1387218&r2=1387584&rev=1387584&view=diff
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/UnigramPriorGenerator.java
(original)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
Wed Sep 19 13:27:14 2012
@@ -1,4 +1,4 @@
-package org.apache.ctakes.spelling;
+package org.apache.ctakes.spelling.priors.unigram;
 
 import java.io.File;
 import java.io.FilenameFilter;
@@ -48,15 +48,15 @@ public class UnigramPriorGenerator {
 			String dirName = args[i];
 			File curDir = new File(dirName);
 			if(!curDir.exists() || !curDir.isDirectory()){
-				throw new IOException("Erroneous directory passed in!");
+				System.err.println("Error reading directory: " + dirName);
+				continue;
 			}
 			
 			// get all the text files in this directory
 			File[] txtFiles = curDir.listFiles(new FilenameFilter() {
 				@Override
 				public boolean accept(File arg0, String arg1) {
-					// TODO Auto-generated method stub
-					return arg1.endsWith(".txt");
+					return arg1.endsWith(".txt") || !arg1.contains(".");
 				}
 			});
 			
@@ -116,7 +116,7 @@ public class UnigramPriorGenerator {
 	
 	private static void handleWord(String w){
 		String word = w.toLowerCase();
-		if(word.matches("^.*\\p{Digit}.*$")) return; // ignore words with numbers
+//		if(word.matches("^.*\\p{Digit}.*$")) return; // ignore words with numbers
 		if(word.length() == 0) return;
 		if(!wordCounts.containsKey(word)){
 			wordCounts.put(word, 0);

Modified: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java?rev=1387584&r1=1387583&r2=1387584&view=diff
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
(original)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestOnCorpus.java
Wed Sep 19 13:27:14 2012
@@ -6,8 +6,9 @@ import java.io.FilenameFilter;
 import java.io.IOException;
 import java.util.Scanner;
 
-import org.apache.ctakes.spelling.SimpleEditDistanceModel;
 import org.apache.ctakes.spelling.SpellingCorrector;
+import org.apache.ctakes.spelling.mistakes.edit.SimpleEditDistanceModel;
+import org.apache.ctakes.spelling.priors.unigram.SimpleUnigramPrior;
 import org.apache.uima.UIMAException;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.cas.FSIterator;
@@ -33,14 +34,9 @@ public class TestOnCorpus {
 			System.err.println("Required arguments: <priors file> <corpus directory>");
 			System.exit(-1);
 		}
+		SimpleUnigramPrior priorModel = new SimpleUnigramPrior(args[0]);
 		SimpleEditDistanceModel chanModel = new SimpleEditDistanceModel();
-		SpellingCorrector speller = new SpellingCorrector(chanModel);
-		try{
-			speller.usePriorsFile(args[0]);
-		}catch(FileNotFoundException e){
-			System.err.println("Error using priors file, could not find!");
-			System.exit(-1);
-		}
+		SpellingCorrector speller = new SpellingCorrector(chanModel, priorModel);
 
 		JCas jcas = null;
 		AnalysisEngine ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../core/desc/analysis_engine/AggregateAE.xml");

Modified: incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java?rev=1387584&r1=1387583&r2=1387584&view=diff
==============================================================================
--- incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
(original)
+++ incubator/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestSpellingCorrector.java
Wed Sep 19 13:27:14 2012
@@ -4,34 +4,31 @@ import java.io.FileNotFoundException;
 import java.util.HashSet;
 import java.util.Scanner;
 
-import org.apache.ctakes.spelling.SimpleEditDistanceModel;
 import org.apache.ctakes.spelling.SpellingCorrector;
+import org.apache.ctakes.spelling.mistakes.edit.SimpleEditDistanceModel;
+import org.apache.ctakes.spelling.priors.unigram.SimpleUnigramPrior;
 
 public class TestSpellingCorrector {
 
 	/**
 	 * @param args
+	 * @throws FileNotFoundException 
 	 */
-	public static void main(String[] args) {
+	public static void main(String[] args) throws FileNotFoundException {
 		if(args.length < 1){
 			System.err.println("Error: Arg0 must be the priors file.");
 			System.exit(-1);
 		}
 		SimpleEditDistanceModel chanModel = new SimpleEditDistanceModel();
-		SpellingCorrector speller = new SpellingCorrector(chanModel);
-		try{
-			speller.usePriorsFile(args[0]);
-		}catch(FileNotFoundException e){
-			System.err.println("Error using priors file, could not find!");
-			System.exit(-1);
-		}
-
+		SimpleUnigramPrior priorModel = new SimpleUnigramPrior(args[0]);
+		SpellingCorrector speller = new SpellingCorrector(chanModel, priorModel);
+		
 		Scanner scanner = new Scanner(System.in);
 
 		while(scanner.hasNextLine()){
 			String word = scanner.nextLine().trim();
 
-			System.out.println("Prior probability is: " + speller.getPrior(word));
+			System.out.println("Prior probability is: " + priorModel.getPrior(new String[]{word},0));
 
 			HashSet<String> ed1words = chanModel.getEditDistanceWords(word, 1);
 			



Mime
View raw message