ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1495809 - in /ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling: mistakes/ priors/stupid/ test/ util/
Date Sun, 23 Jun 2013 11:34:29 GMT
Author: tmill
Date: Sun Jun 23 11:34:28 2013
New Revision: 1495809

URL: http://svn.apache.org/r1495809
Log:
Working versions of all components of mistake model. Working version of backoff model that requires a lot of memory.

Added:
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/BuildPriorModel.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/LearnPartitionPriors.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/TrainBrillModel.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/stupid/
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/stupid/StupidBackoffModel.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestAlignment.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestBrillTraining.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/Alignment.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/JaspellTernarySearchTrie.java
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/StringAlignment.java

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/BuildPriorModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/BuildPriorModel.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/BuildPriorModel.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/BuildPriorModel.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,80 @@
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+public class BuildPriorModel {
+
+	/**
+	 * @param args
+	 * @throws FileNotFoundException 
+	 */
+	public static void main(String[] args) throws FileNotFoundException {
+		if(args.length < 3){
+			System.err.println("Required argumnets : <rule count file> <context count file> <output file>");
+			System.exit(-1);
+		}
+		
+		Map<String,Double> probs = BuildPriorModel.computeProbabilities(args[0], args[1]);
+		List<String> sortedRules = new ArrayList<String>(probs.keySet());
+		Collections.sort(sortedRules);
+		PrintWriter out = new PrintWriter(args[2]);
+		for(String key : sortedRules){
+			out.print(key);
+			out.print(" -> ");
+			out.println(probs.get(key));
+		}
+		out.close();
+	}
+
+	private static Pattern contextPatt = Pattern.compile("^([^ ]+) : (\\d+)");
+	private static Pattern rulePatt = Pattern.compile("^(([^ ]*) -> ([^ ]+)) : (\\d+)");
+	public static Map<String, Double> computeProbabilities(String ruleCountFilename,
+			String ruleContextFilename) throws FileNotFoundException {
+		Map<String,Double> probs = new HashMap<String,Double>();
+		Map<String,Integer> contextCounts = new HashMap<String,Integer>();
+		
+		Matcher m = null;
+		Scanner scanner = new Scanner(new File(ruleContextFilename));
+		while(scanner.hasNextLine()){
+			String line = scanner.nextLine().trim();
+			m = contextPatt.matcher(line);
+			if(m.matches()){
+				String context = m.group(1);
+				int count = Integer.parseInt(m.group(2));
+				contextCounts.put(context, count);
+			}
+		}
+		scanner.close();
+		scanner = new Scanner(new File(ruleCountFilename));
+		while(scanner.hasNextLine()){
+			String line = scanner.nextLine().trim();
+			m = rulePatt.matcher(line);
+			if(m.matches()){
+				String rule = m.group(1);
+				String lhs = m.group(2);
+				String rhs = m.group(3);
+				int count = Integer.parseInt(m.group(4));
+				double prob = 0.0;
+				if(contextCounts.containsKey(lhs)){
+					prob = (double) count / contextCounts.get(lhs);
+				}else{
+					prob = 1.0 / 26;
+				}
+				probs.put(rule, prob);
+			}
+		}
+		scanner.close();
+		return probs;
+	}
+
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/LearnPartitionPriors.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/LearnPartitionPriors.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/LearnPartitionPriors.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/LearnPartitionPriors.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,86 @@
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashSet;
+import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.commons.io.DirectoryWalker;
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.util.FileUtils;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.JCasIterable;
+
+public class LearnPartitionPriors extends DirectoryWalker<String> {
+
+	HashSet<String> lhs = null;
+	CounterMap<String> counts = new CounterMap<String>();
+	Pattern rulePatt = Pattern.compile("^([^ ]+)->.*");
+	
+	public LearnPartitionPriors(String filename) throws FileNotFoundException {
+		lhs = new HashSet<String>();
+		Matcher m = null;
+		Scanner scanner = new Scanner(new File(filename));
+		while(scanner.hasNextLine()){
+			String line = scanner.nextLine().trim();
+			m = rulePatt.matcher(line);
+			if(m.matches()){
+				lhs.add(m.group(1));				
+			}
+		}
+	}
+
+	public CounterMap<String> getCountsForCorpus(String inputDir) throws UIMAException, IOException {
+		counts.clear();
+		walk(new File(inputDir), null);
+		return counts;
+	}
+
+	@Override
+	protected void handleFile(File file, int depth, java.util.Collection<String> results) throws IOException {
+		String text = FileUtils.file2String(file).toLowerCase();
+		for(String lhs : this.lhs){
+			int count = 0;
+			int startInd = 0;
+			while((startInd = (text.indexOf(lhs, startInd)+1)) > 0){
+				count++;
+			}
+			if(count > 0){
+				counts.add(lhs, count);
+			}
+		}
+	}
+	
+	/**
+	 * @param args
+	 * @throws IOException 
+	 * @throws UIMAException 
+	 */
+	public static void main(String[] args) throws UIMAException, IOException {
+		if(args.length < 3){
+			System.err.println("Required arguments: <rule counts file> <original corpus> <output file>");
+			System.exit(-1);
+		}
+		LearnPartitionPriors learner = new LearnPartitionPriors(args[0]);
+		CounterMap<String> counts = learner.getCountsForCorpus(args[1]);
+		PrintWriter out = new PrintWriter(args[2]);
+		for(String key : counts.keySet()){
+			out.print(key);
+			out.print(" : ");
+			out.print(counts.get(key));
+			out.println();
+			out.flush();
+		}
+		out.close();
+	}
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/TrainBrillModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/TrainBrillModel.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/TrainBrillModel.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/mistakes/TrainBrillModel.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,99 @@
+package org.apache.ctakes.spelling.mistakes;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.PrintWriter;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.spelling.util.Alignment;
+import org.apache.ctakes.spelling.util.StringAlignment;
+import org.apache.ctakes.utils.struct.CounterMap;
+
+public class TrainBrillModel {
+	Pattern examplePatt = Pattern.compile("^(\\S+)\\t(\\S+) : (\\d+)$");
+	CounterMap<String> rules = new CounterMap<String>();
+	
+	File trainingFile = null;
+	public TrainBrillModel(String filename) {
+		this.trainingFile = new File(filename);
+	}
+
+	public void train() throws FileNotFoundException {
+		Scanner scanner = new Scanner(trainingFile);
+		while(scanner.hasNextLine()){
+			String example = scanner.nextLine().trim();
+			Matcher m = examplePatt.matcher(example);
+			if(m.matches()){
+				int numCounts = Integer.parseInt(m.group(3));
+				String actual = m.group(1);
+				String intended = m.group(2);
+				collectExamples(actual, intended, rules);
+			}
+		}
+		scanner.close();
+	}
+	
+	public static void collectExamples(String actual, String intended, CounterMap<String> rules){
+		Alignment align = StringAlignment.getOptimalAlignment(actual, intended);
+		if(align.getScore() < 10){
+			StringBuffer s1 = align.getS1();
+			StringBuffer s2 = align.getS2();
+			for(int i = 0; i < s1.length(); i++){
+				// generate rule for alignment
+				rules.add(s1.substring(i,i+1).trim() + " -> " + s2.substring(i,i+1).trim());
+				
+				if(s1.charAt(i) == ' ' || s2.charAt(i) == ' ' || s1.charAt(i) != s2.charAt(i)){
+					if(i > 0){
+						rules.add((s1.substring(i-1, i+1).replaceAll(" ","") + " -> " + s2.substring(i-1, i+1).replaceAll(" ", "")));
+					}
+					if(i > 1){
+						rules.add((s1.substring(i-2, i+1).replaceAll(" ","") + " -> " + s2.substring(i-2, i+1).replaceAll(" ", "")));
+					}
+					if(i < s1.length()-1){
+						rules.add((s1.substring(i, i+2).replaceAll(" ","") + " -> " + s2.substring(i, i+2).replaceAll(" ", "")));
+					}
+					if(i < s1.length()-2){
+						rules.add((s1.substring(i, i+3).replaceAll(" ","") + " -> " + s2.substring(i, i+3).replaceAll(" ", "")));
+					}
+					if(i > 0 && i < s1.length()-1){
+						rules.add((s1.substring(i-1, i+2).replaceAll(" ","") + " -> " + s2.substring(i-1, i+2).replaceAll(" ", "")));
+					}
+				}
+			}
+		}
+	}
+	
+	public CounterMap<String> getRuleCounts(){
+		return rules;
+	}
+	
+	/**
+	 * @param args
+	 * @throws FileNotFoundException 
+	 */
+	public static void main(String[] args) throws FileNotFoundException {
+		if(args.length < 2){
+			System.err.println("Required arguments: <training examples file> <counts file>");
+			System.exit(-1);
+		}
+		
+		TrainBrillModel model = new TrainBrillModel(args[0]);
+		model.train();
+		Map<String,Integer> map = model.getRuleCounts();
+		List<String> keys = new ArrayList<String>(map.keySet());
+		Collections.sort(keys);
+		PrintWriter out = new PrintWriter(args[1]);
+		for(String key : keys){
+			out.printf("%s : %d\n", key, map.get(key));
+		}
+		out.close();
+	}
+
+
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/stupid/StupidBackoffModel.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/stupid/StupidBackoffModel.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/stupid/StupidBackoffModel.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/stupid/StupidBackoffModel.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,132 @@
+package org.apache.ctakes.spelling.priors.stupid;
+
+import java.io.IOException;
+import java.io.PrintWriter;
+import java.util.HashMap;
+import java.util.List;
+
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.spelling.priors.WordPriorModel;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.util.JCasUtil;
+
+public class StupidBackoffModel implements WordPriorModel {
+
+	public static final String START_TOKEN = "<START>";
+	public static final String END_TOKEN = "<END>";
+	
+	@Override
+	public double getPrior(String[] context, int index) {
+		// TODO Auto-generated method stub
+		return 0;
+	}
+
+	public static void trainModel(String inputDirname, String outputFilename) throws UIMAException, IOException{
+		HashMap<String, HashMap<String,CounterMap<String>>> triFreqs = new HashMap<String, HashMap<String,CounterMap<String>>>();
+		HashMap<String, CounterMap<String>> biFreqs = new HashMap<String, CounterMap<String>>();
+		CounterMap<String> uniFreqs = new CounterMap<String>();
+		
+		CollectionReader reader = null; 
+		AnalysisEngine ae = null;
+		reader = CollectionReaderFactory.createCollectionReader(FilesInDirectoryCollectionReader.class
+				,FilesInDirectoryCollectionReader.PARAM_INPUTDIR
+				,inputDirname
+				,FilesInDirectoryCollectionReader.PARAM_RECURSE
+				,true
+				,FilesInDirectoryCollectionReader.PARAM_EXTENSIONS
+				,new String[]{"txt"}
+				);
+		ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../ctakes-core/desc/analysis_engine/AggregateAE.xml");
+		
+		JCasIterable casIter = new JCasIterable(reader, ae);
+		while(casIter.hasNext()){
+			String prefix = null;
+			String word = null;
+			String postfix = null;
+			JCas jcas = casIter.next();
+			for(Sentence sent : JCasUtil.select(jcas, Sentence.class)){
+				List<WordToken> tokens = JCasUtil.selectCovered(WordToken.class, sent);
+				uniFreqs.add(START_TOKEN);
+				uniFreqs.add(END_TOKEN);
+				for(int i = 0; i < tokens.size(); i++){
+					if(i == 0) prefix = START_TOKEN;
+					else prefix = word; //tokens.get(i-1).getCoveredText().toLowerCase();
+					
+					if(i == 0) word = tokens.get(0).getCoveredText().toLowerCase();
+					else word = postfix;
+					
+					if(i == tokens.size()-1) postfix = END_TOKEN;
+					else postfix = tokens.get(i+1).getCoveredText().toLowerCase();
+					
+					uniFreqs.add(word);
+					if(!biFreqs.containsKey(prefix)){
+						biFreqs.put(prefix, new CounterMap<String>());
+					}
+					biFreqs.get(prefix).add(word);
+					
+					if(!triFreqs.containsKey(prefix)){
+						triFreqs.put(prefix, new HashMap<String, CounterMap<String>>());
+					}
+					if(!triFreqs.get(prefix).containsKey(postfix)){
+						triFreqs.get(prefix).put(postfix, new CounterMap<String>());
+					}
+					triFreqs.get(prefix).get(postfix).add(word);
+				}
+			}
+		}
+
+		// write out model:
+		PrintWriter out = new PrintWriter(outputFilename);
+		for(String word : uniFreqs.keySet()){
+			out.print("UNI ");
+			out.print(word);
+			out.print(" = ");
+			out.print(uniFreqs.get(word));
+			out.println();
+			CounterMap<String> bigramMap = biFreqs.get(word);
+			if(bigramMap != null){
+				for(String word2 : bigramMap.keySet()){
+					out.print("BI ");
+					out.print(word);
+					out.print(" : ");
+					out.print(word2);
+					out.print(" = ");
+					out.print(bigramMap.get(word2));
+					out.println();
+					CounterMap<String> trigramMap = triFreqs.get(word).get(word2);
+					if(trigramMap != null){
+						for(String word3 : trigramMap.keySet()){
+							out.print("TRI ");
+							out.print(word);
+							out.print(" ");
+							out.print(word2);
+							out.print(" : ");
+							out.print(word3);
+							out.print(" = ");
+							out.print(trigramMap.get(word3));
+							out.println();
+						}
+					}
+				}
+			}
+		}
+	}
+	
+	public static void main(String[] args) throws UIMAException, IOException{
+		if(args.length < 2){
+			System.err.println("Required arguments: <input directory> <output file>");
+			System.exit(-1);
+		}
+		
+		trainModel(args[0], args[1]);
+	}
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestAlignment.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestAlignment.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestAlignment.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestAlignment.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,50 @@
+package org.apache.ctakes.spelling.test;
+
+import org.apache.ctakes.spelling.util.Alignment;
+import org.apache.ctakes.spelling.util.StringAlignment;
+import org.junit.Test;
+
+public class TestAlignment {
+
+	@Test
+	public void testGetOptimalAlignment() {
+		String s1 = "asdf";
+		String s2 = "addf";
+		Alignment align = StringAlignment.getOptimalAlignment(s1, s2);
+		System.out.println(align.getS1());
+		System.out.println(align.getS2());
+		assert align.getScore() == 1;
+		
+		s2 = "asdf";
+		align = StringAlignment.getOptimalAlignment(s1,  s2);
+		System.out.println(align.getS1());
+		System.out.println(align.getS2());
+		assert align.getScore() == 0;
+		
+		s2 = "addg";
+		align = StringAlignment.getOptimalAlignment(s1, s2);
+		System.out.println(align.getS1());
+		System.out.println(align.getS2());
+		assert align.getScore() == 2;
+		
+		s2 = "adf";
+		align = StringAlignment.getOptimalAlignment(s1, s2);
+		System.out.println(align.getS1());
+		System.out.println(align.getS2());
+		assert align.getScore() == 1;
+		assert align.getS2().toString().equals("a df");
+		
+		s2 = "qsdf";
+		align = StringAlignment.getOptimalAlignment(s1, s2);
+		System.out.println(align.getS1());
+		System.out.println(align.getS2());
+		assert align.getScore() == 1;
+		
+		s2 = "asdq";
+		align = StringAlignment.getOptimalAlignment(s1, s2);
+		System.out.println(align.getS1());
+		System.out.println(align.getS2());
+		assert align.getScore() == 1;
+	}
+
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestBrillTraining.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestBrillTraining.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestBrillTraining.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/test/TestBrillTraining.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,20 @@
+package org.apache.ctakes.spelling.test;
+
+import org.apache.ctakes.spelling.mistakes.TrainBrillModel;
+import org.apache.ctakes.utils.struct.CounterMap;
+import org.junit.Test;
+
+public class TestBrillTraining {
+
+	@Test
+	public void testTrain() {
+		CounterMap<String> map = new CounterMap<String>();
+		String s1 = "actual";
+		String s2 = "akgsual";
+		TrainBrillModel.collectExamples(s1, s2, map);
+		for(String key : map.keySet()){
+			System.out.println(key + " => " + map.get(key));
+		}
+	}
+
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/Alignment.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/Alignment.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/Alignment.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/Alignment.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,25 @@
+package org.apache.ctakes.spelling.util;
+
+public class Alignment {
+	int score;
+	StringBuffer s1;
+	StringBuffer s2;
+	
+	public Alignment(StringBuffer s1, StringBuffer s2, int score){
+		this.s1 = s1;
+		this.s2 = s2;
+		this.score = score;
+	}
+	
+	public StringBuffer getS1(){
+		return s1;
+	}
+	
+	public StringBuffer getS2(){
+		return s2;
+	}
+	
+	public int getScore(){
+		return score;
+	}
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/JaspellTernarySearchTrie.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/JaspellTernarySearchTrie.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/JaspellTernarySearchTrie.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/JaspellTernarySearchTrie.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,890 @@
+package org.apache.ctakes.spelling.util;
+
+/** 
+ * Copyright (c) 2005 Bruno Martins
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without 
+ * modification, are permitted provided that the following conditions 
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright 
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the organization nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ * 
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
+ * THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.List;
+import java.util.Locale;
+import java.util.Vector;
+import java.util.zip.GZIPInputStream;
+
+import org.apache.lucene.util.IOUtils;
+
+/**
+ * Implementation of a Ternary Search Trie, a data structure for storing
+ * <code>String</code> objects that combines the compact size of a binary search
+ * tree with the speed of a digital search trie, and is therefore ideal for
+ * practical use in sorting and searching data.</p>
+ * <p>
+ * 
+ * This data structure is faster than hashing for many typical search problems,
+ * and supports a broader range of useful problems and operations. Ternary
+ * searches are faster than hashing and more powerful, too.
+ * </p>
+ * <p>
+ * 
+ * The theory of ternary search trees was described at a symposium in 1997 (see
+ * "Fast Algorithms for Sorting and Searching Strings," by J.L. Bentley and R.
+ * Sedgewick, Proceedings of the 8th Annual ACM-SIAM Symposium on Discrete
+ * Algorithms, January 1997). Algorithms in C, Third Edition, by Robert
+ * Sedgewick (Addison-Wesley, 1998) provides yet another view of ternary search
+ * trees.
+ */
+public class JaspellTernarySearchTrie {
+
+  /**
+   * An inner class of Ternary Search Trie that represents a node in the trie.
+   */
+  protected final class TSTNode {
+
+    /** Index values for accessing relatives array. */
+    protected final static int PARENT = 0, LOKID = 1, EQKID = 2, HIKID = 3;
+
+    /** The key to the node. */
+    protected Object data;
+
+    /** The relative nodes. */
+    protected TSTNode[] relatives = new TSTNode[4];
+
+    /** The char used in the split. */
+    protected char splitchar;
+
+    /**
+     * Constructor method.
+     * 
+     *@param splitchar
+     *          The char used in the split.
+     *@param parent
+     *          The parent node.
+     */
+    protected TSTNode(char splitchar, TSTNode parent) {
+      this.splitchar = splitchar;
+      relatives[PARENT] = parent;
+    }
+  }
+
+  /**
+   * Compares characters by alfabetical order.
+   * 
+   *@param cCompare2
+   *          The first char in the comparison.
+   *@param cRef
+   *          The second char in the comparison.
+   *@return A negative number, 0 or a positive number if the second char is
+   *         less, equal or greater.
+   */
+  private static int compareCharsAlphabetically(char cCompare2, char cRef) {
+    return Character.toLowerCase(cCompare2) - Character.toLowerCase(cRef);
+  }
+  
+  /* what follows is the original Jaspell code. 
+  private static int compareCharsAlphabetically(int cCompare2, int cRef) {
+    int cCompare = 0;
+    if (cCompare2 >= 65) {
+      if (cCompare2 < 89) {
+        cCompare = (2 * cCompare2) - 65;
+      } else if (cCompare2 < 97) {
+        cCompare = cCompare2 + 24;
+      } else if (cCompare2 < 121) {
+        cCompare = (2 * cCompare2) - 128;
+      } else cCompare = cCompare2;
+    } else cCompare = cCompare2;
+    if (cRef < 65) {
+      return cCompare - cRef;
+    }
+    if (cRef < 89) {
+      return cCompare - ((2 * cRef) - 65);
+    }
+    if (cRef < 97) {
+      return cCompare - (cRef + 24);
+    }
+    if (cRef < 121) {
+      return cCompare - ((2 * cRef) - 128);
+    }
+    return cCompare - cRef;
+  }
+  */
+
+  /**
+   * The default number of values returned by the <code>matchAlmost</code>
+   * method.
+   */
+  private int defaultNumReturnValues = -1;
+
+  /**
+   * the number of differences allowed in a call to the
+   * <code>matchAlmostKey</code> method.
+   */
+  private int matchAlmostDiff;
+
+  /** The base node in the trie. */
+  private TSTNode rootNode;
+  
+  private final Locale locale;
+
+  /**
+   * Constructs an empty Ternary Search Trie.
+   */
+  public JaspellTernarySearchTrie() {
+    this(Locale.ROOT);
+  }
+  
+  /**
+   * Constructs an empty Ternary Search Trie,
+   * specifying the Locale used for lowercasing.
+   */
+  public JaspellTernarySearchTrie(Locale locale) {
+    this.locale = locale;
+  }
+  
+  // for loading
+  void setRoot(TSTNode newRoot) {
+    rootNode = newRoot;
+  }
+  
+  // for saving
+  TSTNode getRoot() {
+    return rootNode;
+  }
+
+  /**
+   * Constructs a Ternary Search Trie and loads data from a <code>File</code>
+   * into the Trie. The file is a normal text document, where each line is of
+   * the form word TAB float.
+   * 
+   *@param file
+   *          The <code>File</code> with the data to load into the Trie.
+   *@exception IOException
+   *              A problem occured while reading the data.
+   */
+  public JaspellTernarySearchTrie(File file) throws IOException {
+    this(file, false);
+  }
+
+  /**
+   * Constructs a Ternary Search Trie and loads data from a <code>File</code>
+   * into the Trie. The file is a normal text document, where each line is of
+   * the form "word TAB float".
+   * 
+   *@param file
+   *          The <code>File</code> with the data to load into the Trie.
+   *@param compression
+   *          If true, the file is compressed with the GZIP algorithm, and if
+   *          false, the file is a normal text document.
+   *@exception IOException
+   *              A problem occured while reading the data.
+   */
+  public JaspellTernarySearchTrie(File file, boolean compression)
+          throws IOException {
+    this();
+    BufferedReader in;
+    if (compression)
+      in = new BufferedReader(IOUtils.getDecodingReader(new GZIPInputStream(
+              new FileInputStream(file)), IOUtils.CHARSET_UTF_8));
+    else in = new BufferedReader(IOUtils.getDecodingReader((new FileInputStream(
+            file)), IOUtils.CHARSET_UTF_8));
+    String word;
+    int pos;
+    Float occur, one = new Float(1);
+    int numWords = 0;
+    while ((word = in.readLine()) != null) {
+      numWords++;
+      pos = word.indexOf("\t");
+      occur = one;
+      if (pos != -1) {
+        occur = Float.parseFloat(word.substring(pos + 1).trim());
+        word = word.substring(0, pos);
+      }
+      String key = word.toLowerCase(locale);
+      if (rootNode == null) {
+        rootNode = new TSTNode(key.charAt(0), null);
+      }
+      TSTNode node = null;
+      if (key.length() > 0 && rootNode != null) {
+        TSTNode currentNode = rootNode;
+        int charIndex = 0;
+        while (true) {
+          if (currentNode == null) break;
+          int charComp = compareCharsAlphabetically(key.charAt(charIndex),
+                  currentNode.splitchar);
+          if (charComp == 0) {
+            charIndex++;
+            if (charIndex == key.length()) {
+              node = currentNode;
+              break;
+            }
+            currentNode = currentNode.relatives[TSTNode.EQKID];
+          } else if (charComp < 0) {
+            currentNode = currentNode.relatives[TSTNode.LOKID];
+          } else {
+            currentNode = currentNode.relatives[TSTNode.HIKID];
+          }
+        }
+        Float occur2 = null;
+        if (node != null) occur2 = ((Float) (node.data));
+        if (occur2 != null) {
+          occur += occur2.floatValue();
+        }
+        currentNode = getOrCreateNode(word.trim().toLowerCase(locale));
+        currentNode.data = occur;
+      }
+    }
+    in.close();
+  }
+
+  /**
+   * Deletes the node passed in as an argument. If this node has non-null data,
+   * then both the node and the data will be deleted. It also deletes any other
+   * nodes in the trie that are no longer needed after the deletion of the node.
+   * 
+   *@param nodeToDelete
+   *          The node to delete.
+   */
+  private void deleteNode(TSTNode nodeToDelete) {
+    if (nodeToDelete == null) {
+      return;
+    }
+    nodeToDelete.data = null;
+    while (nodeToDelete != null) {
+      nodeToDelete = deleteNodeRecursion(nodeToDelete);
+      // deleteNodeRecursion(nodeToDelete);
+    }
+  }
+
+  /**
+   * Recursively visits each node to be deleted.
+   * 
+   * To delete a node, first set its data to null, then pass it into this
+   * method, then pass the node returned by this method into this method (make
+   * sure you don't delete the data of any of the nodes returned from this
+   * method!) and continue in this fashion until the node returned by this
+   * method is <code>null</code>.
+   * 
+   * The TSTNode instance returned by this method will be next node to be
+   * operated on by <code>deleteNodeRecursion</code> (This emulates recursive
+   * method call while avoiding the JVM overhead normally associated with a
+   * recursive method.)
+   * 
+   *@param currentNode
+   *          The node to delete.
+   *@return The next node to be called in deleteNodeRecursion.
+   */
+  private TSTNode deleteNodeRecursion(TSTNode currentNode) {
+    if (currentNode == null) {
+      return null;
+    }
+    if (currentNode.relatives[TSTNode.EQKID] != null
+            || currentNode.data != null) {
+      return null;
+    }
+    // can't delete this node if it has a non-null eq kid or data
+    TSTNode currentParent = currentNode.relatives[TSTNode.PARENT];
+    boolean lokidNull = currentNode.relatives[TSTNode.LOKID] == null;
+    boolean hikidNull = currentNode.relatives[TSTNode.HIKID] == null;
+    int childType;
+    if (currentParent.relatives[TSTNode.LOKID] == currentNode) {
+      childType = TSTNode.LOKID;
+    } else if (currentParent.relatives[TSTNode.EQKID] == currentNode) {
+      childType = TSTNode.EQKID;
+    } else if (currentParent.relatives[TSTNode.HIKID] == currentNode) {
+      childType = TSTNode.HIKID;
+    } else {
+      rootNode = null;
+      return null;
+    }
+    if (lokidNull && hikidNull) {
+      currentParent.relatives[childType] = null;
+      return currentParent;
+    }
+    if (lokidNull) {
+      currentParent.relatives[childType] = currentNode.relatives[TSTNode.HIKID];
+      currentNode.relatives[TSTNode.HIKID].relatives[TSTNode.PARENT] = currentParent;
+      return currentParent;
+    }
+    if (hikidNull) {
+      currentParent.relatives[childType] = currentNode.relatives[TSTNode.LOKID];
+      currentNode.relatives[TSTNode.LOKID].relatives[TSTNode.PARENT] = currentParent;
+      return currentParent;
+    }
+    int deltaHi = currentNode.relatives[TSTNode.HIKID].splitchar
+            - currentNode.splitchar;
+    int deltaLo = currentNode.splitchar
+            - currentNode.relatives[TSTNode.LOKID].splitchar;
+    int movingKid;
+    TSTNode targetNode;
+    if (deltaHi == deltaLo) {
+      if (Math.random() < 0.5) {
+        deltaHi++;
+      } else {
+        deltaLo++;
+      }
+    }
+    if (deltaHi > deltaLo) {
+      movingKid = TSTNode.HIKID;
+      targetNode = currentNode.relatives[TSTNode.LOKID];
+    } else {
+      movingKid = TSTNode.LOKID;
+      targetNode = currentNode.relatives[TSTNode.HIKID];
+    }
+    while (targetNode.relatives[movingKid] != null) {
+      targetNode = targetNode.relatives[movingKid];
+    }
+    targetNode.relatives[movingKid] = currentNode.relatives[movingKid];
+    currentParent.relatives[childType] = targetNode;
+    targetNode.relatives[TSTNode.PARENT] = currentParent;
+    if (!lokidNull) {
+      currentNode.relatives[TSTNode.LOKID] = null;
+    }
+    if (!hikidNull) {
+      currentNode.relatives[TSTNode.HIKID] = null;
+    }
+    return currentParent;
+  }
+
+  /**
+   * Retrieve the object indexed by a key.
+   * 
+   *@param key
+   *          A <code>String</code> index.
+   *@return The object retrieved from the Ternary Search Trie.
+   */
+  public Object get(CharSequence key) {
+    TSTNode node = getNode(key);
+    if (node == null) {
+      return null;
+    }
+    return node.data;
+  }
+
+  /**
+   * Retrieve the <code>Float</code> indexed by key, increment it by one unit
+   * and store the new <code>Float</code>.
+   * 
+   *@param key
+   *          A <code>String</code> index.
+   *@return The <code>Float</code> retrieved from the Ternary Search Trie.
+   */
+  public Float getAndIncrement(String key) {
+    String key2 = key.trim().toLowerCase(locale);
+    TSTNode node = getNode(key2);
+    if (node == null) {
+      return null;
+    }
+    Float aux = (Float) (node.data);
+    if (aux == null) {
+      aux = new Float(1);
+    } else {
+      aux = new Float(aux.intValue() + 1);
+    }
+    put(key2, aux);
+    return aux;
+  }
+
+  /**
+   * Returns the key that indexes the node argument.
+   * 
+   *@param node
+   *          The node whose index is to be calculated.
+   *@return The <code>String</code> that indexes the node argument.
+   */
+  protected String getKey(TSTNode node) {
+    StringBuffer getKeyBuffer = new StringBuffer();
+    getKeyBuffer.setLength(0);
+    getKeyBuffer.append("" + node.splitchar);
+    TSTNode currentNode;
+    TSTNode lastNode;
+    currentNode = node.relatives[TSTNode.PARENT];
+    lastNode = node;
+    while (currentNode != null) {
+      if (currentNode.relatives[TSTNode.EQKID] == lastNode) {
+        getKeyBuffer.append("" + currentNode.splitchar);
+      }
+      lastNode = currentNode;
+      currentNode = currentNode.relatives[TSTNode.PARENT];
+    }
+    getKeyBuffer.reverse();
+    return getKeyBuffer.toString();
+  }
+
+  /**
+   * Returns the node indexed by key, or <code>null</code> if that node doesn't
+   * exist. Search begins at root node.
+   * 
+   *@param key
+   *          A <code>String</code> that indexes the node that is returned.
+   *@return The node object indexed by key. This object is an instance of an
+   *         inner class named <code>TernarySearchTrie.TSTNode</code>.
+   */
+  public TSTNode getNode(CharSequence key) {
+    return getNode(key, rootNode);
+  }
+
+  /**
+   * Returns the node indexed by key, or <code>null</code> if that node doesn't
+   * exist. The search begins at root node.
+   * 
+   *@param key
+   *          A <code>String</code> that indexes the node that is returned.
+   *@param startNode
+   *          The top node defining the subtrie to be searched.
+   *@return The node object indexed by key. This object is an instance of an
+   *         inner class named <code>TernarySearchTrie.TSTNode</code>.
+   */
+  protected TSTNode getNode(CharSequence key, TSTNode startNode) {
+    if (key == null || startNode == null || key.length() == 0) {
+      return null;
+    }
+    TSTNode currentNode = startNode;
+    int charIndex = 0;
+    while (true) {
+      if (currentNode == null) {
+        return null;
+      }
+      int charComp = compareCharsAlphabetically(key.charAt(charIndex),
+              currentNode.splitchar);
+      if (charComp == 0) {
+        charIndex++;
+        if (charIndex == key.length()) {
+          return currentNode;
+        }
+        currentNode = currentNode.relatives[TSTNode.EQKID];
+      } else if (charComp < 0) {
+        currentNode = currentNode.relatives[TSTNode.LOKID];
+      } else {
+        currentNode = currentNode.relatives[TSTNode.HIKID];
+      }
+    }
+  }
+
+  /**
+   * Returns the node indexed by key, creating that node if it doesn't exist,
+   * and creating any required intermediate nodes if they don't exist.
+   * 
+   *@param key
+   *          A <code>String</code> that indexes the node that is returned.
+   *@return The node object indexed by key. This object is an instance of an
+   *         inner class named <code>TernarySearchTrie.TSTNode</code>.
+   *@exception NullPointerException
+   *              If the key is <code>null</code>.
+   *@exception IllegalArgumentException
+   *              If the key is an empty <code>String</code>.
+   */
+  protected TSTNode getOrCreateNode(CharSequence key) throws NullPointerException,
+          IllegalArgumentException {
+    if (key == null) {
+      throw new NullPointerException(
+              "attempt to get or create node with null key");
+    }
+    if (key.length() == 0) {
+      throw new IllegalArgumentException(
+              "attempt to get or create node with key of zero length");
+    }
+    if (rootNode == null) {
+      rootNode = new TSTNode(key.charAt(0), null);
+    }
+    TSTNode currentNode = rootNode;
+    int charIndex = 0;
+    while (true) {
+      int charComp = compareCharsAlphabetically(key.charAt(charIndex),
+              currentNode.splitchar);
+      if (charComp == 0) {
+        charIndex++;
+        if (charIndex == key.length()) {
+          return currentNode;
+        }
+        if (currentNode.relatives[TSTNode.EQKID] == null) {
+          currentNode.relatives[TSTNode.EQKID] = new TSTNode(key
+                  .charAt(charIndex), currentNode);
+        }
+        currentNode = currentNode.relatives[TSTNode.EQKID];
+      } else if (charComp < 0) {
+        if (currentNode.relatives[TSTNode.LOKID] == null) {
+          currentNode.relatives[TSTNode.LOKID] = new TSTNode(key
+                  .charAt(charIndex), currentNode);
+        }
+        currentNode = currentNode.relatives[TSTNode.LOKID];
+      } else {
+        if (currentNode.relatives[TSTNode.HIKID] == null) {
+          currentNode.relatives[TSTNode.HIKID] = new TSTNode(key
+                  .charAt(charIndex), currentNode);
+        }
+        currentNode = currentNode.relatives[TSTNode.HIKID];
+      }
+    }
+  }
+
+  /**
+   * Returns a <code>List</code> of keys that almost match the argument key.
+   * Keys returned will have exactly diff characters that do not match the
+   * target key, where diff is equal to the last value passed in as an argument
+   * to the <code>setMatchAlmostDiff</code> method.
+   * <p>
+   * If the <code>matchAlmost</code> method is called before the
+   * <code>setMatchAlmostDiff</code> method has been called for the first time,
+   * then diff = 0.
+   * 
+   *@param key
+   *          The target key.
+   *@return A <code>List</code> with the results.
+   */
+  public List<String> matchAlmost(String key) {
+    return matchAlmost(key, defaultNumReturnValues);
+  }
+
+  /**
+   * Returns a <code>List</code> of keys that almost match the argument key.
+   * Keys returned will have exactly diff characters that do not match the
+   * target key, where diff is equal to the last value passed in as an argument
+   * to the <code>setMatchAlmostDiff</code> method.
+   * <p>
+   * If the <code>matchAlmost</code> method is called before the
+   * <code>setMatchAlmostDiff</code> method has been called for the first time,
+   * then diff = 0.
+   * 
+   *@param key
+   *          The target key.
+   *@param numReturnValues
+   *          The maximum number of values returned by this method.
+   *@return A <code>List</code> with the results
+   */
+  public List<String> matchAlmost(CharSequence key, int numReturnValues) {
+    return matchAlmostRecursion(rootNode, 0, matchAlmostDiff, key,
+            ((numReturnValues < 0) ? -1 : numReturnValues), new Vector<String>(), false);
+  }
+
+  /**
+   * Recursivelly vists the nodes in order to find the ones that almost match a
+   * given key.
+   * 
+   *@param currentNode
+   *          The current node.
+   *@param charIndex
+   *          The current char.
+   *@param d
+   *          The number of differences so far.
+   *@param matchAlmostNumReturnValues
+   *          The maximum number of values in the result <code>List</code>.
+   *@param matchAlmostResult2
+   *          The results so far.
+   *@param upTo
+   *          If true all keys having up to and including matchAlmostDiff
+   *          mismatched letters will be included in the result (including a key
+   *          that is exactly the same as the target string) otherwise keys will
+   *          be included in the result only if they have exactly
+   *          matchAlmostDiff number of mismatched letters.
+   *@param matchAlmostKey
+   *          The key being searched.
+   *@return A <code>List</code> with the results.
+   */
+  private List<String> matchAlmostRecursion(TSTNode currentNode, int charIndex,
+          int d, CharSequence matchAlmostKey, int matchAlmostNumReturnValues,
+          List<String> matchAlmostResult2, boolean upTo) {
+    if ((currentNode == null)
+            || (matchAlmostNumReturnValues != -1 && matchAlmostResult2.size() >= matchAlmostNumReturnValues)
+            || (d < 0) || (charIndex >= matchAlmostKey.length())) {
+      return matchAlmostResult2;
+    }
+    List<String> matchAlmostResult = matchAlmostResult2;
+
+    
+    int charComp = compareCharsAlphabetically(matchAlmostKey.charAt(charIndex),
+            currentNode.splitchar);
+
+    
+    if(charIndex + 1 < matchAlmostKey.length() && 
+        compareCharsAlphabetically(matchAlmostKey.charAt(charIndex+1), currentNode.splitchar) == 0
+/*        compareCharsAlphabetically(matchAlmostKey )*/){
+      // delete the current character:
+      matchAlmostResult = matchAlmostRecursion(
+          currentNode, charIndex+1, d-1, 
+          matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo);
+    }
+    
+    
+    if ((d > 0) || (charComp < 0)) {
+      matchAlmostResult = matchAlmostRecursion(
+              currentNode.relatives[TSTNode.LOKID], charIndex, d,
+              matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult,
+              upTo);
+    }
+    int nextD = (charComp == 0) ? d : d - 1;
+    boolean cond = (upTo) ? (nextD >= 0) : (nextD == 0);
+    if ((matchAlmostKey.length() == charIndex + 1) && cond
+            && (currentNode.data != null)) {
+      matchAlmostResult.add(getKey(currentNode));
+    }
+    matchAlmostResult = matchAlmostRecursion(
+            currentNode.relatives[TSTNode.EQKID], charIndex + 1, nextD,
+            matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult, upTo);
+    if ((d > 0) || (charComp > 0)) {
+      matchAlmostResult = matchAlmostRecursion(
+              currentNode.relatives[TSTNode.HIKID], charIndex, d,
+              matchAlmostKey, matchAlmostNumReturnValues, matchAlmostResult,
+              upTo);
+    }
+    return matchAlmostResult;
+  }
+
+  /**
+   * Returns an alphabetical <code>List</code> of all keys in the trie that
+   * begin with a given prefix. Only keys for nodes having non-null data are
+   * included in the <code>List</code>.
+   * 
+   *@param prefix
+   *          Each key returned from this method will begin with the characters
+   *          in prefix.
+   *@return A <code>List</code> with the results.
+   */
+  public List<String> matchPrefix(String prefix) {
+    return matchPrefix(prefix, defaultNumReturnValues);
+  }
+
+  /**
+   * Returns an alphabetical <code>List</code> of all keys in the trie that
+   * begin with a given prefix. Only keys for nodes having non-null data are
+   * included in the <code>List</code>.
+   * 
+   *@param prefix
+   *          Each key returned from this method will begin with the characters
+   *          in prefix.
+   *@param numReturnValues
+   *          The maximum number of values returned from this method.
+   *@return A <code>List</code> with the results
+   */
+  public List<String> matchPrefix(CharSequence prefix, int numReturnValues) {
+    Vector<String> sortKeysResult = new Vector<String>();
+    TSTNode startNode = getNode(prefix);
+    if (startNode == null) {
+      return sortKeysResult;
+    }
+    if (startNode.data != null) {
+      sortKeysResult.addElement(getKey(startNode));
+    }
+    return sortKeysRecursion(startNode.relatives[TSTNode.EQKID],
+            ((numReturnValues < 0) ? -1 : numReturnValues), sortKeysResult);
+  }
+
+  /**
+   * Returns the number of nodes in the trie that have non-null data.
+   * 
+   *@return The number of nodes in the trie that have non-null data.
+   */
+  public int numDataNodes() {
+    return numDataNodes(rootNode);
+  }
+
+  /**
+   * Returns the number of nodes in the subtrie below and including the starting
+   * node. The method counts only nodes that have non-null data.
+   * 
+   *@param startingNode
+   *          The top node of the subtrie. the node that defines the subtrie.
+   *@return The total number of nodes in the subtrie.
+   */
+  protected int numDataNodes(TSTNode startingNode) {
+    return recursiveNodeCalculator(startingNode, true, 0);
+  }
+
+  /**
+   * Returns the total number of nodes in the trie. The method counts nodes
+   * whether or not they have data.
+   * 
+   *@return The total number of nodes in the trie.
+   */
+  public int numNodes() {
+    return numNodes(rootNode);
+  }
+
+  /**
+   * Returns the total number of nodes in the subtrie below and including the
+   * starting Node. The method counts nodes whether or not they have data.
+   * 
+   *@param startingNode
+   *          The top node of the subtrie. The node that defines the subtrie.
+   *@return The total number of nodes in the subtrie.
+   */
+  protected int numNodes(TSTNode startingNode) {
+    return recursiveNodeCalculator(startingNode, false, 0);
+  }
+
+  /**
+   * Stores a value in the trie. The value may be retrieved using the key.
+   * 
+   *@param key
+   *          A <code>String</code> that indexes the object to be stored.
+   *@param value
+   *          The object to be stored in the Trie.
+   */
+  public void put(CharSequence key, Object value) {
+    getOrCreateNode(key).data = value;
+  }
+
+  /**
+   * Recursivelly visists each node to calculate the number of nodes.
+   * 
+   *@param currentNode
+   *          The current node.
+   *@param checkData
+   *          If true we check the data to be different of <code>null</code>.
+   *@param numNodes2
+   *          The number of nodes so far.
+   *@return The number of nodes accounted.
+   */
+  private int recursiveNodeCalculator(TSTNode currentNode, boolean checkData,
+          int numNodes2) {
+    if (currentNode == null) {
+      return numNodes2;
+    }
+    int numNodes = recursiveNodeCalculator(
+            currentNode.relatives[TSTNode.LOKID], checkData, numNodes2);
+    numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.EQKID],
+            checkData, numNodes);
+    numNodes = recursiveNodeCalculator(currentNode.relatives[TSTNode.HIKID],
+            checkData, numNodes);
+    if (checkData) {
+      if (currentNode.data != null) {
+        numNodes++;
+      }
+    } else {
+      numNodes++;
+    }
+    return numNodes;
+  }
+
+  /**
+   * Removes the value indexed by key. Also removes all nodes that are rendered
+   * unnecessary by the removal of this data.
+   * 
+   *@param key
+   *          A <code>string</code> that indexes the object to be removed from
+   *          the Trie.
+   */
+  public void remove(String key) {
+    deleteNode(getNode(key.trim().toLowerCase(locale)));
+  }
+
+  /**
+   * Sets the number of characters by which words can differ from target word
+   * when calling the <code>matchAlmost</code> method.
+   * <p>
+   * Arguments less than 0 will set the char difference to 0, and arguments
+   * greater than 3 will set the char difference to 3.
+   * 
+   *@param diff
+   *          The number of characters by which words can differ from target
+   *          word.
+   */
+  public void setMatchAlmostDiff(int diff) {
+    if (diff < 0) {
+      matchAlmostDiff = 0;
+    } else if (diff > 3) {
+      matchAlmostDiff = 3;
+    } else {
+      matchAlmostDiff = diff;
+    }
+  }
+
+  /**
+   * Sets the default maximum number of values returned from the
+   * <code>matchPrefix</code> and <code>matchAlmost</code> methods.
+   * <p>
+   * The value should be set this to -1 to get an unlimited number of return
+   * values. note that the methods mentioned above provide overloaded versions
+   * that allow you to specify the maximum number of return values, in which
+   * case this value is temporarily overridden.
+   * 
+   **@param num
+   *          The number of values that will be returned when calling the
+   *          methods above.
+   */
+  public void setNumReturnValues(int num) {
+    defaultNumReturnValues = (num < 0) ? -1 : num;
+  }
+
+  /**
+   * Returns keys sorted in alphabetical order. This includes the start Node and
+   * all nodes connected to the start Node.
+   * <p>
+   * The number of keys returned is limited to numReturnValues. To get a list
+   * that isn't limited in size, set numReturnValues to -1.
+   * 
+   *@param startNode
+   *          The top node defining the subtrie to be searched.
+   *@param numReturnValues
+   *          The maximum number of values returned from this method.
+   *@return A <code>List</code> with the results.
+   */
+  protected List<String> sortKeys(TSTNode startNode, int numReturnValues) {
+    return sortKeysRecursion(startNode, ((numReturnValues < 0) ? -1
+            : numReturnValues), new Vector<String>());
+  }
+
+  /**
+   * Returns keys sorted in alphabetical order. This includes the current Node
+   * and all nodes connected to the current Node.
+   * <p>
+   * Sorted keys will be appended to the end of the resulting <code>List</code>.
+   * The result may be empty when this method is invoked, but may not be
+   * <code>null</code>.
+   * 
+   *@param currentNode
+   *          The current node.
+   *@param sortKeysNumReturnValues
+   *          The maximum number of values in the result.
+   *@param sortKeysResult2
+   *          The results so far.
+   *@return A <code>List</code> with the results.
+   */
+  private List<String> sortKeysRecursion(TSTNode currentNode,
+          int sortKeysNumReturnValues, List<String> sortKeysResult2) {
+    if (currentNode == null) {
+      return sortKeysResult2;
+    }
+    List<String> sortKeysResult = sortKeysRecursion(
+            currentNode.relatives[TSTNode.LOKID], sortKeysNumReturnValues,
+            sortKeysResult2);
+    if (sortKeysNumReturnValues != -1
+            && sortKeysResult.size() >= sortKeysNumReturnValues) {
+      return sortKeysResult;
+    }
+    if (currentNode.data != null) {
+      sortKeysResult.add(getKey(currentNode));
+    }
+    sortKeysResult = sortKeysRecursion(currentNode.relatives[TSTNode.EQKID],
+            sortKeysNumReturnValues, sortKeysResult);
+    return sortKeysRecursion(currentNode.relatives[TSTNode.HIKID],
+            sortKeysNumReturnValues, sortKeysResult);
+  }
+
+}

Added: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/StringAlignment.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/StringAlignment.java?rev=1495809&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/StringAlignment.java (added)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/util/StringAlignment.java Sun Jun 23 11:34:28 2013
@@ -0,0 +1,75 @@
+package org.apache.ctakes.spelling.util;
+
+public class StringAlignment {
+	public static Alignment getOptimalAlignment(String s1, String s2){
+		s1 = s1.toLowerCase();
+		s2 = s2.toLowerCase();
+		Alignment align = null; //new Alignment();
+		StringBuffer buff1 = new StringBuffer();
+		StringBuffer buff2 = new StringBuffer();
+		
+		Cell[][] table = new Cell[s1.length()+1][s2.length()+1];
+		int i, j=0;
+		for(i = 0; i < s1.length()+1; i++){
+			for(j = 0; j < s2.length()+1; j++){
+				if(i == 0 && j == 0){
+					table[i][j] = new Cell(0, null);
+				}else if(j == 0){
+					table[i][j] = new Cell(i, table[i-1][j]);
+				}else if(i == 0){
+					table[i][j] = new Cell(j, table[i][j-1]);					
+				}else{
+					int alignScore = table[i-1][j-1].getValue() +
+							(s1.charAt(i-1) == s2.charAt(j-1) ? 0 : 1);
+					int insertScore = table[i-1][j].getValue() + 1;
+					int deleteScore = table[i][j-1].getValue() + 1;
+					if(alignScore <= insertScore && alignScore <= deleteScore){
+						table[i][j] = new Cell(alignScore, table[i-1][j-1]);
+					}else if(insertScore <= deleteScore){
+						table[i][j] = new Cell(insertScore, table[i-1][j]);
+					}else{
+						table[i][j] = new Cell(deleteScore, table[i][j-1]);
+					}
+				}
+			}
+		}
+		// undo last increment
+		i--; j--;
+		int score = table[i][j].value;
+		
+		// work backwards from last cell
+//		Cell currentCell = table[i][j];
+		while(i > 0 && j > 0){
+			if(i == 0 || table[i][j].back == table[i][j-1]){
+				buff1.append(' ');
+				buff2.append(s2.charAt(j-1));
+				j--;
+			}else if(j == 0 || table[i][j].back == table[i-1][j]){
+				buff1.append(s1.charAt(i-1));
+				buff2.append(' ');
+				i--;
+			}else{
+				buff1.append(s1.charAt(i-1));
+				buff2.append(s2.charAt(j-1));
+				i--;
+				j--;
+			}
+		}
+		align = new Alignment(buff1.reverse(), buff2.reverse(), score);
+		return align;
+	}
+}
+
+class Cell{
+	Cell back = null;
+	int value = 0;
+	
+	public Cell(int value, Cell backPointer){
+		this.value = value;
+		this.back = backPointer;
+	}
+	
+	public int getValue(){
+		return value;
+	}
+}



Mime
View raw message