ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1463246 - /ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
Date Mon, 01 Apr 2013 18:40:03 GMT
Author: tmill
Date: Mon Apr  1 18:40:02 2013
New Revision: 1463246

URL: http://svn.apache.org/r1463246
Log:
ctakes-58: Changed to use uimafit JCasIterable.

Modified:
    ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java

Modified: ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java?rev=1463246&r1=1463245&r2=1463246&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
(original)
+++ ctakes/sandbox/ctakes-spelling-corrector/src/org/apache/ctakes/spelling/priors/unigram/UnigramPriorGenerator.java
Mon Apr  1 18:40:02 2013
@@ -3,28 +3,30 @@ package org.apache.ctakes.spelling.prior
 import java.io.File;
 import java.io.FilenameFilter;
 import java.io.IOException;
-import java.io.OutputStream;
 import java.io.PrintStream;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Scanner;
 
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.uima.UIMAException;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.cas.FSIterator;
+import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.factory.JCasFactory;
-import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.JCasIterable;
 import org.uimafit.pipeline.SimplePipeline;
 
 public class UnigramPriorGenerator {
 	static HashMap<String,Integer> wordCounts = new HashMap<String,Integer>();
 	static long numWords = 0;
-
+	static boolean printProbs = false;
+	
 	/**
 	 * @param args
 	 * @throws IOException 
@@ -37,44 +39,29 @@ public class UnigramPriorGenerator {
 			System.exit(-1);
 		}
 		
-		JCas jcas = null;
-		AnalysisEngine ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../core/desc/analysis_engine/AggregateAE.xml");
-		TypeSystemDescription types = TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("../common-type-system/desc/common_type_system.xml");
+//		JCas jcas = null;
+		AnalysisEngine ae = AnalysisEngineFactory.createAnalysisEngineFromPath("../ctakes-core/desc/analysis_engine/AggregateAE.xml");
+		CollectionReader reader = CollectionReaderFactory.createCollectionReader(FilesInDirectoryCollectionReader.class
+				, FilesInDirectoryCollectionReader.PARAM_INPUTDIR
+				, args[1]
+				, FilesInDirectoryCollectionReader.PARAM_RECURSE
+				, true
+				, FilesInDirectoryCollectionReader.PARAM_EXTENSIONS
+				, new String[]{"txt"}
+				);
 		
 		// iterate over directories passed in at command line
-//		for(String dirName : args){
-		for(int i = 1; i < args.length; i++){
-			String dirName = args[i];
-			File curDir = new File(dirName);
-			if(!curDir.exists() || !curDir.isDirectory()){
-				System.err.println("Error reading directory: " + dirName);
-				continue;
-			}
-			
-			// get all the text files in this directory
-			File[] txtFiles = curDir.listFiles(new FilenameFilter() {
-				@Override
-				public boolean accept(File arg0, String arg1) {
-					return arg1.endsWith(".txt") || !arg1.contains(".");
-				}
-			});
-			
-			// now tokenize each file
-			for(File f : txtFiles){
-				Scanner scanner = new Scanner(f);
-				StringBuilder buff = new StringBuilder();
-				while(scanner.hasNextLine()){
-					buff.append(scanner.nextLine());
-				}
-				jcas = JCasFactory.createJCas(types);
-				jcas.setDocumentText(buff.toString());
-				SimplePipeline.runPipeline(jcas, ae);
-				FSIterator iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
-				while(iter.hasNext()){
-					BaseToken tok = (BaseToken) iter.next();
-					if(tok instanceof WordToken){
-						handleWord(tok.getCoveredText());
-					}
+		int numTokens = 0;
+		JCasIterable casIter = new JCasIterable(reader, ae);
+		while(casIter.hasNext()){
+			JCas jcas = casIter.next();
+			FSIterator iter = jcas.getAnnotationIndex(BaseToken.type).iterator();
+			while(iter.hasNext()){
+				BaseToken tok = (BaseToken) iter.next();
+				if(tok instanceof WordToken){
+					numTokens++;
+					handleWord(tok.getCoveredText());
+					if(numTokens % 1000000 == 0) writePriorModel(new PrintStream(args[0]));
 				}
 			}
 		}
@@ -126,10 +113,15 @@ public class UnigramPriorGenerator {
 	
 	private static void writePriorModel(PrintStream out){
 		for(Map.Entry<String,Integer> entry : wordCounts.entrySet()){
-			double prob = (double) entry.getValue() / numWords;
 			out.print(entry.getKey());
 			out.print(" : ");
-			out.println(prob);
+			if(printProbs){
+				double prob = (double) entry.getValue() / numWords;
+				out.println(prob);
+			}else{
+				out.println(entry.getValue());
+			}
 		}
+		out.close();
 	}
 }



Mime
View raw message