ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From james-mas...@apache.org
Subject svn commit: r1553321 - in /ctakes/trunk/ctakes-core/scripts: ./ groovy/ groovy/cTAKES-clinical-pipeline.groovy groovy/run_cTAKES.groovy
Date Tue, 24 Dec 2013 19:36:33 GMT
Author: james-masanz
Date: Tue Dec 24 19:36:33 2013
New Revision: 1553321

URL: http://svn.apache.org/r1553321
Log:
CTAKES-273 - these scripts checked in with this revision do not do automatic downloads; instead
they rely on a cTAKES convenience binary to have been already installed

Added:
    ctakes/trunk/ctakes-core/scripts/
    ctakes/trunk/ctakes-core/scripts/groovy/
    ctakes/trunk/ctakes-core/scripts/groovy/cTAKES-clinical-pipeline.groovy
    ctakes/trunk/ctakes-core/scripts/groovy/run_cTAKES.groovy

Added: ctakes/trunk/ctakes-core/scripts/groovy/cTAKES-clinical-pipeline.groovy
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/scripts/groovy/cTAKES-clinical-pipeline.groovy?rev=1553321&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/scripts/groovy/cTAKES-clinical-pipeline.groovy (added)
+++ ctakes/trunk/ctakes-core/scripts/groovy/cTAKES-clinical-pipeline.groovy Tue Dec 24 19:36:33
2013
@@ -0,0 +1,351 @@
+#!/usr/bin/env groovy
+
+/**
+** 	This script was not written to be run directly, although it could be if you 
+**  set up your classpath to include all the things needed (see below).
+**  The intent is to have something else set up a number of things first, 
+**  such as the classpath for running cTAKES, and then this script actually uses cTAKES.
+** 	This script assumes that 
+**   - you have installed Apache cTAKES. 
+**   - you have everything on your classpath needed to run cTAKES (see run_cTAKES.groovy)
+**   - you have installed Groovy 
+**   - the command groovy is available in your path.
+**  Arguments:
+**   - The name or full path of the directory containing the files to be processed by cTAKES
+**   - The name or full path of the directory where cTAKES is installed 
+** 	On Debian/Ubuntu systems, installing Groovy should be as easy as apt-get install groovy.
+** 	You can download groovy from http://groovy.codehaus.org/
+**/
+
+import java.io.File;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.xwriter.XWriter;
+import org.uimafit.factory.ConfigurationParameterFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.factory.TypePrioritiesFactory;
+import static org.uimafit.util.JCasUtil.*;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.util.CtakesFileNamer;
+
+import org.cleartk.util.cr.FilesCollectionReader;
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+
+
+def OUTPUT_DIR = "output-dir";
+def DEBUG = false;
+
+
+println("Starting " + this.class.getName());
+
+// this.class.classLoader.rootLoader.URLs.each{ println it } // Print out the classpath entries
for debugging
+
+
+if (DEBUG) println("args.length: " + args.length);
+if (DEBUG) println("args = " + args);
+
+if (args.length!=2) {
+	println "Expect exactly 2 arguments - the input dir and the path to cTAKES"
+    println("args.length: " + args.length);
+    println("args = " + args);
+	System.exit(-1);
+}
+
+def inputDir = args[0];
+def cTAKES_HOME = args[1];
+
+
+println("Using cTAKES in " + cTAKES_HOME);
+
+println("Instantiating collection reader");
+CollectionReader collectionReader = FilesCollectionReader.getCollectionReader(inputDir);
+
+
+//Build the pipeline to run
+// we assume cTAKES' desc directory is on classpath for those 
+// such as ctakes-assertion/desc/AssertionMiniPipelineAnalysisEngine that we 
+// reference by descriptor (XML file) name
+println("Building pipeline aggregate builder object");
+
+AggregateBuilder aggregateBuilder = new AggregateBuilder();
+
+// Here is the flow from AggregatePlaintextUMLSProcessor.xml in 3.1.1
+/*
+    <node>SimpleSegmentAnnotator</node>
+    <node>SentenceDetectorAnnotator</node>
+    <node>TokenizerAnnotator</node>
+    <node>LvgAnnotator</node>
+    <node>ContextDependentTokenizerAnnotator</node>
+    <node>POSTagger</node>
+    <node>Chunker</node>
+    <node>AdjustNounPhraseToIncludeFollowingNP</node>
+    <node>AdjustNounPhraseToIncludeFollowingPPNP</node>
+    <node>LookupWindowAnnotator</node>
+    <node>DictionaryLookupAnnotatorDB</node>
+    <node>DependencyParser</node>
+    <node>SemanticRoleLabeler</node>        
+    <node>AssertionAnnotator</node>
+    <node>ExtractionPrepAnnotator</node>
+*/
+
+
+println(" Adding segment aka section annotator");
+def segmentAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.core.ae.SimpleSegmentAnnotator.class);
+// ConfigurationParameterFactory.addConfigurationParameters(
+//			segmentAnnotator,
+//			"SegmentID", // TODO update SimpleSegmentAnnotator.java to have a constant for this
+//			"20104" // 20104 is Current Medications. "SIMPLE_SEGMENT" is default. @see SimpleSegmentAnnotator.java
 
+//	);
+aggregateBuilder.add(segmentAnnotator);
+
+println(" Adding sentence annotator");
+def sentenceAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.core.ae.SentenceDetector.class);
+ConfigurationParameterFactory.addConfigurationParameters(
+			sentenceAnnotator,
+			org.apache.ctakes.core.ae.SentenceDetector.SD_MODEL_FILE_PARAM, 
+			"org/apache/ctakes/core/sentdetect/sd-med-model.zip" 
+	);
+// This is where you could add SegmentsToSkip parameter values @see org.apache.ctakes.core.ae.SentenceDetector.PARAM_SEGMENTS_TO_SKIP
+aggregateBuilder.add(sentenceAnnotator);
+
+println(" Adding tokenizer annotator");
+def tokenizerAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.core.ae.TokenizerAnnotatorPTB.class);
+// This is where you could add SegmentsToSkip parameter values. @see TokenizerAnnotatorPTB.PARAM_SEGMENTS_TO_SKIP
+aggregateBuilder.add(tokenizerAnnotator);
+
+println(" Adding LVG annotator (word normalizer)");
+//If chose to add lvg annotator by class name, the following function provides a start, but
it has a TODO
+//def lvgAnnotator = lvgAnnotatorByClassName();
+//aggregateBuilder.add(lvgAnnotator);
+// lvg annotator has many parameters, using the xml descriptor to easily use the default
parameters
+def lvgDescriptorLocation = "ctakes-lvg/desc/analysis_engine/LvgAnnotator"; // Note createAnalysisEngineDescription
expects name to not end in .xml even though filename actually does
+AnalysisEngineDescription lvgDescriptor = AnalysisEngineFactory.createAnalysisEngineDescription(lvgDescriptorLocation);
// Note, do not include .xml in the name here as createAnalysisEngineDescription will append
.xml
+aggregateBuilder.add(lvgDescriptor);
+
+println(" Adding context dependent tokenizer annotator");
+def cdtAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator.class);
+aggregateBuilder.add(cdtAnnotator);
+
+println(" Adding part of speech (POS) annotator");
+def posAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.postagger.POSTagger.class);
+ConfigurationParameterFactory.addConfigurationParameters(
+			posAnnotator,
+			org.apache.ctakes.postagger.POSTagger.POS_MODEL_FILE_PARAM, 
+			"org/apache/ctakes/postagger/models/mayo-pos.zip" 
+	);
+aggregateBuilder.add(posAnnotator);
+
+println(" Adding chunker annotator");
+def chunkerAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.chunker.ae.Chunker.class);
+ConfigurationParameterFactory.addConfigurationParameters(
+			chunkerAnnotator,
+			org.apache.ctakes.chunker.ae.Chunker.CHUNKER_MODEL_FILE_PARAM, 
+			"org/apache/ctakes/chunker/models/chunker-model.zip" 
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			chunkerAnnotator,
+			org.apache.ctakes.chunker.ae.Chunker.CHUNKER_CREATOR_CLASS_PARAM, 
+			"org.apache.ctakes.chunker.ae.PhraseTypeChunkCreator" 
+	);
+aggregateBuilder.add(chunkerAnnotator);
+	
+// First chunk adjuster
+println(" Adding chunker adjuster annotator - NounPhraseToIncludeFollowingNP");
+def chunkAdjusterNPNPAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster.class);
+ConfigurationParameterFactory.addConfigurationParameters(
+			chunkAdjusterNPNPAnnotator,
+			org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster.PARAM_CHUNK_PATTERN, 
+			(String []) ["NP", "NP"]
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			chunkAdjusterNPNPAnnotator,
+			org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, 
+			1
+	);
+aggregateBuilder.add(chunkAdjusterNPNPAnnotator);
+
+// Second chunk adjuster
+println(" Adding chunker adjuster annotator - NounPhraseToIncludeFollowingPPNP");
+def chunkAdjusterNPPPNPAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster.class);
+ConfigurationParameterFactory.addConfigurationParameters(
+			chunkAdjusterNPPPNPAnnotator,
+			org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster.PARAM_CHUNK_PATTERN, 
+			(String []) ["NP", "PP", "NP"]
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			chunkAdjusterNPPPNPAnnotator,
+			org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, 
+			2
+	);
+aggregateBuilder.add(chunkAdjusterNPPPNPAnnotator);
+
+println(" Adding lookup window annotator");
+def lwAnnotatorDescriptorLocation = "ctakes-clinical-pipeline/desc/analysis_engine/LookupWindowAnnotator";

+AnalysisEngineDescription lookupWindowAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(lwAnnotatorDescriptorLocation);

+aggregateBuilder.add(lookupWindowAnnotator);
+
+
+// TODO - this is a longer range TODO item: it would be nice to be able to set values here
that would be used instead of what's in the LookupDesc*.xml files
+
+
+// DictionaryLookupAnnotatorUMLS - org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator
+println(" Adding dictionary lookup (UMLS) annotator");
+def dictLookupAnnotatorDescriptorLocation = "ctakes-dictionary-lookup/desc/analysis_engine/DictionaryLookupAnnotatorUMLS";

+AnalysisEngineDescription dictionaryLookupAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(dictLookupAnnotatorDescriptorLocation);

+//UmlsDictionaryLookupAnnotator will look for system properties before looking at these values
+ConfigurationParameterFactory.addConfigurationParameters(
+			dictionaryLookupAnnotator,
+			org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator.UMLSADDR_PARAM, 
+			"https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser"
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			dictionaryLookupAnnotator,
+			org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator.UMLSVENDOR_PARAM,

+			"NLM-6515182895"
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			dictionaryLookupAnnotator,
+			org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator.UMLSUSER_PARAM, 
+			"" // put your UMLS user ID here or set JAVA_OPTS ctakes.umlsuser or see user or install
guide
+	);
+// Commenting out the setting of UMLSPW_PARAM as you probably don't want to put your password
+// in this script so that if you share the script you don't share your password accidentally
+//ConfigurationParameterFactory.addConfigurationParameters(
+//			dictionaryLookupAnnotator,
+//			org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator.UMLSPW_PARAM, 
+//			""
+//	);
+aggregateBuilder.add(dictionaryLookupAnnotator);
+
+// DependencyParser
+println(" TODO YET *** Adding dependency parser annotator"); // TODO 
+
+// SemanticRoleLabeler        
+println(" TODO YET *** Adding semantic role labeler annotator"); // TODO 
+		
+println(" Adding assertion annotators");
+def assertionDescriptorLocation = "ctakes-assertion/desc/AssertionMiniPipelineAnalysisEngine";
// Note createAnalysisEngineDescription expects name to not end in .xml even though filename
actually does
+AnalysisEngineDescription assertionDescriptor = AnalysisEngineFactory.createAnalysisEngineDescription(assertionDescriptorLocation);
// Note, do not include .xml in the name here as createAnalysisEngineDescription will append
.xml
+aggregateBuilder.add(assertionDescriptor);
+
+println(" Adding extraction prep annotator");
+def extractionPrepDescriptorLocation = "ctakes-clinical-pipeline/desc/analysis_engine/ExtractionPrepAnnotator";

+AnalysisEngineDescription extractionPrepDescriptor = AnalysisEngineFactory.createAnalysisEngineDescription(extractionPrepDescriptorLocation);

+aggregateBuilder.add(extractionPrepDescriptor);
+
+
+
+TypeSystemDescription typeSystemDescription = TypeSystemDescriptionFactory.createTypeSystemDescription("org.apache.ctakes.typesystem.types.TypeSystem");
+		
+		
+// generic XMI writer
+// TODO When generalize this script to run specific components instead of the whole pipeline,
+// consider separate writer for each engine or a diffferent writer that produces more friendly
output
+println(" Adding XMI writer");
+AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
+			  XWriter.class,
+			  typeSystemDescription,
+			  XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+			  OUTPUT_DIR,
+			  XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+			  CtakesFileNamer.class.getName()
+			  );
+aggregateBuilder.add(xWriter);
+
+//println("About to run pipeline using SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate())");
+SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+
+println("Done: " + this.class.getName());
+
+
+
+
+def lvgAnnotatorByClassName() {
+def lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(org.apache.ctakes.lvg.ae.LvgAnnotator.class);
+// This is where you could add parameter values. 
+//PARAM_POST_LEMMAS defaults to false
+//PARAM_USE_LEMMA_CACHE defaults to false
+//PARAM_LEMMA_CACHE_FILE_LOCATION (required iff PARAM_USE_LEMMA_CACHE true)
+//PARAM_LEMMA_CACHE_FREQUENCY_CUTOFF // code defaults to 20 which is same as LvgAnnotator.xml
value
+//"XeroxTreebankMap"
+//"ExclusionSet"
+// "UseSegments"
+// "SegmentsToSkip"
+def XeroxTreebankMapValues = (String []) [
+            "adj|JJ",
+            "adv|RB",
+            "aux|AUX",
+            "compl|CS",
+            "conj|CC",
+            "det|DET",
+            "modal|MD",
+            "noun|NN",
+            "prep|IN",
+            "pron|PRP",
+            "verb|VB" ];
+ConfigurationParameterFactory.addConfigurationParameters(
+			lvgAnnotator,
+			"XeroxTreebankMap", 
+			XeroxTreebankMapValues 
+	);
+def ExclusionSetValues = (String[]) [
+            "and",
+            "And",
+            "by",
+            "By",
+            "for",
+            "For",
+            "in",
+            "In",
+            "of",
+            "Of",
+            "on",
+            "On",
+            "the",
+            "The",
+            "to",
+            "To",
+            "with",
+            "With" ];
+ConfigurationParameterFactory.addConfigurationParameters(
+			lvgAnnotator,
+			"ExclusionSet", 
+			ExclusionSetValues 
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			lvgAnnotator,
+			"UseSegments", 
+			false
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			lvgAnnotator,
+			"SegmentsToSkip", 
+			(String [])[]
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			lvgAnnotator,
+			"UseCmdCache", 
+			false
+	);
+ConfigurationParameterFactory.addConfigurationParameters(
+			lvgAnnotator,
+			"CmdCacheFrequencyCutoff", 
+			20
+	);
+// TODO handle LvgCmdApi here 
+return lvgAnnotator
+}	

Added: ctakes/trunk/ctakes-core/scripts/groovy/run_cTAKES.groovy
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/scripts/groovy/run_cTAKES.groovy?rev=1553321&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/scripts/groovy/run_cTAKES.groovy (added)
+++ ctakes/trunk/ctakes-core/scripts/groovy/run_cTAKES.groovy Tue Dec 24 19:36:33 2013
@@ -0,0 +1,105 @@
+#!/usr/bin/env groovy
+
+/**
+** Sets the classpath before invoking the script that actually uses cTAKES.
+** This includes (almost) all jars used by cTAKES as well as the resources
+** and descriptors.
+**
+** 	This script assumes that you have installed Groovy and 
+** 	that you have the command groovy available in your path.
+**  This also assumes you have installed Apache cTAKES
+**  from the convenience binary. 
+**
+**  Set the following below 
+**    cTAKES_HOME (where you extracted cTAKES. This should be the parent of bin, desc, resources,
etc)
+**    EXTERNAL_RESOURCE_PATH (the parent of ctakes-resources-3.1.0, available from SF.net)
+**
+** 	On Debian/Ubuntu systems, installing Groovy should be as easy as apt-get install groovy.
+** 	You can download groovy from http://groovy.codehaus.org/
+**  Usage: $groovy run_cTAKES.groovy [inputDir]
+**/
+
+import java.io.File;
+
+println("Starting " + this.class.getName());
+
+// print out the classpath entries for debug purposes
+//this.class.classLoader.rootLoader.URLs.each{ println it }
+
+// TODO improve handling of whether user enters a trailing slash for these two constants:
+def cTAKES_HOME = "/C:/Apache-cTAKES/apache-ctakes-3.1.1/";
+def EXTERNAL_RESOURCE_PATH = "/C:/parent-of-ctakes-resources";
+
+println("Using cTAKES in " + cTAKES_HOME);
+
+
+// Add everything under cTAKES lib directory to classpath
+
+File jarsDir = new File(cTAKES_HOME, "lib");
+File[] files = jarsDir.listFiles();
+//for (int i=files.length-1; i>=0; i--) {
+for (int i=0; i<files.length; i++) {
+	File f = files[i];
+	if (f.getName().toLowerCase().endsWith(".jar")) {
+		def path = f.getCanonicalPath();
+		//println("this.class.classLoader = " + this.class.classLoader); //e.g.  this.class.classLoader
= groovy.lang.GroovyClassLoader$InnerLoader@67ecd78
+		//println("rootLoader = " + this.class.classLoader.rootLoader); // e.g.  rootLoader = org.codehaus.groovy.tools.RootLoader@58fe64b9
+		
+		// This is a total HACK: skipping the jars that start with "x".
+		// Having a problem with groovy playing nice with some xml-processing jars/classes
+		if (f.getName().startsWith("x")) {
+			// HACK: Skip the "x" jars for now to avoid problem with xalan
+		} else {
+			if (path.startsWith("C:")) { // TODO generalize
+				this.class.classLoader.rootLoader.addURL( new URL("file:///" + path));
+			} else {
+				this.class.classLoader.rootLoader.addURL( new URL("file://" + path));
+			}
+		}
+	} else {
+		println("Ignoring " + f.getName());
+	}
+}
+
+// Add cTAKES' resources directory to classpath
+def subdir = "resources/";
+println("Adding cTAKES subdir called " + subdir + " to classpath");
+this.class.classLoader.rootLoader.addURL( new URL("file://" + cTAKES_HOME + subdir));
+
+// Add cTAKES' desc directory to classpath
+// Note, MUST end the URL with trailing slash or it doesn't seem to think it's a directory
+// because AnalysisEngineFactory.createAnalysisEngineDescription won't be able to find things
+// under this directory
+subdir = "desc/";
+println("Adding cTAKES subdir called " + subdir + " to classpath");
+this.class.classLoader.rootLoader.addURL( new URL("file://" + cTAKES_HOME + subdir));
+
+
+//println("TODO -- consider having script download and unzip ctakes-resources-3.1.0.zip to
lib");
+// Add the ctakesresources (UMLS dictionary, LVG database) to the classpath
+//println("TODO -- download and unzip ctakes-resources-3.1.0.zip to lib");
+println("Adding ctakes-resources-3.1.0/resources to classpath");
+// from ctakes-resources-3.1.0.zip
+this.class.classLoader.rootLoader.addURL( new URL("file://" + EXTERNAL_RESOURCE_PATH + "/ctakes-resources-3.1.0/resources/")
);
+
+
+if (args.length < 1) {
+	println("Please specify input directory");
+	System.exit(1);
+}
+println("Input parm: " + args[0]);
+
+// Run cTAKES now that the classpath has been set so that imports and resources are found
+// Prepare to pass variable values to script
+Binding binding = new Binding();
+//binding.setVariable("cTAKES_HOME", cTAKES_HOME);
+String arg0 = args[0];
+String [] arguments = new String[2];
+arguments[0] = arg0;
+arguments[1] = cTAKES_HOME;
+binding.setVariable("args", arguments);
+GroovyShell shell = new GroovyShell(binding);
+
+Object value = shell.evaluate(new File('cTAKES-clinical-pipeline.groovy'));
+
+println("Done " + this.class.getName());



Mime
View raw message