incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s..@apache.org
Subject svn commit: r1443625 - in /incubator/ctakes/trunk/ctakes-assertion: ./ desc/analysis_engine/ resources/launch/ src/main/java/org/apache/ctakes/assertion/attributes/subject/ src/main/java/org/apache/ctakes/assertion/eval/ src/main/java/org/apache/ctakes...
Date Thu, 07 Feb 2013 17:47:56 GMT
Author: swu
Date: Thu Feb  7 17:47:55 2013
New Revision: 1443625

URL: http://svn.apache.org/r1443625
Log:
making assertion module easier to train -- all in the AssertionEvaluation class, use options
to preprocess, train, test

Added:
    incubator/ctakes/trunk/ctakes-assertion/desc/analysis_engine/
    incubator/ctakes/trunk/ctakes-assertion/desc/analysis_engine/AttributeDiscoveryPreprocessor.xml
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
      - copied, changed from r1441179, incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvalBasedOnModifier.java
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java
Removed:
    incubator/ctakes/trunk/ctakes-assertion/resources/launch/GoldEntityAndAttributeReaderPipelineForSeedCorpus.launch
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvalBasedOnModifier.java
Modified:
    incubator/ctakes/trunk/ctakes-assertion/pom.xml
    incubator/ctakes/trunk/ctakes-assertion/resources/launch/ctakes__assertion_eval.launch
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/subject/SubjectAttributeClassifier.java
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/CreateAssertionDescriptor.java
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAssertionModel.java
    incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java

Added: incubator/ctakes/trunk/ctakes-assertion/desc/analysis_engine/AttributeDiscoveryPreprocessor.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/desc/analysis_engine/AttributeDiscoveryPreprocessor.xml?rev=1443625&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/desc/analysis_engine/AttributeDiscoveryPreprocessor.xml
(added)
+++ incubator/ctakes/trunk/ctakes-assertion/desc/analysis_engine/AttributeDiscoveryPreprocessor.xml
Thu Feb  7 17:47:55 2013
@@ -0,0 +1,158 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+  <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+  <primitive>false</primitive>
+  <delegateAnalysisEngineSpecifiers>
+    <delegateAnalysisEngine key="DependencyParser">
+      <import location="../../../ctakes-dependency-parser/desc/analysis_engine/ClearParserDependencyParserAE.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="Chunker">
+      <import location="../../../ctakes-chunker/desc/Chunker.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="TokenizerAnnotator">
+      <import location="../../../ctakes-core/desc/analysis_engine/TokenizerAnnotator.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="ContextDependentTokenizerAnnotator">
+      <import location="../../../ctakes-context-tokenizer/desc/analysis_engine/ContextDependentTokenizerAnnotator.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="SentenceDetectorAnnotator">
+      <import location="../../../ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="LookupWindowAnnotator">
+      <import location="../../../ctakes-clinical-pipeline/desc/analysis_engine/LookupWindowAnnotator.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="AdjustNounPhraseToIncludeFollowingNP">
+      <import location="../../../ctakes-chunker/desc/AdjustNounPhraseToIncludeFollowingNP.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="SimpleSegmentAnnotator">
+      <import location="../../../ctakes-clinical-pipeline/desc/analysis_engine/SimpleSegmentAnnotator.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="AdjustNounPhraseToIncludeFollowingPPNP">
+      <import location="../../../ctakes-chunker/desc/AdjustNounPhraseToIncludeFollowingPPNP.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="LvgAnnotator">
+      <import location="../../../ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="POSTagger">
+      <import location="../../../ctakes-pos-tagger/desc/POSTagger.xml"/>
+    </delegateAnalysisEngine>
+  </delegateAnalysisEngineSpecifiers>
+  <analysisEngineMetaData>
+    <name>AggregatePlaintextUMLSProcessor</name>
+    <description>Runs the complete pipeline for annotating clinical documents in plain
text format using the built in UMLS (SNOMEDCT and RxNORM) dictionaries.  This uses the dictionary
lookup/desc/DictionaryLookupAnnotatorUMLS.xml
+and requires an UMLS license.  Please update DictionaryLookupAnnotatorUMLS.xml file with
your UMLS username and password.</description>
+    <version/>
+    <vendor/>
+    <configurationParameters searchStrategy="language_fallback">
+      <configurationParameter>
+        <name>SegmentID</name>
+        <description/>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>false</mandatory>
+        <overrides>
+          <parameter>SimpleSegmentAnnotator/SegmentID</parameter>
+        </overrides>
+      </configurationParameter>
+      <configurationParameter>
+        <name>ChunkCreatorClass</name>
+        <type>String</type>
+        <multiValued>false</multiValued>
+        <mandatory>true</mandatory>
+        <overrides>
+          <parameter>Chunker/ChunkCreatorClass</parameter>
+        </overrides>
+      </configurationParameter>
+    </configurationParameters>
+    <configurationParameterSettings>
+      <nameValuePair>
+        <name>ChunkCreatorClass</name>
+        <value>
+          <string>org.apache.ctakes.chunker.ae.PhraseTypeChunkCreator</string>
+        </value>
+      </nameValuePair>
+    </configurationParameterSettings>
+    <flowConstraints>
+      <fixedFlow>
+        <node>SimpleSegmentAnnotator</node>
+        <node>SentenceDetectorAnnotator</node>
+        <node>TokenizerAnnotator</node>
+        <node>LvgAnnotator</node>
+        <node>ContextDependentTokenizerAnnotator</node>
+        <node>POSTagger</node>
+        <node>DependencyParser</node>
+      </fixedFlow>
+    </flowConstraints>
+    <typePriorities>
+      <name>Ordering</name>
+      <description>For subiterator</description>
+      <version>1.0</version>
+      <priorityList>
+        <type>org.apache.ctakes.typesystem.type.textspan.Segment</type>
+        <type>org.apache.ctakes.typesystem.type.textspan.Sentence</type>
+        <type>org.apache.ctakes.typesystem.type.syntax.BaseToken</type>
+      </priorityList>
+      <priorityList>
+        <type>org.apache.ctakes.typesystem.type.textspan.Sentence</type>
+        <type>org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation</type>
+      </priorityList>
+    </typePriorities>
+    <fsIndexCollection/>
+    <capabilities>
+      <capability>
+        <inputs/>
+        <outputs>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.NewlineToken</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.WordToken</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.VP</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.refsem.UmlsConcept</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.UCP</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.TimeAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.SymbolToken</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textspan.Sentence</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textspanSegment</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.SBAR</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.RomanNumeralAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.RangeAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.PunctuationToken</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.Property</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.Properties</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.PersonTitleAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.PRT</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.PP</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.OntologyConcept</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.NumToken</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.MeasurementAnnotation</type>
+          <type allAnnotatorFeatures="true">edu.mayo.bmi.uima.lookup.type.LookupWindowAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.Lemma</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.LST</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.INTJ</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.FractionAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.structured.DocumentID</type>
+          <type allAnnotatorFeatures="true">uima.tcas.DocumentAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.DateAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.CopySrcAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.CopyDestAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.ContractionToken</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.textsem.ContextAnnotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.Chunk</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.CONJP</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.BaseToken</type>
+          <type allAnnotatorFeatures="true">uima.cas.AnnotationBase</type>
+          <type allAnnotatorFeatures="true">uima.tcas.Annotation</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.ADVP</type>
+          <type allAnnotatorFeatures="true">org.apache.ctakes.typesystem.type.syntax.ADJP</type>
+        </outputs>
+        <languagesSupported/>
+      </capability>
+    </capabilities>
+    <operationalProperties>
+      <modifiesCas>true</modifiesCas>
+      <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+      <outputsNewCASes>false</outputsNewCASes>
+    </operationalProperties>
+  </analysisEngineMetaData>
+  <resourceManagerConfiguration/>
+</analysisEngineDescription>

Modified: incubator/ctakes/trunk/ctakes-assertion/pom.xml
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/pom.xml?rev=1443625&r1=1443624&r2=1443625&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/pom.xml (original)
+++ incubator/ctakes/trunk/ctakes-assertion/pom.xml Thu Feb  7 17:47:55 2013
@@ -221,6 +221,18 @@
 			<scope>system</scope>
 			<systemPath>${project.basedir}/lib/jcarafe-ext_2.9.1-0.9.8.3.RC4.jar</systemPath>
 		</dependency>
+		<dependency>
+			<groupId>org.apache.ctakes</groupId>
+			<artifactId>ctakes-lvg</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>org.apache.ctakes</groupId>
+			<artifactId>ctakes-context-tokenizer</artifactId>
+		</dependency>
+		<dependency>
+			<groupId>org.apache.ctakes</groupId>
+			<artifactId>ctakes-chunker</artifactId>
+		</dependency>
 	</dependencies>
 	<build>
 		<plugins>

Modified: incubator/ctakes/trunk/ctakes-assertion/resources/launch/ctakes__assertion_eval.launch
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/resources/launch/ctakes__assertion_eval.launch?rev=1443625&r1=1443624&r2=1443625&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/resources/launch/ctakes__assertion_eval.launch
(original)
+++ incubator/ctakes/trunk/ctakes-assertion/resources/launch/ctakes__assertion_eval.launch
Thu Feb  7 17:47:55 2013
@@ -2,14 +2,14 @@
 <launchConfiguration type="org.eclipse.jdt.launching.localJavaApplication">
 <stringAttribute key="bad_container_name" value="/ctakes-assertion/resour"/>
 <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_PATHS">
-<listEntry value="/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvalBasedOnModifier.java"/>
+<listEntry value="/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java"/>
 </listAttribute>
 <listAttribute key="org.eclipse.debug.core.MAPPED_RESOURCE_TYPES">
 <listEntry value="1"/>
 </listAttribute>
 <stringAttribute key="org.eclipse.debug.ui.ATTR_CAPTURE_IN_FILE" value="/tmp/assertion.log"/>
 <stringAttribute key="org.eclipse.jdt.launching.CLASSPATH_PROVIDER" value="org.eclipse.m2e.launchconfig.classpathProvider"/>
-<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier"/>
+<stringAttribute key="org.eclipse.jdt.launching.MAIN_TYPE" value="org.apache.ctakes.assertion.eval.AssertionEvaluation"/>
 <stringAttribute key="org.eclipse.jdt.launching.PROGRAM_ARGUMENTS" value="--train-dir
/sharp_data/train --test-dir /sharp_data/test --models-dir /sharp_data/model/eval.model"/>
 <stringAttribute key="org.eclipse.jdt.launching.PROJECT_ATTR" value="ctakes-assertion"/>
 <stringAttribute key="org.eclipse.jdt.launching.SOURCE_PATH_PROVIDER" value="org.eclipse.m2e.launchconfig.sourcepathProvider"/>

Modified: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/subject/SubjectAttributeClassifier.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/subject/SubjectAttributeClassifier.java?rev=1443625&r1=1443624&r2=1443625&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/subject/SubjectAttributeClassifier.java
(original)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/attributes/subject/SubjectAttributeClassifier.java
Thu Feb  7 17:47:55 2013
@@ -28,7 +28,7 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation;
 import org.apache.ctakes.dependency.parser.util.DependencyPath;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.typesystem.type.constants.CONST;

Copied: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
(from r1441179, incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvalBasedOnModifier.java)
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java?p2=incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java&p1=incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvalBasedOnModifier.java&r1=1441179&r2=1443625&rev=1443625&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvalBasedOnModifier.java
(original)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/eval/AssertionEvaluation.java
Thu Feb  7 17:47:55 2013
@@ -69,6 +69,7 @@ import org.apache.ctakes.assertion.medfa
 import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
 import org.apache.ctakes.assertion.medfacts.cleartk.SubjectCleartkAnalysisEngine;
 import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.pipelines.GoldEntityAndAttributeReaderPipelineForSeedCorpus;
 import org.apache.ctakes.core.ae.DocumentIdPrinterAnalysisEngine;
 import org.apache.ctakes.core.util.CtakesFileNamer;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
@@ -101,24 +102,30 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textsem.Modifier;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
 
-public class AssertionEvalBasedOnModifier extends Evaluation_ImplBase<File, Map<String,
AnnotationStatistics>> {
+public class AssertionEvaluation extends Evaluation_ImplBase<File, Map<String, AnnotationStatistics>>
{
   
-  private static Logger logger = Logger.getLogger(AssertionEvalBasedOnModifier.class); 
+  private static Logger logger = Logger.getLogger(AssertionEvaluation.class); 
 
   public static class Options extends Options_ImplBase {
     @Option(
         name = "--train-dir",
-        usage = "specify the directory contraining the XMI training files (for example, /NLP/Corpus/Relations/mipacq/xmi/train)",
+        usage = "specify the directory containing the XMI training files (for example, /NLP/Corpus/Relations/mipacq/xmi/train)",
         required = true)
     public File trainDirectory;
     
     @Option(
         name = "--test-dir",
-        usage = "specify the directory contraining the XMI testing files (for example, /NLP/Corpus/Relations/mipacq/xmi/test)",
+        usage = "specify the directory containing the XMI testing files (for example, /NLP/Corpus/Relations/mipacq/xmi/test)",
         required = false)
     public File testDirectory;
     
     @Option(
+            name = "--dev-dir",
+            usage = "if running --preprocess, store the XMI development files here",
+            required = false)
+        public File devDirectory;
+
+    @Option(
         name = "--models-dir",
         usage = "specify the directory where the models will be placed",
         required = true)
@@ -168,12 +175,24 @@ public class AssertionEvalBasedOnModifie
     public Integer crossValidationFolds;
     
     @Option(
+            name = "--train-only",
+            usage = "do not test a model, build one from xmi output and store in --models-dir",
+            required = false)
+    public boolean trainOnly = false;
+
+    @Option(
             name = "--test-only",
             usage = "do not train a model, use the one specified in --models-dir",
             required = false)
     public boolean testOnly = false;
 
     @Option(
+            name = "--preprocess-only",
+            usage = "run preprocessing pipeline on a SHARP-style corpus, specify root directory",
+            required = false)
+    public File preprocessDir;
+
+    @Option(
             name = "--no-cleartk",
             usage = "run the version of the assertion module released with cTAKES 2.5",
             required = false)
@@ -188,8 +207,9 @@ public class AssertionEvalBasedOnModifie
   
   private File evaluationOutputDirectory;
 
-  
-  protected static Options options = new Options();
+  private String sharpCorpusDirectory;
+
+protected static Options options = new Options();
   
   public static void main(String[] args) throws Exception {
     //Options options = new Options();
@@ -232,7 +252,7 @@ public class AssertionEvalBasedOnModifie
     if (options.runSubject) { annotationTypes.add("subject"); }
     if (options.runGeneric) { annotationTypes.add("generic"); }
     
-    AssertionEvalBasedOnModifier evaluation = new AssertionEvalBasedOnModifier(
+    AssertionEvaluation evaluation = new AssertionEvaluation(
         modelsDir,
         evaluationOutputDirectory,
         annotationTypes,
@@ -241,29 +261,14 @@ public class AssertionEvalBasedOnModifie
         "100",
         "2"
         );
-    /*
-        ,
-        "-t",
-        "0",
-        "-c",
-        "1000");
-        */
-
-//    List<AnnotationStatistics> foldStats = evaluation.crossValidation(trainFiles,
2);
-//    //AnnotationStatistics overallStats = AnnotationStatistics.addAll(foldStats);
-//    //AnnotationStatistics overallStats = new AnnotationStatistics();
-//    //overallStats.addAll(foldStats);
-//    AnnotationStatistics overallStats = new AnnotationStatistics();
-//    for (AnnotationStatistics singleFoldStats : foldStats)
-//    {
-//    	overallStats.addAll(singleFoldStats);
-//    }
-//    System.err.println("Overall:");
-//    System.err.println(overallStats);
-    
     
+    // if preprocessing, don't do anything else
+    if(options.preprocessDir!=null ) {
+    	preprocess(options.preprocessDir);
+    }
     
-    if(options.testDirectory == null || options.crossValidationFolds != null) {
+    // run cross-validation
+    else if(options.testDirectory == null || options.crossValidationFolds != null) {
       // run n-fold cross-validation
       List<Map<String, AnnotationStatistics>> foldStats = evaluation.crossValidation(trainFiles,
options.crossValidationFolds);
       //AnnotationStatistics overallStats = AnnotationStatistics.addAll(foldStats);
@@ -283,9 +288,12 @@ public class AssertionEvalBasedOnModifie
     	  }
       }
       
-      AssertionEvalBasedOnModifier.printScore(overallStats,  "CROSS FOLD OVERALL");
+      AssertionEvaluation.printScore(overallStats,  "CROSS FOLD OVERALL");
       
-    } else {
+    } 
+    
+    // run on test set
+    else {
       // train on the entire training set and evaluate on the test set
       List<File> testFiles = Arrays.asList(options.testDirectory.listFiles());
       
@@ -296,7 +304,7 @@ public class AssertionEvalBasedOnModifie
       CollectionReader testCollectionReader = evaluation.getCollectionReader(testFiles);
       Map<String, AnnotationStatistics> stats = evaluation.test(testCollectionReader,
modelsDir);
       
-      AssertionEvalBasedOnModifier.printScore(stats,  modelsDir.getAbsolutePath());
+      AssertionEvaluation.printScore(stats,  modelsDir.getAbsolutePath());
     }
     
     System.out.println("Finished assertion module.");
@@ -345,7 +353,7 @@ public static void printScore(Map<String
 
   private String[] trainingArguments;
 
-  public AssertionEvalBasedOnModifier(
+  public AssertionEvaluation(
       File modelDirectory,
       File evaluationOutputDirectory,
       ArrayList<String> annotationTypes,
@@ -378,6 +386,12 @@ public static void printScore(Map<String
         paths);
   }
 
+  public static void preprocess(File preprocessDir ) throws ResourceInitializationException,
UIMAException, IOException {
+//	  File devDirectory = new File(options.trainDirectory.getParentFile() + File.separator
+ "dev");
+	  GoldEntityAndAttributeReaderPipelineForSeedCorpus.readSharpUmlsCem(
+			  preprocessDir, options.trainDirectory, options.testDirectory, options.devDirectory);
+  }
+  
   @Override
   public void train(CollectionReader collectionReader, File directory) throws Exception {
     AggregateBuilder builder = new AggregateBuilder();
@@ -438,7 +452,7 @@ public static void printScore(Map<String
 	    ConfigurationParameterFactory.addConfigurationParameters(
 	        polarityAnnotator,
 	        AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-	        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+	        AssertionEvaluation.GOLD_VIEW_NAME,
 	        CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 	        this.dataWriterFactoryClass.getName(),
 	        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -453,7 +467,7 @@ public static void printScore(Map<String
 	    ConfigurationParameterFactory.addConfigurationParameters(
 	        conditionalAnnotator,
 	        AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-	        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+	        AssertionEvaluation.GOLD_VIEW_NAME,
 	        CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 	        this.dataWriterFactoryClass.getName(),
 	        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -468,7 +482,7 @@ public static void printScore(Map<String
 	    ConfigurationParameterFactory.addConfigurationParameters(
 	        uncertaintyAnnotator,
 	        AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-	        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+	        AssertionEvaluation.GOLD_VIEW_NAME,
 	        CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 	        this.dataWriterFactoryClass.getName(),
 	        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -483,7 +497,7 @@ public static void printScore(Map<String
 	    ConfigurationParameterFactory.addConfigurationParameters(
 	        subjectAnnotator,
 	        AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-	        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+	        AssertionEvaluation.GOLD_VIEW_NAME,
 	        CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 	        this.dataWriterFactoryClass.getName(),
 	        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -498,7 +512,7 @@ public static void printScore(Map<String
 		ConfigurationParameterFactory.addConfigurationParameters(
 		    genericAnnotator,
 		    AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-		    AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+		    AssertionEvaluation.GOLD_VIEW_NAME,
 		    CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
 		    this.dataWriterFactoryClass.getName(),
 		    DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -677,7 +691,7 @@ private void addExternalAttributeAnnotat
 	ConfigurationParameterFactory.addConfigurationParameters(
 			oldAssertionAnnotator,
 			AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-			AssertionEvalBasedOnModifier.GOLD_VIEW_NAME
+			AssertionEvaluation.GOLD_VIEW_NAME
 	);
 	builder.add(oldAssertionAnnotator);
 
@@ -685,7 +699,7 @@ private void addExternalAttributeAnnotat
 	ConfigurationParameterFactory.addConfigurationParameters(
 			oldConversionAnnotator,
 			AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-			AssertionEvalBasedOnModifier.GOLD_VIEW_NAME
+			AssertionEvaluation.GOLD_VIEW_NAME
 	);
 	builder.add(oldConversionAnnotator);
 
@@ -701,7 +715,7 @@ private void addExternalAttributeAnnotat
 	ConfigurationParameterFactory.addConfigurationParameters(
 			oldGenericAnnotator,
 			AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-			AssertionEvalBasedOnModifier.GOLD_VIEW_NAME
+			AssertionEvaluation.GOLD_VIEW_NAME
 	);
 	builder.add(oldGenericAnnotator);
 }
@@ -738,7 +752,7 @@ private void addCleartkAttributeAnnotato
 		ConfigurationParameterFactory.addConfigurationParameters(
 				polarityAnnotator,
 				AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-				AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+				AssertionEvaluation.GOLD_VIEW_NAME,
 				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
 				new File(new File(directory, "polarity"), "model.jar").getPath()
 		);
@@ -751,7 +765,7 @@ private void addCleartkAttributeAnnotato
 		ConfigurationParameterFactory.addConfigurationParameters(
 				conditionalAnnotator,
 				AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-				AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+				AssertionEvaluation.GOLD_VIEW_NAME,
 				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
 				new File(new File(directory, "conditional"), "model.jar").getPath()
 		);
@@ -764,7 +778,7 @@ private void addCleartkAttributeAnnotato
 		ConfigurationParameterFactory.addConfigurationParameters(
 				uncertaintyAnnotator,
 				AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-				AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+				AssertionEvaluation.GOLD_VIEW_NAME,
 				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
 				new File(new File(directory, "uncertainty"), "model.jar").getPath()
 		);
@@ -777,7 +791,7 @@ private void addCleartkAttributeAnnotato
 		ConfigurationParameterFactory.addConfigurationParameters(
 				subjectAnnotator,
 				AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-				AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+				AssertionEvaluation.GOLD_VIEW_NAME,
 				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
 				new File(new File(directory, "subject"), "model.jar").getPath()
 		);
@@ -790,7 +804,7 @@ private void addCleartkAttributeAnnotato
 		ConfigurationParameterFactory.addConfigurationParameters(
 				genericAnnotator,
 				AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-				AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+				AssertionEvaluation.GOLD_VIEW_NAME,
 				GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
 				new File(new File(directory, "generic"), "model.jar").getPath()
 		);

Modified: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/CreateAssertionDescriptor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/CreateAssertionDescriptor.java?rev=1443625&r1=1443624&r2=1443625&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/CreateAssertionDescriptor.java
(original)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/CreateAssertionDescriptor.java
Thu Feb  7 17:47:55 2013
@@ -7,10 +7,10 @@ import java.io.IOException;
 import java.net.URI;
 import java.net.URISyntaxException;
 
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier;
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier.ReferenceAnnotationsSystemAssertionClearer;
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier.ReferenceIdentifiedAnnotationsSystemToGoldCopier;
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier.ReferenceSupportingAnnotationsSystemToGoldCopier;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation.ReferenceAnnotationsSystemAssertionClearer;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation.ReferenceIdentifiedAnnotationsSystemToGoldCopier;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation.ReferenceSupportingAnnotationsSystemToGoldCopier;
 import org.apache.ctakes.core.ae.DocumentIdPrinterAnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.resource.ResourceInitializationException;
@@ -91,7 +91,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         polarityAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
         this.dataWriterFactoryClass.getName(),
         DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -103,7 +103,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         conditionalAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
         this.dataWriterFactoryClass.getName(),
         DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -115,7 +115,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         uncertaintyAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
         this.dataWriterFactoryClass.getName(),
         DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -127,7 +127,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         subjectAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
         this.dataWriterFactoryClass.getName(),
         DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -139,7 +139,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         genericAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
         this.dataWriterFactoryClass.getName(),
         DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -192,7 +192,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         polarityAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
         new File(new File(directory, "polarity"), "model.jar").getPath()
         );
@@ -202,7 +202,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         conditionalAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
         new File(new File(directory, "conditional"), "model.jar").getPath()
         );
@@ -212,7 +212,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         uncertaintyAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
         new File(new File(directory, "uncertainty"), "model.jar").getPath()
         );
@@ -222,7 +222,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         subjectAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
         new File(new File(directory, "subject"), "model.jar").getPath()
         );
@@ -232,7 +232,7 @@ public class CreateAssertionDescriptor
     ConfigurationParameterFactory.addConfigurationParameters(
         genericAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
         new File(new File(directory, "generic"), "model.jar").getPath()
         );

Modified: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAssertionModel.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAssertionModel.java?rev=1443625&r1=1443624&r2=1443625&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAssertionModel.java
(original)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/medfacts/cleartk/TrainAssertionModel.java
Thu Feb  7 17:47:55 2013
@@ -29,9 +29,9 @@ import org.apache.commons.cli.Option;
 import org.apache.commons.cli.OptionBuilder;
 import org.apache.commons.cli.Options;
 import org.apache.commons.cli.ParseException;
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier;
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier.ReferenceAnnotationsSystemAssertionClearer;
-import org.apache.ctakes.assertion.eval.AssertionEvalBasedOnModifier.ReferenceIdentifiedAnnotationsSystemToGoldCopier;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation.ReferenceAnnotationsSystemAssertionClearer;
+import org.apache.ctakes.assertion.eval.AssertionEvaluation.ReferenceIdentifiedAnnotationsSystemToGoldCopier;
 import org.apache.log4j.Logger;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 
@@ -298,7 +298,7 @@ public class TrainAssertionModel {
     ConfigurationParameterFactory.addConfigurationParameters(
         trainingAssertionAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         CleartkAnnotator.PARAM_DATA_WRITER_FACTORY_CLASS_NAME,
         dataWriterFactoryClass.getName(),
         DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
@@ -357,7 +357,7 @@ public class TrainAssertionModel {
     ConfigurationParameterFactory.addConfigurationParameters(
         decodingAssertionAnnotator,
         AssertionCleartkAnalysisEngine.PARAM_GOLD_VIEW_NAME,
-        AssertionEvalBasedOnModifier.GOLD_VIEW_NAME,
+        AssertionEvaluation.GOLD_VIEW_NAME,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
         new File(modelOutputDirectory, "model.jar").getPath()
         );

Modified: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java?rev=1443625&r1=1443624&r2=1443625&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java
(original)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/GoldEntityAndAttributeReaderPipelineForSeedCorpus.java
Thu Feb  7 17:47:55 2013
@@ -27,8 +27,10 @@ import org.apache.log4j.Logger;
 import org.apache.uima.UIMAException;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.collection.CollectionReaderDescription;
+import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.metadata.TypeSystemDescription;
 import org.uimafit.component.xwriter.XWriter;
+import org.uimafit.factory.AggregateBuilder;
 import org.uimafit.factory.AnalysisEngineFactory;
 import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.factory.TypeSystemDescriptionFactory;
@@ -63,8 +65,20 @@ public class GoldEntityAndAttributeReade
 		
 		String parentDirectoryString = args[0];
 		//String parentDirectoryString = "/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/Seattle
Group Health/UMLS_CEM";
-		logger.info("parent directory: " + parentDirectoryString);
+
 		File parentDirectory = new File(parentDirectoryString);
+		readSharpUmlsCem(parentDirectory);
+		
+	}
+
+	public static void readSharpUmlsCem(File parentDirectory) throws ResourceInitializationException,
UIMAException, IOException {
+		readSharpUmlsCem(parentDirectory, null, null, null);
+	}
+	
+	public static void readSharpUmlsCem(File parentDirectory, File trainDirectory, File testDirectory,
File devDirectory)
+			throws ResourceInitializationException, UIMAException, IOException {
+//		logger.info("parent directory: " + parentDirectoryString);
+//		File parentDirectory = new File(parentDirectoryString);
 		if (!parentDirectory.exists())
 		{
 			logger.fatal("parent directory does not exist! exiting!");
@@ -121,22 +135,16 @@ public class GoldEntityAndAttributeReade
 					// (found in ctakes-type-system/src/main/resources)
 				TypeSystemDescriptionFactory.createTypeSystemDescription();
 			
+			AggregateBuilder aggregate = new AggregateBuilder();
+			
 			CollectionReaderDescription collectionReader = CollectionReaderFactory.createDescription(
 					FilesInDirectoryCollectionReader.class,
 					typeSystemDescription,
 					"InputDirectory",
-					//"/Users/m081914/work/data/sharp/Seed Corpus/Mayo/UMLS_CEM/ss1_batch04/Knowtator/text"
-					//"/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/sandbox/batch02_mayo/text"
 					textDirectory.toString()
 					);
 			
-//			AnalysisEngineDescription goldAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
-//					GoldEntityAndAttributeReader.class,
-//					typeSystemDescription,
-//					"InputDirectory",
-//					//"/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/sandbox/batch02_mayo/knowtator/"
-//					xmlDirectory.toString() + "/"
-//					);
+			// read the UMLS_CEM data from Knowtator
 			AnalysisEngineDescription goldAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
 					SHARPKnowtatorXMLReader.class,
 					typeSystemDescription,
@@ -144,28 +152,59 @@ public class GoldEntityAndAttributeReade
 					//"/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/sandbox/batch02_mayo/knowtator/"
 					textDirectory.toString() + "/"
 			);
+			aggregate.add(goldAnnotator);
+
+			// write just the XMI version of what's in Knowtator UMLS_CEM
+			AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
+					XWriter.class,
+					typeSystemDescription,
+					XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+					xmiDirectory.toString(),
+					XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+					CtakesFileNamer.class.getName()
+			);
+			aggregate.add(xWriter);
+
+			// fill in other values that are necessary for preprocessing
+			AnalysisEngineDescription preprocessAnnotator = AnalysisEngineFactory.createAnalysisEngineDescription(
+					"desc/analysis_engine/AttributeDiscoveryPreprocessor"
+					);
+			aggregate.add(preprocessAnnotator);
 			
-//			AnalysisEngineDescription sysAnnotator = (AnalysisEngineDescription) AnalysisEngineFactory.createAnalysisEngineFromPath(
-//					"/Users/m081914/work/sharpattr/ctakes/ctakes-clinical-pipeline" +
-//					"/desc/analysis_engine/AttributeClassifierPreprocessor.xml"
-//					);
-			
-	    AnalysisEngineDescription xWriter = AnalysisEngineFactory.createPrimitiveDescription(
-	        XWriter.class,
-	        typeSystemDescription,
-	        XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
-//	        "/Users/m081914/work/sharpattr/ctakes/ctakes-assertion/data/output",
-	        // "/work/medfacts/sharp/data/2012-10-09_full_data_set/batch02"
-	        //"/work/medfacts/sharp/data/2012-10-16_full_data_set_updated/Seed_Corpus/sandbox/batch02_mayo/xmi",
-	        xmiDirectory.toString(),
-	        XWriter.PARAM_FILE_NAMER_CLASS_NAME,
-	        CtakesFileNamer.class.getName()
-	        );
-	    
-			SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter);
+			if (trainDirectory!=null && testDirectory!=null && devDirectory!=null)
{
+				File subcorpusDirectory;
+				switch (SharpCorpusSplit.split(currentBatchDirectory)) {
+				case TRAIN: 
+					subcorpusDirectory = trainDirectory;
+					break;
+				case TEST:
+					subcorpusDirectory = testDirectory;
+					break;
+				case DEV:
+					subcorpusDirectory = devDirectory;
+					break;
+				case CROSSVAL:
+					subcorpusDirectory = trainDirectory;
+					break;
+				default:
+					subcorpusDirectory = trainDirectory;
+					break;
+				}
+				AnalysisEngineDescription xWriter2 = AnalysisEngineFactory.createPrimitiveDescription(
+						XWriter.class,
+						typeSystemDescription,
+						XWriter.PARAM_OUTPUT_DIRECTORY_NAME,
+						subcorpusDirectory,
+						XWriter.PARAM_FILE_NAMER_CLASS_NAME,
+						CtakesFileNamer.class.getName()
+				);
+				aggregate.add(xWriter2);
+//				SimplePipeline.runPipeline(collectionReader, goldAnnotator, xWriter, xWriter2);
+			}
+
+			SimplePipeline.runPipeline(collectionReader, aggregate.createAggregateDescription());
 		}
-		
+
 		logger.info("Finished!");
-		
 	}
 }

Added: incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java?rev=1443625&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java
(added)
+++ incubator/ctakes/trunk/ctakes-assertion/src/main/java/org/apache/ctakes/assertion/pipelines/SharpCorpusSplit.java
Thu Feb  7 17:47:55 2013
@@ -0,0 +1,40 @@
+package org.apache.ctakes.assertion.pipelines;
+
+import java.io.File;
+import java.util.HashMap;
+import java.util.Map;
+
+public class SharpCorpusSplit {
+	public enum Subcorpus { TRAIN, TEST, DEV, CROSSVAL }
+	
+	private static Map<String,Subcorpus> map = new HashMap<String,Subcorpus>();
+	static {
+		map.put("ss1_batch02", Subcorpus.TRAIN); 
+		map.put("ss1_batch03", Subcorpus.TRAIN); 
+		map.put("ss1_batch04", Subcorpus.TRAIN); 
+		map.put("ss1_batch05", Subcorpus.TRAIN); 
+		map.put("ss1_batch06", Subcorpus.TRAIN); 
+		map.put("ss1_batch07", Subcorpus.TRAIN); 
+		map.put("ss1_batch08", Subcorpus.TRAIN); 
+		map.put("ss1_batch09", Subcorpus.TRAIN); 
+		map.put("ss1_batch10", Subcorpus.DEV); 
+		map.put("ss1_batch11", Subcorpus.TEST); 
+		map.put("ss1_batch12", Subcorpus.TEST); 
+		map.put("ss1_batch13", Subcorpus.TRAIN); 
+		map.put("ss1_batch14", Subcorpus.TRAIN); 
+		map.put("ss1_batch15", Subcorpus.TRAIN); 
+		map.put("ss1_batch16", Subcorpus.TRAIN); 
+		map.put("ss1_batch17", Subcorpus.DEV); 
+		map.put("ss1_batch18", Subcorpus.TRAIN); 
+		map.put("ss1_batch19", Subcorpus.TRAIN); 
+	}
+	
+	public static Subcorpus split( File directory ) {
+		if (map.containsKey(directory.getName())) {
+//			System.out.println(directory.toString());
+			return map.get(directory.getName());
+		} else {
+			return Subcorpus.TRAIN;
+		}
+	}
+}



Mime
View raw message