incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c...@apache.org
Subject svn commit: r1417934 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/ ae/feature/ ae/feature/selection/ eval/
Date Thu, 06 Dec 2012 15:20:09 GMT
Author: clin
Date: Thu Dec  6 15:20:07 2012
New Revision: 1417934

URL: http://svn.apache.org/viewvc?rev=1417934&view=rev
Log:
add chi-square feature selection and downsampling to event detection.

current chi-square feature selection method is only designed for categorical features. Numeric features don't need to go through feature selection step.

the downsampling ratio can be controlled by a command line argument: such as
--downratio 0.6
It will write 60% of the negative instances into the training file.

the feature selection can be controlled by a command line argument: 
--featureSelect 5
It will keep features whose chi2 values are bigger than 5.
if -- featureSelect 0
Then the feature selection step is ignored.

Added:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java   (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java   (with props)
Removed:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FSExtractor.java
Modified:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1417934&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Thu Dec  6 15:20:07 2012
@@ -0,0 +1,379 @@
+package org.apache.ctakes.temporal.ae;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+//import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+//import java.util.logging.Logger;
+
+//import org.apache.ctakes.temporal.ae.feature.CoveredTextToValuesExtractor;
+import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
+import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
+import org.apache.ctakes.temporal.ae.feature.SurfaceFormFeatureExtractor;
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.CleartkAnnotator;
+//import org.cleartk.classifier.DataWriter;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+//import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.chunking.BIOChunking;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CharacterCategoryPatternExtractor.PatternType;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.CoveredTextExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.simple.TypePathExtractor;
+import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.classifier.jar.JarClassifierFactory;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.ConfigurationParameterFactory;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.util.JCasUtil;
+
+//import com.google.common.base.Charsets;
+import com.google.common.base.Predicate;
+import com.google.common.collect.Iterables;
+import com.google.common.collect.Lists;
+//import com.google.common.io.LineProcessor;
+//import com.google.common.io.Resources;
+
+public class EventAnnotator extends CleartkAnnotator<String> {
+
+  public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE = "ProbabilityOfKeepingANegativeExample";
+
+  @ConfigurationParameter(
+			name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+			mandatory = false,
+			description = "probability that a negative example should be retained for training")
+  protected Float probabilityOfKeepingANegativeExample = 0.8f;
+  
+  public static final String PARAM_FEATURE_TRIM_ORNOT = "WhetherToDoFeatureSelection";
+
+  @ConfigurationParameter(
+			name = PARAM_FEATURE_TRIM_ORNOT,
+			mandatory = false,
+			description = "set whether feature selection is used or not")
+  public static Float featureTrim = 0f;
+  
+  public static AnalysisEngineDescription createDataWriterDescription(
+      String dataWriterName,
+      File outputDirectory, float downratio, float featureSelect) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        EventAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterName,
+        DefaultDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory,
+        EventAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        downratio,
+        EventAnnotator.PARAM_FEATURE_TRIM_ORNOT,
+        featureSelect);
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
+      throws ResourceInitializationException {
+	 AnalysisEngineDescription fsEventAnnotator =AnalysisEngineFactory.createPrimitiveDescription(
+        EventAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        JarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        new File(modelDirectory, "model.jar"));
+	 ConfigurationParameterFactory.addConfigurationParameter(
+		fsEventAnnotator,	
+		EventAnnotator.PARAM_NB_FS_URI,
+		EventAnnotator.createNbFSURI(modelDirectory) );
+	 
+     return(fsEventAnnotator);
+  }
+
+  protected List<SimpleFeatureExtractor> tokenFeatureExtractors;
+
+  protected List<CleartkExtractor> contextFeatureExtractors;
+
+  private BIOChunking<BaseToken, EntityMention> entityChunking;
+
+  private BIOChunking<BaseToken, EventMention> eventChunking;
+  
+  public static final String PARAM_NB_FS_URI = ConfigurationParameterFactory.createConfigurationParameterName(
+		      EventAnnotator.class,
+		      "neighborFsUri");
+
+	  @ConfigurationParameter(
+		  mandatory = false,
+		  description = "provides a URI where the neighbor annotation's feature selection data will be written")
+	  protected URI neighborFsUri;
+		    
+  public static final String FS_NEIGHBOR_EXTRACTOR_KEY = "SelectNeighborFeatures";
+
+  private Chi2NeighborFSExtractor<String> chi2NeighborFsExtractor;
+  
+  
+  public static URI createNbFSURI(File outputDirectoryName) {
+	    File f = new File(outputDirectoryName, FS_NEIGHBOR_EXTRACTOR_KEY + "_Chi2_extractor.dat");
+	    return f.toURI();
+	  }
+  //*****feature selection related parameters
+
+  @Override
+  public void initialize(UimaContext context) throws ResourceInitializationException {
+    super.initialize(context);
+
+    // define chunkings
+    this.entityChunking = new BIOChunking<BaseToken, EntityMention>(
+        BaseToken.class,
+        EntityMention.class,
+        "typeID");
+    this.eventChunking = new BIOChunking<BaseToken, EventMention>(
+        BaseToken.class,
+        EventMention.class);
+      
+    //configure FS extractor:
+    if (featureTrim > 0){//if feature selection
+        CombinedExtractor forneighbors    = new CombinedExtractor(
+    			new CoveredTextExtractor(),
+    			new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+    			new TypePathExtractor(BaseToken.class, "partOfSpeech"),
+    			new SurfaceFormFeatureExtractor(),
+    	      	new PhraseExtractor(),
+    	      	new SRLExtractor());
+        
+        try {
+    		this.chi2NeighborFsExtractor = initNbFSExtractor(forneighbors);
+    	} catch (IOException e) {
+    		e.printStackTrace();
+    	}
+    }else{//if no feature selection
+        // add features: word, stem, pos and more
+        this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
+        // try {
+        this.tokenFeatureExtractors.addAll(Arrays.asList(
+        		new CoveredTextExtractor(),
+//            	new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+//            	new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
+//            	new SurfaceFormFeatureExtractor(),
+//            	new PhraseExtractor(),
+            	new SRLExtractor()));
+        		// new CoveredTextToValuesExtractor("ACF", StringToDoublesProcessor.parse("/word_freq.lst")),
+        		// new CoveredTextToValuesExtractor("PCA", StringToDoublesProcessor.parse("/word_pca.lst")),
+        		// new CoveredTextToValuesExtractor("TimPCA", StringToDoublesProcessor.parse("/tim_word_pca.txt"))));
+
+        //add window of features before and after
+        CombinedExtractor subExtractor = new CombinedExtractor(
+        	new CoveredTextExtractor(),
+        	new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
+        	new TypePathExtractor(BaseToken.class, "partOfSpeech"),//);
+        	new SurfaceFormFeatureExtractor(),
+        	new SRLExtractor());
+
+        this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
+        this.contextFeatureExtractors.add(new CleartkExtractor(
+        	BaseToken.class,
+        	subExtractor,
+        	new Preceding(3),
+        	new Following(3)));
+    }
+
+
+  }
+
+
+private Chi2NeighborFSExtractor<String> initNbFSExtractor(
+		CombinedExtractor subextractor) throws IOException{
+
+	Chi2NeighborFSExtractor<String> chi2NbFSExtractor = new  Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, BaseToken.class, subextractor, featureTrim, new Preceding(4),
+	    	new Following(4)); //the 3rd last parameter is used to control chi2 threshold, the last two are used to control window size
+	
+	if (this.neighborFsUri != null) {
+		chi2NbFSExtractor.load(this.neighborFsUri);
+	    }
+	return chi2NbFSExtractor;
+}
+
+
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+	  
+	Random rand = new Random();
+    // classify tokens within each sentence
+    for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
+      List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
+
+      // during training, the list of all outcomes for the tokens
+      List<String> outcomes;
+      if (this.isTraining()) {
+        List<EventMention> events = JCasUtil.selectCovered(jCas, EventMention.class, sentence);
+        outcomes = this.eventChunking.createOutcomes(jCas, tokens, events);
+      }
+      // during prediction, the list of outcomes predicted so far
+      else {
+        outcomes = new ArrayList<String>();
+      }
+
+      // get BIO entity tags for each entity type
+      int[] entityTypeIDs = new int[] {
+          CONST.NE_TYPE_ID_ANATOMICAL_SITE,
+          CONST.NE_TYPE_ID_DISORDER,
+          CONST.NE_TYPE_ID_DRUG,
+          CONST.NE_TYPE_ID_FINDING,
+          CONST.NE_TYPE_ID_PROCEDURE,
+          CONST.NE_TYPE_ID_UNKNOWN };
+      List<EntityMention> entities = JCasUtil.selectCovered(jCas, EntityMention.class, sentence);
+      Map<Integer, List<String>> entityTagsByType = new HashMap<Integer, List<String>>();
+      for (int typeID : entityTypeIDs) {
+        Predicate<EntityMention> hasTypeID = hasEntityType(typeID);
+        List<EntityMention> subEntities = Lists.newArrayList(Iterables.filter(entities, hasTypeID));
+        entityTagsByType.put(typeID, this.entityChunking.createOutcomes(jCas, tokens, subEntities));
+      }
+
+      // extract features for all tokens
+      int tokenIndex = -1;
+      int window = 2;
+      int nPreviousClassifications = 2;
+      
+      for (BaseToken token : tokens) {
+        ++tokenIndex;
+
+        List<Feature> features = new ArrayList<Feature>();
+        
+        if (featureTrim >0 ){//if feature selection
+        	features.addAll(this.chi2NeighborFsExtractor.extract(jCas, token)); //base features
+        	features.addAll(this.chi2NeighborFsExtractor.extractWithin(jCas, token, sentence)); //neighbor features
+        	features.addAll(this.chi2NeighborFsExtractor.extract(entityTypeIDs, entityTagsByType,tokenIndex, window)); // features from surrounding entities
+        	features.addAll(this.chi2NeighborFsExtractor.extract(nPreviousClassifications, tokenIndex, outcomes)); //features from previous classifications
+        }else{ //if no feature selection
+        	// features from token attributes
+            for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
+              features.addAll(extractor.extract(jCas, token));
+            }
+            // features from surrounding tokens
+            for (CleartkExtractor extractor : this.contextFeatureExtractors) {
+              features.addAll(extractor.extractWithin(jCas, token, sentence));
+            }
+            // features from surrounding entities
+            for (int typeID : entityTypeIDs) {
+              List<String> tokenEntityTags = entityTagsByType.get(typeID);
+              int begin = Math.max(tokenIndex - window, 0);
+              int end = Math.min(tokenIndex + window, tokenEntityTags.size());
+              for (int i = begin; i < end; ++i) {
+                String name = String.format("EntityTag_%d_%d", typeID, i - begin);
+                features.add(new Feature(name, tokenEntityTags.get(i)));
+              }
+            }
+            // features from previous classifications
+            for (int i = nPreviousClassifications; i > 0; --i) {
+              int index = tokenIndex - i;
+              String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+              features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+            }
+        }
+        
+        // if training, write to data file
+        if (this.isTraining()) {
+            String outcome = outcomes.get(tokenIndex);
+            if(outcome.equals("O")){ //if it is an "O". downsample it
+          	  if (rand.nextDouble()<=probabilityOfKeepingANegativeExample)
+          		  this.dataWriter.write(new Instance<String>(outcome, features));
+            }else {
+          	  this.dataWriter.write(new Instance<String>(outcome, features));
+            }
+          }
+
+        // if predicting, add prediction to outcomes
+        else {
+          outcomes.add(this.classifier.classify(features));
+        }
+      }
+
+      // during prediction, convert chunk labels to events and add them to the CAS
+      if (!this.isTraining()) {
+        this.eventChunking.createChunks(jCas, tokens, outcomes);
+      }
+    }
+  }
+
+  private static Predicate<EntityMention> hasEntityType(final int typeID) {
+    return new Predicate<EntityMention>() {
+      public boolean apply(EntityMention mention) {
+        return mention.getTypeID() == typeID;
+      }
+    };
+  }
+
+//  private static class StringToDoublesProcessor implements LineProcessor<Map<String, double[]>> {
+//    private Logger logger = Logger.getLogger(this.getClass().getName());
+//
+//    private Map<String, double[]> result = new HashMap<String, double[]>();
+//
+//    private int length = -1;
+//
+//    @Override
+//    public Map<String, double[]> getResult() {
+//      return this.result;
+//    }
+//
+//    @Override
+//    public boolean processLine(String line) throws IOException {
+//      String[] parts = line.trim().split(",");
+//      String key = parts[0];
+//      int partsOffset = 0;
+//      if (this.length == -1) {
+//        this.length = parts.length;
+//      } else if (parts.length != this.length) {
+//        String message = "expected %d parts, found %d, skipping line '%s'";
+//        this.logger.warning(String.format(message, this.length, parts.length, line));
+//        return true;
+//      }
+//      double[] values = new double[parts.length - 1];
+//      for (int i = 0; i < values.length; ++i) {
+//        values[i] = Double.parseDouble(parts[i + 1 + partsOffset]);
+//      }
+//      this.result.put(key, values);
+//      return true;
+//    }
+//  }
+
+
+public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
+	return this.chi2NeighborFsExtractor;
+}
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java?rev=1417934&r1=1417933&r2=1417934&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/SRLExtractor.java Thu Dec  6 15:20:07 2012
@@ -19,6 +19,7 @@
 package org.apache.ctakes.temporal.ae.feature;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
@@ -38,23 +39,23 @@ public class SRLExtractor implements Sim
   public List<Feature> extract(JCas jCas, Annotation focusAnnotation)
       throws CleartkExtractorException {
     // and cache the results so that we only do this once per CAS
-	//String jCasText = jCas.getDocumentText();
+	String jCasText = jCas.getDocumentText();
 	String roleFeat = "SemanticRole";
-	//String roleVerbFeat = "RoleAndVerb";
-	//String verb = "noVerb";
+	String roleVerbFeat = "RoleAndVerb";
+	String verb = "noVerb";
     Feature role = new Feature(roleFeat, "NoRole");
-    //Feature roleVerb = new Feature(roleVerbFeat, "NoRole"+verb);
+    Feature roleVerb = new Feature(roleVerbFeat, "NoRole"+verb);
     ArrayList<Feature> features = new ArrayList<Feature>();
     for (Predicate predicate : JCasUtil.select(jCas, Predicate.class)) {
 
       for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, predicate)) {
         if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
           role = new Feature(roleFeat,"Predicate");
-          //verb = jCasText.substring(predicate.getBegin(), predicate.getEnd());
-          //roleVerb = new Feature(roleVerbFeat, "Predicate::"+verb);
+          verb = jCasText.substring(predicate.getBegin(), predicate.getEnd());
+          roleVerb = new Feature(roleVerbFeat, "Predicate::"+verb);
           
           features.add(role);
-          //features.add(roleVerb);
+          features.add(roleVerb);
           return features;
         }
       }
@@ -67,13 +68,13 @@ public class SRLExtractor implements Sim
         for (BaseToken token : JCasUtil.selectCovered(jCas, BaseToken.class, arg)) {
           if (token.equals(focusAnnotation)) {// token.getBegin()==focusAnnotation.getBegin()){
             String label = arg.getLabel();
-            //Predicate currentPred = relation.getPredicate();
-            //verb = jCasText.substring(currentPred.getBegin(), currentPred.getEnd());
+            Predicate currentPred = relation.getPredicate();
+            verb = jCasText.substring(currentPred.getBegin(), currentPred.getEnd());
             role = new Feature(roleFeat, label);
-            //roleVerb = new Feature(roleVerbFeat, label+"::"+verb);
+            roleVerb = new Feature(roleVerbFeat, label+"::"+verb);
             
             features.add(role);
-            //features.add(roleVerb);
+            features.add(roleVerb);
             return features;
           }
         }
@@ -81,7 +82,7 @@ public class SRLExtractor implements Sim
     }
 
     features.add(role);
-    //features.add(roleVerb);
+    features.add(roleVerb);
     return features;
   }
 

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java?rev=1417934&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java Thu Dec  6 15:20:07 2012
@@ -0,0 +1,471 @@
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor.Chi2Evaluator.ComputeFeatureScore;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.BetweenAnnotationsFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Table;
+
+/**
+ * 
+ * Selects features via Chi-squared statistics between the features extracted from its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ * 
+ * @author Chen Lin
+ * 
+ */
+public class Chi2NeighborFSExtractor<OUTCOME_T> extends FeatureSelectionExtractor<OUTCOME_T>
+		implements SimpleFeatureExtractor , BetweenAnnotationsFeatureExtractor{
+	
+			/**
+			   * A Bounds implementation that puts no restrictions on the context.
+			   */
+			  private static class NoBounds implements Bounds {
+
+			    public NoBounds() {
+			    }
+
+			    @Override
+			    public boolean contains(Annotation annotation) {
+			      return true;
+			    }
+
+			  }
+
+			/**
+			   * A Bounds implementation that restricts the context to annotations within a given span.
+			   */
+		private static class SpanBounds implements Bounds {
+
+		private int begin;
+
+		private int end;
+
+		public SpanBounds(int begin, int end) {
+		    this.begin = begin;
+		    this.end = end;
+		}
+
+		@Override
+		public boolean contains(Annotation annotation) {
+		    return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end;
+		}
+
+	}
+
+		/**
+		   * Helper class for aggregating and computing mutual Chi2 statistics
+		*/
+		public static class Chi2Evaluator<OUTCOME_T> {
+			 protected Multiset<OUTCOME_T> classCounts;
+
+			 protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+			 public Chi2Evaluator() {
+			      this.classCounts = HashMultiset.<OUTCOME_T> create();
+			      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+			 }
+
+			 public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+			      Integer count = this.featValueClassCount.get(featureName, outcome);
+			      if (count == null) {
+			        count = 0;
+			      }
+			      this.featValueClassCount.put(featureName, outcome, count + occurrences);
+			      this.classCounts.add(outcome, occurrences);
+			 }
+
+			 public double Chi2Cal(String featureName) {
+			      // notation index of 0 means false, 1 mean true
+				  //Contingency Table:
+				  //    | class1 | class2 | class3 | sum
+				  //posi| 		 |        |        | posiFeatCount
+				  //nega|        |        |        | negaFeatCount
+				  //    | outcnt1| outcnt2| outcnt3| n
+				  
+				  int numOfClass = this.classCounts.elementSet().size();
+			      int[] posiOutcomeCounts = new int[numOfClass];
+			      int[] outcomeCounts = new int[numOfClass];
+			      int classId = 0;
+			      int posiFeatCount = 0;
+			      for ( OUTCOME_T clas: this.classCounts.elementSet()){
+			    	  posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)? 
+			    			  this.featValueClassCount.get(featureName, clas)
+					          : 0;
+			    	  posiFeatCount += posiOutcomeCounts[classId];
+			    	  outcomeCounts[classId] = this.classCounts.count(clas);
+			    	  classId ++;
+			      }
+			      
+			      int n = this.classCounts.size();
+			      int negaFeatCount = n - posiFeatCount;
+			      
+			      double chi2val = 0.0;
+			      
+			      if (posiFeatCount == 0 || posiFeatCount == n){ //all instances have same value on this feature, degree of freedom = 0
+			    	  return chi2val;			    	  
+			      }
+			      
+			      boolean yates = true;
+			      for (int lbl =0; lbl < numOfClass; lbl++){
+			    	  	//for positive part of feature:
+				    	  double expected = (outcomeCounts[lbl]/(double)n)*(posiFeatCount);
+				    	  if (expected > 0){
+				    		  double diff = Math.abs(posiOutcomeCounts[lbl]-expected);
+				    		  if (yates){ // apply Yate's correction
+				    			  diff -= 0.5;
+				    		  }
+				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
+				    	  }
+				    		  
+				    	  //for negative part of feature:
+				    	  expected = (outcomeCounts[lbl]/(double)n)*(negaFeatCount);
+				    	  double observ = outcomeCounts[lbl]-posiOutcomeCounts[lbl];
+				    	  if (expected > 0){
+				    		  double diff = Math.abs(observ-expected);
+				    		  if (yates){ // apply Yate's correction
+				    			  diff -= 0.5;
+				    		  }
+				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
+				    	  }
+			      }
+
+			      return chi2val;
+			    }
+
+			    
+			 public void save(URI outputURI) throws IOException {
+			      File out = new File(outputURI);
+			      BufferedWriter writer = null;
+			      writer = new BufferedWriter(new FileWriter(out));
+
+			      // Write out header
+			      writer.append("Chi2 FS Neighbor Data\n");
+			      writer.append("Feature\t");
+			      writer.append(Joiner.on("\t").join(this.featValueClassCount.columnKeySet()));
+			      writer.append("\n");
+
+			      // Write out Chi2 values for all features
+			      for (String featureName : this.featValueClassCount.rowKeySet()) {
+			        writer.append(featureName);
+			        writer.append("\t");
+			        writer.append(String.format("%f", this.Chi2Cal(featureName)));
+			        writer.append("\n");
+			      }
+			      writer.append("\n");
+			      writer.append(this.featValueClassCount.toString());
+			      writer.close();
+			    }
+			 
+			 public ComputeFeatureScore<OUTCOME_T> getScoreFunction() {
+			      return new ComputeFeatureScore<OUTCOME_T>(this);
+			    }
+
+			    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
+
+			      private Chi2Evaluator<OUTCOME_T> stats;
+
+			      public ComputeFeatureScore(Chi2Evaluator<OUTCOME_T> stats) {
+			        this.stats = stats;
+			      }
+
+			      @Override
+			      public Double apply(String featureName) {
+			        Double featureChi2 = stats.Chi2Cal(featureName);
+			        return featureChi2;
+			      }
+
+			    }
+	}
+			
+			
+	protected boolean isTrained;
+	private CombinedExtractor subExtractor;
+	private List<String> selectedFeatures;
+	private double chi2Threshold;
+	private Chi2Evaluator<OUTCOME_T> chi2Evaluator;
+	private Context[] contexts;
+	private Class<? extends Annotation> annotationClass;
+
+	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, Context... contexts) {
+		super(name);
+		this.annotationClass = annotationClass;
+		this.init(featureExtractor, 0.0);
+		this.contexts = contexts;
+	}
+	
+	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, double thres, Context... contexts) {
+		super(name);
+		this.annotationClass = annotationClass;
+		this.init(featureExtractor, thres);
+		this.contexts = contexts;
+	}
+
+	public Chi2NeighborFSExtractor(String fsNeighborExtractorKey, Float thres) {
+		super(fsNeighborExtractorKey);
+		this.isTrained=false;
+		this.chi2Threshold = thres;
+	}
+
+	private void init(CombinedExtractor featureExtractor, double thres) {
+		this.subExtractor= featureExtractor;
+		this.chi2Threshold = thres;
+	}
+
+	@Override
+	public List<Feature> extract(JCas view, Annotation focusAnnotation)
+			throws CleartkExtractorException {
+		List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
+	    List<Feature> result = new ArrayList<Feature>();
+	    if (this.isTrained) {
+	      // Filter out selected features
+	      result.addAll(Collections2.filter(extracted, this));
+	    } else {
+	      // We haven't trained this extractor yet, so just mark the existing features
+	      // for future modification, by creating one uber-container feature
+	      result.add(new TransformableFeature(this.name, extracted));
+	    }
+
+	    return result;
+	}
+	
+	public List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds)
+		      throws CleartkExtractorException {
+		    List<Feature> extracted = new ArrayList<Feature>();
+		    for (Context context : this.contexts) {
+			      extracted.addAll(context.extract(
+			          view,
+			          focusAnnotation,
+			          bounds,
+			          this.annotationClass,
+			          this.subExtractor));
+			    }
+		    List<Feature> result = new ArrayList<Feature>();
+		    if (this.isTrained){
+		    	// Filter out selected features
+			    result.addAll(Collections2.filter(extracted, this));
+		    }else{
+		    	// We haven't trained this extractor yet, so just mark the existing features
+			    // for future modification, by creating one uber-container feature
+			    result.add(new TransformableFeature(this.name, extracted));
+		    }
+		    
+		    return result;
+		  }
+
+	/**
+	  * Extract features from the annotations around the focus annotation and within the given bounds.
+	   * 
+	   * @param view
+	   *          The JCas containing the focus annotation.
+	   * @param focusAnnotation
+	   *          The annotation whose context is to be searched.
+	   * @param boundsAnnotation
+	   *          The boundary within which context annotations may be identified.
+	   * @return The features extracted in the context of the focus annotation.
+	   */
+	public List<Feature> extractWithin(
+	      JCas view,
+	      Annotation focusAnnotation,
+	      Annotation boundsAnnotation) throws CleartkExtractorException {
+	    Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd());
+	    return this.extract(view, focusAnnotation, bounds);
+	}
+	  
+	@Override
+	public boolean apply(Feature feature) {
+		return this.selectedFeatures.contains(this.nameFeature(feature));
+	}
+	
+	public String nameFeature(Feature feature) {
+	    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
+	        + feature.getValue();
+	  }
+
+	@Override
+	public void train(Iterable<Instance<OUTCOME_T>> instances) {
+		// aggregate statistics for all features
+	    this.chi2Evaluator = new Chi2Evaluator<OUTCOME_T>();
+
+	    for (Instance<OUTCOME_T> instance : instances) {
+	      OUTCOME_T outcome = instance.getOutcome();
+	      for (Feature feature : instance.getFeatures()) {
+	        if (this.isTransformable(feature)) {
+	          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+	        	  chi2Evaluator.update(this.nameFeature(untransformedFeature), outcome, 1);
+	          }
+	        }else{
+	        	chi2Evaluator.update(this.nameFeature(feature), outcome, 1);
+	        }
+	      }
+	    }
+	    // Compute mutual information score for each feature
+	    Set<String> featureNames = chi2Evaluator.featValueClassCount.rowKeySet();
+
+	
+		//step3: remove small chi2 valued features
+	    Iterator<String> iter = featureNames.iterator();
+	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+	    while (iter.hasNext()){
+	    	String feat = iter.next();
+	    	Double chi2 = computeScore.apply(feat);
+	    	if(chi2 <= this.chi2Threshold){
+	    		iter.remove();
+	    	}
+	    }
+	    
+//	    this.selectedFeatures = new ArrayList<String>();
+//	    for (String feature : featureNames){
+//	    	this.selectedFeatures.add(feature);
+//	    }
+//	    
+	    //step4:get selected features
+	    this.selectedFeatures = Ordering.natural().onResultOf(
+        this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
+        featureNames);
+	    
+//	    Iterator<String> iter = featureNames.iterator();
+//	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+//	    this.selectedFeatures = new ArrayList<String>();
+//	    while (iter.hasNext()){
+//	    	String feat = iter.next();
+//	    	Double chi2 = computeScore.apply(feat);
+//	    	if(chi2 > this.chi2Threshold){
+//	    		this.selectedFeatures.add(feat);
+//	    	}
+//	    }
+//		//order the list 
+//	    this.selectedFeatures = Ordering.natural().onResultOf(
+//	          this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
+//	        		  this.selectedFeatures);
+	    
+		this.isTrained = true;
+		
+	}
+
+	@Override
+	public void save(URI uri) throws IOException {
+		if (!this.isTrained) {
+		      throw new IOException("Chi2FSExtractor: Cannot save before training.");
+		}
+		File out = new File(uri);
+	    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+	    for (String feature : this.selectedFeatures) {
+	      writer.append(String.format("%s\t%f\n", feature, computeScore.apply(feature)));
+	    }
+
+	    writer.close();
+	}
+
+	@Override
+	public void load(URI uri) throws IOException {
+		this.selectedFeatures = Lists.newArrayList();
+	    File in = new File(uri);
+	    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+	    // The rest of the lines are feature + selection scores
+	    String line = null;
+	    //int n = 0;
+	    while ((line = reader.readLine()) != null ){//&& n < this.numFeatures) {
+	      String[] featureValuePair = line.split("\\t");
+	      this.selectedFeatures.add(featureValuePair[0]);
+	      //n++;
+	    }
+
+	    reader.close();
+	    this.isTrained = true;
+		
+	}
+
+	@Override
+	public List<Feature> extractBetween(JCas jCas, Annotation annotation1,
+			Annotation annotation2) throws CleartkExtractorException {
+		int begin = annotation1.getEnd();
+	    int end = annotation2.getBegin();
+	    // FIXME: creating a new annotation may leak memory - is there a better approach?
+	    Annotation focusAnnotation = new Annotation(jCas, begin, end);
+	    return this.extract(jCas, focusAnnotation, new NoBounds());
+	}
+
+	public Collection<? extends Feature> extract(int[] entityTypeIDs, Map<Integer, List<String>> entityTagsByType, int tokenIndex, int window) {
+		List<Feature> extracted = new ArrayList<Feature>();
+	    List<Feature> result = new ArrayList<Feature>();
+	    for (int typeID : entityTypeIDs) {
+            List<String> tokenEntityTags = entityTagsByType.get(typeID);
+            int begin = Math.max(tokenIndex - window, 0);
+            int end = Math.min(tokenIndex + window, tokenEntityTags.size());
+            for (int i = begin; i < end; ++i) {
+              String name = String.format("EntityTag_%d_%d", typeID, i - begin);
+              extracted.add(new Feature(name, tokenEntityTags.get(i)));
+            }
+          }
+		if (this.isTrained){
+	    	// Filter out selected features
+		    result.addAll(Collections2.filter(extracted, this));
+	    }else{
+	    	// We haven't trained this extractor yet, so just mark the existing features
+		    // for future modification, by creating one uber-container feature
+		    result.add(new TransformableFeature(this.name, extracted));
+	    }
+	    
+	    return result;
+	}
+
+	public Collection<? extends Feature> extract(int nPreviousClassifications,
+			int tokenIndex, List<String> outcomes) {
+		List<Feature> extracted = new ArrayList<Feature>();
+	    List<Feature> result = new ArrayList<Feature>();
+		// features from previous classifications
+        for (int i = nPreviousClassifications; i > 0; --i) {
+          int index = tokenIndex - i;
+          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+          extracted.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+        }
+        
+        if (this.isTrained){
+	    	// Filter out selected features
+		    result.addAll(Collections2.filter(extracted, this));
+	    }else{
+	    	// We haven't trained this extractor yet, so just mark the existing features
+		    // for future modification, by creating one uber-container feature
+		    result.add(new TransformableFeature(this.name, extracted));
+	    }
+	    
+	    return result;
+	}
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java?rev=1417934&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java Thu Dec  6 15:20:07 2012
@@ -0,0 +1,182 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.logging.FileHandler;
+import java.util.logging.Formatter;
+import java.util.logging.Level;
+import java.util.logging.LogRecord;
+import java.util.logging.Logger;
+
+import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.InstanceStream;
+import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
+import org.cleartk.eval.AnnotationStatistics;
+import org.cleartk.util.ViewURIUtil;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.pipeline.SimplePipeline;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Ordering;
+
+public abstract class EvaluationOfAnnotationSpans_ImplBase extends
+    Evaluation_ImplBase<AnnotationStatistics<String>> {
+
+  private final Logger logger = Logger.getLogger(this.getClass().getName());
+
+  public void setLogging(Level level, File outputFile) throws IOException {
+    if (!outputFile.getParentFile().exists()) {
+      outputFile.getParentFile().mkdirs();
+    }
+    this.logger.setLevel(level);
+    FileHandler handler = new FileHandler(outputFile.getPath());
+    handler.setFormatter(new Formatter() {
+      @Override
+      public String format(LogRecord record) {
+        return record.getMessage() + '\n';
+      }
+    });
+    this.logger.addHandler(handler);
+  }
+
+  public EvaluationOfAnnotationSpans_ImplBase(
+      File baseDirectory,
+      File rawTextDirectory,
+      File knowtatorXMLDirectory,
+      List<Integer> patientSets,
+      Set<AnnotatorType> annotatorFlags) {
+    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, patientSets, annotatorFlags);
+  }
+
+  protected abstract AnalysisEngineDescription getDataWriterDescription(File directory)
+      throws ResourceInitializationException;
+
+  protected abstract void trainAndPackage(File directory) throws Exception;
+
+  @Override
+  protected void train(CollectionReader collectionReader, File directory) throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(this.getPreprocessorTrainDescription());
+    aggregateBuilder.add(this.getDataWriterDescription(directory));
+    SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+    
+    if( EventAnnotator.featureTrim > 0 ){
+    	//Extracting features and writing instances
+        Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+        // Collect MinMax stats for feature normalization
+        URI chi2NbFsURI = EventAnnotator.createNbFSURI(directory);
+        Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, EventAnnotator.featureTrim);
+        chi2NbFsExtractor.train(instances);
+        chi2NbFsExtractor.save(chi2NbFsURI);
+        //now write in the libsvm format
+        this.logger.info("Write out model training data");
+        LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
+        for (Instance<String> instance : instances) {
+          instance = chi2NbFsExtractor.transform(instance);
+          dataWriter.write(instance);
+        }
+        dataWriter.finish();
+    }
+    
+    this.trainAndPackage(directory);
+  }
+
+  protected abstract AnalysisEngineDescription getAnnotatorDescription(File directory)
+      throws ResourceInitializationException;
+
+  protected abstract Collection<? extends Annotation> getGoldAnnotations(JCas jCas);
+
+  protected abstract Collection<? extends Annotation> getSystemAnnotations(JCas jCas);
+
+  @Override
+  protected AnnotationStatistics<String> test(CollectionReader collectionReader, File directory)
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(this.getPreprocessorTestDescription());
+    aggregateBuilder.add(this.getAnnotatorDescription(directory));
+
+    AnnotationStatistics<String> stats = new AnnotationStatistics<String>();
+    Ordering<Annotation> bySpans = Ordering.<Integer> natural().lexicographical().onResultOf(
+        new Function<Annotation, List<Integer>>() {
+          @Override
+          public List<Integer> apply(Annotation annotation) {
+            return Arrays.asList(annotation.getBegin(), annotation.getEnd());
+          }
+        });
+    for (JCas jCas : new JCasIterable(collectionReader, aggregateBuilder.createAggregate())) {
+      JCas goldView = jCas.getView(GOLD_VIEW_NAME);
+      JCas systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
+      Collection<? extends Annotation> goldAnnotations = this.getGoldAnnotations(goldView);
+      Collection<? extends Annotation> systemAnnotations = this.getSystemAnnotations(systemView);
+      stats.add(goldAnnotations, systemAnnotations);
+
+      Set<Annotation> goldSet = new TreeSet<Annotation>(bySpans);
+      goldSet.addAll(goldAnnotations);
+      Set<Annotation> systemSet = new TreeSet<Annotation>(bySpans);
+      systemSet.addAll(systemAnnotations);
+
+      Set<Annotation> goldOnly = new TreeSet<Annotation>(bySpans);
+      goldOnly.addAll(goldSet);
+      goldOnly.removeAll(systemSet);
+
+      Set<Annotation> systemOnly = new TreeSet<Annotation>(bySpans);
+      systemOnly.addAll(systemSet);
+      systemOnly.removeAll(goldSet);
+
+      String text = jCas.getDocumentText().replaceAll("[\r\n]", " ");
+      if (!goldOnly.isEmpty() || !systemOnly.isEmpty()) {
+        this.logger.fine("Errors in : " + ViewURIUtil.getURI(jCas).toString());
+        Set<Annotation> errors = new TreeSet<Annotation>(bySpans);
+        errors.addAll(goldOnly);
+        errors.addAll(systemOnly);
+        for (Annotation annotation : errors) {
+          int begin = annotation.getBegin();
+          int end = annotation.getEnd();
+          int windowBegin = Math.max(0, begin - 50);
+          int windowEnd = Math.min(text.length(), end + 50);
+          String label = goldOnly.contains(annotation) ? "DROPPED:" : "ADDED:  ";
+          this.logger.fine(String.format(
+              "%s  ...%s[!%s!]%s...",
+              label,
+              text.substring(windowBegin, begin),
+              text.substring(begin, end),
+              text.substring(end, windowEnd)));
+        }
+      }
+    }
+    return stats;
+  }
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1417934&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Thu Dec  6 15:20:07 2012
@@ -0,0 +1,132 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.logging.Level;
+
+import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.feature.transform.InstanceDataWriter;
+import org.cleartk.classifier.jar.JarClassifierBuilder;
+import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
+import org.cleartk.eval.AnnotationStatistics;
+import org.uimafit.util.JCasUtil;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+
+public class EvaluationOfEventSpans extends EvaluationOfAnnotationSpans_ImplBase {
+
+  public static void main(String[] args) throws Exception {
+    Options options = CliFactory.parseArguments(Options.class, args);
+    EvaluationOfEventSpans evaluation = new EvaluationOfEventSpans(
+        new File("target/eval"),
+        options.getRawTextDirectory(),
+        options.getKnowtatorXMLDirectory(),
+        options.getPatients().getList(),
+        options.getDownSampleRatio(),
+    	options.getFeatureSelect()); //control apply feature selection or not
+    evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-event-errors.log"));
+    List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(4);
+    for (AnnotationStatistics<String> stats : foldStats) {
+      System.err.println(stats);
+    }
+    System.err.println("OVERALL");
+    System.err.println(AnnotationStatistics.addAll(foldStats));
+  }
+  
+  private float downratio;
+  private float featureTrim;
+
+  public EvaluationOfEventSpans(
+      File baseDirectory,
+      File rawTextDirectory,
+      File knowtatorXMLDirectory,
+      List<Integer> patientSets,
+      float downratio, float featureSelect) {
+    super(
+        baseDirectory,
+        rawTextDirectory,
+        knowtatorXMLDirectory,
+        patientSets,
+        EnumSet.of(AnnotatorType.PART_OF_SPEECH_TAGS,
+        //AnnotatorType.UMLS_NAMED_ENTITIES,
+//        AnnotatorType.LEXICAL_VARIANTS,
+        AnnotatorType.DEPENDENCIES,
+        AnnotatorType.SEMANTIC_ROLES));
+    this.downratio = downratio;
+    this.featureTrim = featureSelect;
+  }
+
+  @Override
+  protected AnalysisEngineDescription getDataWriterDescription(File directory)
+      throws ResourceInitializationException {
+	if(this.featureTrim > 0){
+		return EventAnnotator.createDataWriterDescription(
+		    	InstanceDataWriter.class.getName(),
+		        directory,
+		        this.downratio,
+		        this.featureTrim);
+	}
+	return EventAnnotator.createDataWriterDescription(
+	        LIBSVMStringOutcomeDataWriter.class.getName(),
+	        directory,
+	        this.downratio,
+	        this.featureTrim);
+	
+    
+  }
+
+  @Override
+  protected void trainAndPackage(File directory) throws Exception {
+    JarClassifierBuilder.trainAndPackage(directory, "-c", "10000");
+  }
+
+  @Override
+  protected List<Class<? extends TOP>> getAnnotationClassesThatShouldBeGoldAtTestTime() {
+    List<Class<? extends TOP>> result = super.getAnnotationClassesThatShouldBeGoldAtTestTime();
+    result.add(EntityMention.class);
+    return result;
+  }
+
+  @Override
+  protected AnalysisEngineDescription getAnnotatorDescription(File directory)
+      throws ResourceInitializationException {
+    return EventAnnotator.createAnnotatorDescription(directory);
+  }
+
+  @Override
+  protected Collection<? extends Annotation> getGoldAnnotations(JCas jCas) {
+    return JCasUtil.select(jCas, EventMention.class);
+  }
+
+  @Override
+  protected Collection<? extends Annotation> getSystemAnnotations(JCas jCas) {
+    return JCasUtil.select(jCas, EventMention.class);
+  }
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1417934&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Thu Dec  6 15:20:07 2012
@@ -0,0 +1,399 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
+import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
+import org.apache.ctakes.core.resource.SuffixMaxentModelResourceImpl;
+import org.apache.ctakes.dependency.parser.ae.ClearParserDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearParserSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.temporal.ae.THYMEKnowtatorXMLReader;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.TOP;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.ViewCreatorAnnotator;
+import org.uimafit.component.ViewTextCopierAnnotator;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.ExternalResourceFactory;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.collect.Lists;
+import com.lexicalscope.jewel.cli.Option;
+
+public abstract class Evaluation_ImplBase<STATISTICS_TYPE> extends
+    org.cleartk.eval.Evaluation_ImplBase<Integer, STATISTICS_TYPE> {
+
+  public enum AnnotatorType {
+    PART_OF_SPEECH_TAGS, UMLS_NAMED_ENTITIES, LEXICAL_VARIANTS, DEPENDENCIES, SEMANTIC_ROLES
+  };
+
+  protected final String GOLD_VIEW_NAME = "GoldView";
+
+  static interface Options {
+
+    @Option(longName = "text")
+    public File getRawTextDirectory();
+
+    @Option(longName = "xml")
+    public File getKnowtatorXMLDirectory();
+
+    @Option(longName = "patients")
+    public CommandLine.IntegerRanges getPatients();
+    
+    @Option(longName = "downratio")
+	public float getDownSampleRatio();
+
+    @Option(longName = "featureSelect")
+    public float getFeatureSelect(); //get feature selection cut off threshold is it is > 0. apply no FS if featureSelect == 0 
+  }
+
+  protected File rawTextDirectory;
+
+  protected File knowtatorXMLDirectory;
+
+  protected List<Integer> patientSets;
+
+  private Set<AnnotatorType> annotatorFlags;
+
+  public Evaluation_ImplBase(
+      File baseDirectory,
+      File rawTextDirectory,
+      File knowtatorXMLDirectory,
+      List<Integer> patientSets,
+      Set<AnnotatorType> annotatorFlags) {
+    super(baseDirectory);
+    this.rawTextDirectory = rawTextDirectory;
+    this.knowtatorXMLDirectory = knowtatorXMLDirectory;
+    this.patientSets = patientSets;
+    this.annotatorFlags = annotatorFlags;
+  }
+
+  public List<STATISTICS_TYPE> crossValidation(int nFolds) throws Exception {
+    return this.crossValidation(this.patientSets, nFolds);
+  }
+
+  @Override
+  protected CollectionReader getCollectionReader(List<Integer> patientSets) throws Exception {
+    List<File> files = new ArrayList<File>();
+    for (Integer set : patientSets) {
+      File setTextDirectory = new File(this.rawTextDirectory, "doc" + set);
+      for (File file : setTextDirectory.listFiles()) {
+        files.add(file);
+      }
+    }
+    return UriCollectionReader.getCollectionReaderFromFiles(files);
+  }
+
+  protected AnalysisEngineDescription getPreprocessorTrainDescription() throws Exception {
+    return this.getPreprocessorDescription(PipelineType.TRAIN);
+  }
+
+  protected AnalysisEngineDescription getPreprocessorTestDescription() throws Exception {
+    return this.getPreprocessorDescription(PipelineType.TEST);
+  }
+
+  protected List<Class<? extends TOP>> getAnnotationClassesThatShouldBeGoldAtTestTime() {
+    return new ArrayList<Class<? extends TOP>>();
+  }
+
+  private static enum PipelineType {
+    TRAIN, TEST
+  };
+
+  private AnalysisEngineDescription getPreprocessorDescription(PipelineType pipelineType)
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+    switch (pipelineType) {
+      case TRAIN:
+        aggregateBuilder.add(THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory));
+        break;
+      case TEST:
+        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+            ViewCreatorAnnotator.class,
+            ViewCreatorAnnotator.PARAM_VIEW_NAME,
+            GOLD_VIEW_NAME));
+        aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+            ViewTextCopierAnnotator.class,
+            ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+            CAS.NAME_DEFAULT_SOFA,
+            ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+            GOLD_VIEW_NAME));
+        aggregateBuilder.add(
+            THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory),
+            CAS.NAME_DEFAULT_SOFA,
+            GOLD_VIEW_NAME);
+        for (Class<? extends TOP> annotationClass : this.getAnnotationClassesThatShouldBeGoldAtTestTime()) {
+          aggregateBuilder.add(AnnotationCopier.getDescription(
+              GOLD_VIEW_NAME,
+              CAS.NAME_DEFAULT_SOFA,
+              annotationClass));
+        }
+        break;
+    }
+    // identify segments
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        SentenceDetector.class,
+        "MaxentModel",
+        ExternalResourceFactory.createExternalResourceDescription(
+            SuffixMaxentModelResourceImpl.class,
+            SentenceDetector.class.getResource("../sentdetect/sdmed.mod"))));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags if requested
+    if (this.annotatorFlags.contains(AnnotatorType.PART_OF_SPEECH_TAGS)) {
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          POSTagger.class,
+          POSTagger.POS_MODEL_FILE_PARAM,
+          "org/apache/ctakes/postagger/models/mayo-pos.zip",
+          POSTagger.TAG_DICTIONARY_PARAM,
+          "org/apache/ctakes/postagger/models/tag.dictionary.txt",
+          POSTagger.CASE_SENSITIVE_PARAM,
+          true));
+    }
+
+    // identify UMLS named entities if requested
+    if (this.annotatorFlags.contains(AnnotatorType.UMLS_NAMED_ENTITIES)) {
+      // remove gold mentions if they're there (we'll add cTAKES mentions later instead)
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(EntityMentionRemover.class));
+      // identify chunks
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          Chunker.class,
+          Chunker.CHUNKER_MODEL_FILE_PARAM,
+          Chunker.class.getResource("../models/chunk-model.claims-1.5.zip").toURI().getPath(),
+          Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+          DefaultChunkCreator.class));
+      // adjust NP in NP NP to span both
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          ChunkAdjuster.class,
+          ChunkAdjuster.PARAM_CHUNK_PATTERN,
+          new String[] { "NP", "NP" },
+          ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+          1));
+      // adjust NP in NP PP NP to span all three
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          ChunkAdjuster.class,
+          ChunkAdjuster.PARAM_CHUNK_PATTERN,
+          new String[] { "NP", "PP", "NP" },
+          ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+          2));
+      // add lookup windows for each NP
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
+      // maximize lookup windows
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          OverlapAnnotator.class,
+          "A_ObjectClass",
+          LookupWindowAnnotation.class,
+          "B_ObjectClass",
+          LookupWindowAnnotation.class,
+          "OverlapType",
+          "A_ENV_B",
+          "ActionType",
+          "DELETE",
+          "DeleteAction",
+          new String[] { "selector=B" }));
+      // add UMLS on top of lookup windows
+      String umlsUser = System.getProperty("umls.user");
+      String umlsPassword = System.getProperty("umls.password");
+      if (umlsUser == null || umlsPassword == null) {
+        throw new IllegalArgumentException(
+            "The properties umls.user and umls.password must be set to use the "
+                + "UmlsDictionaryLookupAnnotator. You can set them by provding java with the "
+                + "arguments -Dumls.user=... and -Dumls.password=...");
+      }
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+          UmlsDictionaryLookupAnnotator.class,
+          "UMLSAddr",
+          "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser",
+          "UMLSVendor",
+          "NLM-6515182895",
+          "UMLSUser",
+          umlsUser,
+          "UMLSPW",
+          umlsPassword,
+          "LookupDescriptor",
+          ExternalResourceFactory.createExternalResourceDescription(
+              FileResourceImpl.class,
+              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../LookupDesc_Db.xml")),
+          "DbConnection",
+          ExternalResourceFactory.createExternalResourceDescription(
+              JdbcConnectionResourceImpl.class,
+              "",
+              JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS,
+              "org.hsqldb.jdbcDriver",
+              JdbcConnectionResourceImpl.PARAM_URL,
+              "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+          "RxnormIndexReader",
+          ExternalResourceFactory.createExternalResourceDescription(
+              LuceneIndexReaderResourceImpl.class,
+              "",
+              "UseMemoryIndex",
+              true,
+              "IndexDirectory",
+              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../rxnorm_index")),
+          "OrangeBookIndexReader",
+          ExternalResourceFactory.createExternalResourceDescription(
+              LuceneIndexReaderResourceImpl.class,
+              "",
+              "UseMemoryIndex",
+              true,
+              "IndexDirectory",
+              getResourceAsFile(UmlsDictionaryLookupAnnotator.class, "../OrangeBook"))));
+    }
+
+    // add lvg annotator
+    if (this.annotatorFlags.contains(AnnotatorType.LEXICAL_VARIANTS)) {
+      String[] XeroxTreebankMap = {
+          "adj|JJ",
+          "adv|RB",
+          "aux|AUX",
+          "compl|CS",
+          "conj|CC",
+          "det|DET",
+          "modal|MD",
+          "noun|NN",
+          "prep|IN",
+          "pron|PRP",
+          "verb|VB" };
+      String[] ExclusionSet = {
+          "and",
+          "And",
+          "by",
+          "By",
+          "for",
+          "For",
+          "in",
+          "In",
+          "of",
+          "Of",
+          "on",
+          "On",
+          "the",
+          "The",
+          "to",
+          "To",
+          "with",
+          "With" };
+      AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+          LvgAnnotator.class,
+          "UseSegments",
+          false,
+          "SegmentsToSkip",
+          new String[0],
+          "UseCmdCache",
+          false,
+          "CmdCacheFileLocation",
+          "/org/apache/ctakes/lvg/2005_norm.voc",
+          "CmdCacheFrequencyCutoff",
+          20,
+          "ExclusionSet",
+          ExclusionSet,
+          "XeroxTreebankMap",
+          XeroxTreebankMap,
+          "LemmaCacheFileLocation",
+          "/org/apache/ctakes/lvg/2005_lemma.voc",
+          "UseLemmaCache",
+          false,
+          "LemmaCacheFrequencyCutoff",
+          20,
+          "PostLemmas",
+          true,
+          "LvgCmdApi",
+          ExternalResourceFactory.createExternalResourceDescription(
+              LvgCmdApiResourceImpl.class,
+              getResourceAsFile(LvgAnnotator.class, "../data/config/lvg.properties")));
+      aggregateBuilder.add(lvgAnnotator);
+    }
+
+    // add dependency parser
+    if (this.annotatorFlags.contains(AnnotatorType.DEPENDENCIES)) {
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearParserDependencyParserAE.class));
+    }
+
+    // add semantic role labeler
+    if (this.annotatorFlags.contains(AnnotatorType.SEMANTIC_ROLES)) {
+      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearParserSemanticRoleLabelerAE.class));
+    }
+    return aggregateBuilder.createAggregateDescription();
+  }
+
+  /**
+   * This is hack to deal with classes that don't handle resources correctly
+   */
+  private static File getResourceAsFile(Class<?> cls, String path) throws URISyntaxException {
+    // this will fail if the resource is not a real File, but the UMLS code assumes that
+    return new File(cls.getResource(path).toURI());
+  }
+
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+
+  public static class EntityMentionRemover extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) {
+        mention.removeFromIndexes();
+      }
+    }
+  }
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message