incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From stevenbeth...@apache.org
Subject svn commit: r1424631 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/ ae/feature/selection/ eval/
Date Thu, 20 Dec 2012 18:46:11 GMT
Author: stevenbethard
Date: Thu Dec 20 18:46:11 2012
New Revision: 1424631

URL: http://svn.apache.org/viewvc?rev=1424631&view=rev
Log:
Updates temporal evaluation for new SHARP data. Refactors some awful duplication in feature selection code.

Added:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java
      - copied, changed from r1424215, incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelection.java
      - copied, changed from r1424215, incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelection.java
      - copied, changed from r1424215, incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/THYMEData.java   (with props)
Removed:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
Modified:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKEventSpans.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKTimeSpans.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventProperties.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1424631&r1=1424630&r2=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java Thu Dec 20 18:46:11 2012
@@ -31,7 +31,8 @@ import java.util.Random;
 import org.apache.ctakes.temporal.ae.feature.PhraseExtractor;
 import org.apache.ctakes.temporal.ae.feature.SRLExtractor;
 import org.apache.ctakes.temporal.ae.feature.SurfaceFormFeatureExtractor;
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2FeatureSelection;
+import org.apache.ctakes.temporal.ae.feature.selection.FeatureSelection;
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
@@ -60,7 +61,6 @@ import org.cleartk.classifier.jar.Direct
 import org.cleartk.classifier.jar.GenericJarClassifierFactory;
 import org.uimafit.descriptor.ConfigurationParameter;
 import org.uimafit.factory.AnalysisEngineFactory;
-import org.uimafit.factory.ConfigurationParameterFactory;
 import org.uimafit.util.JCasUtil;
 
 import com.google.common.base.Predicate;
@@ -114,36 +114,37 @@ public class EventAnnotator extends Clea
 
   public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
       throws ResourceInitializationException {
-    AnalysisEngineDescription fsEventAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+    return AnalysisEngineFactory.createPrimitiveDescription(
         EventAnnotator.class,
         CleartkAnnotator.PARAM_IS_TRAINING,
         false,
         GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
-        new File(modelDirectory, "model.jar"));
-    ConfigurationParameterFactory.addConfigurationParameter(
-        fsEventAnnotator,
+        new File(modelDirectory, "model.jar"),
         EventAnnotator.PARAM_FEATURE_SELECTION_URI,
         EventAnnotator.createFeatureSelectionURI(modelDirectory));
-
-    return (fsEventAnnotator);
   }
 
-  protected List<SimpleFeatureExtractor> tokenFeatureExtractors;
+  protected SimpleFeatureExtractor tokenFeatureExtractor;
 
-  protected List<CleartkExtractor> contextFeatureExtractors;
+  protected CleartkExtractor contextFeatureExtractor;
 
   private BIOChunking<BaseToken, EntityMention> entityChunking;
 
   private BIOChunking<BaseToken, EventMention> eventChunking;
 
-  public static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
+  private FeatureSelection<String> featureSelection;
 
-  private Chi2NeighborFSExtractor<String> featureSelectionExtractor;
+  private static final String FEATURE_SELECTION_NAME = "SelectNeighborFeatures";
 
+  public static FeatureSelection<String> createFeatureSelection(double threshold) {
+    return new Chi2FeatureSelection<String>(EventAnnotator.FEATURE_SELECTION_NAME, threshold);
+  }
+  
   public static URI createFeatureSelectionURI(File outputDirectoryName) {
     return new File(outputDirectoryName, FEATURE_SELECTION_NAME + "_Chi2_extractor.dat").toURI();
   }
 
+
   // *****feature selection related parameters
 
   @Override
@@ -159,39 +160,31 @@ public class EventAnnotator extends Clea
         BaseToken.class,
         EventMention.class);
 
-    CombinedExtractor subExtractor = new CombinedExtractor(
+    this.tokenFeatureExtractor = new CombinedExtractor(
         new CoveredTextExtractor(),
         new CharacterCategoryPatternExtractor(PatternType.ONE_PER_CHAR),
         new TypePathExtractor(BaseToken.class, "partOfSpeech"),
         new SurfaceFormFeatureExtractor(),
         new PhraseExtractor(),
         new SRLExtractor());
+    this.contextFeatureExtractor = new CleartkExtractor(
+        BaseToken.class,
+        this.tokenFeatureExtractor,
+        new Preceding(3),
+        new Following(3));
 
-    if (featureSelectionThreshold > 0) {
-      this.featureSelectionExtractor = new Chi2NeighborFSExtractor<String>(
-          EventAnnotator.FEATURE_SELECTION_NAME,
-          BaseToken.class,
-          subExtractor,
-          this.featureSelectionThreshold,
-          new Preceding(4),
-          new Following(4));
+    if (featureSelectionThreshold == 0) {
+      this.featureSelection = null;
+    } else {
+      this.featureSelection = EventAnnotator.createFeatureSelection(this.featureSelectionThreshold);
 
       if (this.featureSelectionURI != null) {
         try {
-          this.featureSelectionExtractor.load(this.featureSelectionURI);
+          this.featureSelection.load(this.featureSelectionURI);
         } catch (IOException e) {
           throw new ResourceInitializationException(e);
         }
       }
-    } else {
-      this.tokenFeatureExtractors = new ArrayList<SimpleFeatureExtractor>();
-      this.tokenFeatureExtractors.add(subExtractor);
-      this.contextFeatureExtractors = new ArrayList<CleartkExtractor>();
-      this.contextFeatureExtractors.add(new CleartkExtractor(
-          BaseToken.class,
-          subExtractor,
-          new Preceding(3),
-          new Following(3)));
     }
   }
 
@@ -240,53 +233,39 @@ public class EventAnnotator extends Clea
 
         List<Feature> features = new ArrayList<Feature>();
 
-        if (featureSelectionThreshold > 0) {// if feature selection
-          features.addAll(this.featureSelectionExtractor.extract(jCas, token)); // base features
-          features.addAll(this.featureSelectionExtractor.extractWithin(jCas, token, sentence)); // neighbor
-          // features
-          features.addAll(this.featureSelectionExtractor.extract(
-              entityTypeIDs,
-              entityTagsByType,
-              tokenIndex,
-              window)); // features from surrounding entities
-          features.addAll(this.featureSelectionExtractor.extract(
-              nPreviousClassifications,
-              tokenIndex,
-              outcomes)); // features from previous classifications
-        } else { // if no feature selection
-          // features from token attributes
-          for (SimpleFeatureExtractor extractor : this.tokenFeatureExtractors) {
-            features.addAll(extractor.extract(jCas, token));
-          }
-          // features from surrounding tokens
-          for (CleartkExtractor extractor : this.contextFeatureExtractors) {
-            features.addAll(extractor.extractWithin(jCas, token, sentence));
-          }
-          // features from surrounding entities
-          for (int typeID : entityTypeIDs) {
-            List<String> tokenEntityTags = entityTagsByType.get(typeID);
-            int begin = Math.max(tokenIndex - window, 0);
-            int end = Math.min(tokenIndex + window, tokenEntityTags.size());
-            for (int i = begin; i < end; ++i) {
-              String name = String.format("EntityTag_%d_%d", typeID, i - begin);
-              features.add(new Feature(name, tokenEntityTags.get(i)));
-            }
-          }
-          // features from previous classifications
-          for (int i = nPreviousClassifications; i > 0; --i) {
-            int index = tokenIndex - i;
-            String previousOutcome = index < 0 ? "O" : outcomes.get(index);
-            features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+        // features from token attributes
+        features.addAll(this.tokenFeatureExtractor.extract(jCas, token));
+
+        // features from surrounding tokens
+        features.addAll(this.contextFeatureExtractor.extractWithin(jCas, token, sentence));
+
+        // features from surrounding entities
+        for (int typeID : entityTypeIDs) {
+          List<String> tokenEntityTags = entityTagsByType.get(typeID);
+          int begin = Math.max(tokenIndex - window, 0);
+          int end = Math.min(tokenIndex + window, tokenEntityTags.size());
+          for (int i = begin; i < end; ++i) {
+            String name = String.format("EntityTag_%d_%d", typeID, i - begin);
+            features.add(new Feature(name, tokenEntityTags.get(i)));
           }
         }
+        // features from previous classifications
+        for (int i = nPreviousClassifications; i > 0; --i) {
+          int index = tokenIndex - i;
+          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+          features.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+        }
+
+        // apply feature selection, if necessary
+        if (this.featureSelection != null) {
+          features = this.featureSelection.transform(features);
+        }
 
         // if training, write to data file
         if (this.isTraining()) {
           String outcome = outcomes.get(tokenIndex);
-          if (outcome.equals("O")) { // if it is an "O". downsample it
-            if (rand.nextDouble() <= probabilityOfKeepingANegativeExample)
-              this.dataWriter.write(new Instance<String>(outcome, features));
-          } else {
+          // if it is an "O" down-sample it
+          if (!outcome.equals("O") || rand.nextDouble() <= this.probabilityOfKeepingANegativeExample) {
             this.dataWriter.write(new Instance<String>(outcome, features));
           }
         }
@@ -312,8 +291,4 @@ public class EventAnnotator extends Clea
       }
     };
   }
-
-  public Chi2NeighborFSExtractor<String> getChi2NbSubExtractor() {
-    return this.featureSelectionExtractor;
-  }
 }

Copied: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java (from r1424215, incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java)
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java?p2=incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java&p1=incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java&r1=1424215&r2=1424631&rev=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FeatureSelection.java Thu Dec 20 18:46:11 2012
@@ -7,464 +7,184 @@ import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.net.URI;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor.Chi2Evaluator.ComputeFeatureScore;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
+
 import org.cleartk.classifier.Feature;
 import org.cleartk.classifier.Instance;
-import org.cleartk.classifier.feature.extractor.BetweenAnnotationsFeatureExtractor;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.CleartkExtractor.Bounds;
-import org.cleartk.classifier.feature.extractor.CleartkExtractor.Context;
-import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
 import org.cleartk.classifier.feature.transform.TransformableFeature;
 
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Collections2;
 import com.google.common.collect.HashBasedTable;
 import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Lists;
 import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
 import com.google.common.collect.Table;
 
 /**
  * 
- * Selects features via Chi-squared statistics between the features extracted from its
- * sub-extractor and the outcome values they are paired with in classification instances.
+ * Selects features via Chi-squared statistics between the features extracted from its sub-extractor
+ * and the outcome values they are paired with in classification instances.
  * 
  * @author Chen Lin
  * 
  */
-public class Chi2NeighborFSExtractor<OUTCOME_T> extends FeatureSelectionExtractor<OUTCOME_T>
-		implements SimpleFeatureExtractor , BetweenAnnotationsFeatureExtractor{
-	
-			/**
-			   * A Bounds implementation that puts no restrictions on the context.
-			   */
-			  private static class NoBounds implements Bounds {
-
-			    public NoBounds() {
-			    }
-
-			    @Override
-			    public boolean contains(Annotation annotation) {
-			      return true;
-			    }
-
-			  }
-
-			/**
-			   * A Bounds implementation that restricts the context to annotations within a given span.
-			   */
-		private static class SpanBounds implements Bounds {
-
-		private int begin;
-
-		private int end;
-
-		public SpanBounds(int begin, int end) {
-		    this.begin = begin;
-		    this.end = end;
-		}
-
-		@Override
-		public boolean contains(Annotation annotation) {
-		    return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end;
-		}
-
-	}
-
-		/**
-		   * Helper class for aggregating and computing mutual Chi2 statistics
-		*/
-		public static class Chi2Evaluator<OUTCOME_T> {
-			 protected Multiset<OUTCOME_T> classCounts;
-
-			 protected Table<String, OUTCOME_T, Integer> featValueClassCount;
-
-			 public Chi2Evaluator() {
-			      this.classCounts = HashMultiset.<OUTCOME_T> create();
-			      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
-			 }
-
-			 public void update(String featureName, OUTCOME_T outcome, int occurrences) {
-			      Integer count = this.featValueClassCount.get(featureName, outcome);
-			      if (count == null) {
-			        count = 0;
-			      }
-			      this.featValueClassCount.put(featureName, outcome, count + occurrences);
-			      this.classCounts.add(outcome, occurrences);
-			 }
-
-			 public double Chi2Cal(String featureName) {
-			      // notation index of 0 means false, 1 mean true
-				  //Contingency Table:
-				  //    | class1 | class2 | class3 | sum
-				  //posi| 		 |        |        | posiFeatCount
-				  //nega|        |        |        | negaFeatCount
-				  //    | outcnt1| outcnt2| outcnt3| n
-				  
-				  int numOfClass = this.classCounts.elementSet().size();
-			      int[] posiOutcomeCounts = new int[numOfClass];
-			      int[] outcomeCounts = new int[numOfClass];
-			      int classId = 0;
-			      int posiFeatCount = 0;
-			      for ( OUTCOME_T clas: this.classCounts.elementSet()){
-			    	  posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)? 
-			    			  this.featValueClassCount.get(featureName, clas)
-					          : 0;
-			    	  posiFeatCount += posiOutcomeCounts[classId];
-			    	  outcomeCounts[classId] = this.classCounts.count(clas);
-			    	  classId ++;
-			      }
-			      
-			      int n = this.classCounts.size();
-			      int negaFeatCount = n - posiFeatCount;
-			      
-			      double chi2val = 0.0;
-			      
-			      if (posiFeatCount == 0 || posiFeatCount == n){ //all instances have same value on this feature, degree of freedom = 0
-			    	  return chi2val;			    	  
-			      }
-			      
-			      boolean yates = true;
-			      for (int lbl =0; lbl < numOfClass; lbl++){
-			    	  	//for positive part of feature:
-				    	  double expected = (outcomeCounts[lbl]/(double)n)*(posiFeatCount);
-				    	  if (expected > 0){
-				    		  double diff = Math.abs(posiOutcomeCounts[lbl]-expected);
-				    		  if (yates){ // apply Yate's correction
-				    			  diff -= 0.5;
-				    		  }
-				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
-				    	  }
-				    		  
-				    	  //for negative part of feature:
-				    	  expected = (outcomeCounts[lbl]/(double)n)*(negaFeatCount);
-				    	  double observ = outcomeCounts[lbl]-posiOutcomeCounts[lbl];
-				    	  if (expected > 0){
-				    		  double diff = Math.abs(observ-expected);
-				    		  if (yates){ // apply Yate's correction
-				    			  diff -= 0.5;
-				    		  }
-				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
-				    	  }
-			      }
-
-			      return chi2val;
-			    }
-
-			    
-			 public void save(URI outputURI) throws IOException {
-			      File out = new File(outputURI);
-			      BufferedWriter writer = null;
-			      writer = new BufferedWriter(new FileWriter(out));
-
-			      // Write out header
-			      writer.append("Chi2 FS Neighbor Data\n");
-			      writer.append("Feature\t");
-			      writer.append(Joiner.on("\t").join(this.featValueClassCount.columnKeySet()));
-			      writer.append("\n");
-
-			      // Write out Chi2 values for all features
-			      for (String featureName : this.featValueClassCount.rowKeySet()) {
-			        writer.append(featureName);
-			        writer.append("\t");
-			        writer.append(String.format("%f", this.Chi2Cal(featureName)));
-			        writer.append("\n");
-			      }
-			      writer.append("\n");
-			      writer.append(this.featValueClassCount.toString());
-			      writer.close();
-			    }
-			 
-			 public ComputeFeatureScore<OUTCOME_T> getScoreFunction() {
-			      return new ComputeFeatureScore<OUTCOME_T>(this);
-			    }
-
-			    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
-
-			      private Chi2Evaluator<OUTCOME_T> stats;
-
-			      public ComputeFeatureScore(Chi2Evaluator<OUTCOME_T> stats) {
-			        this.stats = stats;
-			      }
-
-			      @Override
-			      public Double apply(String featureName) {
-			        Double featureChi2 = stats.Chi2Cal(featureName);
-			        return featureChi2;
-			      }
-
-			    }
-	}
-			
-			
-	protected boolean isTrained;
-	private CombinedExtractor subExtractor;
-	private List<String> selectedFeatures;
-	private double chi2Threshold;
-	private Chi2Evaluator<OUTCOME_T> chi2Evaluator;
-	private Context[] contexts;
-	private Class<? extends Annotation> annotationClass;
-
-	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, Context... contexts) {
-		super(name);
-		this.annotationClass = annotationClass;
-		this.init(featureExtractor, 0.0);
-		this.contexts = contexts;
-	}
-	
-	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, double thres, Context... contexts) {
-		super(name);
-		this.annotationClass = annotationClass;
-		this.init(featureExtractor, thres);
-		this.contexts = contexts;
-	}
-
-	public Chi2NeighborFSExtractor(String fsNeighborExtractorKey, Float thres) {
-		super(fsNeighborExtractorKey);
-		this.isTrained=false;
-		this.chi2Threshold = thres;
-	}
-
-	private void init(CombinedExtractor featureExtractor, double thres) {
-		this.subExtractor= featureExtractor;
-		this.chi2Threshold = thres;
-	}
-
-	@Override
-	public List<Feature> extract(JCas view, Annotation focusAnnotation)
-			throws CleartkExtractorException {
-		List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
-	    List<Feature> result = new ArrayList<Feature>();
-	    if (this.isTrained) {
-	      // Filter out selected features
-	      result.addAll(Collections2.filter(extracted, this));
-	    } else {
-	      // We haven't trained this extractor yet, so just mark the existing features
-	      // for future modification, by creating one uber-container feature
-	      result.add(new TransformableFeature(this.name, extracted));
-	    }
-
-	    return result;
-	}
-	
-	public List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds)
-		      throws CleartkExtractorException {
-		    List<Feature> extracted = new ArrayList<Feature>();
-		    for (Context context : this.contexts) {
-			      extracted.addAll(context.extract(
-			          view,
-			          focusAnnotation,
-			          bounds,
-			          this.annotationClass,
-			          this.subExtractor));
-			    }
-		    List<Feature> result = new ArrayList<Feature>();
-		    if (this.isTrained){
-		    	// Filter out selected features
-			    result.addAll(Collections2.filter(extracted, this));
-		    }else{
-		    	// We haven't trained this extractor yet, so just mark the existing features
-			    // for future modification, by creating one uber-container feature
-			    result.add(new TransformableFeature(this.name, extracted));
-		    }
-		    
-		    return result;
-		  }
-
-	/**
-	  * Extract features from the annotations around the focus annotation and within the given bounds.
-	   * 
-	   * @param view
-	   *          The JCas containing the focus annotation.
-	   * @param focusAnnotation
-	   *          The annotation whose context is to be searched.
-	   * @param boundsAnnotation
-	   *          The boundary within which context annotations may be identified.
-	   * @return The features extracted in the context of the focus annotation.
-	   */
-	public List<Feature> extractWithin(
-	      JCas view,
-	      Annotation focusAnnotation,
-	      Annotation boundsAnnotation) throws CleartkExtractorException {
-	    Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd());
-	    return this.extract(view, focusAnnotation, bounds);
-	}
-	  
-	@Override
-	public boolean apply(Feature feature) {
-		return this.selectedFeatures.contains(this.nameFeature(feature));
-	}
-	
-	public String nameFeature(Feature feature) {
-	    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
-	        + feature.getValue();
-	  }
-
-	@Override
-	public void train(Iterable<Instance<OUTCOME_T>> instances) {
-		// aggregate statistics for all features
-	    this.chi2Evaluator = new Chi2Evaluator<OUTCOME_T>();
-
-	    for (Instance<OUTCOME_T> instance : instances) {
-	      OUTCOME_T outcome = instance.getOutcome();
-	      for (Feature feature : instance.getFeatures()) {
-	        if (this.isTransformable(feature)) {
-	          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
-	        	  chi2Evaluator.update(this.nameFeature(untransformedFeature), outcome, 1);
-	          }
-	        }else{
-	        	chi2Evaluator.update(this.nameFeature(feature), outcome, 1);
-	        }
-	      }
-	    }
-	    // Compute mutual information score for each feature
-	    Set<String> featureNames = chi2Evaluator.featValueClassCount.rowKeySet();
-
-	
-		//step3: remove small chi2 valued features
-	    Iterator<String> iter = featureNames.iterator();
-	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
-	    while (iter.hasNext()){
-	    	String feat = iter.next();
-	    	Double chi2 = computeScore.apply(feat);
-	    	if(chi2 <= this.chi2Threshold){
-	    		iter.remove();
-	    	}
-	    }
-	    
-//	    this.selectedFeatures = new ArrayList<String>();
-//	    for (String feature : featureNames){
-//	    	this.selectedFeatures.add(feature);
-//	    }
-//	    
-	    //step4:get selected features
-	    this.selectedFeatures = Ordering.natural().onResultOf(
-        this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
-        featureNames);
-	    
-//	    Iterator<String> iter = featureNames.iterator();
-//	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
-//	    this.selectedFeatures = new ArrayList<String>();
-//	    while (iter.hasNext()){
-//	    	String feat = iter.next();
-//	    	Double chi2 = computeScore.apply(feat);
-//	    	if(chi2 > this.chi2Threshold){
-//	    		this.selectedFeatures.add(feat);
-//	    	}
-//	    }
-//		//order the list 
-//	    this.selectedFeatures = Ordering.natural().onResultOf(
-//	          this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
-//	        		  this.selectedFeatures);
-	    
-		this.isTrained = true;
-		
-	}
-
-	@Override
-	public void save(URI uri) throws IOException {
-		if (!this.isTrained) {
-		      throw new IOException("Chi2FSExtractor: Cannot save before training.");
-		}
-		File out = new File(uri);
-	    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
-
-	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
-	    for (String feature : this.selectedFeatures) {
-	      writer.append(String.format("%s\t%f\n", feature, computeScore.apply(feature)));
-	    }
-
-	    writer.close();
-	}
-
-	@Override
-	public void load(URI uri) throws IOException {
-		this.selectedFeatures = Lists.newArrayList();
-	    File in = new File(uri);
-	    BufferedReader reader = new BufferedReader(new FileReader(in));
-
-	    // The rest of the lines are feature + selection scores
-	    String line = null;
-	    //int n = 0;
-	    while ((line = reader.readLine()) != null ){//&& n < this.numFeatures) {
-	      String[] featureValuePair = line.split("\\t");
-	      this.selectedFeatures.add(featureValuePair[0]);
-	      //n++;
-	    }
-
-	    reader.close();
-	    this.isTrained = true;
-		
-	}
-
-	@Override
-	public List<Feature> extractBetween(JCas jCas, Annotation annotation1,
-			Annotation annotation2) throws CleartkExtractorException {
-		int begin = annotation1.getEnd();
-	    int end = annotation2.getBegin();
-	    // FIXME: creating a new annotation may leak memory - is there a better approach?
-	    Annotation focusAnnotation = new Annotation(jCas, begin, end);
-	    return this.extract(jCas, focusAnnotation, new NoBounds());
-	}
-
-	public Collection<? extends Feature> extract(int[] entityTypeIDs, Map<Integer, List<String>> entityTagsByType, int tokenIndex, int window) {
-		List<Feature> extracted = new ArrayList<Feature>();
-	    List<Feature> result = new ArrayList<Feature>();
-	    for (int typeID : entityTypeIDs) {
-            List<String> tokenEntityTags = entityTagsByType.get(typeID);
-            int begin = Math.max(tokenIndex - window, 0);
-            int end = Math.min(tokenIndex + window, tokenEntityTags.size());
-            for (int i = begin; i < end; ++i) {
-              String featureName = String.format("EntityTag_%d_%d", typeID, i - begin);
-              extracted.add(new Feature(featureName, tokenEntityTags.get(i)));
-            }
+public class Chi2FeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
+
+  /**
+   * Helper class for aggregating and computing mutual Chi2 statistics
+   */
+  private static class Chi2Scorer<OUTCOME_T> {
+    protected Multiset<OUTCOME_T> classCounts;
+
+    protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+    public Chi2Scorer() {
+      this.classCounts = HashMultiset.<OUTCOME_T> create();
+      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+    }
+
+    public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+      Integer count = this.featValueClassCount.get(featureName, outcome);
+      if (count == null) {
+        count = 0;
+      }
+      this.featValueClassCount.put(featureName, outcome, count + occurrences);
+      this.classCounts.add(outcome, occurrences);
+    }
+
+    public double score(String featureName) {
+      // notation index of 0 means false, 1 mean true
+      // Contingency Table:
+      //      | class1  | class2  | class3  | sum
+      // posi |         |         |         | posiFeatCount
+      // nega |         |         |         | negaFeatCount
+      //      | outcnt1 | outcnt2 | outcnt3 | n
+
+      int numOfClass = this.classCounts.elementSet().size();
+      int[] posiOutcomeCounts = new int[numOfClass];
+      int[] outcomeCounts = new int[numOfClass];
+      int classId = 0;
+      int posiFeatCount = 0;
+      for (OUTCOME_T clas : this.classCounts.elementSet()) {
+        posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)
+            ? this.featValueClassCount.get(featureName, clas)
+            : 0;
+        posiFeatCount += posiOutcomeCounts[classId];
+        outcomeCounts[classId] = this.classCounts.count(clas);
+        classId++;
+      }
+
+      int n = this.classCounts.size();
+      int negaFeatCount = n - posiFeatCount;
+
+      double chi2val = 0.0;
+
+      if (posiFeatCount == 0 || posiFeatCount == n) { // all instances have same value on this
+                                                      // feature, degree of freedom = 0
+        return chi2val;
+      }
+
+      boolean yates = true;
+      for (int lbl = 0; lbl < numOfClass; lbl++) {
+        // for positive part of feature:
+        double expected = (outcomeCounts[lbl] / (double) n) * (posiFeatCount);
+        if (expected > 0) {
+          double diff = Math.abs(posiOutcomeCounts[lbl] - expected);
+          if (yates) { // apply Yate's correction
+            diff -= 0.5;
+          }
+          if (diff > 0)
+            chi2val += Math.pow(diff, 2) / expected;
+        }
+
+        // for negative part of feature:
+        expected = (outcomeCounts[lbl] / (double) n) * (negaFeatCount);
+        double observ = outcomeCounts[lbl] - posiOutcomeCounts[lbl];
+        if (expected > 0) {
+          double diff = Math.abs(observ - expected);
+          if (yates) { // apply Yate's correction
+            diff -= 0.5;
           }
-		if (this.isTrained){
-	    	// Filter out selected features
-		    result.addAll(Collections2.filter(extracted, this));
-	    }else{
-	    	// We haven't trained this extractor yet, so just mark the existing features
-		    // for future modification, by creating one uber-container feature
-		    result.add(new TransformableFeature(this.name, extracted));
-	    }
-	    
-	    return result;
-	}
-
-	public Collection<? extends Feature> extract(int nPreviousClassifications,
-			int tokenIndex, List<String> outcomes) {
-		List<Feature> extracted = new ArrayList<Feature>();
-	    List<Feature> result = new ArrayList<Feature>();
-		// features from previous classifications
-        for (int i = nPreviousClassifications; i > 0; --i) {
-          int index = tokenIndex - i;
-          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
-          extracted.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+          if (diff > 0)
+            chi2val += Math.pow(diff, 2) / expected;
         }
-        
-        if (this.isTrained){
-	    	// Filter out selected features
-		    result.addAll(Collections2.filter(extracted, this));
-	    }else{
-	    	// We haven't trained this extractor yet, so just mark the existing features
-		    // for future modification, by creating one uber-container feature
-		    result.add(new TransformableFeature(this.name, extracted));
-	    }
-	    
-	    return result;
-	}
+      }
+
+      return chi2val;
+    }
+  }
+
+  private double chi2Threshold;
+
+  private Chi2Scorer<OUTCOME_T> chi2Function;
+
+  public Chi2FeatureSelection(String name) {
+    this(name, 0.0);
+  }
+
+  public Chi2FeatureSelection(String name, double threshold) {
+    super(name);
+    this.chi2Threshold = threshold;
+  }
+
+  @Override
+  public boolean apply(Feature feature) {
+    return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+  }
+
+  @Override
+  public void train(Iterable<Instance<OUTCOME_T>> instances) {
+    // aggregate statistics for all features
+    this.chi2Function = new Chi2Scorer<OUTCOME_T>();
+    for (Instance<OUTCOME_T> instance : instances) {
+      OUTCOME_T outcome = instance.getOutcome();
+      for (Feature feature : instance.getFeatures()) {
+        if (this.isTransformable(feature)) {
+          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+            this.chi2Function.update(this.getFeatureName(untransformedFeature), outcome, 1);
+          }
+        }
+      }
+    }
+    // keep only large chi2 valued features
+    this.selectedFeatureNames = Sets.newHashSet();
+    for (String featureName : this.chi2Function.featValueClassCount.rowKeySet()) {
+      if (this.chi2Function.score(featureName) > this.chi2Threshold) {
+        this.selectedFeatureNames.add(featureName);
+      }
+    }
+
+    this.isTrained = true;
+  }
+
+  @Override
+  public void save(URI uri) throws IOException {
+    if (!this.isTrained) {
+      throw new IllegalStateException("Cannot save before training");
+    }
+    File out = new File(uri);
+    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+    for (String feature : this.selectedFeatureNames) {
+      writer.append(String.format("%s\t%f\n", feature, this.chi2Function.score(feature)));
+    }
+
+    writer.close();
+  }
+
+  @Override
+  public void load(URI uri) throws IOException {
+    this.selectedFeatureNames = Sets.newLinkedHashSet();
+    File in = new File(uri);
+    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+    // The lines are <feature-name>\t<feature-score>
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      String[] featureValuePair = line.split("\t");
+      this.selectedFeatureNames.add(featureValuePair[0]);
+    }
+
+    reader.close();
+    this.isTrained = true;
 
+  }
 }

Copied: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelection.java (from r1424215, incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java)
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelection.java?p2=incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelection.java&p1=incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java&r1=1424215&r2=1424631&rev=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelection.java Thu Dec 20 18:46:11 2012
@@ -2,6 +2,7 @@ package org.apache.ctakes.temporal.ae.fe
 
 import java.util.ArrayList;
 import java.util.List;
+import java.util.Set;
 
 import org.cleartk.classifier.feature.transform.TrainableExtractor_ImplBase;
 import org.cleartk.classifier.feature.transform.TransformableFeature;
@@ -10,26 +11,54 @@ import org.cleartk.classifier.Instance;
 
 import com.google.common.base.Predicate;
 import com.google.common.collect.Collections2;
+import com.google.common.collect.Lists;
+
+public abstract class FeatureSelection<OUTCOME_T> extends
+    TrainableExtractor_ImplBase<OUTCOME_T> implements Predicate<Feature> {
+
+  protected boolean isTrained;
+  
+  protected Set<String> selectedFeatureNames;
+
+  public FeatureSelection(String name) {
+    super(name);
+    this.isTrained = false;
+  }
+
+  @Override
+  public boolean apply(Feature feature) {
+    return this.selectedFeatureNames.contains(this.getFeatureName(feature));
+  }
+
+  @Override
+  public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
+    List<Feature> features = new ArrayList<Feature>();
+    for (Feature feature : instance.getFeatures()) {
+      if (this.isTransformable(feature)) {
+        // Filter down to selected features
+        features.addAll(Collections2.filter(((TransformableFeature) feature).getFeatures(), this));
+      } else {
+        // Pass non-relevant features through w/o filtering
+        features.add(feature);
+      }
+    }
+    return new Instance<OUTCOME_T>(instance.getOutcome(), features);
+  }
+
+  public List<Feature> transform(List<Feature> features) {
+    List<Feature> results = Lists.newArrayList();
+    if (this.isTrained) {
+      results.addAll(Collections2.filter(features, this));
+    } else {
+      results.add(new TransformableFeature(this.name, features));
+    }
+    return results;
+  }
+
+  protected String getFeatureName(Feature feature) {
+    String featureName = feature.getName();
+    Object featureValue = feature.getValue();
+    return featureValue instanceof Number ? featureName : featureName + ":" + featureValue;
+  }
 
-public abstract class FeatureSelectionExtractor<OUTCOME_T> extends
-		TrainableExtractor_ImplBase<OUTCOME_T> implements Predicate<Feature> {
-			
-		public FeatureSelectionExtractor(String name) {
-		    super(name);
-		}
-
-		@Override
-		public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
-		    List<Feature> features = new ArrayList<Feature>();
-		    for (Feature feature : instance.getFeatures()) {
-		    	if (this.isTransformable(feature)) {
-			        // Filter down to selected features
-			        features.addAll(Collections2.filter(((TransformableFeature) feature).getFeatures(), this));
-			    } else {
-			        // Pass non-relevant features through w/o filtering
-			        features.add(feature);
-			    }
-			}
-			return new Instance<OUTCOME_T>(instance.getOutcome(), features);
-		}
 }

Copied: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelection.java (from r1424215, incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java)
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelection.java?p2=incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelection.java&p1=incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java&r1=1424215&r2=1424631&rev=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelection.java Thu Dec 20 18:46:11 2012
@@ -7,30 +7,23 @@ import java.io.FileReader;
 import java.io.FileWriter;
 import java.io.IOException;
 import java.net.URI;
-import java.util.ArrayList;
 import java.util.Collection;
-import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.ctakes.temporal.ae.feature.selection.MutualInformationFeatureSelection.MutualInformationStats.ComputeFeatureScore;
 import org.cleartk.classifier.Feature;
 import org.cleartk.classifier.Instance;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
 import org.cleartk.classifier.feature.transform.TransformableFeature;
 
 import com.google.common.base.Function;
 import com.google.common.base.Joiner;
-import com.google.common.collect.Collections2;
 import com.google.common.collect.HashBasedTable;
 import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Lists;
 import com.google.common.collect.Maps;
 import com.google.common.collect.Multiset;
 import com.google.common.collect.Ordering;
+import com.google.common.collect.Sets;
 import com.google.common.collect.Table;
 
 /**
@@ -45,8 +38,7 @@ import com.google.common.collect.Table;
  * @author Lee Becker
  * 
  */
-public class MutualInformationFeatureSelectionExtractor<OUTCOME_T> extends
-    FeatureSelectionExtractor<OUTCOME_T> implements SimpleFeatureExtractor {
+public class MutualInformationFeatureSelection<OUTCOME_T> extends FeatureSelection<OUTCOME_T> {
 
   /**
    * Specifies how scores for each outcome should be combined/aggregated into a single score
@@ -58,8 +50,8 @@ public class MutualInformationFeatureSel
     // MERGE, // Take k-largest mutual information values for each class and merge into a single
     // collection - currently omitted because it requires a different extraction flow
 
-    public static class AverageScores<OUTCOME_T>
-    implements Function<Map<OUTCOME_T, Double>, Double> {
+    public static class AverageScores<OUTCOME_T> implements
+        Function<Map<OUTCOME_T, Double>, Double> {
       @Override
       public Double apply(Map<OUTCOME_T, Double> input) {
         Collection<Double> scores = input.values();
@@ -73,8 +65,7 @@ public class MutualInformationFeatureSel
       }
     }
 
-    public static class MaxScores<OUTCOME_T>
-    implements Function<Map<OUTCOME_T, Double>, Double> {
+    public static class MaxScores<OUTCOME_T> implements Function<Map<OUTCOME_T, Double>, Double> {
       @Override
       public Double apply(Map<OUTCOME_T, Double> input) {
         return Ordering.natural().max(input.values());
@@ -211,102 +202,55 @@ public class MutualInformationFeatureSel
 
   }
 
-  public String nameFeature(Feature feature) {
-    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
-        + feature.getValue();
-  }
-
-  protected boolean isTrained;
-
   private MutualInformationStats<OUTCOME_T> mutualInfoStats;
 
-  private CombinedExtractor subExtractor;
-
   private int numFeatures;
 
   private CombineScoreMethod combineScoreMethod;
 
-  private List<String> selectedFeatures;
-
   private double smoothingCount;
 
-  public MutualInformationFeatureSelectionExtractor(String name, CombinedExtractor  extractor) {
-    super(name);
-    this.init(extractor, CombineScoreMethod.MAX, 1.0, 10);
+  public MutualInformationFeatureSelection(String name) {
+    this(name, CombineScoreMethod.MAX, 1.0, 10);
   }
 
-  public MutualInformationFeatureSelectionExtractor(
-      String name,
-      CombinedExtractor  extractor,
-      int numFeatures) {
-    super(name);
-    this.init(extractor, CombineScoreMethod.MAX, 1.0, numFeatures);
+  public MutualInformationFeatureSelection(String name, int numFeatures) {
+    this(name, CombineScoreMethod.MAX, 1.0, numFeatures);
   }
 
-  public MutualInformationFeatureSelectionExtractor(
+  public MutualInformationFeatureSelection(
       String name,
-      CombinedExtractor  extractor,
-      CombineScoreMethod combineMeasureType,
+      CombineScoreMethod combineScoreMethod,
       double smoothingCount,
       int numFeatures) {
     super(name);
-    this.init(extractor, combineMeasureType, smoothingCount, numFeatures);
-  }
-
-  private void init(
-	  CombinedExtractor  extractor,
-      CombineScoreMethod method,
-      double smoothCount,
-      int n) {
-    this.subExtractor = extractor;
-    this.combineScoreMethod = method;
-    this.smoothingCount = smoothCount;
-    this.numFeatures = n;
-  }
-
-  @Override
-  public List<Feature> extract(JCas view, Annotation focusAnnotation)
-      throws CleartkExtractorException {
-
-    List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
-    List<Feature> result = new ArrayList<Feature>();
-    if (this.isTrained) {
-      // Filter out selected features
-      result.addAll(Collections2.filter(extracted, this));
-    } else {
-      // We haven't trained this extractor yet, so just mark the existing features
-      // for future modification, by creating one uber-container feature
-//      List<TransformableFeature> transExtracted = new ArrayList<TransformableFeature>();
-//      for (Feature feat: extracted){
-//    	  transExtracted.add(new TransformableFeature(feat.getName(), feat));
-//      }
-      result.add(new TransformableFeature(this.name, extracted));
-    }
-
-    return result;
+    this.combineScoreMethod = combineScoreMethod;
+    this.smoothingCount = smoothingCount;
+    this.numFeatures = numFeatures;
   }
 
   @Override
   public void train(Iterable<Instance<OUTCOME_T>> instances) {
     // aggregate statistics for all features and classes
     this.mutualInfoStats = new MutualInformationStats<OUTCOME_T>(this.smoothingCount);
-
     for (Instance<OUTCOME_T> instance : instances) {
       OUTCOME_T outcome = instance.getOutcome();
       for (Feature feature : instance.getFeatures()) {
         if (this.isTransformable(feature)) {
           for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
-            mutualInfoStats.update(this.nameFeature(untransformedFeature), outcome, 1);
+            mutualInfoStats.update(this.getFeatureName(untransformedFeature), outcome, 1);
           }
         }
       }
     }
-    // Compute mutual information score for each feature
+    
+    // sort features by mutual information score
     Set<String> featureNames = mutualInfoStats.classConditionalCounts.rowKeySet();
-
-    this.selectedFeatures = Ordering.natural().onResultOf(
-        this.mutualInfoStats.getScoreFunction(this.combineScoreMethod)).reverse().immutableSortedCopy(
-        featureNames);
+    ComputeFeatureScore<OUTCOME_T> scoreFunction = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
+    Ordering<String> ordering = Ordering.natural().onResultOf(scoreFunction).reverse();
+    
+    // keep only the top N features
+    this.selectedFeatureNames = Sets.newLinkedHashSet(ordering.immutableSortedCopy(featureNames).subList(0, this.numFeatures));
     this.isTrained = true;
   }
 
@@ -319,46 +263,35 @@ public class MutualInformationFeatureSel
     BufferedWriter writer = new BufferedWriter(new FileWriter(out));
     writer.append("CombineScoreType\t");
     writer.append(this.combineScoreMethod.toString());
-    writer.append("\n");
+    writer.append('\n');
 
-//    ComputeFeatureScore<OUTCOME_T> computeScore = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
-    for (String feature : this.selectedFeatures) {
-      writer.append(String.format("%s\n", feature));//, computeScore.apply(feature)));
+    for (String featureName : this.selectedFeatureNames) {
+      writer.append(featureName);
+      writer.append('\n');
     }
 
     writer.close();
-
   }
 
   @Override
   public void load(URI uri) throws IOException {
-    this.selectedFeatures = Lists.newArrayList();
+    this.selectedFeatureNames = Sets.newLinkedHashSet();
     File in = new File(uri);
     BufferedReader reader = new BufferedReader(new FileReader(in));
 
     // First line specifies the combine utility type
-    this.combineScoreMethod = CombineScoreMethod.valueOf(reader.readLine().split("\\t")[1]);
+    this.combineScoreMethod = CombineScoreMethod.valueOf(reader.readLine().split("\t")[1]);
 
     // The rest of the lines are feature + selection scores
     String line = null;
     int n = 0;
     while ((line = reader.readLine()) != null && n < this.numFeatures) {
-      String featureValue = line.trim();
-      this.selectedFeatures.add(featureValue);
+      String featureName = line.trim();
+      this.selectedFeatureNames.add(featureName);
       n++;
     }
 
     reader.close();
     this.isTrained = true;
   }
-
-  @Override
-  public boolean apply(Feature feature) {
-    return this.selectedFeatures.contains(this.nameFeature(feature));
-  }
-
-  public final List<String> getSelectedFeatures() {
-    return this.selectedFeatures;
-  }
-
 }

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKEventSpans.java?rev=1424631&r1=1424630&r2=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKEventSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKEventSpans.java Thu Dec 20 18:46:11 2012
@@ -49,7 +49,7 @@ public class EvaluationOfClearTKEventSpa
   public static void main(String[] args) throws Exception {
     Options options = CliFactory.parseArguments(Options.class, args);
     EvaluationOfClearTKEventSpans evaluation = new EvaluationOfClearTKEventSpans(
-        new File("target/eval"),
+        new File("target/eval/cleartk-event-spans"),
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory());
     evaluation.setLogging(Level.FINE, new File("target/eval/cleartk-event-errors.log"));

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKTimeSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKTimeSpans.java?rev=1424631&r1=1424630&r2=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKTimeSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfClearTKTimeSpans.java Thu Dec 20 18:46:11 2012
@@ -48,7 +48,7 @@ public class EvaluationOfClearTKTimeSpan
   public static void main(String[] args) throws Exception {
     Options options = CliFactory.parseArguments(Options.class, args);
     EvaluationOfClearTKTimeSpans evaluation = new EvaluationOfClearTKTimeSpans(
-        new File("target/eval"),
+        new File("target/eval/cleartk-time-spans"),
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory());
     evaluation.setLogging(Level.FINE, new File("target/eval/cleartk-time-errors.log"));

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventProperties.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventProperties.java?rev=1424631&r1=1424630&r2=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventProperties.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventProperties.java Thu Dec 20 18:46:11 2012
@@ -58,7 +58,7 @@ public class EvaluationOfEventProperties
   public static void main(String[] args) throws Exception {
     Options options = CliFactory.parseArguments(Options.class, args);
     EvaluationOfEventProperties evaluation = new EvaluationOfEventProperties(
-        new File("target/eval"),
+        new File("target/eval/event-properties"),
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory());
     List<Map<String, AnnotationStatistics<String>>> foldStats = evaluation.crossValidation(

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1424631&r1=1424630&r2=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java Thu Dec 20 18:46:11 2012
@@ -19,14 +19,13 @@
 package org.apache.ctakes.temporal.eval;
 
 import java.io.File;
-import java.net.URI;
 import java.util.Collection;
 import java.util.EnumSet;
 import java.util.List;
 import java.util.logging.Level;
 
 import org.apache.ctakes.temporal.ae.EventAnnotator;
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
+import org.apache.ctakes.temporal.ae.feature.selection.FeatureSelection;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
@@ -61,21 +60,18 @@ public class EvaluationOfEventSpans exte
 
   public static void main(String[] args) throws Exception {
     Options options = CliFactory.parseArguments(Options.class, args);
+    List<Integer> patientSets = options.getPatients().getList();
+    List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
+    List<Integer> devItems = THYMEData.getDevPatientSets(patientSets);
     EvaluationOfEventSpans evaluation = new EvaluationOfEventSpans(
-        new File("target/eval"),
+        new File("target/eval/event-spans"),
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory(),
         options.getProbabilityOfKeepingANegativeExample(),
         options.getFeatureSelectionThreshold());
     evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-event-errors.log"));
-    List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(
-        options.getPatients().getList(),
-        2);
-    for (AnnotationStatistics<String> stats : foldStats) {
-      System.err.println(stats);
-    }
-    System.err.println("OVERALL");
-    System.err.println(AnnotationStatistics.addAll(foldStats));
+    AnnotationStatistics<String> stats = evaluation.trainAndTest(trainItems, devItems);
+    System.err.println(stats);
   }
 
   private float probabilityOfKeepingANegativeExample;
@@ -122,17 +118,13 @@ public class EvaluationOfEventSpans exte
       // Extracting features and writing instances
       Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
       // Collect MinMax stats for feature normalization
-      URI chi2NbFsURI = EventAnnotator.createFeatureSelectionURI(directory);
-      Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(
-          EventAnnotator.FEATURE_SELECTION_NAME,
-          this.featureSelectionThreshold);
-      chi2NbFsExtractor.train(instances);
-      chi2NbFsExtractor.save(chi2NbFsURI);
+      FeatureSelection<String> featureSelection = EventAnnotator.createFeatureSelection(this.featureSelectionThreshold);
+      featureSelection.train(instances);
+      featureSelection.save(EventAnnotator.createFeatureSelectionURI(directory));
       // now write in the libsvm format
       LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
       for (Instance<String> instance : instances) {
-        instance = chi2NbFsExtractor.transform(instance);
-        dataWriter.write(instance);
+        dataWriter.write(featureSelection.transform(instance));
       }
       dataWriter.finish();
     }

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java?rev=1424631&r1=1424630&r2=1424631&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfTimeSpans.java Thu Dec 20 18:46:11 2012
@@ -41,19 +41,16 @@ public class EvaluationOfTimeSpans exten
 
   public static void main(String[] args) throws Exception {
     Options options = CliFactory.parseArguments(Options.class, args);
+    List<Integer> patientSets = options.getPatients().getList();
+    List<Integer> trainItems = THYMEData.getTrainPatientSets(patientSets);
+    List<Integer> devItems = THYMEData.getDevPatientSets(patientSets);
     EvaluationOfTimeSpans evaluation = new EvaluationOfTimeSpans(
-        new File("target/eval"),
+        new File("target/eval/time-spans"),
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory());
     evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-time-errors.log"));
-    List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(
-        options.getPatients().getList(),
-        4);
-    for (AnnotationStatistics<String> stats : foldStats) {
-      System.err.println(stats);
-    }
-    System.err.println("OVERALL");
-    System.err.println(AnnotationStatistics.addAll(foldStats));
+    AnnotationStatistics<String> stats = evaluation.trainAndTest(trainItems, devItems);
+    System.err.println(stats);
   }
 
   public EvaluationOfTimeSpans(

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/THYMEData.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/THYMEData.java?rev=1424631&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/THYMEData.java (added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/THYMEData.java Thu Dec 20 18:46:11 2012
@@ -0,0 +1,43 @@
+package org.apache.ctakes.temporal.eval;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * A class for splitting the THYME data into appropriate sets for evaluation.
+ */
+public class THYMEData {
+
+  public static List<Integer> getTrainPatientSets(List<Integer> patientSets) {
+    List<Integer> items = new ArrayList<Integer>();
+    for (Integer i : patientSets) {
+      int remainder = i % 8;
+      if (remainder < 4) {
+        items.add(i);
+      }
+    }
+    return items;
+  }
+
+  public static List<Integer> getDevPatientSets(List<Integer> patientSets) {
+    List<Integer> items = new ArrayList<Integer>();
+    for (Integer i : patientSets) {
+      int remainder = i % 8;
+      if (4 <= remainder && remainder < 6) {
+        items.add(i);
+      }
+    }
+    return items;
+  }
+
+  public static List<Integer> getTestPatientSets(List<Integer> patientSets) {
+    List<Integer> items = new ArrayList<Integer>();
+    for (Integer i : patientSets) {
+      int remainder = i % 8;
+      if (6 <= remainder) {
+        items.add(i);
+      }
+    }
+    return items;
+  }
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/THYMEData.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/THYMEData.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message