incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From stevenbeth...@apache.org
Subject svn commit: r1424157 [2/3] - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/ ae/feature/ ae/feature/selection/ eval/
Date Wed, 19 Dec 2012 21:49:47 GMT
Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java Wed Dec 19 21:49:46 2012
@@ -1,470 +1,470 @@
-package org.apache.ctakes.temporal.ae.feature.selection;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor.Chi2Evaluator.ComputeFeatureScore;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.Instance;
-import org.cleartk.classifier.feature.extractor.BetweenAnnotationsFeatureExtractor;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.CleartkExtractor.Bounds;
-import org.cleartk.classifier.feature.extractor.CleartkExtractor.Context;
-import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
-import org.cleartk.classifier.feature.transform.TransformableFeature;
-
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Collections2;
-import com.google.common.collect.HashBasedTable;
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
-import com.google.common.collect.Table;
-
-/**
- * 
- * Selects features via Chi-squared statistics between the features extracted from its
- * sub-extractor and the outcome values they are paired with in classification instances.
- * 
- * @author Chen Lin
- * 
- */
-public class Chi2NeighborFSExtractor<OUTCOME_T> extends FeatureSelectionExtractor<OUTCOME_T>
-		implements SimpleFeatureExtractor , BetweenAnnotationsFeatureExtractor{
-	
-			/**
-			   * A Bounds implementation that puts no restrictions on the context.
-			   */
-			  private static class NoBounds implements Bounds {
-
-			    public NoBounds() {
-			    }
-
-			    @Override
-			    public boolean contains(Annotation annotation) {
-			      return true;
-			    }
-
-			  }
-
-			/**
-			   * A Bounds implementation that restricts the context to annotations within a given span.
-			   */
-		private static class SpanBounds implements Bounds {
-
-		private int begin;
-
-		private int end;
-
-		public SpanBounds(int begin, int end) {
-		    this.begin = begin;
-		    this.end = end;
-		}
-
-		@Override
-		public boolean contains(Annotation annotation) {
-		    return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end;
-		}
-
-	}
-
-		/**
-		   * Helper class for aggregating and computing mutual Chi2 statistics
-		*/
-		public static class Chi2Evaluator<OUTCOME_T> {
-			 protected Multiset<OUTCOME_T> classCounts;
-
-			 protected Table<String, OUTCOME_T, Integer> featValueClassCount;
-
-			 public Chi2Evaluator() {
-			      this.classCounts = HashMultiset.<OUTCOME_T> create();
-			      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
-			 }
-
-			 public void update(String featureName, OUTCOME_T outcome, int occurrences) {
-			      Integer count = this.featValueClassCount.get(featureName, outcome);
-			      if (count == null) {
-			        count = 0;
-			      }
-			      this.featValueClassCount.put(featureName, outcome, count + occurrences);
-			      this.classCounts.add(outcome, occurrences);
-			 }
-
-			 public double Chi2Cal(String featureName) {
-			      // notation index of 0 means false, 1 mean true
-				  //Contingency Table:
-				  //    | class1 | class2 | class3 | sum
-				  //posi| 		 |        |        | posiFeatCount
-				  //nega|        |        |        | negaFeatCount
-				  //    | outcnt1| outcnt2| outcnt3| n
-				  
-				  int numOfClass = this.classCounts.elementSet().size();
-			      int[] posiOutcomeCounts = new int[numOfClass];
-			      int[] outcomeCounts = new int[numOfClass];
-			      int classId = 0;
-			      int posiFeatCount = 0;
-			      for ( OUTCOME_T clas: this.classCounts.elementSet()){
-			    	  posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)? 
-			    			  this.featValueClassCount.get(featureName, clas)
-					          : 0;
-			    	  posiFeatCount += posiOutcomeCounts[classId];
-			    	  outcomeCounts[classId] = this.classCounts.count(clas);
-			    	  classId ++;
-			      }
-			      
-			      int n = this.classCounts.size();
-			      int negaFeatCount = n - posiFeatCount;
-			      
-			      double chi2val = 0.0;
-			      
-			      if (posiFeatCount == 0 || posiFeatCount == n){ //all instances have same value on this feature, degree of freedom = 0
-			    	  return chi2val;			    	  
-			      }
-			      
-			      boolean yates = true;
-			      for (int lbl =0; lbl < numOfClass; lbl++){
-			    	  	//for positive part of feature:
-				    	  double expected = (outcomeCounts[lbl]/(double)n)*(posiFeatCount);
-				    	  if (expected > 0){
-				    		  double diff = Math.abs(posiOutcomeCounts[lbl]-expected);
-				    		  if (yates){ // apply Yate's correction
-				    			  diff -= 0.5;
-				    		  }
-				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
-				    	  }
-				    		  
-				    	  //for negative part of feature:
-				    	  expected = (outcomeCounts[lbl]/(double)n)*(negaFeatCount);
-				    	  double observ = outcomeCounts[lbl]-posiOutcomeCounts[lbl];
-				    	  if (expected > 0){
-				    		  double diff = Math.abs(observ-expected);
-				    		  if (yates){ // apply Yate's correction
-				    			  diff -= 0.5;
-				    		  }
-				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
-				    	  }
-			      }
-
-			      return chi2val;
-			    }
-
-			    
-			 public void save(URI outputURI) throws IOException {
-			      File out = new File(outputURI);
-			      BufferedWriter writer = null;
-			      writer = new BufferedWriter(new FileWriter(out));
-
-			      // Write out header
-			      writer.append("Chi2 FS Neighbor Data\n");
-			      writer.append("Feature\t");
-			      writer.append(Joiner.on("\t").join(this.featValueClassCount.columnKeySet()));
-			      writer.append("\n");
-
-			      // Write out Chi2 values for all features
-			      for (String featureName : this.featValueClassCount.rowKeySet()) {
-			        writer.append(featureName);
-			        writer.append("\t");
-			        writer.append(String.format("%f", this.Chi2Cal(featureName)));
-			        writer.append("\n");
-			      }
-			      writer.append("\n");
-			      writer.append(this.featValueClassCount.toString());
-			      writer.close();
-			    }
-			 
-			 public ComputeFeatureScore<OUTCOME_T> getScoreFunction() {
-			      return new ComputeFeatureScore<OUTCOME_T>(this);
-			    }
-
-			    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
-
-			      private Chi2Evaluator<OUTCOME_T> stats;
-
-			      public ComputeFeatureScore(Chi2Evaluator<OUTCOME_T> stats) {
-			        this.stats = stats;
-			      }
-
-			      @Override
-			      public Double apply(String featureName) {
-			        Double featureChi2 = stats.Chi2Cal(featureName);
-			        return featureChi2;
-			      }
-
-			    }
-	}
-			
-			
-	protected boolean isTrained;
-	private CombinedExtractor subExtractor;
-	private List<String> selectedFeatures;
-	private double chi2Threshold;
-	private Chi2Evaluator<OUTCOME_T> chi2Evaluator;
-	private Context[] contexts;
-	private Class<? extends Annotation> annotationClass;
-
-	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, Context... contexts) {
-		super(name);
-		this.annotationClass = annotationClass;
-		this.init(featureExtractor, 0.0);
-		this.contexts = contexts;
-	}
-	
-	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, double thres, Context... contexts) {
-		super(name);
-		this.annotationClass = annotationClass;
-		this.init(featureExtractor, thres);
-		this.contexts = contexts;
-	}
-
-	public Chi2NeighborFSExtractor(String fsNeighborExtractorKey, Float thres) {
-		super(fsNeighborExtractorKey);
-		this.isTrained=false;
-		this.chi2Threshold = thres;
-	}
-
-	private void init(CombinedExtractor featureExtractor, double thres) {
-		this.subExtractor= featureExtractor;
-		this.chi2Threshold = thres;
-	}
-
-	@Override
-	public List<Feature> extract(JCas view, Annotation focusAnnotation)
-			throws CleartkExtractorException {
-		List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
-	    List<Feature> result = new ArrayList<Feature>();
-	    if (this.isTrained) {
-	      // Filter out selected features
-	      result.addAll(Collections2.filter(extracted, this));
-	    } else {
-	      // We haven't trained this extractor yet, so just mark the existing features
-	      // for future modification, by creating one uber-container feature
-	      result.add(new TransformableFeature(this.name, extracted));
-	    }
-
-	    return result;
-	}
-	
-	public List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds)
-		      throws CleartkExtractorException {
-		    List<Feature> extracted = new ArrayList<Feature>();
-		    for (Context context : this.contexts) {
-			      extracted.addAll(context.extract(
-			          view,
-			          focusAnnotation,
-			          bounds,
-			          this.annotationClass,
-			          this.subExtractor));
-			    }
-		    List<Feature> result = new ArrayList<Feature>();
-		    if (this.isTrained){
-		    	// Filter out selected features
-			    result.addAll(Collections2.filter(extracted, this));
-		    }else{
-		    	// We haven't trained this extractor yet, so just mark the existing features
-			    // for future modification, by creating one uber-container feature
-			    result.add(new TransformableFeature(this.name, extracted));
-		    }
-		    
-		    return result;
-		  }
-
-	/**
-	  * Extract features from the annotations around the focus annotation and within the given bounds.
-	   * 
-	   * @param view
-	   *          The JCas containing the focus annotation.
-	   * @param focusAnnotation
-	   *          The annotation whose context is to be searched.
-	   * @param boundsAnnotation
-	   *          The boundary within which context annotations may be identified.
-	   * @return The features extracted in the context of the focus annotation.
-	   */
-	public List<Feature> extractWithin(
-	      JCas view,
-	      Annotation focusAnnotation,
-	      Annotation boundsAnnotation) throws CleartkExtractorException {
-	    Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd());
-	    return this.extract(view, focusAnnotation, bounds);
-	}
-	  
-	@Override
-	public boolean apply(Feature feature) {
-		return this.selectedFeatures.contains(this.nameFeature(feature));
-	}
-	
-	public String nameFeature(Feature feature) {
-	    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
-	        + feature.getValue();
-	  }
-
-	@Override
-	public void train(Iterable<Instance<OUTCOME_T>> instances) {
-		// aggregate statistics for all features
-	    this.chi2Evaluator = new Chi2Evaluator<OUTCOME_T>();
-
-	    for (Instance<OUTCOME_T> instance : instances) {
-	      OUTCOME_T outcome = instance.getOutcome();
-	      for (Feature feature : instance.getFeatures()) {
-	        if (this.isTransformable(feature)) {
-	          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
-	        	  chi2Evaluator.update(this.nameFeature(untransformedFeature), outcome, 1);
-	          }
-	        }else{
-	        	chi2Evaluator.update(this.nameFeature(feature), outcome, 1);
-	        }
-	      }
-	    }
-	    // Compute mutual information score for each feature
-	    Set<String> featureNames = chi2Evaluator.featValueClassCount.rowKeySet();
-
-	
-		//step3: remove small chi2 valued features
-	    Iterator<String> iter = featureNames.iterator();
-	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
-	    while (iter.hasNext()){
-	    	String feat = iter.next();
-	    	Double chi2 = computeScore.apply(feat);
-	    	if(chi2 <= this.chi2Threshold){
-	    		iter.remove();
-	    	}
-	    }
-	    
-//	    this.selectedFeatures = new ArrayList<String>();
-//	    for (String feature : featureNames){
-//	    	this.selectedFeatures.add(feature);
-//	    }
-//	    
-	    //step4:get selected features
-	    this.selectedFeatures = Ordering.natural().onResultOf(
-        this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
-        featureNames);
-	    
-//	    Iterator<String> iter = featureNames.iterator();
-//	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
-//	    this.selectedFeatures = new ArrayList<String>();
-//	    while (iter.hasNext()){
-//	    	String feat = iter.next();
-//	    	Double chi2 = computeScore.apply(feat);
-//	    	if(chi2 > this.chi2Threshold){
-//	    		this.selectedFeatures.add(feat);
-//	    	}
-//	    }
-//		//order the list 
-//	    this.selectedFeatures = Ordering.natural().onResultOf(
-//	          this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
-//	        		  this.selectedFeatures);
-	    
-		this.isTrained = true;
-		
-	}
-
-	@Override
-	public void save(URI uri) throws IOException {
-		if (!this.isTrained) {
-		      throw new IOException("Chi2FSExtractor: Cannot save before training.");
-		}
-		File out = new File(uri);
-	    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
-
-	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
-	    for (String feature : this.selectedFeatures) {
-	      writer.append(String.format("%s\t%f\n", feature, computeScore.apply(feature)));
-	    }
-
-	    writer.close();
-	}
-
-	@Override
-	public void load(URI uri) throws IOException {
-		this.selectedFeatures = Lists.newArrayList();
-	    File in = new File(uri);
-	    BufferedReader reader = new BufferedReader(new FileReader(in));
-
-	    // The rest of the lines are feature + selection scores
-	    String line = null;
-	    //int n = 0;
-	    while ((line = reader.readLine()) != null ){//&& n < this.numFeatures) {
-	      String[] featureValuePair = line.split("\\t");
-	      this.selectedFeatures.add(featureValuePair[0]);
-	      //n++;
-	    }
-
-	    reader.close();
-	    this.isTrained = true;
-		
-	}
-
-	@Override
-	public List<Feature> extractBetween(JCas jCas, Annotation annotation1,
-			Annotation annotation2) throws CleartkExtractorException {
-		int begin = annotation1.getEnd();
-	    int end = annotation2.getBegin();
-	    // FIXME: creating a new annotation may leak memory - is there a better approach?
-	    Annotation focusAnnotation = new Annotation(jCas, begin, end);
-	    return this.extract(jCas, focusAnnotation, new NoBounds());
-	}
-
-	public Collection<? extends Feature> extract(int[] entityTypeIDs, Map<Integer, List<String>> entityTagsByType, int tokenIndex, int window) {
-		List<Feature> extracted = new ArrayList<Feature>();
-	    List<Feature> result = new ArrayList<Feature>();
-	    for (int typeID : entityTypeIDs) {
-            List<String> tokenEntityTags = entityTagsByType.get(typeID);
-            int begin = Math.max(tokenIndex - window, 0);
-            int end = Math.min(tokenIndex + window, tokenEntityTags.size());
-            for (int i = begin; i < end; ++i) {
-              String featureName = String.format("EntityTag_%d_%d", typeID, i - begin);
-              extracted.add(new Feature(featureName, tokenEntityTags.get(i)));
-            }
-          }
-		if (this.isTrained){
-	    	// Filter out selected features
-		    result.addAll(Collections2.filter(extracted, this));
-	    }else{
-	    	// We haven't trained this extractor yet, so just mark the existing features
-		    // for future modification, by creating one uber-container feature
-		    result.add(new TransformableFeature(this.name, extracted));
-	    }
-	    
-	    return result;
-	}
-
-	public Collection<? extends Feature> extract(int nPreviousClassifications,
-			int tokenIndex, List<String> outcomes) {
-		List<Feature> extracted = new ArrayList<Feature>();
-	    List<Feature> result = new ArrayList<Feature>();
-		// features from previous classifications
-        for (int i = nPreviousClassifications; i > 0; --i) {
-          int index = tokenIndex - i;
-          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
-          extracted.add(new Feature("PreviousOutcome_" + i, previousOutcome));
-        }
-        
-        if (this.isTrained){
-	    	// Filter out selected features
-		    result.addAll(Collections2.filter(extracted, this));
-	    }else{
-	    	// We haven't trained this extractor yet, so just mark the existing features
-		    // for future modification, by creating one uber-container feature
-		    result.add(new TransformableFeature(this.name, extracted));
-	    }
-	    
-	    return result;
-	}
-
-}
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor.Chi2Evaluator.ComputeFeatureScore;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.BetweenAnnotationsFeatureExtractor;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.classifier.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Table;
+
+/**
+ * 
+ * Selects features via Chi-squared statistics between the features extracted from its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ * 
+ * @author Chen Lin
+ * 
+ */
+public class Chi2NeighborFSExtractor<OUTCOME_T> extends FeatureSelectionExtractor<OUTCOME_T>
+		implements SimpleFeatureExtractor , BetweenAnnotationsFeatureExtractor{
+	
+			/**
+			   * A Bounds implementation that puts no restrictions on the context.
+			   */
+			  private static class NoBounds implements Bounds {
+
+			    public NoBounds() {
+			    }
+
+			    @Override
+			    public boolean contains(Annotation annotation) {
+			      return true;
+			    }
+
+			  }
+
+			/**
+			   * A Bounds implementation that restricts the context to annotations within a given span.
+			   */
+		private static class SpanBounds implements Bounds {
+
+		private int begin;
+
+		private int end;
+
+		public SpanBounds(int begin, int end) {
+		    this.begin = begin;
+		    this.end = end;
+		}
+
+		@Override
+		public boolean contains(Annotation annotation) {
+		    return annotation.getBegin() >= this.begin && annotation.getEnd() <= this.end;
+		}
+
+	}
+
+		/**
+		   * Helper class for aggregating and computing mutual Chi2 statistics
+		*/
+		public static class Chi2Evaluator<OUTCOME_T> {
+			 protected Multiset<OUTCOME_T> classCounts;
+
+			 protected Table<String, OUTCOME_T, Integer> featValueClassCount;
+
+			 public Chi2Evaluator() {
+			      this.classCounts = HashMultiset.<OUTCOME_T> create();
+			      this.featValueClassCount = HashBasedTable.<String, OUTCOME_T, Integer> create();
+			 }
+
+			 public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+			      Integer count = this.featValueClassCount.get(featureName, outcome);
+			      if (count == null) {
+			        count = 0;
+			      }
+			      this.featValueClassCount.put(featureName, outcome, count + occurrences);
+			      this.classCounts.add(outcome, occurrences);
+			 }
+
+			 public double Chi2Cal(String featureName) {
+			      // notation index of 0 means false, 1 mean true
+				  //Contingency Table:
+				  //    | class1 | class2 | class3 | sum
+				  //posi| 		 |        |        | posiFeatCount
+				  //nega|        |        |        | negaFeatCount
+				  //    | outcnt1| outcnt2| outcnt3| n
+				  
+				  int numOfClass = this.classCounts.elementSet().size();
+			      int[] posiOutcomeCounts = new int[numOfClass];
+			      int[] outcomeCounts = new int[numOfClass];
+			      int classId = 0;
+			      int posiFeatCount = 0;
+			      for ( OUTCOME_T clas: this.classCounts.elementSet()){
+			    	  posiOutcomeCounts[classId] = this.featValueClassCount.contains(featureName, clas)? 
+			    			  this.featValueClassCount.get(featureName, clas)
+					          : 0;
+			    	  posiFeatCount += posiOutcomeCounts[classId];
+			    	  outcomeCounts[classId] = this.classCounts.count(clas);
+			    	  classId ++;
+			      }
+			      
+			      int n = this.classCounts.size();
+			      int negaFeatCount = n - posiFeatCount;
+			      
+			      double chi2val = 0.0;
+			      
+			      if (posiFeatCount == 0 || posiFeatCount == n){ //all instances have same value on this feature, degree of freedom = 0
+			    	  return chi2val;			    	  
+			      }
+			      
+			      boolean yates = true;
+			      for (int lbl =0; lbl < numOfClass; lbl++){
+			    	  	//for positive part of feature:
+				    	  double expected = (outcomeCounts[lbl]/(double)n)*(posiFeatCount);
+				    	  if (expected > 0){
+				    		  double diff = Math.abs(posiOutcomeCounts[lbl]-expected);
+				    		  if (yates){ // apply Yate's correction
+				    			  diff -= 0.5;
+				    		  }
+				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
+				    	  }
+				    		  
+				    	  //for negative part of feature:
+				    	  expected = (outcomeCounts[lbl]/(double)n)*(negaFeatCount);
+				    	  double observ = outcomeCounts[lbl]-posiOutcomeCounts[lbl];
+				    	  if (expected > 0){
+				    		  double diff = Math.abs(observ-expected);
+				    		  if (yates){ // apply Yate's correction
+				    			  diff -= 0.5;
+				    		  }
+				    		  if (diff>0) chi2val += Math.pow(diff,2)/expected;
+				    	  }
+			      }
+
+			      return chi2val;
+			    }
+
+			    
+			 public void save(URI outputURI) throws IOException {
+			      File out = new File(outputURI);
+			      BufferedWriter writer = null;
+			      writer = new BufferedWriter(new FileWriter(out));
+
+			      // Write out header
+			      writer.append("Chi2 FS Neighbor Data\n");
+			      writer.append("Feature\t");
+			      writer.append(Joiner.on("\t").join(this.featValueClassCount.columnKeySet()));
+			      writer.append("\n");
+
+			      // Write out Chi2 values for all features
+			      for (String featureName : this.featValueClassCount.rowKeySet()) {
+			        writer.append(featureName);
+			        writer.append("\t");
+			        writer.append(String.format("%f", this.Chi2Cal(featureName)));
+			        writer.append("\n");
+			      }
+			      writer.append("\n");
+			      writer.append(this.featValueClassCount.toString());
+			      writer.close();
+			    }
+			 
+			 public ComputeFeatureScore<OUTCOME_T> getScoreFunction() {
+			      return new ComputeFeatureScore<OUTCOME_T>(this);
+			    }
+
+			    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
+
+			      private Chi2Evaluator<OUTCOME_T> stats;
+
+			      public ComputeFeatureScore(Chi2Evaluator<OUTCOME_T> stats) {
+			        this.stats = stats;
+			      }
+
+			      @Override
+			      public Double apply(String featureName) {
+			        Double featureChi2 = stats.Chi2Cal(featureName);
+			        return featureChi2;
+			      }
+
+			    }
+	}
+			
+			
+	protected boolean isTrained;
+	private CombinedExtractor subExtractor;
+	private List<String> selectedFeatures;
+	private double chi2Threshold;
+	private Chi2Evaluator<OUTCOME_T> chi2Evaluator;
+	private Context[] contexts;
+	private Class<? extends Annotation> annotationClass;
+
+	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, Context... contexts) {
+		super(name);
+		this.annotationClass = annotationClass;
+		this.init(featureExtractor, 0.0);
+		this.contexts = contexts;
+	}
+	
+	public Chi2NeighborFSExtractor(String name, Class<? extends Annotation> annotationClass, CombinedExtractor featureExtractor, double thres, Context... contexts) {
+		super(name);
+		this.annotationClass = annotationClass;
+		this.init(featureExtractor, thres);
+		this.contexts = contexts;
+	}
+
+	public Chi2NeighborFSExtractor(String fsNeighborExtractorKey, Float thres) {
+		super(fsNeighborExtractorKey);
+		this.isTrained=false;
+		this.chi2Threshold = thres;
+	}
+
+	private void init(CombinedExtractor featureExtractor, double thres) {
+		this.subExtractor= featureExtractor;
+		this.chi2Threshold = thres;
+	}
+
+	@Override
+	public List<Feature> extract(JCas view, Annotation focusAnnotation)
+			throws CleartkExtractorException {
+		List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
+	    List<Feature> result = new ArrayList<Feature>();
+	    if (this.isTrained) {
+	      // Filter out selected features
+	      result.addAll(Collections2.filter(extracted, this));
+	    } else {
+	      // We haven't trained this extractor yet, so just mark the existing features
+	      // for future modification, by creating one uber-container feature
+	      result.add(new TransformableFeature(this.name, extracted));
+	    }
+
+	    return result;
+	}
+	
+	public List<Feature> extract(JCas view, Annotation focusAnnotation, Bounds bounds)
+		      throws CleartkExtractorException {
+		    List<Feature> extracted = new ArrayList<Feature>();
+		    for (Context context : this.contexts) {
+			      extracted.addAll(context.extract(
+			          view,
+			          focusAnnotation,
+			          bounds,
+			          this.annotationClass,
+			          this.subExtractor));
+			    }
+		    List<Feature> result = new ArrayList<Feature>();
+		    if (this.isTrained){
+		    	// Filter out selected features
+			    result.addAll(Collections2.filter(extracted, this));
+		    }else{
+		    	// We haven't trained this extractor yet, so just mark the existing features
+			    // for future modification, by creating one uber-container feature
+			    result.add(new TransformableFeature(this.name, extracted));
+		    }
+		    
+		    return result;
+		  }
+
+	/**
+	  * Extract features from the annotations around the focus annotation and within the given bounds.
+	   * 
+	   * @param view
+	   *          The JCas containing the focus annotation.
+	   * @param focusAnnotation
+	   *          The annotation whose context is to be searched.
+	   * @param boundsAnnotation
+	   *          The boundary within which context annotations may be identified.
+	   * @return The features extracted in the context of the focus annotation.
+	   */
+	public List<Feature> extractWithin(
+	      JCas view,
+	      Annotation focusAnnotation,
+	      Annotation boundsAnnotation) throws CleartkExtractorException {
+	    Bounds bounds = new SpanBounds(boundsAnnotation.getBegin(), boundsAnnotation.getEnd());
+	    return this.extract(view, focusAnnotation, bounds);
+	}
+	  
+	@Override
+	public boolean apply(Feature feature) {
+		return this.selectedFeatures.contains(this.nameFeature(feature));
+	}
+	
+	public String nameFeature(Feature feature) {
+	    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
+	        + feature.getValue();
+	  }
+
+	@Override
+	public void train(Iterable<Instance<OUTCOME_T>> instances) {
+		// aggregate statistics for all features
+	    this.chi2Evaluator = new Chi2Evaluator<OUTCOME_T>();
+
+	    for (Instance<OUTCOME_T> instance : instances) {
+	      OUTCOME_T outcome = instance.getOutcome();
+	      for (Feature feature : instance.getFeatures()) {
+	        if (this.isTransformable(feature)) {
+	          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+	        	  chi2Evaluator.update(this.nameFeature(untransformedFeature), outcome, 1);
+	          }
+	        }else{
+	        	chi2Evaluator.update(this.nameFeature(feature), outcome, 1);
+	        }
+	      }
+	    }
+	    // Compute mutual information score for each feature
+	    Set<String> featureNames = chi2Evaluator.featValueClassCount.rowKeySet();
+
+	
+		//step3: remove small chi2 valued features
+	    Iterator<String> iter = featureNames.iterator();
+	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+	    while (iter.hasNext()){
+	    	String feat = iter.next();
+	    	Double chi2 = computeScore.apply(feat);
+	    	if(chi2 <= this.chi2Threshold){
+	    		iter.remove();
+	    	}
+	    }
+	    
+//	    this.selectedFeatures = new ArrayList<String>();
+//	    for (String feature : featureNames){
+//	    	this.selectedFeatures.add(feature);
+//	    }
+//	    
+	    //step4:get selected features
+	    this.selectedFeatures = Ordering.natural().onResultOf(
+        this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
+        featureNames);
+	    
+//	    Iterator<String> iter = featureNames.iterator();
+//	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+//	    this.selectedFeatures = new ArrayList<String>();
+//	    while (iter.hasNext()){
+//	    	String feat = iter.next();
+//	    	Double chi2 = computeScore.apply(feat);
+//	    	if(chi2 > this.chi2Threshold){
+//	    		this.selectedFeatures.add(feat);
+//	    	}
+//	    }
+//		//order the list 
+//	    this.selectedFeatures = Ordering.natural().onResultOf(
+//	          this.chi2Evaluator.getScoreFunction()).reverse().immutableSortedCopy(
+//	        		  this.selectedFeatures);
+	    
+		this.isTrained = true;
+		
+	}
+
+	@Override
+	public void save(URI uri) throws IOException {
+		if (!this.isTrained) {
+		      throw new IOException("Chi2FSExtractor: Cannot save before training.");
+		}
+		File out = new File(uri);
+	    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+
+	    ComputeFeatureScore<OUTCOME_T> computeScore = this.chi2Evaluator.getScoreFunction();
+	    for (String feature : this.selectedFeatures) {
+	      writer.append(String.format("%s\t%f\n", feature, computeScore.apply(feature)));
+	    }
+
+	    writer.close();
+	}
+
+	@Override
+	public void load(URI uri) throws IOException {
+		this.selectedFeatures = Lists.newArrayList();
+	    File in = new File(uri);
+	    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+	    // The rest of the lines are feature + selection scores
+	    String line = null;
+	    //int n = 0;
+	    while ((line = reader.readLine()) != null ){//&& n < this.numFeatures) {
+	      String[] featureValuePair = line.split("\\t");
+	      this.selectedFeatures.add(featureValuePair[0]);
+	      //n++;
+	    }
+
+	    reader.close();
+	    this.isTrained = true;
+		
+	}
+
+	@Override
+	public List<Feature> extractBetween(JCas jCas, Annotation annotation1,
+			Annotation annotation2) throws CleartkExtractorException {
+		int begin = annotation1.getEnd();
+	    int end = annotation2.getBegin();
+	    // FIXME: creating a new annotation may leak memory - is there a better approach?
+	    Annotation focusAnnotation = new Annotation(jCas, begin, end);
+	    return this.extract(jCas, focusAnnotation, new NoBounds());
+	}
+
+	public Collection<? extends Feature> extract(int[] entityTypeIDs, Map<Integer, List<String>> entityTagsByType, int tokenIndex, int window) {
+		List<Feature> extracted = new ArrayList<Feature>();
+	    List<Feature> result = new ArrayList<Feature>();
+	    for (int typeID : entityTypeIDs) {
+            List<String> tokenEntityTags = entityTagsByType.get(typeID);
+            int begin = Math.max(tokenIndex - window, 0);
+            int end = Math.min(tokenIndex + window, tokenEntityTags.size());
+            for (int i = begin; i < end; ++i) {
+              String featureName = String.format("EntityTag_%d_%d", typeID, i - begin);
+              extracted.add(new Feature(featureName, tokenEntityTags.get(i)));
+            }
+          }
+		if (this.isTrained){
+	    	// Filter out selected features
+		    result.addAll(Collections2.filter(extracted, this));
+	    }else{
+	    	// We haven't trained this extractor yet, so just mark the existing features
+		    // for future modification, by creating one uber-container feature
+		    result.add(new TransformableFeature(this.name, extracted));
+	    }
+	    
+	    return result;
+	}
+
+	public Collection<? extends Feature> extract(int nPreviousClassifications,
+			int tokenIndex, List<String> outcomes) {
+		List<Feature> extracted = new ArrayList<Feature>();
+	    List<Feature> result = new ArrayList<Feature>();
+		// features from previous classifications
+        for (int i = nPreviousClassifications; i > 0; --i) {
+          int index = tokenIndex - i;
+          String previousOutcome = index < 0 ? "O" : outcomes.get(index);
+          extracted.add(new Feature("PreviousOutcome_" + i, previousOutcome));
+        }
+        
+        if (this.isTrained){
+	    	// Filter out selected features
+		    result.addAll(Collections2.filter(extracted, this));
+	    }else{
+	    	// We haven't trained this extractor yet, so just mark the existing features
+		    // for future modification, by creating one uber-container feature
+		    result.add(new TransformableFeature(this.name, extracted));
+	    }
+	    
+	    return result;
+	}
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2NeighborFSExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java Wed Dec 19 21:49:46 2012
@@ -1,35 +1,35 @@
-package org.apache.ctakes.temporal.ae.feature.selection;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import org.cleartk.classifier.feature.transform.TrainableExtractor_ImplBase;
-import org.cleartk.classifier.feature.transform.TransformableFeature;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.Instance;
-
-import com.google.common.base.Predicate;
-import com.google.common.collect.Collections2;
-
-public abstract class FeatureSelectionExtractor<OUTCOME_T> extends
-		TrainableExtractor_ImplBase<OUTCOME_T> implements Predicate<Feature> {
-			
-		public FeatureSelectionExtractor(String name) {
-		    super(name);
-		}
-
-		@Override
-		public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
-		    List<Feature> features = new ArrayList<Feature>();
-		    for (Feature feature : instance.getFeatures()) {
-		    	if (this.isTransformable(feature)) {
-			        // Filter down to selected features
-			        features.addAll(Collections2.filter(((TransformableFeature) feature).getFeatures(), this));
-			    } else {
-			        // Pass non-relevant features through w/o filtering
-			        features.add(feature);
-			    }
-			}
-			return new Instance<OUTCOME_T>(instance.getOutcome(), features);
-		}
-}
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.cleartk.classifier.feature.transform.TrainableExtractor_ImplBase;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Collections2;
+
+public abstract class FeatureSelectionExtractor<OUTCOME_T> extends
+		TrainableExtractor_ImplBase<OUTCOME_T> implements Predicate<Feature> {
+			
+		public FeatureSelectionExtractor(String name) {
+		    super(name);
+		}
+
+		@Override
+		public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
+		    List<Feature> features = new ArrayList<Feature>();
+		    for (Feature feature : instance.getFeatures()) {
+		    	if (this.isTransformable(feature)) {
+			        // Filter down to selected features
+			        features.addAll(Collections2.filter(((TransformableFeature) feature).getFeatures(), this));
+			    } else {
+			        // Pass non-relevant features through w/o filtering
+			        features.add(feature);
+			    }
+			}
+			return new Instance<OUTCOME_T>(instance.getOutcome(), features);
+		}
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java Wed Dec 19 21:49:46 2012
@@ -1,364 +1,364 @@
-package org.apache.ctakes.temporal.ae.feature.selection;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.net.URI;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.cleartk.classifier.Feature;
-import org.cleartk.classifier.Instance;
-import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
-import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
-import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
-import org.cleartk.classifier.feature.transform.TransformableFeature;
-
-import com.google.common.base.Function;
-import com.google.common.base.Joiner;
-import com.google.common.collect.Collections2;
-import com.google.common.collect.HashBasedTable;
-import com.google.common.collect.HashMultiset;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Multiset;
-import com.google.common.collect.Ordering;
-import com.google.common.collect.Table;
-
-/**
- * <br>
- * Copyright (c) 2007-2012, Regents of the University of Colorado <br>
- * All rights reserved.
- * <p>
- * 
- * Selects features via mutual information statistics between the features extracted from its
- * sub-extractor and the outcome values they are paired with in classification instances.
- * 
- * @author Lee Becker
- * 
- */
-public class MutualInformationFeatureSelectionExtractor<OUTCOME_T> extends
-    FeatureSelectionExtractor<OUTCOME_T> implements SimpleFeatureExtractor {
-
-  /**
-   * Specifies how scores for each outcome should be combined/aggregated into a single score
-   */
-  public static enum CombineScoreMethod {
-    AVERAGE, // Average mutual information across all classes and take features with k-largest
-             // values
-    MAX; // Take highest mutual information value for each class
-    // MERGE, // Take k-largest mutual information values for each class and merge into a single
-    // collection - currently omitted because it requires a different extraction flow
-
-    public static class AverageScores<OUTCOME_T>
-    implements Function<Map<OUTCOME_T, Double>, Double> {
-      @Override
-      public Double apply(Map<OUTCOME_T, Double> input) {
-        Collection<Double> scores = input.values();
-        int size = scores.size();
-        double total = 0;
-
-        for (Double score : scores) {
-          total += score;
-        }
-        return total / size;
-      }
-    }
-
-    public static class MaxScores<OUTCOME_T>
-    implements Function<Map<OUTCOME_T, Double>, Double> {
-      @Override
-      public Double apply(Map<OUTCOME_T, Double> input) {
-        return Ordering.natural().max(input.values());
-      }
-    }
-  }
-
-  /**
-   * Helper class for aggregating and computing mutual information statistics
-   */
-  public static class MutualInformationStats<OUTCOME_T> {
-    protected Multiset<OUTCOME_T> classCounts;
-
-    protected Table<String, OUTCOME_T, Integer> classConditionalCounts;
-
-    protected double smoothingCount;
-
-    public MutualInformationStats(double smoothingCount) {
-      this.classCounts = HashMultiset.<OUTCOME_T> create();
-      this.classConditionalCounts = HashBasedTable.<String, OUTCOME_T, Integer> create();
-      this.smoothingCount += smoothingCount;
-    }
-
-    public void update(String featureName, OUTCOME_T outcome, int occurrences) {
-      Integer count = this.classConditionalCounts.get(featureName, outcome);
-      if (count == null) {
-        count = 0;
-      }
-      this.classConditionalCounts.put(featureName, outcome, count + occurrences);
-      this.classCounts.add(outcome, occurrences);
-    }
-
-    public double mutualInformation(String featureName, OUTCOME_T outcome) {
-      // notation index of 0 means false, 1 mean true
-      int[] featureCounts = new int[2];
-      int[] outcomeCounts = new int[2];
-      int[][] featureOutcomeCounts = new int[2][2];
-
-      int n = this.classCounts.size();
-      featureCounts[1] = sum(this.classConditionalCounts.row(featureName).values());
-      featureCounts[0] = n - featureCounts[1];
-      outcomeCounts[1] = this.classCounts.count(outcome);
-      outcomeCounts[0] = n - outcomeCounts[1];
-
-      featureOutcomeCounts[1][1] = this.classConditionalCounts.contains(featureName, outcome)
-          ? this.classConditionalCounts.get(featureName, outcome)
-          : 0;
-      featureOutcomeCounts[1][0] = featureCounts[1] - featureOutcomeCounts[1][1];
-      featureOutcomeCounts[0][1] = outcomeCounts[1] - featureOutcomeCounts[1][1];
-      featureOutcomeCounts[0][0] = n - featureCounts[1] - outcomeCounts[1]
-          + featureOutcomeCounts[1][1];
-
-      double information = 0.0;
-      for (int nFeature = 0; nFeature <= 1; nFeature++) {
-        for (int nOutcome = 0; nOutcome <= 1; nOutcome++) {
-          featureOutcomeCounts[nFeature][nOutcome] += smoothingCount;
-          information += (double) featureOutcomeCounts[nFeature][nOutcome]
-              / (double) n
-              * Math.log(((double) n * featureOutcomeCounts[nFeature][nOutcome])
-                  / ((double) featureCounts[nFeature] * outcomeCounts[nOutcome]));
-        }
-      }
-
-      return information;
-    }
-
-    private static int sum(Collection<Integer> values) {
-      int total = 0;
-      for (int v : values) {
-        total += v;
-      }
-      return total;
-    }
-
-    public void save(URI outputURI) throws IOException {
-      File out = new File(outputURI);
-      BufferedWriter writer = null;
-      writer = new BufferedWriter(new FileWriter(out));
-
-      // Write out header
-      writer.append("Mutual Information Data\n");
-      writer.append("Feature\t");
-      writer.append(Joiner.on("\t").join(this.classConditionalCounts.columnKeySet()));
-      writer.append("\n");
-
-      // Write out Mutual Information data
-      for (String featureName : this.classConditionalCounts.rowKeySet()) {
-        writer.append(featureName);
-        for (OUTCOME_T outcome : this.classConditionalCounts.columnKeySet()) {
-          writer.append("\t");
-          writer.append(String.format("%f", this.mutualInformation(featureName, outcome)));
-        }
-        writer.append("\n");
-      }
-      writer.append("\n");
-      writer.append(this.classConditionalCounts.toString());
-      writer.close();
-    }
-
-    public ComputeFeatureScore<OUTCOME_T> getScoreFunction(CombineScoreMethod combineScoreMethod) {
-      return new ComputeFeatureScore<OUTCOME_T>(this, combineScoreMethod);
-    }
-
-    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
-
-      private MutualInformationStats<OUTCOME_T> stats;
-
-      private Function<Map<OUTCOME_T, Double>, Double> combineScoreFunction;
-
-      public ComputeFeatureScore(
-          MutualInformationStats<OUTCOME_T> stats,
-          CombineScoreMethod combineMeasureType) {
-        this.stats = stats;
-        switch (combineMeasureType) {
-          case AVERAGE:
-            this.combineScoreFunction = new CombineScoreMethod.AverageScores<OUTCOME_T>();
-          case MAX:
-            this.combineScoreFunction = new CombineScoreMethod.MaxScores<OUTCOME_T>();
-        }
-
-      }
-
-      @Override
-      public Double apply(String featureName) {
-        Set<OUTCOME_T> outcomes = stats.classConditionalCounts.columnKeySet();
-        Map<OUTCOME_T, Double> featureOutcomeMI = Maps.newHashMap();
-        for (OUTCOME_T outcome : outcomes) {
-          featureOutcomeMI.put(outcome, stats.mutualInformation(featureName, outcome));
-        }
-        return this.combineScoreFunction.apply(featureOutcomeMI);
-      }
-
-    }
-
-  }
-
-  public String nameFeature(Feature feature) {
-    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
-        + feature.getValue();
-  }
-
-  protected boolean isTrained;
-
-  private MutualInformationStats<OUTCOME_T> mutualInfoStats;
-
-  private CombinedExtractor subExtractor;
-
-  private int numFeatures;
-
-  private CombineScoreMethod combineScoreMethod;
-
-  private List<String> selectedFeatures;
-
-  private double smoothingCount;
-
-  public MutualInformationFeatureSelectionExtractor(String name, CombinedExtractor  extractor) {
-    super(name);
-    this.init(extractor, CombineScoreMethod.MAX, 1.0, 10);
-  }
-
-  public MutualInformationFeatureSelectionExtractor(
-      String name,
-      CombinedExtractor  extractor,
-      int numFeatures) {
-    super(name);
-    this.init(extractor, CombineScoreMethod.MAX, 1.0, numFeatures);
-  }
-
-  public MutualInformationFeatureSelectionExtractor(
-      String name,
-      CombinedExtractor  extractor,
-      CombineScoreMethod combineMeasureType,
-      double smoothingCount,
-      int numFeatures) {
-    super(name);
-    this.init(extractor, combineMeasureType, smoothingCount, numFeatures);
-  }
-
-  private void init(
-	  CombinedExtractor  extractor,
-      CombineScoreMethod method,
-      double smoothCount,
-      int n) {
-    this.subExtractor = extractor;
-    this.combineScoreMethod = method;
-    this.smoothingCount = smoothCount;
-    this.numFeatures = n;
-  }
-
-  @Override
-  public List<Feature> extract(JCas view, Annotation focusAnnotation)
-      throws CleartkExtractorException {
-
-    List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
-    List<Feature> result = new ArrayList<Feature>();
-    if (this.isTrained) {
-      // Filter out selected features
-      result.addAll(Collections2.filter(extracted, this));
-    } else {
-      // We haven't trained this extractor yet, so just mark the existing features
-      // for future modification, by creating one uber-container feature
-//      List<TransformableFeature> transExtracted = new ArrayList<TransformableFeature>();
-//      for (Feature feat: extracted){
-//    	  transExtracted.add(new TransformableFeature(feat.getName(), feat));
-//      }
-      result.add(new TransformableFeature(this.name, extracted));
-    }
-
-    return result;
-  }
-
-  @Override
-  public void train(Iterable<Instance<OUTCOME_T>> instances) {
-    // aggregate statistics for all features and classes
-    this.mutualInfoStats = new MutualInformationStats<OUTCOME_T>(this.smoothingCount);
-
-    for (Instance<OUTCOME_T> instance : instances) {
-      OUTCOME_T outcome = instance.getOutcome();
-      for (Feature feature : instance.getFeatures()) {
-        if (this.isTransformable(feature)) {
-          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
-            mutualInfoStats.update(this.nameFeature(untransformedFeature), outcome, 1);
-          }
-        }
-      }
-    }
-    // Compute mutual information score for each feature
-    Set<String> featureNames = mutualInfoStats.classConditionalCounts.rowKeySet();
-
-    this.selectedFeatures = Ordering.natural().onResultOf(
-        this.mutualInfoStats.getScoreFunction(this.combineScoreMethod)).reverse().immutableSortedCopy(
-        featureNames);
-    this.isTrained = true;
-  }
-
-  @Override
-  public void save(URI uri) throws IOException {
-    if (!this.isTrained) {
-      throw new IOException("MutualInformationFeatureExtractor: Cannot save before training.");
-    }
-    File out = new File(uri);
-    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
-    writer.append("CombineScoreType\t");
-    writer.append(this.combineScoreMethod.toString());
-    writer.append("\n");
-
-//    ComputeFeatureScore<OUTCOME_T> computeScore = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
-    for (String feature : this.selectedFeatures) {
-      writer.append(String.format("%s\n", feature));//, computeScore.apply(feature)));
-    }
-
-    writer.close();
-
-  }
-
-  @Override
-  public void load(URI uri) throws IOException {
-    this.selectedFeatures = Lists.newArrayList();
-    File in = new File(uri);
-    BufferedReader reader = new BufferedReader(new FileReader(in));
-
-    // First line specifies the combine utility type
-    this.combineScoreMethod = CombineScoreMethod.valueOf(reader.readLine().split("\\t")[1]);
-
-    // The rest of the lines are feature + selection scores
-    String line = null;
-    int n = 0;
-    while ((line = reader.readLine()) != null && n < this.numFeatures) {
-      String featureValue = line.trim();
-      this.selectedFeatures.add(featureValue);
-      n++;
-    }
-
-    reader.close();
-    this.isTrained = true;
-  }
-
-  @Override
-  public boolean apply(Feature feature) {
-    return this.selectedFeatures.contains(this.nameFeature(feature));
-  }
-
-  public final List<String> getSelectedFeatures() {
-    return this.selectedFeatures;
-  }
-
-}
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Table;
+
+/**
+ * <br>
+ * Copyright (c) 2007-2012, Regents of the University of Colorado <br>
+ * All rights reserved.
+ * <p>
+ * 
+ * Selects features via mutual information statistics between the features extracted from its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ * 
+ * @author Lee Becker
+ * 
+ */
+public class MutualInformationFeatureSelectionExtractor<OUTCOME_T> extends
+    FeatureSelectionExtractor<OUTCOME_T> implements SimpleFeatureExtractor {
+
+  /**
+   * Specifies how scores for each outcome should be combined/aggregated into a single score
+   */
+  public static enum CombineScoreMethod {
+    AVERAGE, // Average mutual information across all classes and take features with k-largest
+             // values
+    MAX; // Take highest mutual information value for each class
+    // MERGE, // Take k-largest mutual information values for each class and merge into a single
+    // collection - currently omitted because it requires a different extraction flow
+
+    public static class AverageScores<OUTCOME_T>
+    implements Function<Map<OUTCOME_T, Double>, Double> {
+      @Override
+      public Double apply(Map<OUTCOME_T, Double> input) {
+        Collection<Double> scores = input.values();
+        int size = scores.size();
+        double total = 0;
+
+        for (Double score : scores) {
+          total += score;
+        }
+        return total / size;
+      }
+    }
+
+    public static class MaxScores<OUTCOME_T>
+    implements Function<Map<OUTCOME_T, Double>, Double> {
+      @Override
+      public Double apply(Map<OUTCOME_T, Double> input) {
+        return Ordering.natural().max(input.values());
+      }
+    }
+  }
+
+  /**
+   * Helper class for aggregating and computing mutual information statistics
+   */
+  public static class MutualInformationStats<OUTCOME_T> {
+    protected Multiset<OUTCOME_T> classCounts;
+
+    protected Table<String, OUTCOME_T, Integer> classConditionalCounts;
+
+    protected double smoothingCount;
+
+    public MutualInformationStats(double smoothingCount) {
+      this.classCounts = HashMultiset.<OUTCOME_T> create();
+      this.classConditionalCounts = HashBasedTable.<String, OUTCOME_T, Integer> create();
+      this.smoothingCount += smoothingCount;
+    }
+
+    public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+      Integer count = this.classConditionalCounts.get(featureName, outcome);
+      if (count == null) {
+        count = 0;
+      }
+      this.classConditionalCounts.put(featureName, outcome, count + occurrences);
+      this.classCounts.add(outcome, occurrences);
+    }
+
+    public double mutualInformation(String featureName, OUTCOME_T outcome) {
+      // notation index of 0 means false, 1 mean true
+      int[] featureCounts = new int[2];
+      int[] outcomeCounts = new int[2];
+      int[][] featureOutcomeCounts = new int[2][2];
+
+      int n = this.classCounts.size();
+      featureCounts[1] = sum(this.classConditionalCounts.row(featureName).values());
+      featureCounts[0] = n - featureCounts[1];
+      outcomeCounts[1] = this.classCounts.count(outcome);
+      outcomeCounts[0] = n - outcomeCounts[1];
+
+      featureOutcomeCounts[1][1] = this.classConditionalCounts.contains(featureName, outcome)
+          ? this.classConditionalCounts.get(featureName, outcome)
+          : 0;
+      featureOutcomeCounts[1][0] = featureCounts[1] - featureOutcomeCounts[1][1];
+      featureOutcomeCounts[0][1] = outcomeCounts[1] - featureOutcomeCounts[1][1];
+      featureOutcomeCounts[0][0] = n - featureCounts[1] - outcomeCounts[1]
+          + featureOutcomeCounts[1][1];
+
+      double information = 0.0;
+      for (int nFeature = 0; nFeature <= 1; nFeature++) {
+        for (int nOutcome = 0; nOutcome <= 1; nOutcome++) {
+          featureOutcomeCounts[nFeature][nOutcome] += smoothingCount;
+          information += (double) featureOutcomeCounts[nFeature][nOutcome]
+              / (double) n
+              * Math.log(((double) n * featureOutcomeCounts[nFeature][nOutcome])
+                  / ((double) featureCounts[nFeature] * outcomeCounts[nOutcome]));
+        }
+      }
+
+      return information;
+    }
+
+    private static int sum(Collection<Integer> values) {
+      int total = 0;
+      for (int v : values) {
+        total += v;
+      }
+      return total;
+    }
+
+    public void save(URI outputURI) throws IOException {
+      File out = new File(outputURI);
+      BufferedWriter writer = null;
+      writer = new BufferedWriter(new FileWriter(out));
+
+      // Write out header
+      writer.append("Mutual Information Data\n");
+      writer.append("Feature\t");
+      writer.append(Joiner.on("\t").join(this.classConditionalCounts.columnKeySet()));
+      writer.append("\n");
+
+      // Write out Mutual Information data
+      for (String featureName : this.classConditionalCounts.rowKeySet()) {
+        writer.append(featureName);
+        for (OUTCOME_T outcome : this.classConditionalCounts.columnKeySet()) {
+          writer.append("\t");
+          writer.append(String.format("%f", this.mutualInformation(featureName, outcome)));
+        }
+        writer.append("\n");
+      }
+      writer.append("\n");
+      writer.append(this.classConditionalCounts.toString());
+      writer.close();
+    }
+
+    public ComputeFeatureScore<OUTCOME_T> getScoreFunction(CombineScoreMethod combineScoreMethod) {
+      return new ComputeFeatureScore<OUTCOME_T>(this, combineScoreMethod);
+    }
+
+    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String, Double> {
+
+      private MutualInformationStats<OUTCOME_T> stats;
+
+      private Function<Map<OUTCOME_T, Double>, Double> combineScoreFunction;
+
+      public ComputeFeatureScore(
+          MutualInformationStats<OUTCOME_T> stats,
+          CombineScoreMethod combineMeasureType) {
+        this.stats = stats;
+        switch (combineMeasureType) {
+          case AVERAGE:
+            this.combineScoreFunction = new CombineScoreMethod.AverageScores<OUTCOME_T>();
+          case MAX:
+            this.combineScoreFunction = new CombineScoreMethod.MaxScores<OUTCOME_T>();
+        }
+
+      }
+
+      @Override
+      public Double apply(String featureName) {
+        Set<OUTCOME_T> outcomes = stats.classConditionalCounts.columnKeySet();
+        Map<OUTCOME_T, Double> featureOutcomeMI = Maps.newHashMap();
+        for (OUTCOME_T outcome : outcomes) {
+          featureOutcomeMI.put(outcome, stats.mutualInformation(featureName, outcome));
+        }
+        return this.combineScoreFunction.apply(featureOutcomeMI);
+      }
+
+    }
+
+  }
+
+  public String nameFeature(Feature feature) {
+    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName() + ":"
+        + feature.getValue();
+  }
+
+  protected boolean isTrained;
+
+  private MutualInformationStats<OUTCOME_T> mutualInfoStats;
+
+  private CombinedExtractor subExtractor;
+
+  private int numFeatures;
+
+  private CombineScoreMethod combineScoreMethod;
+
+  private List<String> selectedFeatures;
+
+  private double smoothingCount;
+
+  public MutualInformationFeatureSelectionExtractor(String name, CombinedExtractor  extractor) {
+    super(name);
+    this.init(extractor, CombineScoreMethod.MAX, 1.0, 10);
+  }
+
+  public MutualInformationFeatureSelectionExtractor(
+      String name,
+      CombinedExtractor  extractor,
+      int numFeatures) {
+    super(name);
+    this.init(extractor, CombineScoreMethod.MAX, 1.0, numFeatures);
+  }
+
+  public MutualInformationFeatureSelectionExtractor(
+      String name,
+      CombinedExtractor  extractor,
+      CombineScoreMethod combineMeasureType,
+      double smoothingCount,
+      int numFeatures) {
+    super(name);
+    this.init(extractor, combineMeasureType, smoothingCount, numFeatures);
+  }
+
+  private void init(
+	  CombinedExtractor  extractor,
+      CombineScoreMethod method,
+      double smoothCount,
+      int n) {
+    this.subExtractor = extractor;
+    this.combineScoreMethod = method;
+    this.smoothingCount = smoothCount;
+    this.numFeatures = n;
+  }
+
+  @Override
+  public List<Feature> extract(JCas view, Annotation focusAnnotation)
+      throws CleartkExtractorException {
+
+    List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
+    List<Feature> result = new ArrayList<Feature>();
+    if (this.isTrained) {
+      // Filter out selected features
+      result.addAll(Collections2.filter(extracted, this));
+    } else {
+      // We haven't trained this extractor yet, so just mark the existing features
+      // for future modification, by creating one uber-container feature
+//      List<TransformableFeature> transExtracted = new ArrayList<TransformableFeature>();
+//      for (Feature feat: extracted){
+//    	  transExtracted.add(new TransformableFeature(feat.getName(), feat));
+//      }
+      result.add(new TransformableFeature(this.name, extracted));
+    }
+
+    return result;
+  }
+
+  @Override
+  public void train(Iterable<Instance<OUTCOME_T>> instances) {
+    // aggregate statistics for all features and classes
+    this.mutualInfoStats = new MutualInformationStats<OUTCOME_T>(this.smoothingCount);
+
+    for (Instance<OUTCOME_T> instance : instances) {
+      OUTCOME_T outcome = instance.getOutcome();
+      for (Feature feature : instance.getFeatures()) {
+        if (this.isTransformable(feature)) {
+          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures()) {
+            mutualInfoStats.update(this.nameFeature(untransformedFeature), outcome, 1);
+          }
+        }
+      }
+    }
+    // Compute mutual information score for each feature
+    Set<String> featureNames = mutualInfoStats.classConditionalCounts.rowKeySet();
+
+    this.selectedFeatures = Ordering.natural().onResultOf(
+        this.mutualInfoStats.getScoreFunction(this.combineScoreMethod)).reverse().immutableSortedCopy(
+        featureNames);
+    this.isTrained = true;
+  }
+
+  @Override
+  public void save(URI uri) throws IOException {
+    if (!this.isTrained) {
+      throw new IOException("MutualInformationFeatureExtractor: Cannot save before training.");
+    }
+    File out = new File(uri);
+    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+    writer.append("CombineScoreType\t");
+    writer.append(this.combineScoreMethod.toString());
+    writer.append("\n");
+
+//    ComputeFeatureScore<OUTCOME_T> computeScore = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
+    for (String feature : this.selectedFeatures) {
+      writer.append(String.format("%s\n", feature));//, computeScore.apply(feature)));
+    }
+
+    writer.close();
+
+  }
+
+  @Override
+  public void load(URI uri) throws IOException {
+    this.selectedFeatures = Lists.newArrayList();
+    File in = new File(uri);
+    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+    // First line specifies the combine utility type
+    this.combineScoreMethod = CombineScoreMethod.valueOf(reader.readLine().split("\\t")[1]);
+
+    // The rest of the lines are feature + selection scores
+    String line = null;
+    int n = 0;
+    while ((line = reader.readLine()) != null && n < this.numFeatures) {
+      String featureValue = line.trim();
+      this.selectedFeatures.add(featureValue);
+      n++;
+    }
+
+    reader.close();
+    this.isTrained = true;
+  }
+
+  @Override
+  public boolean apply(Feature feature) {
+    return this.selectedFeatures.contains(this.nameFeature(feature));
+  }
+
+  public final List<String> getSelectedFeatures() {
+    return this.selectedFeatures;
+  }
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java?rev=1424157&r1=1424156&r2=1424157&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java (original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java Wed Dec 19 21:49:46 2012
@@ -1,182 +1,182 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package org.apache.ctakes.temporal.eval;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.List;
-import java.util.Set;
-import java.util.TreeSet;
-import java.util.logging.FileHandler;
-import java.util.logging.Formatter;
-import java.util.logging.Level;
-import java.util.logging.LogRecord;
-import java.util.logging.Logger;
-
-import org.apache.ctakes.temporal.ae.EventAnnotator;
-import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
-import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.cas.CAS;
-import org.apache.uima.collection.CollectionReader;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.tcas.Annotation;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.cleartk.classifier.Instance;
-import org.cleartk.classifier.feature.transform.InstanceStream;
-import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
-import org.cleartk.eval.AnnotationStatistics;
-import org.cleartk.util.ViewURIUtil;
-import org.uimafit.factory.AggregateBuilder;
-import org.uimafit.pipeline.JCasIterable;
-import org.uimafit.pipeline.SimplePipeline;
-
-import com.google.common.base.Function;
-import com.google.common.collect.Ordering;
-
-public abstract class EvaluationOfAnnotationSpans_ImplBase extends
-    Evaluation_ImplBase<AnnotationStatistics<String>> {
-
-  private final Logger logger = Logger.getLogger(this.getClass().getName());
-
-  public void setLogging(Level level, File outputFile) throws IOException {
-    if (!outputFile.getParentFile().exists()) {
-      outputFile.getParentFile().mkdirs();
-    }
-    this.logger.setLevel(level);
-    FileHandler handler = new FileHandler(outputFile.getPath());
-    handler.setFormatter(new Formatter() {
-      @Override
-      public String format(LogRecord record) {
-        return record.getMessage() + '\n';
-      }
-    });
-    this.logger.addHandler(handler);
-  }
-
-  public EvaluationOfAnnotationSpans_ImplBase(
-      File baseDirectory,
-      File rawTextDirectory,
-      File knowtatorXMLDirectory,
-      List<Integer> patientSets,
-      Set<AnnotatorType> annotatorFlags) {
-    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, patientSets, annotatorFlags);
-  }
-
-  protected abstract AnalysisEngineDescription getDataWriterDescription(File directory)
-      throws ResourceInitializationException;
-
-  protected abstract void trainAndPackage(File directory) throws Exception;
-
-  @Override
-  protected void train(CollectionReader collectionReader, File directory) throws Exception {
-    AggregateBuilder aggregateBuilder = new AggregateBuilder();
-    aggregateBuilder.add(this.getPreprocessorTrainDescription());
-    aggregateBuilder.add(this.getDataWriterDescription(directory));
-    SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
-    
-    if( EventAnnotator.featureTrim > 0 ){
-    	//Extracting features and writing instances
-        Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
-        // Collect MinMax stats for feature normalization
-        URI chi2NbFsURI = EventAnnotator.createNbFSURI(directory);
-        Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, EventAnnotator.featureTrim);
-        chi2NbFsExtractor.train(instances);
-        chi2NbFsExtractor.save(chi2NbFsURI);
-        //now write in the libsvm format
-        this.logger.info("Write out model training data");
-        LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
-        for (Instance<String> instance : instances) {
-          instance = chi2NbFsExtractor.transform(instance);
-          dataWriter.write(instance);
-        }
-        dataWriter.finish();
-    }
-    
-    this.trainAndPackage(directory);
-  }
-
-  protected abstract AnalysisEngineDescription getAnnotatorDescription(File directory)
-      throws ResourceInitializationException;
-
-  protected abstract Collection<? extends Annotation> getGoldAnnotations(JCas jCas);
-
-  protected abstract Collection<? extends Annotation> getSystemAnnotations(JCas jCas);
-
-  @Override
-  protected AnnotationStatistics<String> test(CollectionReader collectionReader, File directory)
-      throws Exception {
-    AggregateBuilder aggregateBuilder = new AggregateBuilder();
-    aggregateBuilder.add(this.getPreprocessorTestDescription());
-    aggregateBuilder.add(this.getAnnotatorDescription(directory));
-
-    AnnotationStatistics<String> stats = new AnnotationStatistics<String>();
-    Ordering<Annotation> bySpans = Ordering.<Integer> natural().lexicographical().onResultOf(
-        new Function<Annotation, List<Integer>>() {
-          @Override
-          public List<Integer> apply(Annotation annotation) {
-            return Arrays.asList(annotation.getBegin(), annotation.getEnd());
-          }
-        });
-    for (JCas jCas : new JCasIterable(collectionReader, aggregateBuilder.createAggregate())) {
-      JCas goldView = jCas.getView(GOLD_VIEW_NAME);
-      JCas systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
-      Collection<? extends Annotation> goldAnnotations = this.getGoldAnnotations(goldView);
-      Collection<? extends Annotation> systemAnnotations = this.getSystemAnnotations(systemView);
-      stats.add(goldAnnotations, systemAnnotations);
-
-      Set<Annotation> goldSet = new TreeSet<Annotation>(bySpans);
-      goldSet.addAll(goldAnnotations);
-      Set<Annotation> systemSet = new TreeSet<Annotation>(bySpans);
-      systemSet.addAll(systemAnnotations);
-
-      Set<Annotation> goldOnly = new TreeSet<Annotation>(bySpans);
-      goldOnly.addAll(goldSet);
-      goldOnly.removeAll(systemSet);
-
-      Set<Annotation> systemOnly = new TreeSet<Annotation>(bySpans);
-      systemOnly.addAll(systemSet);
-      systemOnly.removeAll(goldSet);
-
-      String text = jCas.getDocumentText().replaceAll("[\r\n]", " ");
-      if (!goldOnly.isEmpty() || !systemOnly.isEmpty()) {
-        this.logger.fine("Errors in : " + ViewURIUtil.getURI(jCas).toString());
-        Set<Annotation> errors = new TreeSet<Annotation>(bySpans);
-        errors.addAll(goldOnly);
-        errors.addAll(systemOnly);
-        for (Annotation annotation : errors) {
-          int begin = annotation.getBegin();
-          int end = annotation.getEnd();
-          int windowBegin = Math.max(0, begin - 50);
-          int windowEnd = Math.min(text.length(), end + 50);
-          String label = goldOnly.contains(annotation) ? "DROPPED:" : "ADDED:  ";
-          this.logger.fine(String.format(
-              "%s  ...%s[!%s!]%s...",
-              label,
-              text.substring(windowBegin, begin),
-              text.substring(begin, end),
-              text.substring(end, windowEnd)));
-        }
-      }
-    }
-    return stats;
-  }
-}
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.temporal.eval;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.logging.FileHandler;
+import java.util.logging.Formatter;
+import java.util.logging.Level;
+import java.util.logging.LogRecord;
+import java.util.logging.Logger;
+
+import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.temporal.ae.feature.selection.Chi2NeighborFSExtractor;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.transform.InstanceStream;
+import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
+import org.cleartk.eval.AnnotationStatistics;
+import org.cleartk.util.ViewURIUtil;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.pipeline.JCasIterable;
+import org.uimafit.pipeline.SimplePipeline;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Ordering;
+
+public abstract class EvaluationOfAnnotationSpans_ImplBase extends
+    Evaluation_ImplBase<AnnotationStatistics<String>> {
+
+  private final Logger logger = Logger.getLogger(this.getClass().getName());
+
+  public void setLogging(Level level, File outputFile) throws IOException {
+    if (!outputFile.getParentFile().exists()) {
+      outputFile.getParentFile().mkdirs();
+    }
+    this.logger.setLevel(level);
+    FileHandler handler = new FileHandler(outputFile.getPath());
+    handler.setFormatter(new Formatter() {
+      @Override
+      public String format(LogRecord record) {
+        return record.getMessage() + '\n';
+      }
+    });
+    this.logger.addHandler(handler);
+  }
+
+  public EvaluationOfAnnotationSpans_ImplBase(
+      File baseDirectory,
+      File rawTextDirectory,
+      File knowtatorXMLDirectory,
+      List<Integer> patientSets,
+      Set<AnnotatorType> annotatorFlags) {
+    super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, patientSets, annotatorFlags);
+  }
+
+  protected abstract AnalysisEngineDescription getDataWriterDescription(File directory)
+      throws ResourceInitializationException;
+
+  protected abstract void trainAndPackage(File directory) throws Exception;
+
+  @Override
+  protected void train(CollectionReader collectionReader, File directory) throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(this.getPreprocessorTrainDescription());
+    aggregateBuilder.add(this.getDataWriterDescription(directory));
+    SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+    
+    if( EventAnnotator.featureTrim > 0 ){
+    	//Extracting features and writing instances
+        Iterable<Instance<String>> instances = InstanceStream.loadFromDirectory(directory);
+        // Collect MinMax stats for feature normalization
+        URI chi2NbFsURI = EventAnnotator.createNbFSURI(directory);
+        Chi2NeighborFSExtractor<String> chi2NbFsExtractor = new Chi2NeighborFSExtractor<String>(EventAnnotator.FS_NEIGHBOR_EXTRACTOR_KEY, EventAnnotator.featureTrim);
+        chi2NbFsExtractor.train(instances);
+        chi2NbFsExtractor.save(chi2NbFsURI);
+        //now write in the libsvm format
+        this.logger.info("Write out model training data");
+        LIBSVMStringOutcomeDataWriter dataWriter = new LIBSVMStringOutcomeDataWriter(directory);
+        for (Instance<String> instance : instances) {
+          instance = chi2NbFsExtractor.transform(instance);
+          dataWriter.write(instance);
+        }
+        dataWriter.finish();
+    }
+    
+    this.trainAndPackage(directory);
+  }
+
+  protected abstract AnalysisEngineDescription getAnnotatorDescription(File directory)
+      throws ResourceInitializationException;
+
+  protected abstract Collection<? extends Annotation> getGoldAnnotations(JCas jCas);
+
+  protected abstract Collection<? extends Annotation> getSystemAnnotations(JCas jCas);
+
+  @Override
+  protected AnnotationStatistics<String> test(CollectionReader collectionReader, File directory)
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(this.getPreprocessorTestDescription());
+    aggregateBuilder.add(this.getAnnotatorDescription(directory));
+
+    AnnotationStatistics<String> stats = new AnnotationStatistics<String>();
+    Ordering<Annotation> bySpans = Ordering.<Integer> natural().lexicographical().onResultOf(
+        new Function<Annotation, List<Integer>>() {
+          @Override
+          public List<Integer> apply(Annotation annotation) {
+            return Arrays.asList(annotation.getBegin(), annotation.getEnd());
+          }
+        });
+    for (JCas jCas : new JCasIterable(collectionReader, aggregateBuilder.createAggregate())) {
+      JCas goldView = jCas.getView(GOLD_VIEW_NAME);
+      JCas systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
+      Collection<? extends Annotation> goldAnnotations = this.getGoldAnnotations(goldView);
+      Collection<? extends Annotation> systemAnnotations = this.getSystemAnnotations(systemView);
+      stats.add(goldAnnotations, systemAnnotations);
+
+      Set<Annotation> goldSet = new TreeSet<Annotation>(bySpans);
+      goldSet.addAll(goldAnnotations);
+      Set<Annotation> systemSet = new TreeSet<Annotation>(bySpans);
+      systemSet.addAll(systemAnnotations);
+
+      Set<Annotation> goldOnly = new TreeSet<Annotation>(bySpans);
+      goldOnly.addAll(goldSet);
+      goldOnly.removeAll(systemSet);
+
+      Set<Annotation> systemOnly = new TreeSet<Annotation>(bySpans);
+      systemOnly.addAll(systemSet);
+      systemOnly.removeAll(goldSet);
+
+      String text = jCas.getDocumentText().replaceAll("[\r\n]", " ");
+      if (!goldOnly.isEmpty() || !systemOnly.isEmpty()) {
+        this.logger.fine("Errors in : " + ViewURIUtil.getURI(jCas).toString());
+        Set<Annotation> errors = new TreeSet<Annotation>(bySpans);
+        errors.addAll(goldOnly);
+        errors.addAll(systemOnly);
+        for (Annotation annotation : errors) {
+          int begin = annotation.getBegin();
+          int end = annotation.getEnd();
+          int windowBegin = Math.max(0, begin - 50);
+          int windowEnd = Math.min(text.length(), end + 50);
+          String label = goldOnly.contains(annotation) ? "DROPPED:" : "ADDED:  ";
+          this.logger.fine(String.format(
+              "%s  ...%s[!%s!]%s...",
+              label,
+              text.substring(windowBegin, begin),
+              text.substring(begin, end),
+              text.substring(end, windowEnd)));
+        }
+      }
+    }
+    return stats;
+  }
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfAnnotationSpans_ImplBase.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message