incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c...@apache.org
Subject svn commit: r1414245 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection: ./ Chi2FSExtractor.java FeatureSelectionExtractor.java MutualInformationFeatureSelectionExtractor.java
Date Tue, 27 Nov 2012 16:17:47 GMT
Author: clin
Date: Tue Nov 27 16:17:46 2012
New Revision: 1414245

URL: http://svn.apache.org/viewvc?rev=1414245&view=rev
Log:
test

Added:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FSExtractor.java
  (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
  (with props)
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
  (with props)

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FSExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FSExtractor.java?rev=1414245&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FSExtractor.java
(added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FSExtractor.java
Tue Nov 27 16:17:46 2012
@@ -0,0 +1,116 @@
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.collect.Collections2;
+import com.google.common.collect.Lists;
+
+public class Chi2FSExtractor<OUTCOME_T> extends FeatureSelectionExtractor<OUTCOME_T>
+		implements SimpleFeatureExtractor {
+			
+	protected boolean isTrained;
+	private CombinedExtractor subExtractor;
+	private ArrayList<String> selectedFeatures;
+	private int numFeatures;
+
+	public Chi2FSExtractor(String name, CombinedExtractor featureExtractor) {
+		super(name);
+		this.subExtractor = featureExtractor;
+	}
+
+	@Override
+	public List<Feature> extract(JCas view, Annotation focusAnnotation)
+			throws CleartkExtractorException {
+		List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
+	    List<Feature> result = new ArrayList<Feature>();
+	    if (this.isTrained) {
+	      // Filter out selected features
+	      result.addAll(Collections2.filter(extracted, this));
+	    } else {
+	      // We haven't trained this extractor yet, so just mark the existing features
+	      // for future modification, by creating one uber-container feature
+	      result.add(new TransformableFeature(this.name, extracted));
+	    }
+
+	    return result;
+	}
+
+	@Override
+	public boolean apply(Feature feature) {
+		return this.selectedFeatures.contains(this.nameFeature(feature));
+	}
+	
+	public String nameFeature(Feature feature) {
+	    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName()
+ ":"
+	        + feature.getValue();
+	  }
+
+	@Override
+	public void train(Iterable<Instance<OUTCOME_T>> instances) {
+		//step1: change cleartk instance to weka instances
+		
+		//step2: step up weka Attribute Selection
+		
+		//step3: get selected features
+		this.selectedFeatures = null;
+		this.isTrained = true;
+		
+	}
+
+	@Override
+	public void save(URI uri) throws IOException {
+		if (!this.isTrained) {
+		      throw new IOException("Chi2FSExtractor: Cannot save before training.");
+		}
+		File out = new File(uri);
+	    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+//	    writer.append("CombineScoreType\t");
+//	    writer.append(this.combineScoreMethod.toString());
+//	    writer.append("\n");
+
+//	    ComputeFeatureScore<OUTCOME_T> computeScore = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
+	    for (String feature : this.selectedFeatures) {
+	      writer.append(String.format("%s\t\n", feature)); //), computeScore.apply(feature)));
+	    }
+
+	    writer.close();
+	}
+
+	@Override
+	public void load(URI uri) throws IOException {
+		this.selectedFeatures = Lists.newArrayList();
+	    File in = new File(uri);
+	    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+	    // The rest of the lines are feature + selection scores
+	    String line = null;
+	    int n = 0;
+	    while ((line = reader.readLine()) != null && n < this.numFeatures) {
+	      String[] featureValuePair = line.split("\\t");
+	      this.selectedFeatures.add(featureValuePair[0]);
+	      n++;
+	    }
+
+	    reader.close();
+	    this.isTrained = true;
+		
+	}
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/Chi2FSExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java?rev=1414245&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
(added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
Tue Nov 27 16:17:46 2012
@@ -0,0 +1,35 @@
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.cleartk.classifier.feature.transform.TrainableExtractor_ImplBase;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+
+import com.google.common.base.Predicate;
+import com.google.common.collect.Collections2;
+
+public abstract class FeatureSelectionExtractor<OUTCOME_T> extends
+		TrainableExtractor_ImplBase<OUTCOME_T> implements Predicate<Feature> {
+			
+		public FeatureSelectionExtractor(String name) {
+		    super(name);
+		}
+
+		@Override
+		public Instance<OUTCOME_T> transform(Instance<OUTCOME_T> instance) {
+		    List<Feature> features = new ArrayList<Feature>();
+		    for (Feature feature : instance.getFeatures()) {
+		    	if (this.isTransformable(feature)) {
+			        // Filter down to selected features
+			        features.addAll(Collections2.filter(((TransformableFeature) feature).getFeatures(),
this));
+			    } else {
+			        // Pass non-relevant features through w/o filtering
+			        features.add(feature);
+			    }
+			}
+			return new Instance<OUTCOME_T>(instance.getOutcome(), features);
+		}
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/FeatureSelectionExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java?rev=1414245&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
(added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
Tue Nov 27 16:17:46 2012
@@ -0,0 +1,368 @@
+package org.apache.ctakes.temporal.ae.feature.selection;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import org.cleartk.classifier.feature.extractor.CleartkExtractorException;
+import org.cleartk.classifier.feature.extractor.simple.CombinedExtractor;
+import org.cleartk.classifier.feature.extractor.simple.SimpleFeatureExtractor;
+import org.apache.ctakes.temporal.ae.feature.selection.MutualInformationFeatureSelectionExtractor.CombineScoreMethod.CombineScoreFunction;
+import org.apache.ctakes.temporal.ae.feature.selection.MutualInformationFeatureSelectionExtractor.MutualInformationStats.ComputeFeatureScore;
+import org.cleartk.classifier.feature.transform.TransformableFeature;
+
+import com.google.common.base.Function;
+import com.google.common.base.Joiner;
+import com.google.common.collect.Collections2;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.HashMultiset;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Multiset;
+import com.google.common.collect.Ordering;
+import com.google.common.collect.Table;
+
+/**
+ * <br>
+ * Copyright (c) 2007-2012, Regents of the University of Colorado <br>
+ * All rights reserved.
+ * <p>
+ * 
+ * Selects features via mutual information statistics between the features extracted from
its
+ * sub-extractor and the outcome values they are paired with in classification instances.
+ * 
+ * @author Lee Becker
+ * 
+ */
+public class MutualInformationFeatureSelectionExtractor<OUTCOME_T> extends
+    FeatureSelectionExtractor<OUTCOME_T> implements SimpleFeatureExtractor {
+
+  /**
+   * Specifies how scores for each outcome should be combined/aggregated into a single score
+   */
+  public static enum CombineScoreMethod {
+    AVERAGE, // Average mutual information across all classes and take features with k-largest
+             // values
+    MAX; // Take highest mutual information value for each class
+    // MERGE, // Take k-largest mutual information values for each class and merge into a
single
+    // collection - currently omitted because it requires a different extraction flow
+
+    public abstract static class CombineScoreFunction<OUTCOME_T> implements
+        Function<Map<OUTCOME_T, Double>, Double> {
+    }
+
+    public static class AverageScores<OUTCOME_T> extends CombineScoreFunction<OUTCOME_T>
{
+      @Override
+      public Double apply(Map<OUTCOME_T, Double> input) {
+        Collection<Double> scores = input.values();
+        int size = scores.size();
+        double total = 0;
+
+        for (Double score : scores) {
+          total += score;
+        }
+        return total / size;
+      }
+    }
+
+    public static class MaxScores<OUTCOME_T> extends CombineScoreFunction<OUTCOME_T>
{
+      @Override
+      public Double apply(Map<OUTCOME_T, Double> input) {
+        return Ordering.natural().max(input.values());
+      }
+    }
+  }
+
+  /**
+   * Helper class for aggregating and computing mutual information statistics
+   */
+  public static class MutualInformationStats<OUTCOME_T> {
+    protected Multiset<OUTCOME_T> classCounts;
+
+    protected Table<String, OUTCOME_T, Integer> classConditionalCounts;
+
+    protected double smoothingCount;
+
+    public MutualInformationStats(double smoothingCount) {
+      this.classCounts = HashMultiset.<OUTCOME_T> create();
+      this.classConditionalCounts = HashBasedTable.<String, OUTCOME_T, Integer> create();
+      this.smoothingCount += smoothingCount;
+    }
+
+    public void update(String featureName, OUTCOME_T outcome, int occurrences) {
+      Integer count = this.classConditionalCounts.get(featureName, outcome);
+      if (count == null) {
+        count = 0;
+      }
+      this.classConditionalCounts.put(featureName, outcome, count + occurrences);
+      this.classCounts.add(outcome, occurrences);
+    }
+
+    public double mutualInformation(String featureName, OUTCOME_T outcome) {
+      // notation index of 0 means false, 1 mean true
+      int[] featureCounts = new int[2];
+      int[] outcomeCounts = new int[2];
+      int[][] featureOutcomeCounts = new int[2][2];
+
+      int n = this.classCounts.size();
+      featureCounts[1] = this.sum(this.classConditionalCounts.row(featureName).values());
+      featureCounts[0] = n - featureCounts[1];
+      outcomeCounts[1] = this.classCounts.count(outcome);
+      outcomeCounts[0] = n - outcomeCounts[1];
+
+      featureOutcomeCounts[1][1] = this.classConditionalCounts.contains(featureName, outcome)
+          ? this.classConditionalCounts.get(featureName, outcome)
+          : 0;
+      featureOutcomeCounts[1][0] = featureCounts[1] - featureOutcomeCounts[1][1];
+      featureOutcomeCounts[0][1] = outcomeCounts[1] - featureOutcomeCounts[1][1];
+      featureOutcomeCounts[0][0] = n - featureCounts[1] - outcomeCounts[1]
+          + featureOutcomeCounts[1][1];
+
+      double information = 0.0;
+      for (int nFeature = 0; nFeature <= 1; nFeature++) {
+        for (int nOutcome = 0; nOutcome <= 1; nOutcome++) {
+          featureOutcomeCounts[nFeature][nOutcome] += smoothingCount;
+          information += (double) featureOutcomeCounts[nFeature][nOutcome]
+              / (double) n
+              * Math.log(((double) n * featureOutcomeCounts[nFeature][nOutcome])
+                  / ((double) featureCounts[nFeature] * outcomeCounts[nOutcome]));
+        }
+      }
+
+      return information;
+    }
+
+    private int sum(Collection<Integer> values) {
+      int total = 0;
+      for (int v : values) {
+        total += v;
+      }
+      return total;
+    }
+
+    public void save(URI outputURI) throws IOException {
+      File out = new File(outputURI);
+      BufferedWriter writer = null;
+      writer = new BufferedWriter(new FileWriter(out));
+
+      // Write out header
+      writer.append("Mutual Information Data\n");
+      writer.append("Feature\t");
+      writer.append(Joiner.on("\t").join(this.classConditionalCounts.columnKeySet()));
+      writer.append("\n");
+
+      // Write out Mutual Information data
+      for (String featureName : this.classConditionalCounts.rowKeySet()) {
+        writer.append(featureName);
+        for (OUTCOME_T outcome : this.classConditionalCounts.columnKeySet()) {
+          writer.append("\t");
+          writer.append(String.format("%f", this.mutualInformation(featureName, outcome)));
+        }
+        writer.append("\n");
+      }
+      writer.append("\n");
+      writer.append(this.classConditionalCounts.toString());
+      writer.close();
+    }
+
+    public ComputeFeatureScore<OUTCOME_T> getScoreFunction(CombineScoreMethod combineScoreMethod)
{
+      return new ComputeFeatureScore<OUTCOME_T>(this, combineScoreMethod);
+    }
+
+    public static class ComputeFeatureScore<OUTCOME_T> implements Function<String,
Double> {
+
+      private MutualInformationStats<OUTCOME_T> stats;
+
+      private CombineScoreFunction<OUTCOME_T> combineScoreFunction;
+
+      public ComputeFeatureScore(
+          MutualInformationStats<OUTCOME_T> stats,
+          CombineScoreMethod combineMeasureType) {
+        this.stats = stats;
+        switch (combineMeasureType) {
+          case AVERAGE:
+            this.combineScoreFunction = new CombineScoreMethod.AverageScores<OUTCOME_T>();
+          case MAX:
+            this.combineScoreFunction = new CombineScoreMethod.MaxScores<OUTCOME_T>();
+        }
+
+      }
+
+      @Override
+      public Double apply(String featureName) {
+        Set<OUTCOME_T> outcomes = stats.classConditionalCounts.columnKeySet();
+        Map<OUTCOME_T, Double> featureOutcomeMI = Maps.newHashMap();
+        for (OUTCOME_T outcome : outcomes) {
+          featureOutcomeMI.put(outcome, stats.mutualInformation(featureName, outcome));
+        }
+        return this.combineScoreFunction.apply(featureOutcomeMI);
+      }
+
+    }
+
+  }
+
+  public String nameFeature(Feature feature) {
+    return (feature.getValue() instanceof Number) ? feature.getName() : feature.getName()
+ ":"
+        + feature.getValue();
+  }
+
+  protected boolean isTrained;
+
+  private MutualInformationStats<OUTCOME_T> mutualInfoStats;
+
+  private CombinedExtractor subExtractor;
+
+  private int numFeatures;
+
+  private CombineScoreMethod combineScoreMethod;
+
+  private List<String> selectedFeatures;
+
+  private double smoothingCount;
+
+  public MutualInformationFeatureSelectionExtractor(String name, CombinedExtractor  extractor)
{
+    super(name);
+    this.init(extractor, CombineScoreMethod.MAX, 1.0, 10);
+  }
+
+  public MutualInformationFeatureSelectionExtractor(
+      String name,
+      CombinedExtractor  extractor,
+      int numFeatures) {
+    super(name);
+    this.init(extractor, CombineScoreMethod.MAX, 1.0, numFeatures);
+  }
+
+  public MutualInformationFeatureSelectionExtractor(
+      String name,
+      CombinedExtractor  extractor,
+      CombineScoreMethod combineMeasureType,
+      double smoothingCount,
+      int numFeatures) {
+    super(name);
+    this.init(extractor, combineMeasureType, smoothingCount, numFeatures);
+  }
+
+  private void init(
+	  CombinedExtractor  extractor,
+      CombineScoreMethod method,
+      double smoothCount,
+      int n) {
+    this.subExtractor = extractor;
+    this.combineScoreMethod = method;
+    this.smoothingCount = smoothCount;
+    this.numFeatures = n;
+  }
+
+  @Override
+  public List<Feature> extract(JCas view, Annotation focusAnnotation)
+      throws CleartkExtractorException {
+
+    List<Feature> extracted = this.subExtractor.extract(view, focusAnnotation);
+    List<Feature> result = new ArrayList<Feature>();
+    if (this.isTrained) {
+      // Filter out selected features
+      result.addAll(Collections2.filter(extracted, this));
+    } else {
+      // We haven't trained this extractor yet, so just mark the existing features
+      // for future modification, by creating one uber-container feature
+//      List<TransformableFeature> transExtracted = new ArrayList<TransformableFeature>();
+//      for (Feature feat: extracted){
+//    	  transExtracted.add(new TransformableFeature(feat.getName(), feat));
+//      }
+      result.add(new TransformableFeature(this.name, extracted));
+    }
+
+    return result;
+  }
+
+  @Override
+  public void train(Iterable<Instance<OUTCOME_T>> instances) {
+    // aggregate statistics for all features and classes
+    this.mutualInfoStats = new MutualInformationStats<OUTCOME_T>(this.smoothingCount);
+
+    for (Instance<OUTCOME_T> instance : instances) {
+      OUTCOME_T outcome = instance.getOutcome();
+      for (Feature feature : instance.getFeatures()) {
+        if (this.isTransformable(feature)) {
+          for (Feature untransformedFeature : ((TransformableFeature) feature).getFeatures())
{
+            mutualInfoStats.update(this.nameFeature(untransformedFeature), outcome, 1);
+          }
+        }
+      }
+    }
+    // Compute mutual information score for each feature
+    Set<String> featureNames = mutualInfoStats.classConditionalCounts.rowKeySet();
+
+    this.selectedFeatures = Ordering.natural().onResultOf(
+        this.mutualInfoStats.getScoreFunction(this.combineScoreMethod)).reverse().immutableSortedCopy(
+        featureNames);
+    this.isTrained = true;
+  }
+
+  @Override
+  public void save(URI uri) throws IOException {
+    if (!this.isTrained) {
+      throw new IOException("MutualInformationFeatureExtractor: Cannot save before training.");
+    }
+    File out = new File(uri);
+    BufferedWriter writer = new BufferedWriter(new FileWriter(out));
+    writer.append("CombineScoreType\t");
+    writer.append(this.combineScoreMethod.toString());
+    writer.append("\n");
+
+//    ComputeFeatureScore<OUTCOME_T> computeScore = this.mutualInfoStats.getScoreFunction(this.combineScoreMethod);
+    for (String feature : this.selectedFeatures) {
+      writer.append(String.format("%s\n", feature));//, computeScore.apply(feature)));
+    }
+
+    writer.close();
+
+  }
+
+  @Override
+  public void load(URI uri) throws IOException {
+    this.selectedFeatures = Lists.newArrayList();
+    File in = new File(uri);
+    BufferedReader reader = new BufferedReader(new FileReader(in));
+
+    // First line specifies the combine utility type
+    this.combineScoreMethod = CombineScoreMethod.valueOf(reader.readLine().split("\\t")[1]);
+
+    // The rest of the lines are feature + selection scores
+    String line = null;
+    int n = 0;
+    while ((line = reader.readLine()) != null && n < this.numFeatures) {
+      String featureValue = line.trim();
+      this.selectedFeatures.add(featureValue);
+      n++;
+    }
+
+    reader.close();
+    this.isTrained = true;
+  }
+
+  @Override
+  public boolean apply(Feature feature) {
+    return this.selectedFeatures.contains(this.nameFeature(feature));
+  }
+
+  public final List<String> getSelectedFeatures() {
+    return this.selectedFeatures;
+  }
+
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/feature/selection/MutualInformationFeatureSelectionExtractor.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message