incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c...@apache.org
Subject svn commit: r1428980 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/EventAnnotator.java eval/EvaluationOfEventSpans.java utils/SMOTEplus.java
Date Fri, 04 Jan 2013 17:00:43 GMT
Author: clin
Date: Fri Jan  4 17:00:42 2013
New Revision: 1428980

URL: http://svn.apache.org/viewvc?rev=1428980&view=rev
Log:
Make SMOTE-plus an option for Event Detection. One can set SMOTE off by ignoring the argument
"numOfSMOTENeighbors", or giving a number less than 1. 
Based on the argument, SMOTE-plus algorithm will generate a number of synthetic minority instances
accordingly. The first minority instance will be a copy of original minority instance.

Modified:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1428980&r1=1428979&r2=1428980&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
(original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
Fri Jan  4 17:00:42 2013
@@ -83,6 +83,14 @@ public class EventAnnotator extends Clea
       mandatory = false,
       description = "the Chi-squared threshold at which features should be removed")
   protected Float featureSelectionThreshold = 0f;
+  
+  public static final String PARAM_SMOTE_NUM_NEIGHBORS = "NumOfNeighborForSMOTE";
+  
+  @ConfigurationParameter(
+	      name = PARAM_SMOTE_NUM_NEIGHBORS,
+	      mandatory = false,
+	      description = "the number of neighbors used for minority instances for SMOTE algorithm")
+	  protected Float smoteNumOfNeighbors = 0f;
 
   public static final String PARAM_FEATURE_SELECTION_URI = "FeatureSelectionURI";
 
@@ -96,7 +104,7 @@ public class EventAnnotator extends Clea
       Class<?> dataWriter,
       File outputDirectory,
       float downratio,
-      float featureSelect) throws ResourceInitializationException {
+      float featureSelect, float smoteNeighborNumber) throws ResourceInitializationException
{
     return AnalysisEngineFactory.createPrimitiveDescription(
         EventAnnotator.class,
         CleartkAnnotator.PARAM_IS_TRAINING,
@@ -108,7 +116,9 @@ public class EventAnnotator extends Clea
         EventAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
         downratio,
         EventAnnotator.PARAM_FEATURE_SELECTION_THRESHOLD,
-        featureSelect);
+        featureSelect,
+        EventAnnotator.PARAM_SMOTE_NUM_NEIGHBORS,
+        smoteNeighborNumber);
   }
 
   public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
@@ -194,7 +204,7 @@ public class EventAnnotator extends Clea
     Random rand = new Random();
     
     //TRY SMOTE algorithm here to generate more minority class samples
-    SMOTEplus smote = new SMOTEplus();
+    SMOTEplus smote = new SMOTEplus((int)Math.ceil(this.smoteNumOfNeighbors));
         
     // classify tokens within each sentence
     for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
@@ -274,12 +284,13 @@ public class EventAnnotator extends Clea
           String outcome = outcomes.get(tokenIndex);
           // if it is an "O" down-sample it
           if (outcome.equals("O")) {
-        	  if (rand.nextDouble() <= this.probabilityOfKeepingANegativeExample)
+        	  if (rand.nextDouble() <= this.probabilityOfKeepingANegativeExample){
         		  this.dataWriter.write(new Instance<String>(outcome, features));
+        	  }		  
           }else{//for minority instances:
         	  Instance<String> minorityInst = new Instance<String>(outcome, features);
         	  this.dataWriter.write(minorityInst);
-        	  smote.addInstance(minorityInst);
+        	  smote.addInstance(minorityInst);//add minority instances to SMOTE algorithm
           }
         }
 
@@ -294,7 +305,7 @@ public class EventAnnotator extends Clea
         this.eventChunking.createChunks(jCas, tokens, outcomes);
       }
     }
-    if(this.isTraining()){ //add synthetic instances to datawriter
+    if(this.isTraining() && this.smoteNumOfNeighbors >= 1){ //add synthetic instances
to datawriter, if smote is selected
     	Iterable<Instance<String>> syntheticInsts = smote.populateMinorityClass();
     	for( Instance<String> sytheticInst: syntheticInsts){
     		this.dataWriter.write(sytheticInst);

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java?rev=1428980&r1=1428979&r2=1428980&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
(original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/EvaluationOfEventSpans.java
Fri Jan  4 17:00:42 2013
@@ -56,6 +56,9 @@ public class EvaluationOfEventSpans exte
 
     @Option(longName = "featureSelectionThreshold", defaultValue = "0")
     public float getFeatureSelectionThreshold();
+    
+    @Option(longName = "SMOTENeighborNumber", defaultValue = "1")
+    public float getSMOTENeighborNumber();
   }
 
   public static void main(String[] args) throws Exception {
@@ -68,7 +71,8 @@ public class EvaluationOfEventSpans exte
         options.getRawTextDirectory(),
         options.getKnowtatorXMLDirectory(),
         options.getProbabilityOfKeepingANegativeExample(),
-        options.getFeatureSelectionThreshold());
+        options.getFeatureSelectionThreshold(),
+        options.getSMOTENeighborNumber());
     evaluation.setLogging(Level.FINE, new File("target/eval/ctakes-event-errors.log"));
     AnnotationStatistics<String> stats = evaluation.trainAndTest(trainItems, devItems);
     System.err.println(stats);
@@ -77,13 +81,15 @@ public class EvaluationOfEventSpans exte
   private float probabilityOfKeepingANegativeExample;
 
   private float featureSelectionThreshold;
+  
+  private float smoteNeighborNumber;
 
   public EvaluationOfEventSpans(
       File baseDirectory,
       File rawTextDirectory,
       File knowtatorXMLDirectory,
       float probabilityOfKeepingANegativeExample,
-      float featureSelectionThreshold) {
+      float featureSelectionThreshold, float numOfSmoteNeighbors) {
     super(baseDirectory, rawTextDirectory, knowtatorXMLDirectory, EnumSet.of(
         AnnotatorType.PART_OF_SPEECH_TAGS,
         AnnotatorType.CHUNKS,
@@ -95,6 +101,7 @@ public class EvaluationOfEventSpans exte
         //AnnotatorType.SEMANTIC_ROLES));
     this.probabilityOfKeepingANegativeExample = probabilityOfKeepingANegativeExample;
     this.featureSelectionThreshold = featureSelectionThreshold;
+    this.smoteNeighborNumber = numOfSmoteNeighbors;
   }
 
   @Override
@@ -107,7 +114,8 @@ public class EvaluationOfEventSpans exte
         dataWriterClass,
         directory,
         this.probabilityOfKeepingANegativeExample,
-        this.featureSelectionThreshold);
+        this.featureSelectionThreshold,
+        this.smoteNeighborNumber);
   }
 
   @Override

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java?rev=1428980&r1=1428979&r2=1428980&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
(original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
Fri Jan  4 17:00:42 2013
@@ -1,7 +1,12 @@
 package org.apache.ctakes.temporal.utils;
 
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.Iterator;
+import java.util.LinkedList;
 import java.util.List;
+import java.util.Random;
 
 import org.cleartk.classifier.Feature;
 import org.cleartk.classifier.Instance;
@@ -20,13 +25,18 @@ import com.google.common.collect.Table;
 public class SMOTEplus {
 
 	protected List<Instance<String>> minorityInsts;
+	protected List<Instance<String>> majorityInsts;
 	protected Table<Instance<String>, String, Integer> instanceFeatureCount;
+	protected Table<Instance<String>, Instance<String>, Double> interInstanceDistance;
 	protected List<Instance<String>> syntheticInsts;
+	protected final int numOfNearestNeighbors;
 	
-	public SMOTEplus() {
-		this.minorityInsts = Lists.newArrayList();
+	public SMOTEplus(int numNeighbors) {
+		this.minorityInsts 	= Lists.newArrayList();
 		this.syntheticInsts = Lists.newArrayList();
-		this.instanceFeatureCount = HashBasedTable.<Instance<String>, String, Integer>
create();
+		this.instanceFeatureCount 	= HashBasedTable.<Instance<String>, String, Integer>
create();
+		this.interInstanceDistance	= HashBasedTable.<Instance<String>, Instance<String>,
Double> create();
+		this.numOfNearestNeighbors	= numNeighbors;
 	}
 	
 	public Iterable<Instance<String>> populateMinorityClass() {
@@ -38,60 +48,85 @@ public class SMOTEplus {
 		}
 		
 		//2. Iterate through all minority instances:
-		for (Instance<String> aMinorityInst : this.instanceFeatureCount.rowKeySet()) {
+		for (Instance<String> aMinorityInst : this.minorityInsts) {
 			//3. find its nearest neighbor minority instance:
-			//TODO: Should be modified to take nearest K neighbors
-			double minDis = Double.MAX_VALUE;
-			Instance<String> nearestNeighbor = null;
-			for (Instance<String> bMinorityInst : this.instanceFeatureCount.rowKeySet()){
-				if ( aMinorityInst==bMinorityInst || (aMinorityInst!=null && aMinorityInst.equals(bMinorityInst))
){
-					double distance = calculateDistance(aMinorityInst, bMinorityInst);
-					if (distance < minDis){
-						minDis = distance;
-						nearestNeighbor = bMinorityInst;
-					}
+			List<Object[]> distToMe = new LinkedList<Object[]>();
+			for ( Instance<String> bInst : this.instanceFeatureCount.rowKeySet()){
+				double distance = calculateDistance(aMinorityInst, bInst);
+				distToMe.add(new Object[] {distance, bInst});
+			}
+			
+			//sort list and find nearest neighbors:
+			Collections.sort(distToMe, new Comparator<Object>(){
+				public int compare(Object o1, Object o2){
+					double dist1 = (Double) ((Object[])o1)[0];
+					double dist2 = (Double) ((Object[])o2)[0];
+					return (int) Math.ceil(dist1 - dist2);
 				}
+			});
+			
+			//populate the nearest neighbor, create synthetic data:
+			Iterator<Object[]> neighborIter = distToMe.iterator();
+			int idx = 0;
+			while( neighborIter.hasNext() && idx < this.numOfNearestNeighbors){
+				@SuppressWarnings("unchecked")
+				Instance<String> nearestNeighbor = ((Instance<String>) neighborIter.next()[1]);
+				Instance<String> sytheticInst = generateInstance(aMinorityInst, nearestNeighbor);
+				this.syntheticInsts.add(sytheticInst);
+				idx ++;
 			}
-			Instance<String> sytheticInst = generateInstance(aMinorityInst, nearestNeighbor);
-			this.syntheticInsts.add(sytheticInst);
 		}
 		
 		return this.syntheticInsts;
 	}
 
+	private static Random rand = new Random();
+	
 	private Instance<String> generateInstance(Instance<String> aMinorityInst,
 			Instance<String> nearestNeighbor) {
 		List<Feature> features = new ArrayList<Feature>();
 		//iterate through all features:
-		for( String featureName: this.instanceFeatureCount.columnKeySet()){
-			Integer valA = this.instanceFeatureCount.get(aMinorityInst, featureName);
+		for(Feature feature: aMinorityInst.getFeatures()){
+			String featureName = getFeatureName(feature);
 			Integer valB = this.instanceFeatureCount.get(nearestNeighbor, featureName);
-			if (valA != null && valB != null){
-				features.add(new Feature(featureName.split(":",2)[0],featureName.split(":",2)[1]));
+			if(valB != null){
+				features.add(feature);
 			}
 		}
-		Instance<String> syntheticInst = new Instance<String>(aMinorityInst.getOutcome(),
features);
+		String outcome = rand.nextBoolean()? aMinorityInst.getOutcome() : nearestNeighbor.getOutcome();
+		Instance<String> syntheticInst = new Instance<String>(outcome, features);
 		return syntheticInst;
 	}
 
 	private double calculateDistance(Instance<String> instA,
 			Instance<String> instB) {
 		double distance = 0;
-		//iterate through all features:
-		for( String featureName: this.instanceFeatureCount.columnKeySet()){
-			Integer valA = this.instanceFeatureCount.get(instA, featureName);
-			Integer valB = this.instanceFeatureCount.get(instB, featureName);
-			if ( (valA!=null && valB == null) || (valA==null && valB != null)){
-				distance ++;
+		Double dis1 = this.interInstanceDistance.get(instA, instB);
+		Double dis2 = this.interInstanceDistance.get(instB, instA);
+
+		if (dis1 == null && dis2 == null){ //if this pair's distance hasn't been calculated,
then calculate it.
+			//iterate through all features:
+			for(Feature feature: instA.getFeatures()){
+				String featureName = getFeatureName(feature);
+				Integer valB = this.instanceFeatureCount.get(instB, featureName);
+				if ( valB == null ){
+					distance ++;
+				}
 			}
+			distance = Math.pow(distance, .5);
+			this.interInstanceDistance.put(instA, instB, distance);
+		}else{
+			distance = dis1 == null?  dis2 : dis1;
 		}
-		return Math.pow(distance, .5);
+
+		return distance;
 	}
 
 	public String getFeatureName(Feature feature) {
 	    String featureName = feature.getName();
 	    Object featureValue = feature.getValue();
-	    return featureValue instanceof Number ? featureName : featureName + ":" + featureValue;
+	    //return featureValue instanceof Number ? featureName : featureName + ":" + featureValue;
+	    return featureName + ":" + featureValue;
 	  }
 
 	public void addInstance(Instance<String> minorityInst) {



Mime
View raw message