incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From c...@apache.org
Subject svn commit: r1426641 - in /incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: ae/EventAnnotator.java utils/ utils/SMOTEplus.java
Date Fri, 28 Dec 2012 21:33:18 GMT
Author: clin
Date: Fri Dec 28 21:33:17 2012
New Revision: 1426641

URL: http://svn.apache.org/viewvc?rev=1426641&view=rev
Log:
Implemented a simple version of SMOTE algorithm. Can slightly increase performance.

Added:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
  (with props)
Modified:
    incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java

Modified: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java?rev=1426641&r1=1426640&r2=1426641&view=diff
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
(original)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/EventAnnotator.java
Fri Dec 28 21:33:17 2012
@@ -30,6 +30,7 @@ import org.apache.ctakes.temporal.ae.fea
 import org.apache.ctakes.temporal.ae.feature.PredicateArgumentExtractor;
 import org.apache.ctakes.temporal.ae.feature.selection.Chi2FeatureSelection;
 import org.apache.ctakes.temporal.ae.feature.selection.FeatureSelection;
+import org.apache.ctakes.temporal.utils.SMOTEplus;
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.Chunk;
@@ -191,6 +192,10 @@ public class EventAnnotator extends Clea
     PredicateArgumentExtractor predicateArgumentExtractor = new PredicateArgumentExtractor(jCas);
 
     Random rand = new Random();
+    
+    //TRY SMOTE algorithm here to generate more minority class samples
+    SMOTEplus smote = new SMOTEplus();
+        
     // classify tokens within each sentence
     for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
       List<BaseToken> tokens = JCasUtil.selectCovered(jCas, BaseToken.class, sentence);
@@ -268,8 +273,13 @@ public class EventAnnotator extends Clea
         if (this.isTraining()) {
           String outcome = outcomes.get(tokenIndex);
           // if it is an "O" down-sample it
-          if (!outcome.equals("O") || rand.nextDouble() <= this.probabilityOfKeepingANegativeExample)
{
-            this.dataWriter.write(new Instance<String>(outcome, features));
+          if (outcome.equals("O")) {
+        	  if (rand.nextDouble() <= this.probabilityOfKeepingANegativeExample)
+        		  this.dataWriter.write(new Instance<String>(outcome, features));
+          }else{//for minority instances:
+        	  Instance<String> minorityInst = new Instance<String>(outcome, features);
+        	  this.dataWriter.write(minorityInst);
+        	  smote.addInstance(minorityInst);
           }
         }
 
@@ -284,6 +294,13 @@ public class EventAnnotator extends Clea
         this.eventChunking.createChunks(jCas, tokens, outcomes);
       }
     }
+    if(this.isTraining()){ //add synthetic instances to datawriter
+    	Iterable<Instance<String>> syntheticInsts = smote.populateMinorityClass();
+    	for( Instance<String> sytheticInst: syntheticInsts){
+    		this.dataWriter.write(sytheticInst);
+    	}
+    }
+    
   }
 
   private static Predicate<EntityMention> hasEntityType(final int typeID) {

Added: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java?rev=1426641&view=auto
==============================================================================
--- incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
(added)
+++ incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
Fri Dec 28 21:33:17 2012
@@ -0,0 +1,100 @@
+package org.apache.ctakes.temporal.utils;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.cleartk.classifier.Feature;
+import org.cleartk.classifier.Instance;
+import com.google.common.collect.HashBasedTable;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Table;
+
+/**
+ * A simple implementation of SMOTE algorithm. 
+ * Nitesh V. Shawla et. al. SMOTE: Synthetic Minority Over-sampling Technique, 06/02
+ * currently only get the nearest neighbor for each minority instance.
+ * May be modified to my SMOTE-plus algorithm.
+ *  
+ * @author Chen Lin
+ */
+public class SMOTEplus {
+
+	protected List<Instance<String>> minorityInsts;
+	protected Table<Instance<String>, String, Integer> instanceFeatureCount;
+	protected List<Instance<String>> syntheticInsts;
+	
+	public SMOTEplus() {
+		this.minorityInsts = Lists.newArrayList();
+		this.syntheticInsts = Lists.newArrayList();
+		this.instanceFeatureCount = HashBasedTable.<Instance<String>, String, Integer>
create();
+	}
+	
+	public Iterable<Instance<String>> populateMinorityClass() {
+		//1. populate Minority instance-Feature matrix
+		for (Instance<String> instance : this.minorityInsts) {
+		      for (Feature feature : instance.getFeatures()) {
+		    	  this.instanceFeatureCount.put(instance, getFeatureName(feature), 1);
+		      }
+		}
+		
+		//2. Iterate through all minority instances:
+		for (Instance<String> aMinorityInst : this.instanceFeatureCount.rowKeySet()) {
+			//3. find its nearest neighbor minority instance:
+			//TODO: Should be modified to take nearest K neighbors
+			double minDis = Double.MAX_VALUE;
+			Instance<String> nearestNeighbor = null;
+			for (Instance<String> bMinorityInst : this.instanceFeatureCount.rowKeySet()){
+				if ( aMinorityInst==bMinorityInst || (aMinorityInst!=null && aMinorityInst.equals(bMinorityInst))
){
+					double distance = calculateDistance(aMinorityInst, bMinorityInst);
+					if (distance < minDis){
+						minDis = distance;
+						nearestNeighbor = bMinorityInst;
+					}
+				}
+			}
+			Instance<String> sytheticInst = generateInstance(aMinorityInst, nearestNeighbor);
+			this.syntheticInsts.add(sytheticInst);
+		}
+		
+		return this.syntheticInsts;
+	}
+
+	private Instance<String> generateInstance(Instance<String> aMinorityInst,
+			Instance<String> nearestNeighbor) {
+		List<Feature> features = new ArrayList<Feature>();
+		//iterate through all features:
+		for( String featureName: this.instanceFeatureCount.columnKeySet()){
+			Integer valA = this.instanceFeatureCount.get(aMinorityInst, featureName);
+			Integer valB = this.instanceFeatureCount.get(nearestNeighbor, featureName);
+			if (valA != null && valB != null){
+				features.add(new Feature(featureName.split(":",2)[0],featureName.split(":",2)[1]));
+			}
+		}
+		Instance<String> syntheticInst = new Instance<String>(aMinorityInst.getOutcome(),
features);
+		return syntheticInst;
+	}
+
+	private double calculateDistance(Instance<String> instA,
+			Instance<String> instB) {
+		double distance = 0;
+		//iterate through all features:
+		for( String featureName: this.instanceFeatureCount.columnKeySet()){
+			Integer valA = this.instanceFeatureCount.get(instA, featureName);
+			Integer valB = this.instanceFeatureCount.get(instB, featureName);
+			if ( (valA!=null && valB == null) || (valA==null && valB != null)){
+				distance ++;
+			}
+		}
+		return Math.pow(distance, .5);
+	}
+
+	public String getFeatureName(Feature feature) {
+	    String featureName = feature.getName();
+	    Object featureValue = feature.getValue();
+	    return featureValue instanceof Number ? featureName : featureName + ":" + featureValue;
+	  }
+
+	public void addInstance(Instance<String> minorityInst) {
+		this.minorityInsts.add(minorityInst);
+	}
+}

Propchange: incubator/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/utils/SMOTEplus.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message