ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1748746 [2/2] - in /ctakes/sandbox/ctakes-coref-cleartk: ./ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/ae/features/ src/main/java/org/apache/ctakes/coreference/ae/features/cluster/ src/main/...
Date Thu, 16 Jun 2016 15:33:02 GMT
Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,58 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class ClusterPairer extends ClusterMentionPairer_ImplBase {
+  private int sentDist;
+  public ClusterPairer(int dist){
+    this.sentDist = dist;
+  }
+  
+  /*
+   * getClusterPairs()
+   * In this method we allow to link to clusters containing more than one mention even if they
+   * are beyond a sentence distance. First we check whether the most recent mention in the cluster
+   * is within the specified sentence distance (presumably longer than the sentence distance passed into
+   * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple
+   * members but only one before the focus mention. So we need to count the members of a cluster until we 
+   * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
+        continue;
+      }
+      int numMembers=0;
+      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        numMembers++;
+        if(m == mostRecent) break;
+      }
+      if(numMembers > 1){
+        pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+      }
+    }
+    
+    return pairs;  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,54 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class ExactStringPairer extends ClusterMentionPairer_ImplBase {
+
+  private Set<String> markableStrings = null;
+  
+  @Override
+  public void reset(JCas jcas){
+    markableStrings = new HashSet<>();
+  }
+  /*
+   * getExactStringMatchPairs()
+   * For mentions that have the exact string repeated elsewhere in the document we want to
+   * allow matching across any distance. We don't use the sentence distance parameter here.
+   * We make use of a global variable markableStrings that is a HashSet containig all the markable
+   * strings from this document.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    
+    if(markableStrings.contains(mention.getCoveredText().toLowerCase())){
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(m == mostRecent) break;
+          // see if any of the members of the cluster have the exact same string as this 
+          if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+        }
+      }
+    }
+    markableStrings.add(mention.getCoveredText().toLowerCase());
+    return pairs;
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,62 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.log4j.Logger;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class HeadwordPairer extends ClusterMentionPairer_ImplBase {
+  private Map<String, Set<Markable>> headWordMarkables = null;
+  
+  @Override
+  public void reset(JCas jcas){
+    super.reset(jcas);
+    headWordMarkables = new HashMap<>();
+  }
+  
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+
+    ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+    if(headNode == null){
+      Logger.getLogger(MentionClusterCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
+      return pairs;
+    }
+    String head = headNode.getCoveredText().toLowerCase();
+    if(headWordMarkables.containsKey(head)){
+      Set<Markable> headSet = headWordMarkables.get(head);
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(headSet.contains(mostRecent)){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+          if(m == mostRecent) break;
+        }
+      }      
+    }else{    
+      headWordMarkables.put(head, new HashSet<Markable>());
+    }
+    headWordMarkables.get(head).add(mention);
+    
+    return pairs;  
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,73 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class SectionHeaderPairer extends ClusterMentionPairer_ImplBase {
+
+  private int sentDist;
+
+  public SectionHeaderPairer(int dist) {
+    this.sentDist = dist;
+  }
+  
+  /*
+   * getSectionHeaderPairs()
+   * Here we want to add clusters where one of the members is on a line all by itself (a section header)
+   * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a 
+   * span only contains one sentence then we consider it a "header" (or also as important a list item).
+   * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by
+   * the "sentence distance" method.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){
+        continue;
+      }
+
+      // now check if any of the mentions are in a section header
+      List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin());
+      for(int j = 0; j < pars.size(); j++){
+        boolean match = false;
+        Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+        List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+        if(coveredSents != null && coveredSents.size() == 1){
+          // this is sentences that are the same span as paragraphs -- how we model section headers
+          // see if any of the cluster mentions are in the section header
+          for(Markable m : JCasUtil.select(members, Markable.class)){
+            if(dominates(par, m)){
+              pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+              match = true;
+              break;
+            }
+          }
+        }
+        if(match) break;
+      }
+    }
+    return pairs;
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,70 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class SentenceDistancePairer extends ClusterMentionPairer_ImplBase {
+
+  private int sentDistance;
+  
+  public SentenceDistancePairer(int distance){
+    this.sentDistance = distance;
+  }
+  /*
+   * Here we want to add only things that are nearby. First we check the semantic types
+   * of the cluster we're comparing against. If any member is an Anatomical Site or Medication,
+   * we add the cluster no matter what. Otherwise we check how many sentences are in between
+   * the mention and the latest element of the cluster.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    Set<String> bestAnaTypes = getBestEnt(jcas, (Markable) mention);
+    
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()) continue;
+      
+      // check for distance if they are not anatomical site or medication
+      if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+          bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+  
+        IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+        if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > this.sentDistance) continue;
+      }
+  
+      // check for types of cluster
+      Set<String> bestClusterTypes = getBestEnt(jcas, cluster);
+      if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){
+        boolean overlap = false;
+        for(String semType : bestAnaTypes){
+          if(bestClusterTypes.contains(semType)){
+            overlap = true;
+          }
+        }
+        // they both correspond to named entities but no overlap in which category of named entity.
+        if(!overlap){
+          continue;
+        }
+      }
+      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));      
+    }
+    return pairs;
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/ExtractSemTypePreferences.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/ExtractSemTypePreferences.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/ExtractSemTypePreferences.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/ExtractSemTypePreferences.java Thu Jun 16 15:33:01 2016
@@ -2,6 +2,7 @@ package org.apache.ctakes.coreference.da
 
 import java.io.FileNotFoundException;
 import java.io.PrintWriter;
+import java.net.MalformedURLException;
 import java.util.List;
 
 import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory;
@@ -25,7 +26,7 @@ import org.apache.uima.resource.Resource
 
 public class ExtractSemTypePreferences {
 
-  public static void main(String[] args) throws ResourceInitializationException, FileNotFoundException {
+  public static void main(String[] args) throws ResourceInitializationException, FileNotFoundException, MalformedURLException {
     CollectionReaderDescription reader = CollectionReaderFactory.createReaderDescription(LuceneCollectionReader.class,
         LuceneCollectionReader.PARAM_INDEX_DIR,
         args[0],

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java Thu Jun 16 15:33:01 2016
@@ -24,13 +24,16 @@ import org.apache.ctakes.assertion.medfa
 import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
 import org.apache.ctakes.core.resource.FileLocator;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.core.util.ListFactory;
 import org.apache.ctakes.coreference.ae.CoreferenceChainScoringOutput;
 import org.apache.ctakes.coreference.ae.DeterministicMarkableAnnotator;
 import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator;
 import org.apache.ctakes.coreference.ae.MarkableSalienceAnnotator;
 import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator;
 import org.apache.ctakes.coreference.ae.MentionClusterRankingCoreferenceAnnotator;
 import org.apache.ctakes.coreference.ae.PersonChainAnnotator;
+import org.apache.ctakes.coreference.util.CoreferencePipelineFactory;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
 import org.apache.ctakes.temporal.ae.BackwardsTimeAnnotator;
@@ -60,6 +63,7 @@ import org.apache.ctakes.utils.distsem.W
 import org.apache.log4j.Level;
 import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.analysis_engine.metadata.FixedFlow;
 import org.apache.uima.analysis_engine.metadata.FlowConstraints;
@@ -82,7 +86,6 @@ import org.apache.uima.flow.JCasFlow_Imp
 import org.apache.uima.flow.SimpleStep;
 import org.apache.uima.flow.Step;
 import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.cas.EmptyFSList;
 import org.apache.uima.jcas.cas.FSArray;
 import org.apache.uima.jcas.cas.FSList;
 import org.apache.uima.jcas.cas.FloatArray;
@@ -91,6 +94,11 @@ import org.apache.uima.jcas.tcas.Annotat
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.util.FileUtils;
 import org.cleartk.eval.AnnotationStatistics;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.jar.DataWriterFactory_ImplBase;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.EncodingDirectoryDataWriterFactory;
 import org.cleartk.ml.jar.JarClassifierBuilder;
 import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter;
 import org.cleartk.ml.svmlight.rank.SvmLightRankDataWriter;
@@ -128,11 +136,16 @@ public class EvaluationOfEventCoreferenc
     
     @Option(shortName="s")
     public String getScorerPath();
+    
+    @Option
+    public boolean getSkipTest();
   }
   
   private static Logger logger = Logger.getLogger(EvaluationOfEventCoreference.class);
   public static float COREF_PAIRS_DOWNSAMPLE = 0.5f;
   public static float COREF_CLUSTER_DOWNSAMPLE=0.5f;
+  private static final int NUM_SAMPLES = 0;
+  private static final double DROPOUT_RATE = 0.1;
   
   protected static ParameterSettings pairwiseParams = new ParameterSettings(DEFAULT_BOTH_DIRECTIONS, COREF_PAIRS_DOWNSAMPLE, "tk",
       1.0, 1.0, "linear", ComboOperator.SUM, 0.1, 0.5);
@@ -179,6 +192,9 @@ public class EvaluationOfEventCoreferenc
     if(options.getSkipDataWriting()){
       eval.skipWrite = true;
     }
+    if(options.getSkipTest()){
+      eval.skipTest = true;
+    }
     eval.evalType = options.getEvalSystem();
     eval.config = options.getConfig();
     goldOut = "gold." + eval.config + ".conll";
@@ -192,7 +208,7 @@ public class EvaluationOfEventCoreferenc
       FileUtils.deleteRecursive(workingDir);
     }
     
-    if(options.getUseExternalScorer()){
+    if(options.getUseExternalScorer() && !options.getSkipTest()){
       Pattern patt = Pattern.compile("(?:Coreference|BLANC): Recall: \\([^\\)]*\\) (\\S+)%.*Precision: \\([^\\)]*\\) (\\S+)%.*F1: (\\S+)%");
       Runtime runtime = Runtime.getRuntime();
       Process p = runtime.exec(new String[]{
@@ -229,7 +245,8 @@ public class EvaluationOfEventCoreferenc
   
   boolean skipTrain=false; 
   boolean skipWrite=false;
-  public enum EVAL_SYSTEM { BASELINE, MENTION_PAIR, MENTION_CLUSTER, CLUSTER_RANK };
+  boolean skipTest=false;
+  public enum EVAL_SYSTEM { BASELINE, MENTION_PAIR, MENTION_CLUSTER, CLUSTER_RANK, PERSON_ONLY };
   EVAL_SYSTEM evalType;
   String config=null;
   
@@ -250,8 +267,10 @@ public class EvaluationOfEventCoreferenc
   protected void train(CollectionReader collectionReader, File directory)
       throws Exception {
     if(skipTrain) return;
+    if(this.evalType == EVAL_SYSTEM.BASELINE || this.evalType == EVAL_SYSTEM.PERSON_ONLY) return;
     if(!skipWrite){
       AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
       aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
       aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
       aggregateBuilder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription());
@@ -259,7 +278,6 @@ public class EvaluationOfEventCoreferenc
       aggregateBuilder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription());
 
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ViewCreatorAnnotator.class, ViewCreatorAnnotator.PARAM_VIEW_NAME, "Baseline"));
-      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
@@ -269,6 +287,7 @@ public class EvaluationOfEventCoreferenc
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
       //    aggregateBuilder.add(CopyFromGold.getDescription(/*Markable.class,*/ CoreferenceRelation.class, CollectionTextRelation.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(MarkableHeadTreeCreator.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class, CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME));
       aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
       if(this.evalType == EVAL_SYSTEM.MENTION_PAIR){
@@ -281,22 +300,46 @@ public class EvaluationOfEventCoreferenc
             params.probabilityOfKeepingANegativeExample
             ));
       }else if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER){
-//        aggregateBuilder.add(EventCoreferenceAnnotator.createScoringAnnotatorDescription("/org/apache/ctakes/coreference/mention-pair" + File.separator + "model.jar"));
-        aggregateBuilder.add(MentionClusterCoreferenceAnnotator.createDataWriterDescription(
-//            LibSvmStringOutcomeDataWriter.class,
+        AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(
+            MentionClusterCoreferenceAnnotator.class,
+            CleartkAnnotator.PARAM_IS_TRAINING,
+            true,
+            MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+            params.probabilityOfKeepingANegativeExample,
+            DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
             LibLinearStringOutcomeDataWriter.class,
-//            MalletStringOutcomeDataWriter.class,
-//            SvmLightStringOutcomeDataWriter.class, 
-//            TkLibSvmStringOutcomeDataWriter.class,
-            directory,
-            params.probabilityOfKeepingANegativeExample
-            ));
+            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+            directory);        
+        aggregateBuilder.add(aed);
+        for(int i = 0; i < NUM_SAMPLES; i++){
+          // after each iteration, remove the gold chains in the system view and re-copy over gold chains with some variation:
+          aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemoveAllCoreferenceAnnotations.class));
+          aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class, CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME, CopyCoreferenceRelations.PARAM_DROP_ELEMENTS, true));          
+
+          aed = AnalysisEngineFactory.createEngineDescription(
+              MentionClusterCoreferenceAnnotator.class,
+              CleartkAnnotator.PARAM_IS_TRAINING,
+              true,
+              MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+              params.probabilityOfKeepingANegativeExample,
+              MentionClusterCoreferenceAnnotator.PARAM_USE_EXISTING_ENCODERS,
+              true,
+              DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+              LibLinearStringOutcomeDataWriter.class,
+              DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+              directory);
+              
+          aggregateBuilder.add(aed);
+          
+        }
       }else if(this.evalType == EVAL_SYSTEM.CLUSTER_RANK){
         // TODO
         aggregateBuilder.add(MentionClusterRankingCoreferenceAnnotator.createDataWriterDescription(
             SvmLightRankDataWriter.class, 
             directory, 
             params.probabilityOfKeepingANegativeExample));
+      }else{
+        logger.warn("Encountered a training configuration taht does not add an annotator: " + this.evalType);
       }
       Logger.getLogger(EventCoreferenceAnnotator.class).setLevel(Level.WARN);
       // create gold chains for writing out which we can then use for our scoring tool
@@ -334,13 +377,18 @@ public class EvaluationOfEventCoreferenc
   @Override
   protected AnnotationStatistics<String> test(
       CollectionReader collectionReader, File directory) throws Exception {
+    AnnotationStatistics<String> corefStats = new AnnotationStatistics<>();
+    if(this.skipTest){
+      logger.info("Skipping test");
+      return corefStats;
+    }
     AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
     aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
     aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
     aggregateBuilder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription());
     aggregateBuilder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription());
     aggregateBuilder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription());
-    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
@@ -354,6 +402,7 @@ public class EvaluationOfEventCoreferenc
         GOLD_VIEW_NAME));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(MarkableHeadTreeCreator.class));
     aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
     if(this.evalType == EVAL_SYSTEM.MENTION_PAIR){
       aggregateBuilder.add(EventCoreferenceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
@@ -362,6 +411,10 @@ public class EvaluationOfEventCoreferenc
       aggregateBuilder.add(MentionClusterCoreferenceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
     }else if(this.evalType == EVAL_SYSTEM.CLUSTER_RANK){
       aggregateBuilder.add(MentionClusterRankingCoreferenceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
+    }else if(this.evalType == EVAL_SYSTEM.BASELINE){
+      aggregateBuilder.add(CoreferencePipelineFactory.getCoreferencePipeline());
+    }else{
+      logger.info("Running an evaluation that does not add an annotator: " + this.evalType);
     }
 //    aggregateBuilder.add(CoreferenceChainAnnotator.createAnnotatorDescription());
     aggregateBuilder.add(PersonChainAnnotator.createAnnotatorDescription());
@@ -383,7 +436,6 @@ public class EvaluationOfEventCoreferenc
       }
     };
      
-    AnnotationStatistics<String> corefStats = new AnnotationStatistics<>();
 
     for(Iterator<JCas> casIter =new JCasIterator(collectionReader, aggregateBuilder.createAggregate()); casIter.hasNext();){
       JCas jCas = casIter.next();
@@ -602,6 +654,10 @@ public class EvaluationOfEventCoreferenc
     @ConfigurationParameter(name=PARAM_GOLD_VIEW, mandatory=true, description="View containing gold standard annotations")
     private String goldViewName;
     
+    public static final String PARAM_DROP_ELEMENTS = "Dropout";
+    @ConfigurationParameter(name = PARAM_DROP_ELEMENTS, mandatory=false)
+    private boolean dropout = false;
+
     @SuppressWarnings("synthetic-access")
     @Override
     public void process(JCas jcas) throws AnalysisEngineProcessException {
@@ -620,45 +676,20 @@ public class EvaluationOfEventCoreferenc
       
       for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
         FSList head = goldChain.getMembers();
-        NonEmptyFSList sysList = new NonEmptyFSList(jcas);
-        NonEmptyFSList listEnd = sysList;
+//        NonEmptyFSList sysList = new NonEmptyFSList(jcas);
+//        NonEmptyFSList listEnd = sysList;
+        List<List<Markable>> systemLists = new ArrayList<>(); // the gold list can be split up into many lists if we allow dropout.
         boolean removeChain = false;
+        List<Markable> prevList = null;
         
         // first one is guaranteed to be nonempty otherwise it would not be in cas
         do{
           NonEmptyFSList element = (NonEmptyFSList) head;
-          // if this is not first time through move listEnd to end.
-          if(listEnd.getHead() != null){
-            listEnd.setTail(new NonEmptyFSList(jcas));
-            listEnd.addToIndexes();
-            listEnd = (NonEmptyFSList) listEnd.getTail();
-          }
           Markable goldMarkable = (Markable) element.getHead();
-          if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
-            
-          
-            ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
-
-            for(Markable sysMarkable : depIndex.get(headNode)){
-              ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
-              if(markNode == headNode){
-                gold2sys.put(goldMarkable, sysMarkable);
-                break;
-              }
-            }
-//            if(!gold2sys.containsKey(goldMarkable)){
-//              Markable mappedGold = new Markable(jcas, goldMarkable.getBegin(), goldMarkable.getEnd());
-//              mappedGold.addToIndexes();
-//            }
-          }else{
-            // Have seen some instances where anafora writes a span that is not possible, log them
-            // so they can be found and fixed:
-            logger.warn(String.format("There is a markable with span [%d, %d] in a document with length %d\n", 
-                goldMarkable.getBegin(), goldMarkable.getEnd(), jcas.getDocumentText().length()));
-          }
+          boolean mapped = mapGoldMarkable(jcas, goldMarkable, gold2sys, depIndex);
           
-          // add markable to end of list:
-          if(gold2sys.get(goldMarkable) == null){
+          // if we can't align the gold markable with one in the system cas then don't add it:
+          if(!mapped){
             String text = "<Out of bounds>";
             if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
               text = goldMarkable.getCoveredText();
@@ -668,20 +699,50 @@ public class EvaluationOfEventCoreferenc
             removeChain = true;
             break;
           }
-          listEnd.setHead(gold2sys.get(goldMarkable));
           
+          Markable sysMarkable = gold2sys.get(goldMarkable);
+          if(!dropout || systemLists.size() == 0){
+            if(systemLists.size() == 0) systemLists.add(new ArrayList<>());
+            systemLists.get(0).add(sysMarkable);
+//            prevList = systemLists.get(0);
+//            // if this is not first time through move listEnd to end.
+//            if(listEnd.getHead() != null){
+//              listEnd.setTail(new NonEmptyFSList(jcas));
+//              listEnd.addToIndexes();
+//              listEnd = (NonEmptyFSList) listEnd.getTail();
+//            }
+//
+//            // add markable to end of list:
+//            listEnd.setHead(gold2sys.get(goldMarkable));
+          }else{
+            // 3 options: Do correctly (append to same list as last element), ii) Start its own list, iii) Randomly join another list
+            if(Math.random() > DROPOUT_RATE){
+              // most of the time do the right thing:
+              systemLists.get(0).add(sysMarkable);
+            }else{
+              int listIndex = (int) Math.ceil(Math.random() * systemLists.size());
+              if(listIndex == systemLists.size()){
+                systemLists.add(new ArrayList<>());
+              }
+              systemLists.get(listIndex).add(sysMarkable);
+            }
+          }
           head = element.getTail();
         }while(head instanceof NonEmptyFSList);
         
         // don't bother copying over -- the gold chain was of person mentions
-        if(!removeChain){      
-          listEnd.setTail(new EmptyFSList(jcas));
-          listEnd.addToIndexes();
-          listEnd.getTail().addToIndexes();
-          sysList.addToIndexes();
-          CollectionTextRelation sysRel = new CollectionTextRelation(jcas);
-          sysRel.setMembers(sysList);
-          sysRel.addToIndexes();
+        if(!removeChain){
+//          listEnd.setTail(new EmptyFSList(jcas));
+//          listEnd.addToIndexes();
+//          listEnd.getTail().addToIndexes();
+//          sysList.addToIndexes();
+          for(List<Markable> chain : systemLists){
+            if(chain.size() > 1){
+              CollectionTextRelation sysRel = new CollectionTextRelation(jcas);
+              sysRel.setMembers(ListFactory.buildList(jcas, chain));
+              sysRel.addToIndexes();
+            }
+          }
         }
       }
       
@@ -704,8 +765,55 @@ public class EvaluationOfEventCoreferenc
           sysRel.addToIndexes();        
         }
       }
+    }
+    
+    private static boolean mapGoldMarkable(JCas jcas, Markable goldMarkable, Map<Markable,Markable> gold2sys, Map<ConllDependencyNode, Collection<Markable>> depIndex){
+      if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+        
+        
+        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
+
+        for(Markable sysMarkable : depIndex.get(headNode)){
+          ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
+          if(markNode == headNode){
+            gold2sys.put(goldMarkable, sysMarkable);
+            return true;
+          }
+        }
+      }else{
+        // Have seen some instances where anafora writes a span that is not possible, log them
+        // so they can be found and fixed:
+        logger.warn(String.format("There is a markable with span [%d, %d] in a document with length %d\n", 
+            goldMarkable.getBegin(), goldMarkable.getEnd(), jcas.getDocumentText().length()));
+        return false;
+      }
+      return false;
+    }
+  }
+  
+  public static class RemoveAllCoreferenceAnnotations extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      List<CollectionTextRelation> chains = new ArrayList<>(JCasUtil.select(jcas, CollectionTextRelation.class));
+      for(CollectionTextRelation chain : chains){
+        NonEmptyFSList head = null;
+        FSList nextHead = chain.getMembers();
+        do{
+          head = (NonEmptyFSList) nextHead;
+          head.removeFromIndexes();
+          nextHead = head.getTail();
+        }while(nextHead instanceof NonEmptyFSList);
+        chain.removeFromIndexes();
+      }
+      List<CoreferenceRelation> rels = new ArrayList<>(JCasUtil.select(jcas, CoreferenceRelation.class));
+      for(CoreferenceRelation rel : rels){
+        rel.getArg1().removeFromIndexes();
+        rel.getArg2().removeFromIndexes();
+        rel.removeFromIndexes();
+      }
     }    
   }
+  
   public static class RemovePersonMarkables extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
 
     @Override
@@ -726,11 +834,10 @@ public class EvaluationOfEventCoreferenc
         if(coveredTokens.size() == 1 && coveredTokens.get(0).getPartOfSpeech().startsWith("PRP") &&
             !markable.getCoveredText().toLowerCase().equals("it")){
           toRemove.add(markable);
-        }else if((coveredTokens.size() == 2 || coveredTokens.size() == 3) && 
-            (coveredTokens.get(0).getCoveredText().startsWith("Mr.") || coveredTokens.get(0).getCoveredText().startsWith("Dr.") ||
+        }else if(coveredTokens.size() > 0 && (coveredTokens.get(0).getCoveredText().startsWith("Mr.") || coveredTokens.get(0).getCoveredText().startsWith("Dr.") ||
                 coveredTokens.get(0).getCoveredText().startsWith("Mrs.") || coveredTokens.get(0).getCoveredText().startsWith("Ms."))){
           toRemove.add(markable);
-        }else if(markable.getCoveredText().toLowerCase().equals("patient")){
+        }else if(markable.getCoveredText().toLowerCase().endsWith("patient") || markable.getCoveredText().toLowerCase().equals("pt")){
           toRemove.add(markable);
         }
       }

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousBag.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousBag.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousBag.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousBag.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,107 @@
+package org.apache.ctakes.coreference.extractors;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bounds;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Context;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+public class ContinuousBag implements Context {
+
+  private Context[] contexts;
+  private String name = null;
+  private Map<String, double[]> vectors = null;
+  private int dims;
+  
+  public ContinuousBag(File vecFile, Context... contexts) throws FileNotFoundException {
+    this.contexts = contexts;
+    this.vectors = readVectorFile(vecFile);
+//    String[] names = new String[contexts.length + 1];
+//    names[0] = "ContinuousBag";
+//    for (int i = 1; i < names.length; ++i) {
+//      names[i] = contexts[i - 1].getName();
+//    }
+    this.name = Feature.createName("ContinuousBag");
+  }
+  
+  private Map<String, double[]> readVectorFile(File vecFile) throws FileNotFoundException{
+    Map<String, double[]> vectorMap = new HashMap<>();
+    try(Scanner scanner = new Scanner(vecFile)){
+      while(scanner.hasNextLine()){
+        String[] termVec = scanner.nextLine().trim().split("\\s+");
+        if(termVec.length == 2) continue; // some files have the first line with the dimensions
+        dims = termVec.length-1;
+        double[] vector = new double[dims];
+        for(int i = 0; i < dims; i++){
+          vector[i] = Double.parseDouble(termVec[i+1]);
+        }
+        vectorMap.put(termVec[0], vector);
+      }
+    }
+    return vectorMap;
+  }
+  
+  public String getName() {
+    return this.name;
+  }
+
+  public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas, Annotation focusAnnotation, Bounds bounds,
+      Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    for (Context context : this.contexts) {
+      double[] contextVec = new double[dims];
+      int numComponents = 0;
+      for (Feature feature : context.extract(
+          jCas,
+          focusAnnotation,
+          bounds,
+          annotationClass,
+          extractor)) {
+        
+        if(this.vectors.containsKey(feature.getValue())){
+          double[] featVec = this.vectors.get(feature.getValue().toString().toLowerCase());
+          addToVector(contextVec, featVec);
+          numComponents++;
+        }
+      }
+      if(numComponents > 0){
+        for(int i = 0; i < dims; i++){
+          feats.add(new Feature(Feature.createName(this.name, context.getName(), String.valueOf(i)), contextVec[i] / numComponents));
+        }
+      }
+    }
+    return feats;
+  }
+  
+  private static void addToVector(double[] vec1, double[] vec2){
+    for(int i = 0; i < vec1.length; i++){
+      vec1[i] += vec2[i];
+    }
+  }
+  
+  public static class Surrounding implements CleartkExtractor.Context {
+
+    public String getName() {
+      return "Surrounding";
+    }
+
+    public <SEARCH_T extends Annotation> List<Feature> extract(JCas jCas, Annotation focusAnnotation, Bounds bounds,
+        Class<SEARCH_T> annotationClass, FeatureExtractor1<SEARCH_T> extractor) throws CleartkExtractorException {
+      List<Feature> feats = new ArrayList<>();
+      
+      return feats;
+    }
+    
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousTextExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousTextExtractor.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousTextExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousTextExtractor.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,61 @@
+package org.apache.ctakes.coreference.extractors;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.NamedFeatureExtractor1;
+
+public class ContinuousTextExtractor implements NamedFeatureExtractor1<BaseToken>  {
+  private int dims;
+  private WordEmbeddings words = null;
+  
+  public ContinuousTextExtractor(String vecFile) throws CleartkExtractorException {
+    super();
+    try {
+      words = WordVectorReader.getEmbeddings(FileLocator.getAsStream(vecFile));
+    } catch (IOException e) {
+      e.printStackTrace();
+      throw new CleartkExtractorException(e);
+    }
+  }
+  
+  @Override
+  public List<Feature> extract(JCas view, BaseToken token) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    String wordText = token.getCoveredText();
+    WordVector vec = null;
+    if(words.containsKey(wordText)){
+      vec = words.getVector(wordText);
+    }else if(words.containsKey(wordText.toLowerCase())){
+      vec = words.getVector(wordText.toLowerCase());
+    }else{
+      return feats;
+    }
+    
+    for(int i = 0; i < vec.size(); i++){
+      feats.add(new Feature(getFeatureName() + "_" + i, vec.getValue(i)));
+    }
+    return feats;
+  }
+
+  @Override
+  public String getFeatureName() {
+    return "ContinuousText";
+  }
+    
+}



Mime
View raw message