ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1687518 [1/3] - in /ctakes/sandbox/ctakes-coref-cleartk/src/main: java/org/apache/ctakes/coreference/ae/ java/org/apache/ctakes/coreference/ae/features/ java/org/apache/ctakes/coreference/ae/features/cluster/ java/org/apache/ctakes/corefer...
Date Thu, 25 Jun 2015 13:40:58 GMT
Author: tmill
Date: Thu Jun 25 13:40:52 2015
New Revision: 1687518

URL: http://svn.apache.org/r1687518
Log:
Changes to coref over last several weeks.

Added:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/ClusterUtils.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/org/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/org/apache/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/org/apache/ctakes/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/org/apache/ctakes/coreference/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/org/apache/ctakes/coreference/distsem/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/org/apache/ctakes/coreference/distsem/mimic_vectors.txt
    ctakes/sandbox/ctakes-coref-cleartk/src/main/resources/org/apache/ctakes/coreference/pref_probs.txt
Modified:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java?rev=1687518&r1=1687517&r2=1687518&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableSalienceAnnotator.java Thu Jun 25 13:40:52 2015
@@ -5,6 +5,7 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.salience.ClinicalFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.salience.GrammaticalRoleFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.salience.MorphosyntacticFeatureExtractor;

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1687518&r1=1687517&r2=1687518&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Thu Jun 25 13:40:52 2015
@@ -1,25 +1,43 @@
 package org.apache.ctakes.coreference.ae;
 
 import java.io.File;
+import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
+import java.util.Set;
 
 import org.apache.ctakes.core.util.ListFactory;
-import org.apache.ctakes.core.util.ListIterable;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
-import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterPartOfSpeechFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
-import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
@@ -40,8 +58,6 @@ import org.cleartk.ml.jar.DirectoryDataW
 import org.cleartk.ml.jar.GenericJarClassifierFactory;
 import org.cleartk.util.ViewUriUtil;
 
-import com.google.common.collect.Lists;
-
 public class MentionClusterCoreferenceAnnotator extends CleartkAnnotator<String> {
   public static final String NO_RELATION_CATEGORY = "-NONE-";
   public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE =
@@ -54,6 +70,8 @@ public class MentionClusterCoreferenceAn
 
   protected Random coin = new Random(0);
 
+  boolean greedyFirst = true;
+  
   public static AnalysisEngineDescription createDataWriterDescription(
       Class<? extends DataWriter<String>> dataWriterClass,
       File outputDirectory,
@@ -82,42 +100,238 @@ public class MentionClusterCoreferenceAn
 
   private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> featureExtractors = this.getFeatureExtractors();
 
+  private Set<String> markableStrings = null;
+  private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+  private Map<String,Set<Markable>> headWordMarkables = null;
+  
   protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> getFeatureExtractors() {
-    return Lists.newArrayList(
-        new MentionClusterAgreementFeaturesExtractor(),
-//        new MentionClusterPartOfSpeechFeaturesExtractor(),
-        new MentionClusterStringFeaturesExtractor()
-        );
+    List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> extractors = new ArrayList<>();
+    extractors.add(new MentionClusterAgreementFeaturesExtractor());
+    extractors.add(new MentionClusterStringFeaturesExtractor());
+    extractors.add(new MentionClusterSectionFeaturesExtractor());
+    extractors.add(new MentionClusterUMLSFeatureExtractor());
+    extractors.add(new MentionClusterDepHeadExtractor());
+    extractors.add(new MentionClusterStackFeaturesExtractor());
+    extractors.add(new MentionClusterSalienceFeaturesExtractor());
+    
+    try {
+      extractors.add(new MentionClusterDistSemExtractor());
+      extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor());
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    
+    return extractors;
   }
   
   protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
       JCas jcas,
       IdentifiedAnnotation mention){
+    int sentDist = 5;
+    // using linked hash set ensures no duplicates:
+    LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
+    if(mention.getCoveredText().equalsIgnoreCase("this")){
+      pairs.addAll(getSentenceDistancePairs(jcas, mention, 1));
+      pairs.addAll(getClusterPairs(jcas, mention, 3));
+    }else{
+      pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist));
+      pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist));
+      pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE));
+      pairs.addAll(getExactStringMatchPairs(jcas, mention, sentDist));
+    }
+//    pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist));
+    
+    return pairs;
+  }
+  
+  /*
+   * getExactStringMatchPairs()
+   * For mentions that have the exact string repeated elsewhere in the document we want to
+   * allow matching across any distance. We don't use the sentence distance parameter here.
+   * We make use of a global variable markableStrings that is a HashSet containig all the markable
+   * strings from this document.
+   */
+  private List<CollectionTextRelationIdentifiedAnnotationPair> getExactStringMatchPairs(
+      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    
+    if(markableStrings.contains(mention.getCoveredText().toLowerCase())){
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(m == mostRecent) break;
+          // see if any of the members of the cluster have the exact same string as this 
+          if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+        }
+      }
+    }
+    return pairs;
+  }
+  
+  /*
+   * getClusterPairs()
+   * In this method we allow to link to clusters containing more than one mention even if they
+   * are beyond a sentence distance. First we check whether the most recent mention in the cluster
+   * is within the specified sentence distance (presumably longer than the sentence distance passed into
+   * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple
+   * members but only one before the focus mention. So we need to count the members of a cluster until we 
+   * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing.
+   */
+  private List<CollectionTextRelationIdentifiedAnnotationPair> getClusterPairs(
+      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
     List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
     for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
+        continue;
+      }
+      int numMembers=0;
+      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        numMembers++;
+        if(m == mostRecent) break;
+      }
+      if(numMembers > 1){
+        pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+      }
+    }
+    
+    return pairs;
+  }
+
+  /*
+   * Here we want to add only things that are nearby. First we check the semantic types
+   * of the cluster we're comparing against. If any member is an Anatomical Site or Medication,
+   * we add the cluster no matter what. Otherwise we check how many sentences are in between
+   * the mention and the latest element of the cluster.
+   */
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSentenceDistancePairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    Set<String> bestAnaTypes = getBestEnt(jcas, (Markable) mention);
+    
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
       NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
       Annotation first = (Annotation) members.getHead();
       if(first == null || mention.getBegin() <= first.getEnd()) continue;
       
-      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) getMostRecent(members, mention);
-      if(EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > 5) continue;
-      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+      // check for distance if they are not anatomical site or medication
+      if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+          bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+
+        IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+        if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist) continue;
+      }
+
+      // check for types of cluster
+      Set<String> bestClusterTypes = getBestEnt(jcas, cluster);
+      if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){
+        boolean overlap = false;
+        for(String semType : bestAnaTypes){
+          if(bestClusterTypes.contains(semType)){
+            overlap = true;
+          }
+        }
+        // they both correspond to named entities but no overlap in which category of named entity.
+        if(!overlap){
+          continue;
+        }
+      }
+      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));      
+    }
+    return pairs;
+  }
+
+  /*
+   * getSectionHeaderPairs()
+   * Here we want to add clusters where one of the members is on a line all by itself (a section header)
+   * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a 
+   * span only contains one sentence then we consider it a "header" (or also as important a list item).
+   * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by
+   * the "sentence distance" method.
+   */
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSectionHeaderPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){
+        continue;
+      }
+      
+      // now check if any of the mentions are in a section header
+      List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin());
+      for(int j = 0; j < pars.size(); j++){
+        boolean match = false;
+        Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+        List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+        if(coveredSents != null && coveredSents.size() == 1){
+          // this is sentences that are the same span as paragraphs -- how we model section headers
+          // see if any of the cluster mentions are in the section header
+          for(Markable m : JCasUtil.select(members, Markable.class)){
+            if(dominates(par, m)){
+              pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+              match = true;
+              break;
+            }
+          }
+        }
+        if(match) break;
+      }
     }
     return pairs;
   }
+  
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getHeadwordMatchPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
 
+    ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+    String head = headNode.getCoveredText().toLowerCase();
+    if(headWordMarkables.containsKey(head)){
+      Set<Markable> headSet = headWordMarkables.get(head);
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(m == mostRecent) break;
+          if(headSet.contains(mostRecent)){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+        }
+      }      
+    }
+    
+    return pairs;
+  }
+  
   @Override
   public void process(JCas jCas) throws AnalysisEngineProcessException {
     // lookup from pair of annotations to binary text relation
     // note: assumes that there will be at most one relation per pair
+    markableStrings = new HashSet<>();
+    nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    headWordMarkables = new HashMap<>();
+    
     Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
     relationLookup = new HashMap<>();
     if (this.isTraining()) {
       for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) {
-        for(IdentifiedAnnotation mention : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
+        for(IdentifiedAnnotation mention : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(mention.getCoveredText().equalsIgnoreCase("this")){
+            System.err.println("Found a mention of 'this' as a pronoun");
+          }
           CollectionTextRelationIdentifiedAnnotationRelation relation = 
               new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
-//          IdentifiedAnnotation mention = (IdentifiedAnnotation) arg.getArgument();
           relation.setCluster(cluster);
           relation.setMention(mention);
           relation.setCategory("CoreferenceClusterMember");
@@ -137,16 +351,24 @@ public class MentionClusterCoreferenceAn
     
     for(Segment segment : JCasUtil.select(jCas, Segment.class)){
       for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+        String mentionText = mention.getCoveredText().toLowerCase();
         boolean singleton = true;
+        double maxScore = 0.0;
+        CollectionTextRelation maxCluster = null;
+        
         for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){
           CollectionTextRelation cluster = pair.getCluster();
           // apply all the feature extractors to extract the list of features
           List<Feature> features = new ArrayList<>();
           for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.featureExtractors) {
             List<Feature> feats = extractor.extract(jCas, cluster, mention);
-            if (feats != null)  features.addAll(feats);
+            if (feats != null){
+              Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
+              features.addAll(feats);
+            }
           }
 
+          List<Feature> dupFeatures = new ArrayList<>();
           // sanity check on feature values
           for (Feature feature : features) {
             if (feature.getValue() == null) {
@@ -154,8 +376,13 @@ public class MentionClusterCoreferenceAn
               String message = String.format("Null value found in %s from %s", feature, features);
               System.err.println(message);
               //            throw new IllegalArgumentException(String.format(message, feature, features));
+//            }else{
+//              if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
+//                dupFeatures.add(new Feature("PRO+"+feature.getName(), feature.getValue()));
+//              }
             }
           }
+          features.addAll(dupFeatures);
 
           // during training, feed the features to the data writer
           if (this.isTraining()) {
@@ -166,23 +393,51 @@ public class MentionClusterCoreferenceAn
 
             // create a classification instance and write it to the training data
             this.dataWriter.write(new Instance<>(category, features));
+            if(!category.equals(NO_RELATION_CATEGORY)){
+              break;
+            }
           }
 
           // during classification feed the features to the classifier and create
           // annotations
           else {
             String predictedCategory = this.classify(features);
-
+            // TODO look at scores in classifier and try best-pair rather than first-pair?
+            Map<String,Double> scores = this.classifier.score(features);
+            
             // add a relation annotation if a true relation was predicted
             if (!predictedCategory.equals(NO_RELATION_CATEGORY)) {
-              createRelation(jCas, cluster, mention, predictedCategory);
-              singleton = false;
-              // break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013),
-              // for "best first" need to keep track of all relations with scores and only keep the highest
-              break;
+//              Logger.getLogger("MCAnnotator").info(String.format("Making a pair with score %f", scores.get(predictedCategory)));
+              if(greedyFirst){
+                createRelation(jCas, cluster, mention, predictedCategory);
+                singleton = false;
+                // break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013),
+                // for "best first" need to keep track of all relations with scores and only keep the highest
+                break;
+              }else{
+                if(scores.get(predictedCategory) > maxScore){
+                  maxScore = scores.get(predictedCategory);
+                  maxCluster = cluster;
+                }
+              }
             }
           }
         }
+        if(!greedyFirst && maxCluster != null){
+          // make a link with the max cluster
+          createRelation(jCas, maxCluster, mention, "CoreferenceClusterMember");
+        }
+        
+        markableStrings.add(mention.getCoveredText().toLowerCase());
+        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+        if(headNode != null){
+          String head = headNode.getCoveredText().toLowerCase();
+          if(!headWordMarkables.containsKey(head)){
+            headWordMarkables.put(head, new HashSet<Markable>());
+          }
+          headWordMarkables.get(head).add(mention);
+        }
+        
         // if we got this far and never matched up the 
         if(singleton){
           // make the markable it's own cluster:
@@ -267,20 +522,49 @@ public class MentionClusterCoreferenceAn
     ListFactory.append(jCas, cluster.getMembers(), mention);    
   }
 
-  private static Annotation getMostRecent(NonEmptyFSList list, Annotation focus){
-    NonEmptyFSList cur = list;
-    Annotation annot = (Annotation) cur.getHead();
-    
-    while(cur.getTail() instanceof NonEmptyFSList){
-      cur = (NonEmptyFSList) cur.getTail();
-      if(((Annotation)cur.getHead()).getEnd() < focus.getEnd()){
-        annot = (Annotation) cur.getHead();
-      }else{
-        break;
+
+  private static final boolean dominates(Annotation arg1, Annotation arg2) {
+    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+  }
+
+  public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
+    Set<String> semTypes = new HashSet<>();
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+      semTypes.addAll(getBestEnt(jcas, member));
+    }
+    return semTypes;
+  }
+  
+  public Set<String> getBestEnt(JCas jcas, Markable markable){
+    Set<String> bestEnts = new HashSet<>();
+    IdentifiedAnnotation bestEnt = null;
+    Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+    for(IdentifiedAnnotation ent : coveringEnts){
+      if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+      ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+      if(entHead == head){
+        if(bestEnt == null){
+          bestEnt = ent;
+        }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+          // if the span of this entity is bigger than the biggest existing one:
+          bestEnt = ent;
+          otherBestEnts = new HashSet<>();
+        }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+          // there is another one with the exact same span and possibly different type!
+          otherBestEnts.add(ent);
+        }
       }
     }
 
-    return annot;
+    if(bestEnt!=null){
+      bestEnts.add(bestEnt.getClass().getSimpleName());
+      for(IdentifiedAnnotation other : otherBestEnts){
+        bestEnts.add(other.getClass().getSimpleName());
+      }
+    }
+    return bestEnts;
   }
   
   public static class CollectionTextRelationIdentifiedAnnotationPair {

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java?rev=1687518&r1=1687517&r2=1687518&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java Thu Jun 25 13:40:52 2015
@@ -133,7 +133,7 @@ public class UMLSFeatureExtractor implem
 		return feats;
 	}
 
-	private static String getDocId(JCas jcas) throws AnalysisEngineProcessException {
+	public static String getDocId(JCas jcas) throws AnalysisEngineProcessException {
 	  String docId = null;
 	  
 	  docId = DocumentIDAnnotationUtil.getDocumentID(jcas);

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,62 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.dependency.parser.util.DependencyPath;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterDepHeadExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode mentionHead = DependencyUtility.getNominalHeadNode(jCas, mention);
+    Set<String> memberHeads = new HashSet<>();
+    Set<String> memberPaths = new HashSet<>();
+    
+    for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+      if(member.getBegin() > mention.getEnd()) break;
+      ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jCas, member);
+      if(memberHead != null){
+        String headWord = memberHead.getCoveredText().toLowerCase();
+        memberHeads.add(headWord);
+        memberPaths.add(memberHead.getDeprel());
+      }
+//      DependencyPath path = DependencyUtility.getPathToTop(jCas, memberHead);
+    }
+    for(String headWord : memberHeads){
+//      feats.add(new Feature("MemberHead", headWord));
+    }
+    for(String path : memberPaths){
+      feats.add(new Feature("MemberRel", path));
+    }
+    
+    if(mentionHead != null){
+      String headWord = mentionHead.getCoveredText().toLowerCase();
+      feats.add(new Feature("MentionRel", mentionHead.getDeprel()));
+//      feats.add(new Feature("MentionHead", headWord));
+      if(memberHeads.contains(headWord) && !StringMatchingFeatureExtractor.isPronoun(mention)){
+        feats.add(new Feature("ClusterHeadMatchesMentionHead", true));
+      }
+    }
+    
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,68 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterDistSemExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  public static final double DEFAULT_SIM = 0.5;  
+  
+  private WordEmbeddings words = null;
+  
+  public MentionClusterDistSemExtractor() throws FileNotFoundException, IOException{
+    words = WordVectorReader.getEmbeddings(FileLocator.getAsStream("org/apache/ctakes/coreference/distsem/mimic_vectors.txt"));
+  }
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    if(StringMatchingFeatureExtractor.isPronoun(mention)) return feats;
+    
+    double maxSim = 0.0;
+    
+    ConllDependencyNode mentionNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+    // first, do not bother with pronouns:
+    
+    String mentionHead = mentionNode != null ? mentionNode.getCoveredText().toLowerCase() : null;
+    if(mentionHead != null){
+      for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+        ConllDependencyNode memberNode = DependencyUtility.getNominalHeadNode(jCas, member);
+        String memberHead = memberNode != null ? memberNode.getCoveredText().toLowerCase() : null;
+        if(mentionHead.equals(memberHead)){
+          maxSim = 1.0;
+          break;
+        }
+        if(memberNode != null && words.containsKey(memberHead) && words.containsKey(mentionHead)){
+          double sim = words.getSimilarity(mentionHead, memberHead);
+          if(sim > maxSim){
+            maxSim = sim;
+          }
+        }
+      }
+    }
+    
+    feats.add(new Feature("HEAD_SIMILARITY_WORD2VEC", maxSim));
+    
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSalienceFeaturesExtractor.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,36 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterSalienceFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    feats.add(new Feature("MC_MENTION_SALIENCE", mention.getConfidence()));
+
+    double maxSalience = 0.0;
+    for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+      if(member.getConfidence() > maxSalience){
+        maxSalience = member.getConfidence();
+      }
+    }
+    feats.add(new Feature("MC_MAX_SALIENCE", maxSalience));
+    
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSectionFeaturesExtractor.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,71 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterSectionFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jcas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    Set<Integer> parsWithAnteHeader = new HashSet<>();
+    
+    boolean anteInHeader = false;
+    boolean anaInHeader = false;
+    int anaPar = -1;
+    
+    // Find section headers -- paragraphs 
+    List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+    for(int i = 0; i < pars.size(); i++){
+      Paragraph par = pars.get(i);
+      if(par.getBegin() > mention.getEnd()){
+        break;
+      }
+      
+      for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+        if(member.getBegin() >= par.getBegin() && member.getEnd() <= par.getEnd()){
+          parsWithAnteHeader.add(i);
+          break;
+        }
+      }
+      
+      // find the paragraph with the anaphor
+      if(mention.getBegin() >= par.getBegin() && mention.getEnd() <= par.getEnd()){
+        anaPar = i;
+      }
+      List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+      if(coveredSents != null && coveredSents.size() == 1){
+        if(anaPar == i){
+          anaInHeader = true;
+          break;
+        }
+      }
+    }
+
+    feats.add(new Feature("AnteInHeader", parsWithAnteHeader.size() > 0));
+    feats.add(new Feature("AnaInHeader", anaInHeader));
+    if(anteInHeader && parsWithAnteHeader.contains(anaPar-1)){
+      feats.add(new Feature("AnteHeaderHeadsAna", true));      
+    }
+
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterSemTypeDepPrefsFeatureExtractor.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,74 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.io.FileNotFoundException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Scanner;
+
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterSemTypeDepPrefsFeatureExtractor implements RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> {
+
+  private HashMap<String,HashMap<String,Double>> probs = new HashMap<>();
+  
+  public MentionClusterSemTypeDepPrefsFeatureExtractor() throws FileNotFoundException {
+    Scanner scanner = new Scanner(FileLocator.getAsStream("org/apache/ctakes/coreference/pref_probs.txt"));
+    while(scanner.hasNextLine()){
+      String line = scanner.nextLine().trim();
+      String[] parts = line.split("\t");
+      if(!probs.containsKey(parts[0])){
+        probs.put(parts[0], new HashMap<String,Double>());
+      }
+      probs.get(parts[0]).put(parts[1], Double.parseDouble(parts[2]));
+    }
+    
+    
+    scanner.close();
+  }
+  
+  @Override
+  public List<Feature> extract(JCas jcas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    double maxProb = 0.0;
+    String mentionText = mention.getCoveredText().toLowerCase();
+    
+    if(mentionText.equals("this") || mentionText.equals("it") || mentionText.equals("that")){
+      ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, mention);
+      String key = head.getHead().getCoveredText().toLowerCase() + "::" + head.getDeprel();
+      Map<String,Double> semProbs = probs.get(key);
+      if(semProbs == null) return feats;
+
+      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        if(mention.getBegin() < m.getEnd()){
+          // during training this might happen -- see a member of a cluster that
+          // is actually subsequent to the candidate mention
+          continue;
+        }
+        List<IdentifiedAnnotation> ents = JCasUtil.selectCovering(jcas, IdentifiedAnnotation.class, m);
+        for(IdentifiedAnnotation ent : ents){
+          String semKey = ent.getClass().getSimpleName();
+          if(semProbs.containsKey(semKey)){
+            double prob = semProbs.get(semKey);
+            if(prob > maxProb) maxProb = prob;
+          }
+        }
+      }
+      feats.add(new Feature("InferredSemTypeMaxProb", maxProb));
+    }
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStackFeaturesExtractor.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,68 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterStackFeaturesExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+
+    int size = 0;
+    for(Markable m : new ListIterable<Markable>(cluster.getMembers())){
+      if(mention.getBegin() < m.getBegin()){
+        break;
+      }
+      size++;
+    }
+    
+    // This feature didn't work.
+//    feats.add(new Feature("ClusterSize_" + size, true));
+//    feats.add(new Feature("ClusterSize", size));
+    
+    NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+    Annotation mostRecent = ClusterUtils.getMostRecent(members, mention);
+    int mentionEnd = mostRecent.getEnd();
+    int numIntervening = 0;
+    int numNonSingletonIntervening = 0;
+    
+    // this feature is how far down the current cluster is on the stack -- to calculate it
+    // we go over all other clusters in the cas, look at the most recent element, and
+    // see if it is more recent than the current cluster underconsideration
+    for(CollectionTextRelation otherCluster : JCasUtil.select(jCas, CollectionTextRelation.class)){
+      if(otherCluster == cluster) continue;
+
+      members = ((NonEmptyFSList)otherCluster.getMembers());
+      mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+      if(mostRecent != null && mostRecent.getEnd() > mentionEnd){
+        numIntervening++;
+        if(ClusterUtils.getSize(members) > 1){
+          numNonSingletonIntervening++;
+        }
+      }
+    }
+    
+//    feats.add(new Feature("ClusterStackPositionInclSingleton"+numIntervening,true));
+//    feats.add(new Feature("ClusterStackPosition"+numNonSingletonIntervening,true));
+    feats.add(new Feature("ClusterStackPositionInclSingleton", Math.log10(numIntervening+1)));
+    feats.add(new Feature("ClusterStackPosition", Math.log10(numNonSingletonIntervening+1)));
+    return feats;
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java?rev=1687518&r1=1687517&r2=1687518&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java Thu Jun 25 13:40:52 2015
@@ -1,14 +1,23 @@
 package org.apache.ctakes.coreference.ae.features.cluster;
 
-import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.*;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.contentWords;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.endMatch;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.soonMatch;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.startMatch;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.wordOverlap;
+import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.wordSubstring;
 
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
 import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.utils.struct.CounterMap;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
@@ -23,34 +32,83 @@ public class MentionClusterStringFeature
     List<Feature> feats = new ArrayList<>();
     CounterMap<String> featCounts = new CounterMap<>();
     
+    if(StringMatchingFeatureExtractor.isPronoun(mention)) return feats;
+    
     String m = mention.getCoveredText();
     Set<String> mentionWords = contentWords(mention);
+    Set<String> nonHeadMentionWords = new HashSet<>(mentionWords);
+    ConllDependencyNode mentionHead = DependencyUtility.getNominalHeadNode(jCas, mention);
+    String mentionHeadString = null;
+    if(mentionHead != null){
+      mentionHeadString = mentionHead.getCoveredText().toLowerCase();
+      nonHeadMentionWords.remove(mentionHeadString);
     
-    for(IdentifiedAnnotation member : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
-      if(member == null){
-        System.err.println("Something that shouldn't happen has happened");
-        continue;
-      }else if(mention.getBegin() < member.getEnd()){
-        // during training this might happen -- see a member of a cluster that
-        // is actually subsequent to the candidate mention
-        continue;
+
+      int clusterSize = 0;
+      int maxNonoverlap = 0;
+
+      for(IdentifiedAnnotation member : new ListIterable<IdentifiedAnnotation>(cluster.getMembers())){
+        if(member == null){
+          System.err.println("Something that shouldn't happen has happened");
+          continue;
+        }else if(mention.getBegin() < member.getEnd()){
+          // during training this might happen -- see a member of a cluster that
+          // is actually subsequent to the candidate mention
+          continue;
+        }else if(StringMatchingFeatureExtractor.isPronoun(member)){
+          continue;
+        }
+
+        String s = member.getCoveredText();
+        Set<String> memberWords = contentWords(member);
+        Set<String> nonHeadMemberWords = new HashSet<>(memberWords);
+        ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jCas, member);
+        String memberHeadString = null;
+        if(memberHead != null){
+          memberHeadString = memberHead.getCoveredText().toLowerCase();
+          nonHeadMemberWords.remove(memberHeadString);
+
+          if(mentionHeadString.equals(memberHeadString)){
+
+            if(m.equalsIgnoreCase(s)) featCounts.add("MC_STRING_EXACT");
+            if(startMatch(m,s)) featCounts.add("MC_STRING_START");
+            if(endMatch(m,s)) featCounts.add("MC_STRING_END");
+            if(soonMatch(m,s)) featCounts.add("MC_STRING_SOON");
+            if(wordOverlap(mentionWords, memberWords)) featCounts.add("MC_OVERLAP");
+            if(wordSubstring(mentionWords, memberWords)) featCounts.add("MC_SUB");
+
+            int nonHeadOverlap = wordNonOverlapCount(nonHeadMemberWords, nonHeadMentionWords);
+            if(nonHeadOverlap > maxNonoverlap){
+              maxNonoverlap = nonHeadOverlap;
+            }
+          }
+        }
+        clusterSize++;
       }
-      
-      String s = member.getCoveredText();
-      Set<String> memberWords = contentWords(member);
-      
-      if(m.equalsIgnoreCase(s)) featCounts.add("MC_STRING_EXACT");
-      if(startMatch(m,s)) featCounts.add("MC_STRING_START");
-      if(endMatch(m,s)) featCounts.add("MC_STRING_END");
-      if(soonMatch(m,s)) featCounts.add("MC_STRING_SOON");
-      if(wordOverlap(mentionWords, memberWords)) featCounts.add("MC_OVERLAP");
-      if(wordSubstring(mentionWords, memberWords)) featCounts.add("MC_SUB");
+      feats.add(new Feature("MC_MAX_NONOVERLAP", maxNonoverlap));
     }
     
+    
     for(String featKey : featCounts.keySet()){
-      feats.add(new Feature(featKey, featCounts.get(featKey)));
+      // normalized
+//      feats.add(new Feature(featKey, (double) featCounts.get(featKey) / clusterSize));
+      // boolean
+      feats.add(new Feature(featKey, true));
     }
     return feats;
   }
+  
+  public static int wordNonOverlapCount(Set<String> w1, Set<String> w2){
+    int count = 0;
+    
+    for(String w : w1){
+      if(!w2.contains(w)) count++;
+    }
+    
+    for(String w : w2){
+      if(!w1.contains(w)) count++;
+    }
+    return count;
+  }
 
 }

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,145 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import static org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor.*;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListIterable;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterUMLSFeatureExtractor implements
+    RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  String docId = null;
+  Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> coveringMap = null;
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    Set<String> trueFeats = new HashSet<>();
+    
+    if(docId == null || !getDocId(jCas).equals(docId)){
+      docId = getDocId(jCas);
+      coveringMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    }
+    
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jCas, mention);
+    
+    if(head != null){
+      List<IdentifiedAnnotation> rmList = new ArrayList<>();
+      // get the entities covering this markable:
+      List<IdentifiedAnnotation> mentionEnts = new ArrayList<>(coveringMap.get(head)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());'
+      for(IdentifiedAnnotation ann : mentionEnts){
+        if(!(ann instanceof EntityMention || ann instanceof EventMention)){
+          rmList.add(ann);
+        }
+      }
+      for(IdentifiedAnnotation toRm : rmList){
+        mentionEnts.remove(toRm);
+      }
+      
+      Set<IdentifiedAnnotation> clusterEnts = new HashSet<>();
+      for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
+        ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jCas, member);
+        rmList.clear();
+        // get the named entities covering this cluster member:
+        List<IdentifiedAnnotation> ents2 = new ArrayList<>(coveringMap.get(memberHead)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
+        for(IdentifiedAnnotation ann : ents2){
+          if(!(ann instanceof EntityMention || ann instanceof EventMention) || ann.getClass() == EventMention.class){
+            rmList.add(ann);
+          }
+        }
+        for(IdentifiedAnnotation toRm : rmList){
+          ents2.remove(toRm);
+        }
+        
+        clusterEnts.addAll(ents2);
+      }
+      
+      if(clusterEnts.size() == 0 && mentionEnts.size() > 0){
+        trueFeats.add("ClusterNoCui_MentionCui");
+      }else if(clusterEnts.size() > 0 && mentionEnts.size() == 0){
+        trueFeats.add("ClusterCui_MentionNoCui");          
+      }else if(clusterEnts.size() == 0 && mentionEnts.size() == 0){
+        trueFeats.add("ClusterMentionNoCui");
+      }else{
+        trueFeats.add("ClusterMentionBothCui");
+      }
+      
+      if((clusterEnts.size() == 0 & mentionEnts.size() > 0) ||
+          (clusterEnts.size() > 0 && mentionEnts.size() == 0)){
+        trueFeats.add("ClusterOrMentionNoCui");
+      }
+
+      for(IdentifiedAnnotation ent1 : clusterEnts){
+        HashSet<String> a1Tuis = new HashSet<>(); 
+        String a1SemType = ent1.getClass().getSimpleName();
+        trueFeats.add("ClusterSemType" + a1SemType);
+        FSArray cons1 = ent1.getOntologyConceptArr();
+        if(cons1 != null){
+          for(int i = 0; i < cons1.size(); i++){
+            if(cons1.get(i) instanceof UmlsConcept){
+              a1Tuis.add(((UmlsConcept)cons1.get(i)).getTui());
+            }
+          }
+        }
+        for(IdentifiedAnnotation ent2 : mentionEnts){
+          HashSet<String> a2Tuis = new HashSet<>();
+          String a2SemType = ent2.getClass().getSimpleName();
+          trueFeats.add("MentionSemType" + a2SemType);
+          if(alias(ent1, ent2)){
+            trueFeats.add("UMLS_ALIAS");
+            break;
+          }
+          trueFeats.add("MentionClusterSemTypePair" + a1SemType + "_" + a2SemType);
+          
+          FSArray cons2 = ent2.getOntologyConceptArr();
+          if(cons2 != null){
+            for(int i = 0; i < cons2.size(); i++){
+              if(cons2.get(i) instanceof UmlsConcept){
+                a2Tuis.add(((UmlsConcept)cons2.get(i)).getTui());
+              }
+            }
+          }
+          for(String tui1 : a1Tuis){
+//            trueFeats.add("ClusterTui_" +  tui1);
+            for(String tui2 : a2Tuis){
+//              trueFeats.add("ClusterTui_" + tui1 + "_MentionTui_ " + tui2);
+              if(tui1.equals(tui2)){
+                trueFeats.add("ClusterMentionTuiMatch");
+              }
+            }
+          }
+          for(String tui2 : a2Tuis){
+//            trueFeats.add("MentionTui_" + tui2);
+          }
+        }
+      }
+    }
+    
+    for(String feat : trueFeats){
+      feats.add(new Feature(feat, true));
+    }
+    return feats;
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java?rev=1687518&r1=1687517&r2=1687518&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java Thu Jun 25 13:40:52 2015
@@ -28,25 +28,26 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.PersonChainAnnotator;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
+import org.apache.ctakes.temporal.ae.BackwardsTimeAnnotator;
 import org.apache.ctakes.temporal.ae.DocTimeRelAnnotator;
 import org.apache.ctakes.temporal.ae.EventAnnotator;
-import org.apache.ctakes.temporal.eval.EvaluationOfEventTimeRelations;
-import org.apache.ctakes.temporal.eval.EvaluationOfTemporalRelations_ImplBase;
-import org.apache.ctakes.temporal.eval.Evaluation_ImplBase;
 import org.apache.ctakes.temporal.eval.EvaluationOfEventTimeRelations.ParameterSettings;
-import org.apache.ctakes.temporal.eval.EvaluationOfTemporalRelations_ImplBase.TempRelOptions;
-import org.apache.ctakes.temporal.eval.Evaluation_ImplBase.Subcorpus;
-import org.apache.ctakes.temporal.eval.Evaluation_ImplBase.XMLFormat;
+import org.apache.ctakes.temporal.eval.EvaluationOfTemporalRelations_ImplBase;
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation;
 import org.apache.ctakes.typesystem.type.relation.RelationArgument;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
 import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
 import org.apache.ctakes.typesystem.type.textspan.Paragraph;
 import org.apache.ctakes.utils.distsem.WordEmbeddings;
 import org.apache.ctakes.utils.distsem.WordVector;
@@ -87,7 +88,7 @@ import org.apache.uima.util.FileUtils;
 import org.cleartk.eval.AnnotationStatistics;
 import org.cleartk.ml.jar.JarClassifierBuilder;
 import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter;
-import org.cleartk.ml.libsvm.tk.TkLibSvmStringOutcomeDataWriter;
+import org.cleartk.ml.libsvm.LibSvmStringOutcomeDataWriter;
 import org.cleartk.ml.tksvmlight.model.CompositeKernel.ComboOperator;
 import org.cleartk.util.ViewUriUtil;
 
@@ -119,7 +120,7 @@ public class EvaluationOfEventCoreferenc
   }
   
   private static Logger logger = Logger.getLogger(EvaluationOfEventCoreference.class);
-  public static float COREF_DOWNSAMPLE = 0.5f;
+  public static float COREF_DOWNSAMPLE = 0.1f;
   protected static ParameterSettings allParams = new ParameterSettings(DEFAULT_BOTH_DIRECTIONS, COREF_DOWNSAMPLE, "tk",
       1.0, 1.0, "linear", ComboOperator.SUM, 0.1, 0.5);
 
@@ -131,7 +132,7 @@ public class EvaluationOfEventCoreferenc
     List<Integer> testItems = options.getTestOnTrain() ? getTrainItems(options) : getTestItems(options);
 
     ParameterSettings params = allParams;
-    File workingDir = new File("target/eval/temporal-relations/coreference");
+    File workingDir = new File("target/eval/temporal-relations/coreference/" + options.getEvalSystem());
     if(!workingDir.exists()) workingDir.mkdirs();
     if(options.getUseTmp()){
       File tempModelDir = File.createTempFile("temporal", null, workingDir);
@@ -234,7 +235,9 @@ public class EvaluationOfEventCoreferenc
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
       aggregateBuilder.add(EventAnnotator.createAnnotatorDescription());
+      aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
       aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
       aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
       //    aggregateBuilder.add(CopyFromGold.getDescription(/*Markable.class,*/ CoreferenceRelation.class, CollectionTextRelation.class));
@@ -244,14 +247,17 @@ public class EvaluationOfEventCoreferenc
       if(this.evalType == EVAL_SYSTEM.MENTION_PAIR){
         aggregateBuilder.add(EventCoreferenceAnnotator.createDataWriterDescription(
             //        TKSVMlightStringOutcomeDataWriter.class,
-            //        LibLinearStringOutcomeDataWriter.class,
-            TkLibSvmStringOutcomeDataWriter.class,
+                    LibLinearStringOutcomeDataWriter.class,
+//            TkLibSvmStringOutcomeDataWriter.class,
             directory,
             params.probabilityOfKeepingANegativeExample
             ));
       }else if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER){
         aggregateBuilder.add(MentionClusterCoreferenceAnnotator.createDataWriterDescription(
+//            LibSvmStringOutcomeDataWriter.class,
             LibLinearStringOutcomeDataWriter.class,
+//            MalletStringOutcomeDataWriter.class,
+            
 //            TkLibSvmStringOutcomeDataWriter.class,
             directory,
             params.probabilityOfKeepingANegativeExample
@@ -301,6 +307,8 @@ public class EvaluationOfEventCoreferenc
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
+    aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
     aggregateBuilder.add(EventAnnotator.createAnnotatorDescription());
     aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
@@ -411,6 +419,62 @@ public class EvaluationOfEventCoreferenc
     
   }
   
+  /*
+   * The Relation extractors all create relation objects but don't populate the objects inside of them
+   * with pointers to the relation.
+   */
+  public static class RelationPropagator extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      for(LocationOfTextRelation locRel : JCasUtil.select(jcas, LocationOfTextRelation.class)){
+        IdentifiedAnnotation arg1 = (IdentifiedAnnotation) locRel.getArg1().getArgument();
+        IdentifiedAnnotation arg2 = (IdentifiedAnnotation) locRel.getArg2().getArgument();
+        // have to do this 3 different times because there is no intermediate class between EventMention and
+        // the three types that can have locations that has that location attribute.
+        // for the case where there are 2 locations, we take the one whose anatomical site argument
+        // has the the longer span assuming it is more specific
+        if(arg1 instanceof ProcedureMention){
+          ProcedureMention p = ((ProcedureMention)arg1);
+          if(p.getBodyLocation() == null){
+            p.setBodyLocation(locRel);
+          }else{
+            Annotation a = p.getBodyLocation().getArg2().getArgument();
+            int oldSize = a.getEnd() - a.getBegin();
+            int newSize = arg2.getEnd() - arg2.getEnd();
+            if(newSize > oldSize){
+              p.setBodyLocation(locRel);
+            }
+          }
+        }else if(arg1 instanceof DiseaseDisorderMention){
+          DiseaseDisorderMention d = (DiseaseDisorderMention)arg1;
+          if(d.getBodyLocation() == null){
+            d.setBodyLocation(locRel);
+          }else{
+            Annotation a = d.getBodyLocation().getArg2().getArgument();
+            int oldSize = a.getEnd() - a.getBegin();
+            int newSize = arg2.getEnd() - arg2.getEnd();
+            if(newSize > oldSize){
+              d.setBodyLocation(locRel);
+            }
+          }
+        }else if(arg1 instanceof SignSymptomMention){
+          SignSymptomMention s = (SignSymptomMention)arg1;
+          if(s.getBodyLocation() == null){
+            s.setBodyLocation(locRel);
+          }else{
+            Annotation a = s.getBodyLocation().getArg2().getArgument();
+            int oldSize = a.getEnd() - a.getBegin();
+            int newSize = arg2.getEnd() - arg2.getEnd();
+            if(newSize > oldSize){
+              s.setBodyLocation(locRel);
+            }
+          }          
+        }
+      }
+    }
+    
+  }
+  
   public static class ParagraphAnnotator extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
 
     @Override
@@ -437,6 +501,7 @@ public class EvaluationOfEventCoreferenc
     
   }
   
+  
   public static class ParagraphVectorAnnotator extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
     WordEmbeddings words = null;
 
@@ -542,10 +607,10 @@ public class EvaluationOfEventCoreferenc
                 break;
               }
             }
-            if(!gold2sys.containsKey(goldMarkable)){
-              Markable mappedGold = new Markable(jcas, goldMarkable.getBegin(), goldMarkable.getEnd());
-              mappedGold.addToIndexes();
-            }
+//            if(!gold2sys.containsKey(goldMarkable)){
+//              Markable mappedGold = new Markable(jcas, goldMarkable.getBegin(), goldMarkable.getEnd());
+//              mappedGold.addToIndexes();
+//            }
           }else{
             // Have seen some instances where anafora writes a span that is not possible, log them
             // so they can be found and fixed:
@@ -555,8 +620,12 @@ public class EvaluationOfEventCoreferenc
           
           // add markable to end of list:
           if(gold2sys.get(goldMarkable) == null){
-            logger.warn(String.format("There is a gold markable [%d, %d] which could not map to a system markable.", 
-                goldMarkable.getBegin(), goldMarkable.getEnd()));
+            String text = "<Out of bounds>";
+            if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+              text = goldMarkable.getCoveredText();
+            }
+            logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.", 
+                text, goldMarkable.getBegin(), goldMarkable.getEnd()));
             removeChain = true;
             break;
           }
@@ -611,8 +680,12 @@ public class EvaluationOfEventCoreferenc
 //      }
       List<Markable> toRemove = new ArrayList<>();
       for(Markable markable : JCasUtil.select(jcas, Markable.class)){
+        if(markable.getCoveredText().equals("I")){
+          System.err.println("Unauthorized markable 'I'");
+        }
         List<BaseToken> coveredTokens = JCasUtil.selectCovered(jcas, BaseToken.class, markable);
-        if(coveredTokens.size() == 1 && coveredTokens.get(0).getPartOfSpeech().startsWith("PRP")){
+        if(coveredTokens.size() == 1 && coveredTokens.get(0).getPartOfSpeech().startsWith("PRP") &&
+            !markable.getCoveredText().toLowerCase().equals("it")){
           toRemove.add(markable);
         }else if(coveredTokens.size() == 2 && 
             (coveredTokens.get(0).getCoveredText().startsWith("Mr.") || coveredTokens.get(0).getCoveredText().startsWith("Dr.") ||

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/ClusterUtils.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/ClusterUtils.java?rev=1687518&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/ClusterUtils.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/util/ClusterUtils.java Thu Jun 25 13:40:52 2015
@@ -0,0 +1,36 @@
+package org.apache.ctakes.coreference.util;
+
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class ClusterUtils {
+  public static Annotation getMostRecent(NonEmptyFSList list, Annotation focus){
+    NonEmptyFSList cur = list;
+    Annotation annot = (Annotation) cur.getHead();
+    
+    // check if the focus annotation is before any of the list elements
+    if(annot.getEnd() > focus.getEnd()) return null;
+    
+    while(cur.getTail() instanceof NonEmptyFSList){
+      cur = (NonEmptyFSList) cur.getTail();
+      if(((Annotation)cur.getHead()).getEnd() < focus.getEnd()){
+        annot = (Annotation) cur.getHead();
+      }else{
+        break;
+      }
+    }
+
+    return annot;
+  }
+  
+  public static int getSize(NonEmptyFSList list){
+    int size=1;
+
+    NonEmptyFSList cur = list;
+    while(cur.getTail() instanceof NonEmptyFSList){
+      cur = (NonEmptyFSList) cur.getTail();
+      size++;
+    }
+    return size;
+  }
+}



Mime
View raw message