ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1748736 [4/5] - in /ctakes/trunk/ctakes-coreference: ./ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/ae/features/ src/main/java/org/apache/ctakes/coreference/ae/features/cluster/ src/main/java...
Date Thu, 16 Jun 2016 14:51:51 GMT
Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/features/salience/SemanticEnvironmentFeatureExtractor.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,136 @@
+package org.apache.ctakes.coreference.ae.features.salience;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+
+import com.google.common.collect.Sets;
+
+import static org.apache.ctakes.dependency.parser.util.DependencyUtility.*;
+
+public class SemanticEnvironmentFeatureExtractor implements
+    FeatureExtractor1<Markable> {
+
+  // this is a subset of the attitude verbs listed in White et al:
+  // Discovering classes of attitude verbs using subcategorization frame distributsion
+  // NELS 2012.
+  private static Set<String> propVerbs = 
+      Sets.newHashSet("allow", "believe", "bother", "demand", "deny", "doubt", "expect", "feel", "forbid", "guess", "hate", "hear", "hope", "imagine", "need", "promise", "realize", "remember", "said", "say", "see", "suppose", "tell", "think", "understand", "want", "worry");
+  
+  public List<Feature> extract(JCas jcas, Markable markable)
+      throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    if(head == null){
+      return feats;
+    }
+    Sentence sent = DependencyUtility.getSentence(jcas, markable);
+    List<ConllDependencyNode> sentNodes = DependencyUtility.getDependencyNodes(jcas, sent);
+    List<ConllDependencyNode> covering = DependencyUtility.getProgeny(head, sentNodes);
+    
+    List<EventMention> events = JCasUtil.selectCovered(jcas, EventMention.class, markable);
+    EventMention markableEvent = null;
+    for(EventMention event : events){
+      ConllDependencyNode eventHead = getNominalHeadNode(jcas, event);
+      if(eventHead == head){
+        if(markableEvent == null || (event.getEnd()-event.getBegin()) > (markableEvent.getEnd()-markableEvent.getBegin())){
+          markableEvent = event;
+        }
+      }
+    }
+    
+    boolean neg = false;
+    if(markableEvent != null){
+      neg = markableEvent.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT;
+      feats.add(new Feature("SemEnvNegation", neg));
+    }
+    
+    boolean modal = presenceOfModality(head, sentNodes);
+    feats.add(new Feature("SemEnvModality", modal));
+    
+    boolean underPropVerb = presenceOfAttitude(jcas, head);
+    feats.add(new Feature("SemEnvAttitude", underPropVerb));
+    
+    
+    // modal * pronoun, neg * pronoun
+    if(head.getPostag().startsWith("PRP") || (head.getPostag().equals("DT") && !head.getDeprel().equals("det"))){
+      feats.add(new Feature("SemEnvProTrueModal"+modal, true));
+      feats.add(new Feature("SemEnvProTrueNeg"+neg, true));
+      feats.add(new Feature("SemEnvProTrueAtt"+underPropVerb, true));
+    }else{
+      feats.add(new Feature("SemEnvProFalseModal"+modal, true));
+      feats.add(new Feature("SemEnvProFalseNeg"+neg, true));
+      feats.add(new Feature("SemEnvProFalseAtt"+underPropVerb, true));
+    }
+    
+    // modal * Proper noun
+    if(head.getPostag().equals("NNP")){
+      feats.add(new Feature("SemEnvProperTrueModal"+modal, true));
+      feats.add(new Feature("SemEnvProperTrueNeg"+neg, true));
+      feats.add(new Feature("SemEnvProperTrueAtt"+underPropVerb, true));
+    }else{
+      feats.add(new Feature("SemEnvProperFalseModal"+modal,true));
+      feats.add(new Feature("SemEnvProperFalseNeg"+neg, true));
+      feats.add(new Feature("SemEnvProperFalseAtt"+underPropVerb, true));
+    }
+    
+    boolean indefinite = false;
+    for(ConllDependencyNode node : covering){
+      if(node.getId() != 0 && (node.getPostag().equals("DT") && 
+          (node.getLemma().equals("a") || node.getLemma().equals("an")))){
+        indefinite = true;
+      }
+    }
+    feats.add(new Feature("Neg"+neg+"Indef"+indefinite, true));
+    
+    feats.add(new Feature("Neg"+neg+"Mods"+(covering.size()-1), true));
+    
+    return feats;
+  }
+
+  private static final boolean presenceOfModality(ConllDependencyNode head, List<ConllDependencyNode> sentNodes) {
+    boolean modal = false;
+    ConllDependencyNode vb = null;
+    
+    if(head.getHead() != null){
+      vb = head.getHead();
+      while(vb.getHead() != null && !vb.getPostag().startsWith("VB")){
+        vb = vb.getHead();
+      }
+      
+      for(ConllDependencyNode node : sentNodes){
+        if(node.getHead() == vb && node.getPostag().equals("MD")){
+          modal = true;
+          break;
+        }
+      }
+    }
+    return modal;
+  }
+
+  private static final boolean presenceOfAttitude(JCas jcas, ConllDependencyNode head){
+    boolean att = false;
+    
+    for(ConllDependencyNode cur : getPathToTop(jcas, head)){
+      if(propVerbs.contains(cur.getLemma())){
+        att = true;
+        break;
+      }
+    }
+    
+    return att;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,10 @@
+package org.apache.ctakes.coreference.ae.pairing;
+
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+
+public interface AnnotationPairer<MENTION_TYPE,PAIR_TYPE> {
+  public List<PAIR_TYPE> getPairs(JCas jcas, MENTION_TYPE mention);
+  public void reset(JCas jcas);
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,76 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.ae.pairing.AnnotationPairer;
+//import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.MapFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public abstract class ClusterMentionPairer_ImplBase implements AnnotationPairer<Markable, CollectionTextRelationIdentifiedAnnotationPair> {
+  public abstract List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m);
+  private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+
+  @Override
+  public void reset(JCas jcas){
+    nodeEntMap = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+  }
+  
+  public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
+    Set<String> semTypes = new HashSet<>();
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+      semTypes.addAll(getBestEnt(jcas, member));
+    }
+    return semTypes;
+  }
+
+  public Set<String> getBestEnt(JCas jcas, Markable markable){
+    Set<String> bestEnts = new HashSet<>();
+    IdentifiedAnnotation bestEnt = null;
+    Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+    ConllDependencyNode head = MapFactory.get(getKey(jcas), markable);
+    
+    Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+    for(IdentifiedAnnotation ent : coveringEnts){
+      if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+      ConllDependencyNode entHead = MapFactory.get(getKey(jcas), ent);
+      if(entHead == head){
+        if(bestEnt == null){
+          bestEnt = ent;
+        }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+          // if the span of this entity is bigger than the biggest existing one:
+          bestEnt = ent;
+          otherBestEnts = new HashSet<>();
+        }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+          // there is another one with the exact same span and possibly different type!
+          otherBestEnts.add(ent);
+        }
+      }
+    }
+
+    if(bestEnt!=null){
+      bestEnts.add(bestEnt.getClass().getSimpleName());
+      for(IdentifiedAnnotation other : otherBestEnts){
+        bestEnts.add(other.getClass().getSimpleName());
+      }
+    }
+    return bestEnts;
+  }
+
+  protected static final boolean dominates(Annotation arg1, Annotation arg2) {
+    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,58 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class ClusterPairer extends ClusterMentionPairer_ImplBase {
+  private int sentDist;
+  public ClusterPairer(int dist){
+    this.sentDist = dist;
+  }
+  
+  /*
+   * getClusterPairs()
+   * In this method we allow to link to clusters containing more than one mention even if they
+   * are beyond a sentence distance. First we check whether the most recent mention in the cluster
+   * is within the specified sentence distance (presumably longer than the sentence distance passed into
+   * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple
+   * members but only one before the focus mention. So we need to count the members of a cluster until we 
+   * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
+        continue;
+      }
+      int numMembers=0;
+      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        numMembers++;
+        if(m == mostRecent) break;
+      }
+      if(numMembers > 1){
+        pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+      }
+    }
+    
+    return pairs;  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,54 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class ExactStringPairer extends ClusterMentionPairer_ImplBase {
+
+  private Set<String> markableStrings = null;
+  
+  @Override
+  public void reset(JCas jcas){
+    markableStrings = new HashSet<>();
+  }
+  /*
+   * getExactStringMatchPairs()
+   * For mentions that have the exact string repeated elsewhere in the document we want to
+   * allow matching across any distance. We don't use the sentence distance parameter here.
+   * We make use of a global variable markableStrings that is a HashSet containig all the markable
+   * strings from this document.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    
+    if(markableStrings.contains(mention.getCoveredText().toLowerCase())){
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(m == mostRecent) break;
+          // see if any of the members of the cluster have the exact same string as this 
+          if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+        }
+      }
+    }
+    markableStrings.add(mention.getCoveredText().toLowerCase());
+    return pairs;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,62 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.log4j.Logger;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class HeadwordPairer extends ClusterMentionPairer_ImplBase {
+  private Map<String, Set<Markable>> headWordMarkables = null;
+  
+  @Override
+  public void reset(JCas jcas){
+    super.reset(jcas);
+    headWordMarkables = new HashMap<>();
+  }
+  
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+
+    ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+    if(headNode == null){
+      Logger.getLogger(MentionClusterCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
+      return pairs;
+    }
+    String head = headNode.getCoveredText().toLowerCase();
+    if(headWordMarkables.containsKey(head)){
+      Set<Markable> headSet = headWordMarkables.get(head);
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(headSet.contains(mostRecent)){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+          if(m == mostRecent) break;
+        }
+      }      
+    }else{    
+      headWordMarkables.put(head, new HashSet<Markable>());
+    }
+    headWordMarkables.get(head).add(mention);
+    
+    return pairs;  
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,73 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class SectionHeaderPairer extends ClusterMentionPairer_ImplBase {
+
+  private int sentDist;
+
+  public SectionHeaderPairer(int dist) {
+    this.sentDist = dist;
+  }
+  
+  /*
+   * getSectionHeaderPairs()
+   * Here we want to add clusters where one of the members is on a line all by itself (a section header)
+   * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a 
+   * span only contains one sentence then we consider it a "header" (or also as important a list item).
+   * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by
+   * the "sentence distance" method.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){
+        continue;
+      }
+
+      // now check if any of the mentions are in a section header
+      List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin());
+      for(int j = 0; j < pars.size(); j++){
+        boolean match = false;
+        Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+        List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+        if(coveredSents != null && coveredSents.size() == 1){
+          // this is sentences that are the same span as paragraphs -- how we model section headers
+          // see if any of the cluster mentions are in the section header
+          for(Markable m : JCasUtil.select(members, Markable.class)){
+            if(dominates(par, m)){
+              pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+              match = true;
+              break;
+            }
+          }
+        }
+        if(match) break;
+      }
+    }
+    return pairs;
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,70 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public class SentenceDistancePairer extends ClusterMentionPairer_ImplBase {
+
+  private int sentDistance;
+  
+  public SentenceDistancePairer(int distance){
+    this.sentDistance = distance;
+  }
+  /*
+   * Here we want to add only things that are nearby. First we check the semantic types
+   * of the cluster we're comparing against. If any member is an Anatomical Site or Medication,
+   * we add the cluster no matter what. Otherwise we check how many sentences are in between
+   * the mention and the latest element of the cluster.
+   */
+  @Override
+  public List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable mention){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    Set<String> bestAnaTypes = getBestEnt(jcas, (Markable) mention);
+    
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()) continue;
+      
+      // check for distance if they are not anatomical site or medication
+      if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+          bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+  
+        IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+        if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > this.sentDistance) continue;
+      }
+  
+      // check for types of cluster
+      Set<String> bestClusterTypes = getBestEnt(jcas, cluster);
+      if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){
+        boolean overlap = false;
+        for(String semType : bestAnaTypes){
+          if(bestClusterTypes.contains(semType)){
+            overlap = true;
+          }
+        }
+        // they both correspond to named entities but no overlap in which category of named entity.
+        if(!overlap){
+          continue;
+        }
+      }
+      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));      
+    }
+    return pairs;
+  }
+
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,919 @@
+package org.apache.ctakes.coreference.eval;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.assertion.medfacts.cleartk.GenericCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.HistoryCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.SubjectCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.coreference.ae.CoreferenceChainScoringOutput;
+import org.apache.ctakes.coreference.ae.DeterministicMarkableAnnotator;
+import org.apache.ctakes.coreference.ae.EventCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator;
+import org.apache.ctakes.coreference.ae.MarkableSalienceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.MentionClusterRankingCoreferenceAnnotator;
+import org.apache.ctakes.coreference.ae.PersonChainAnnotator;
+import org.apache.ctakes.coreference.util.CoreferencePipelineFactory;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
+import org.apache.ctakes.temporal.ae.BackwardsTimeAnnotator;
+import org.apache.ctakes.temporal.ae.DocTimeRelAnnotator;
+import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.temporal.eval.EvaluationOfEventTimeRelations.ParameterSettings;
+import org.apache.ctakes.temporal.eval.EvaluationOfTemporalRelations_ImplBase;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.LocationOfTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
+import org.apache.ctakes.utils.distsem.WordVectorReader;
+import org.apache.log4j.Level;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.analysis_engine.metadata.FixedFlow;
+import org.apache.uima.analysis_engine.metadata.FlowConstraints;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.ViewCreatorAnnotator;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.FlowControllerFactory;
+import org.apache.uima.fit.pipeline.JCasIterator;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.flow.FinalStep;
+import org.apache.uima.flow.Flow;
+import org.apache.uima.flow.FlowControllerContext;
+import org.apache.uima.flow.FlowControllerDescription;
+import org.apache.uima.flow.JCasFlow_ImplBase;
+import org.apache.uima.flow.SimpleStep;
+import org.apache.uima.flow.Step;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.FloatArray;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.FileUtils;
+import org.cleartk.eval.AnnotationStatistics;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.jar.DataWriterFactory_ImplBase;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.EncodingDirectoryDataWriterFactory;
+import org.cleartk.ml.jar.JarClassifierBuilder;
+import org.cleartk.ml.liblinear.LibLinearStringOutcomeDataWriter;
+import org.cleartk.ml.svmlight.rank.SvmLightRankDataWriter;
+import org.cleartk.ml.tksvmlight.model.CompositeKernel.ComboOperator;
+import org.cleartk.util.ViewUriUtil;
+
+import com.google.common.base.Function;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+public class EvaluationOfEventCoreference extends EvaluationOfTemporalRelations_ImplBase {
+ 
+
+  static interface CoreferenceOptions extends TempRelOptions{
+    @Option
+    public String getOutputDirectory();
+    
+    @Option
+    public boolean getUseTmp();
+    
+    @Option
+    public boolean getTestOnTrain();
+    
+    @Option(longName="external")
+    public boolean getUseExternalScorer();
+    
+    @Option(shortName="t", defaultValue={"MENTION_CLUSTER"})
+    public EVAL_SYSTEM getEvalSystem();
+    
+    @Option(shortName="c", defaultValue="default")
+    public String getConfig();
+    
+    @Option(shortName="s")
+    public String getScorerPath();
+    
+    @Option
+    public boolean getSkipTest();
+  }
+  
+  private static Logger logger = Logger.getLogger(EvaluationOfEventCoreference.class);
+  public static float COREF_PAIRS_DOWNSAMPLE = 0.5f;
+  public static float COREF_CLUSTER_DOWNSAMPLE=0.5f;
+  private static final int NUM_SAMPLES = 0;
+  private static final double DROPOUT_RATE = 0.1;
+  
+  protected static ParameterSettings pairwiseParams = new ParameterSettings(DEFAULT_BOTH_DIRECTIONS, COREF_PAIRS_DOWNSAMPLE, "tk",
+      1.0, 1.0, "linear", ComboOperator.SUM, 0.1, 0.5);
+  protected static ParameterSettings clusterParams = new ParameterSettings(DEFAULT_BOTH_DIRECTIONS, COREF_CLUSTER_DOWNSAMPLE, "tk",
+      1.0, 1.0, "linear", ComboOperator.SUM, 0.1, 0.5);
+  
+  private static String goldOut = "";
+  private static String systemOut = "";
+  
+  public static void main(String[] args) throws Exception {
+    CoreferenceOptions options = CliFactory.parseArguments(CoreferenceOptions.class, args);
+
+    List<Integer> patientSets = options.getPatients().getList();
+    List<Integer> trainItems = getTrainItems(options);
+    List<Integer> testItems = options.getTestOnTrain() ? getTrainItems(options) : getTestItems(options);
+
+    ParameterSettings params = options.getEvalSystem() == EVAL_SYSTEM.MENTION_PAIR ? pairwiseParams : clusterParams;
+    
+    File workingDir = new File("target/eval/temporal-relations/coreference/" + options.getEvalSystem() + File.separator +  options.getConfig());
+    if(!workingDir.exists()) workingDir.mkdirs();
+    if(options.getUseTmp()){
+      File tempModelDir = File.createTempFile("temporal", null, workingDir);
+      tempModelDir.delete();
+      tempModelDir.mkdir();
+      workingDir = tempModelDir;
+    }
+    EvaluationOfEventCoreference eval = new EvaluationOfEventCoreference(
+        workingDir,
+        options.getRawTextDirectory(),
+        options.getXMLDirectory(),
+        options.getXMLFormat(),
+        options.getSubcorpus(),
+        options.getXMIDirectory(),
+        options.getTreebankDirectory(),
+        options.getPrintErrors(),
+        options.getPrintFormattedRelations(),
+        params,
+        options.getKernelParams(),
+        options.getOutputDirectory());
+
+    if(options.getSkipTrain()){
+      eval.skipTrain = true;
+    }
+    if(options.getSkipDataWriting()){
+      eval.skipWrite = true;
+    }
+    if(options.getSkipTest()){
+      eval.skipTest = true;
+    }
+    eval.evalType = options.getEvalSystem();
+    eval.config = options.getConfig();
+    goldOut = "gold." + eval.config + ".conll";
+    systemOut = "system." + eval.config + ".conll";
+    
+    eval.prepareXMIsFor(patientSets);
+    
+    params.stats = eval.trainAndTest(trainItems, testItems);//training);//
+
+    if(options.getUseTmp()){
+      FileUtils.deleteRecursive(workingDir);
+    }
+    
+    if(options.getUseExternalScorer() && !options.getSkipTest()){
+      Pattern patt = Pattern.compile("(?:Coreference|BLANC): Recall: \\([^\\)]*\\) (\\S+)%.*Precision: \\([^\\)]*\\) (\\S+)%.*F1: (\\S+)%");
+      Runtime runtime = Runtime.getRuntime();
+      Process p = runtime.exec(new String[]{
+          "perl",
+          options.getScorerPath(),
+          "all",
+          options.getOutputDirectory() + goldOut,
+          options.getOutputDirectory() + systemOut,
+          "none"});
+      BufferedReader reader = new BufferedReader(new InputStreamReader(p.getInputStream()));
+      String line, metric=null;
+      System.out.println(String.format("%10s%7s%7s%7s", "Metric", "Rec", "Prec", "F1"));
+      Map<String,Double> scores = new HashMap<>();
+      while((line = reader.readLine()) != null){
+        line = line.trim();
+        if(line.startsWith("METRIC")){
+          metric = line.substring(7);  // everything after "METRIC"
+          metric = metric.substring(0, metric.length()-1);  // remove colon from the end
+        }else if(line.startsWith("Coreference")){
+          Matcher m = patt.matcher(line);
+          if(m.matches()){
+            System.out.println(String.format("%10s%7.2f%7.2f%7.2f", metric, Double.parseDouble(m.group(1)), Double.parseDouble(m.group(2)), Double.parseDouble(m.group(3))));
+            scores.put(metric, Double.parseDouble(m.group(3)));
+          }
+        }
+      }
+      
+      if(scores.containsKey("muc") && scores.containsKey("bcub") && scores.containsKey("ceafe")){
+        double conll = (scores.get("muc") + scores.get("bcub") + scores.get("ceafe")) / 3.0;
+        System.out.println(String.format("%10s              %7.2f", "Conll", conll));
+      }
+    }
+  }
+  
+  boolean skipTrain=false; 
+  boolean skipWrite=false;
+  boolean skipTest=false;
+  public enum EVAL_SYSTEM { BASELINE, MENTION_PAIR, MENTION_CLUSTER, CLUSTER_RANK, PERSON_ONLY };
+  EVAL_SYSTEM evalType;
+  String config=null;
+  
+  private String outputDirectory;
+  
+  public EvaluationOfEventCoreference(File baseDirectory,
+      File rawTextDirectory, File xmlDirectory,
+      org.apache.ctakes.temporal.eval.Evaluation_ImplBase.XMLFormat xmlFormat, Subcorpus subcorpus,
+      File xmiDirectory, File treebankDirectory, boolean printErrors,
+      boolean printRelations, ParameterSettings params, String cmdParams, String outputDirectory) {
+    super(baseDirectory, rawTextDirectory, xmlDirectory, xmlFormat, subcorpus, xmiDirectory,
+        treebankDirectory, printErrors, printRelations, params);
+    this.outputDirectory = outputDirectory;
+    this.kernelParams = cmdParams == null ? null : cmdParams.replace("\"", "").split(" ");
+  }
+
+  @Override
+  protected void train(CollectionReader collectionReader, File directory)
+      throws Exception {
+    if(skipTrain) return;
+    if(this.evalType == EVAL_SYSTEM.BASELINE || this.evalType == EVAL_SYSTEM.PERSON_ONLY) return;
+    if(!skipWrite){
+      AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
+      aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
+      aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
+      aggregateBuilder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription());
+      aggregateBuilder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription());
+      aggregateBuilder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription());
+
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ViewCreatorAnnotator.class, ViewCreatorAnnotator.PARAM_VIEW_NAME, "Baseline"));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
+      aggregateBuilder.add(EventAnnotator.createAnnotatorDescription());
+      aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
+      aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
+      //    aggregateBuilder.add(CopyFromGold.getDescription(/*Markable.class,*/ CoreferenceRelation.class, CollectionTextRelation.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(MarkableHeadTreeCreator.class));
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class, CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME));
+      aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
+      if(this.evalType == EVAL_SYSTEM.MENTION_PAIR){
+        aggregateBuilder.add(EventCoreferenceAnnotator.createDataWriterDescription(
+            //        TKSVMlightStringOutcomeDataWriter.class,
+                    LibLinearStringOutcomeDataWriter.class,
+//            LibSvmStringOutcomeDataWriter.class,
+//            TkLibSvmStringOutcomeDataWriter.class,
+            directory,
+            params.probabilityOfKeepingANegativeExample
+            ));
+      }else if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER){
+        AnalysisEngineDescription aed = AnalysisEngineFactory.createEngineDescription(
+            MentionClusterCoreferenceAnnotator.class,
+            CleartkAnnotator.PARAM_IS_TRAINING,
+            true,
+            MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+            params.probabilityOfKeepingANegativeExample,
+            DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+            LibLinearStringOutcomeDataWriter.class,
+            DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+            directory);        
+        aggregateBuilder.add(aed);
+        for(int i = 0; i < NUM_SAMPLES; i++){
+          // after each iteration, remove the gold chains in the system view and re-copy over gold chains with some variation:
+          aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemoveAllCoreferenceAnnotations.class));
+          aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class, CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME, CopyCoreferenceRelations.PARAM_DROP_ELEMENTS, true));          
+
+          aed = AnalysisEngineFactory.createEngineDescription(
+              MentionClusterCoreferenceAnnotator.class,
+              CleartkAnnotator.PARAM_IS_TRAINING,
+              true,
+              MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+              params.probabilityOfKeepingANegativeExample,
+              MentionClusterCoreferenceAnnotator.PARAM_USE_EXISTING_ENCODERS,
+              true,
+              DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+              LibLinearStringOutcomeDataWriter.class,
+              DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+              directory);
+              
+          aggregateBuilder.add(aed);
+          
+        }
+      }else if(this.evalType == EVAL_SYSTEM.CLUSTER_RANK){
+        // TODO
+        aggregateBuilder.add(MentionClusterRankingCoreferenceAnnotator.createDataWriterDescription(
+            SvmLightRankDataWriter.class, 
+            directory, 
+            params.probabilityOfKeepingANegativeExample));
+      }else{
+        logger.warn("Encountered a training configuration taht does not add an annotator: " + this.evalType);
+      }
+      Logger.getLogger(EventCoreferenceAnnotator.class).setLevel(Level.WARN);
+      // create gold chains for writing out which we can then use for our scoring tool
+      //    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CoreferenceChainScoringOutput.class,
+      //        CoreferenceChainScoringOutput.PARAM_OUTPUT_DIR,
+      //        this.outputDirectory + "train"));
+      FlowControllerDescription corefFlowControl = FlowControllerFactory.createFlowControllerDescription(CorefEvalFlowController.class);
+      aggregateBuilder.setFlowControllerDescription(corefFlowControl);
+
+      SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+    }
+    String[] optArray;
+
+    if(this.kernelParams == null){
+      ArrayList<String> svmOptions = new ArrayList<>();
+      svmOptions.add("-c"); svmOptions.add(""+params.svmCost);        // svm cost
+      svmOptions.add("-t"); svmOptions.add(""+params.svmKernelIndex); // kernel index 
+      svmOptions.add("-d"); svmOptions.add("3");                      // degree parameter for polynomial
+      svmOptions.add("-g"); svmOptions.add(""+params.svmGamma);
+      if(params.svmKernelIndex==ParameterSettings.SVM_KERNELS.indexOf("tk")){
+        svmOptions.add("-S"); svmOptions.add(""+params.secondKernelIndex);   // second kernel index (similar to -t) for composite kernel
+        String comboFlag = (params.comboOperator == ComboOperator.SUM ? "+" : params.comboOperator == ComboOperator.PRODUCT ? "*" : params.comboOperator == ComboOperator.TREE_ONLY ? "T" : "V");
+        svmOptions.add("-C"); svmOptions.add(comboFlag);
+        svmOptions.add("-L"); svmOptions.add(""+params.lambda);
+        svmOptions.add("-T"); svmOptions.add(""+params.tkWeight);
+        svmOptions.add("-N"); svmOptions.add("3");   // normalize trees and features
+      }
+      optArray = svmOptions.toArray(new String[]{});
+    }else{
+      optArray = this.kernelParams;
+    }
+    JarClassifierBuilder.trainAndPackage(directory, optArray);
+  }
+
+  @Override
+  protected AnnotationStatistics<String> test(
+      CollectionReader collectionReader, File directory) throws Exception {
+    AnnotationStatistics<String> corefStats = new AnnotationStatistics<>();
+    if(this.skipTest){
+      logger.info("Skipping test");
+      return corefStats;
+    }
+    AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
+    aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(GenericCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(HistoryCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(SubjectCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphAnnotator.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ParagraphVectorAnnotator.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
+    aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
+    aggregateBuilder.add(EventAnnotator.createAnnotatorDescription());
+    aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CoreferenceChainScoringOutput.class,
+        CoreferenceChainScoringOutput.PARAM_OUTPUT_FILENAME,
+        this.outputDirectory + goldOut,
+        CoreferenceChainScoringOutput.PARAM_GOLD_VIEW_NAME,
+        GOLD_VIEW_NAME));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(MarkableHeadTreeCreator.class));
+    aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
+    if(this.evalType == EVAL_SYSTEM.MENTION_PAIR){
+      aggregateBuilder.add(EventCoreferenceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
+    }else if(this.evalType == EVAL_SYSTEM.MENTION_CLUSTER){
+//      aggregateBuilder.add(EventCoreferenceAnnotator.createScoringAnnotatorDescription("/org/apache/ctakes/coreference/mention-pair" + File.separator + "model.jar"));
+      aggregateBuilder.add(MentionClusterCoreferenceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
+    }else if(this.evalType == EVAL_SYSTEM.CLUSTER_RANK){
+      aggregateBuilder.add(MentionClusterRankingCoreferenceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
+    }else if(this.evalType == EVAL_SYSTEM.BASELINE){
+      aggregateBuilder.add(CoreferencePipelineFactory.getCoreferencePipeline());
+    }else{
+      logger.info("Running an evaluation that does not add an annotator: " + this.evalType);
+    }
+//    aggregateBuilder.add(CoreferenceChainAnnotator.createAnnotatorDescription());
+    aggregateBuilder.add(PersonChainAnnotator.createAnnotatorDescription());
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CoreferenceChainScoringOutput.class,
+        CoreferenceChainScoringOutput.PARAM_OUTPUT_FILENAME,
+        this.outputDirectory + systemOut));
+
+    FlowControllerDescription corefFlowControl = FlowControllerFactory.createFlowControllerDescription(CorefEvalFlowController.class);
+    aggregateBuilder.setFlowControllerDescription(corefFlowControl);
+//    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(XMIWriter.class));
+    Function<CoreferenceRelation, ?> getSpan = new Function<CoreferenceRelation, HashableArguments>() {
+      public HashableArguments apply(CoreferenceRelation relation) {
+        return new HashableArguments(relation);
+      }
+    };
+    Function<CoreferenceRelation, String> getOutcome = new Function<CoreferenceRelation,String>() {
+      public String apply(CoreferenceRelation relation){
+        return "Coreference";
+      }
+    };
+     
+
+    for(Iterator<JCas> casIter =new JCasIterator(collectionReader, aggregateBuilder.createAggregate()); casIter.hasNext();){
+      JCas jCas = casIter.next();
+      JCas goldView = jCas.getView(GOLD_VIEW_NAME);
+      JCas systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
+      Collection<CoreferenceRelation> goldRelations = JCasUtil.select(
+          goldView,
+          CoreferenceRelation.class);
+      Collection<CoreferenceRelation> systemRelations = JCasUtil.select(
+          systemView,
+          CoreferenceRelation.class);
+      corefStats.add(goldRelations, systemRelations, getSpan, getOutcome);
+      if(this.printErrors){
+        Map<HashableArguments, BinaryTextRelation> goldMap = Maps.newHashMap();
+        for (BinaryTextRelation relation : goldRelations) {
+          goldMap.put(new HashableArguments(relation), relation);
+        }
+        Map<HashableArguments, BinaryTextRelation> systemMap = Maps.newHashMap();
+        for (BinaryTextRelation relation : systemRelations) {
+          systemMap.put(new HashableArguments(relation), relation);
+        }
+        Set<HashableArguments> all = Sets.union(goldMap.keySet(), systemMap.keySet());
+        List<HashableArguments> sorted = Lists.newArrayList(all);
+        Collections.sort(sorted);
+        for (HashableArguments key : sorted) {
+          BinaryTextRelation goldRelation = goldMap.get(key);
+          BinaryTextRelation systemRelation = systemMap.get(key);
+          if (goldRelation == null) {
+            System.out.println("System added: " + formatRelation(systemRelation));
+          } else if (systemRelation == null) {
+            System.out.println("System dropped: " + formatRelation(goldRelation));
+          } else if (!systemRelation.getCategory().equals(goldRelation.getCategory())) {
+            String label = systemRelation.getCategory();
+            System.out.printf("System labeled %s for %s\n", label, formatRelation(goldRelation));
+          } else{
+            System.out.println("Nailed it! " + formatRelation(systemRelation));
+          }
+        }
+      }
+    }
+
+    return corefStats;
+  }
+  
+  public static class AnnotationComparator implements Comparator<Annotation> {
+
+    @Override
+    public int compare(Annotation o1, Annotation o2) {
+      if(o1.getBegin() < o2.getBegin()){
+        return -1;
+      }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() < o2.getEnd()){
+        return -1;
+      }else if(o1.getBegin() == o2.getBegin() && o1.getEnd() > o2.getEnd()){
+        return 1;
+      }else if(o2.getBegin() < o1.getBegin()){
+        return 1;
+      }else{
+        return 0;
+      }
+    }
+  }
+  public static class DocumentIDPrinter extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+    static Logger logger = Logger.getLogger(DocumentIDPrinter.class);
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      String docId = DocumentIDAnnotationUtil.getDocumentID(jCas);
+      if(docId == DocumentIDAnnotationUtil.NO_DOCUMENT_ID){
+        docId = new File(ViewUriUtil.getURI(jCas)).getName();
+      }
+      logger.info(String.format("Processing %s\n", docId));
+    }
+    
+  }
+  
+  /*
+   * The Relation extractors all create relation objects but don't populate the objects inside of them
+   * with pointers to the relation.
+   */
+  public static class RelationPropagator extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      for(LocationOfTextRelation locRel : JCasUtil.select(jcas, LocationOfTextRelation.class)){
+        IdentifiedAnnotation arg1 = (IdentifiedAnnotation) locRel.getArg1().getArgument();
+        IdentifiedAnnotation arg2 = (IdentifiedAnnotation) locRel.getArg2().getArgument();
+        // have to do this 3 different times because there is no intermediate class between EventMention and
+        // the three types that can have locations that has that location attribute.
+        // for the case where there are 2 locations, we take the one whose anatomical site argument
+        // has the the longer span assuming it is more specific
+        if(arg1 instanceof ProcedureMention){
+          ProcedureMention p = ((ProcedureMention)arg1);
+          if(p.getBodyLocation() == null){
+            p.setBodyLocation(locRel);
+          }else{
+            Annotation a = p.getBodyLocation().getArg2().getArgument();
+            int oldSize = a.getEnd() - a.getBegin();
+            int newSize = arg2.getEnd() - arg2.getEnd();
+            if(newSize > oldSize){
+              p.setBodyLocation(locRel);
+            }
+          }
+        }else if(arg1 instanceof DiseaseDisorderMention){
+          DiseaseDisorderMention d = (DiseaseDisorderMention)arg1;
+          if(d.getBodyLocation() == null){
+            d.setBodyLocation(locRel);
+          }else{
+            Annotation a = d.getBodyLocation().getArg2().getArgument();
+            int oldSize = a.getEnd() - a.getBegin();
+            int newSize = arg2.getEnd() - arg2.getEnd();
+            if(newSize > oldSize){
+              d.setBodyLocation(locRel);
+            }
+          }
+        }else if(arg1 instanceof SignSymptomMention){
+          SignSymptomMention s = (SignSymptomMention)arg1;
+          if(s.getBodyLocation() == null){
+            s.setBodyLocation(locRel);
+          }else{
+            Annotation a = s.getBodyLocation().getArg2().getArgument();
+            int oldSize = a.getEnd() - a.getBegin();
+            int newSize = arg2.getEnd() - arg2.getEnd();
+            if(newSize > oldSize){
+              s.setBodyLocation(locRel);
+            }
+          }          
+        }
+      }
+    }
+    
+  }
+  
+  public static class ParagraphAnnotator extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      List<BaseToken> tokens = new ArrayList<>(JCasUtil.select(jcas, BaseToken.class));
+      BaseToken lastToken = null;
+      int parStart = 0;
+      
+      for(int i = 0; i < tokens.size(); i++){
+        BaseToken token = tokens.get(i);
+        if(parStart == i && token instanceof NewlineToken){
+          // we've just created a pargraph ending but there were multiple newlines -- don't want to start the
+          // new paragraph until we are past the newlines -- increment the parStart index and move forward
+          parStart++;
+        }else if(lastToken != null && token instanceof NewlineToken){
+          Paragraph par = new Paragraph(jcas, tokens.get(parStart).getBegin(), lastToken.getEnd());
+          par.addToIndexes();
+          parStart = i+1;
+        }
+        lastToken = token;
+      }
+      
+    }
+    
+  }
+  
+  
+  public static class ParagraphVectorAnnotator extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+    WordEmbeddings words = null;
+
+    @Override
+    public void initialize(final UimaContext context) throws ResourceInitializationException{
+      try {
+        words = WordVectorReader.getEmbeddings(FileLocator.getAsStream("org/apache/ctakes/coreference/distsem/mimic_vectors.txt"));
+      } catch (IOException e) {
+        e.printStackTrace();
+        throw new ResourceInitializationException(e);
+      }
+    }
+    
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      List<Paragraph> pars = new ArrayList<>(JCasUtil.select(jcas, Paragraph.class));
+      FSArray parVecs = new FSArray(jcas, pars.size());
+      for(int parNum = 0; parNum < pars.size(); parNum++){
+        Paragraph par = pars.get(parNum);
+        float[] parVec = new float[words.getDimensionality()];
+
+        List<BaseToken> tokens = JCasUtil.selectCovered(BaseToken.class, par);
+        for(int i = 0; i < tokens.size(); i++){
+          BaseToken token = tokens.get(i);
+          if(token instanceof WordToken){
+            String word = token.getCoveredText().toLowerCase();
+            if(words.containsKey(word)){
+              WordVector wv = words.getVector(word);
+              for(int j = 0; j < parVec.length; j++){
+                parVec[j] += wv.getValue(j);
+              }
+            }          
+          }
+        }
+        normalize(parVec);
+        FloatArray vec = new FloatArray(jcas, words.getDimensionality());
+        vec.copyFromArray(parVec, 0, 0, parVec.length);
+        vec.addToIndexes();
+        parVecs.set(parNum, vec);
+      }
+      parVecs.addToIndexes();
+    }
+
+    private static final void normalize(float[] vec) {
+      double sum = 0.0;
+      for(int i = 0; i < vec.length; i++){
+        sum += (vec[i]*vec[i]);
+      }
+      sum = Math.sqrt(sum);
+      for(int i = 0; i < vec.length; i++){
+        vec[i] /= sum;
+      }
+    }
+  }
+  
+  public static class CopyCoreferenceRelations extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+
+    public static final String PARAM_GOLD_VIEW = "GoldViewName";
+    @ConfigurationParameter(name=PARAM_GOLD_VIEW, mandatory=true, description="View containing gold standard annotations")
+    private String goldViewName;
+    
+    public static final String PARAM_DROP_ELEMENTS = "Dropout";
+    @ConfigurationParameter(name = PARAM_DROP_ELEMENTS, mandatory=false)
+    private boolean dropout = false;
+
+    @SuppressWarnings("synthetic-access")
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      JCas goldView = null;
+      try {
+        goldView = jcas.getView(goldViewName);
+      } catch (CASException e) {
+        e.printStackTrace();
+        throw new AnalysisEngineProcessException(e);
+      }
+      
+      HashMap<Markable,Markable> gold2sys = new HashMap<>();
+      Map<ConllDependencyNode,Collection<Markable>> depIndex = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, Markable.class);
+      // remove those with removed markables (person mentions)
+      List<CollectionTextRelation> toRemove = new ArrayList<>();
+      
+      for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
+        FSList head = goldChain.getMembers();
+//        NonEmptyFSList sysList = new NonEmptyFSList(jcas);
+//        NonEmptyFSList listEnd = sysList;
+        List<List<Markable>> systemLists = new ArrayList<>(); // the gold list can be split up into many lists if we allow dropout.
+        boolean removeChain = false;
+        List<Markable> prevList = null;
+        
+        // first one is guaranteed to be nonempty otherwise it would not be in cas
+        do{
+          NonEmptyFSList element = (NonEmptyFSList) head;
+          Markable goldMarkable = (Markable) element.getHead();
+          boolean mapped = mapGoldMarkable(jcas, goldMarkable, gold2sys, depIndex);
+          
+          // if we can't align the gold markable with one in the system cas then don't add it:
+          if(!mapped){
+            String text = "<Out of bounds>";
+            if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+              text = goldMarkable.getCoveredText();
+            }
+            logger.warn(String.format("There is a gold markable %s [%d, %d] which could not map to a system markable.", 
+                text, goldMarkable.getBegin(), goldMarkable.getEnd()));
+            removeChain = true;
+            break;
+          }
+          
+          Markable sysMarkable = gold2sys.get(goldMarkable);
+          if(!dropout || systemLists.size() == 0){
+            if(systemLists.size() == 0) systemLists.add(new ArrayList<>());
+            systemLists.get(0).add(sysMarkable);
+//            prevList = systemLists.get(0);
+//            // if this is not first time through move listEnd to end.
+//            if(listEnd.getHead() != null){
+//              listEnd.setTail(new NonEmptyFSList(jcas));
+//              listEnd.addToIndexes();
+//              listEnd = (NonEmptyFSList) listEnd.getTail();
+//            }
+//
+//            // add markable to end of list:
+//            listEnd.setHead(gold2sys.get(goldMarkable));
+          }else{
+            // 3 options: Do correctly (append to same list as last element), ii) Start its own list, iii) Randomly join another list
+            if(Math.random() > DROPOUT_RATE){
+              // most of the time do the right thing:
+              systemLists.get(0).add(sysMarkable);
+            }else{
+              int listIndex = (int) Math.ceil(Math.random() * systemLists.size());
+              if(listIndex == systemLists.size()){
+                systemLists.add(new ArrayList<>());
+              }
+              systemLists.get(listIndex).add(sysMarkable);
+            }
+          }
+          head = element.getTail();
+        }while(head instanceof NonEmptyFSList);
+        
+        // don't bother copying over -- the gold chain was of person mentions
+        if(!removeChain){
+//          listEnd.setTail(new EmptyFSList(jcas));
+//          listEnd.addToIndexes();
+//          listEnd.getTail().addToIndexes();
+//          sysList.addToIndexes();
+          for(List<Markable> chain : systemLists){
+            if(chain.size() > 1){
+              CollectionTextRelation sysRel = new CollectionTextRelation(jcas);
+              sysRel.setMembers(ListFactory.buildList(jcas, chain));
+              sysRel.addToIndexes();
+            }
+          }
+        }
+      }
+      
+      for(CoreferenceRelation goldRel : JCasUtil.select(goldView, CoreferenceRelation.class)){
+        if((gold2sys.containsKey(goldRel.getArg1().getArgument()) && gold2sys.containsKey(goldRel.getArg2().getArgument()))){
+          CoreferenceRelation sysRel = new CoreferenceRelation(jcas);
+          sysRel.setCategory(goldRel.getCategory());
+          sysRel.setDiscoveryTechnique(CONST.REL_DISCOVERY_TECH_GOLD_ANNOTATION);
+
+          RelationArgument arg1 = new RelationArgument(jcas);
+          arg1.setArgument(gold2sys.get(goldRel.getArg1().getArgument()));
+          sysRel.setArg1(arg1);
+          arg1.addToIndexes();
+
+          RelationArgument arg2 = new RelationArgument(jcas);
+          arg2.setArgument(gold2sys.get(goldRel.getArg2().getArgument()));
+          sysRel.setArg2(arg2);
+          arg2.addToIndexes();         
+          
+          sysRel.addToIndexes();        
+        }
+      }
+    }
+    
+    private static boolean mapGoldMarkable(JCas jcas, Markable goldMarkable, Map<Markable,Markable> gold2sys, Map<ConllDependencyNode, Collection<Markable>> depIndex){
+      if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+        
+        
+        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
+
+        for(Markable sysMarkable : depIndex.get(headNode)){
+          ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
+          if(markNode == headNode){
+            gold2sys.put(goldMarkable, sysMarkable);
+            return true;
+          }
+        }
+      }else{
+        // Have seen some instances where anafora writes a span that is not possible, log them
+        // so they can be found and fixed:
+        logger.warn(String.format("There is a markable with span [%d, %d] in a document with length %d\n", 
+            goldMarkable.getBegin(), goldMarkable.getEnd(), jcas.getDocumentText().length()));
+        return false;
+      }
+      return false;
+    }
+  }
+  
+  public static class RemoveAllCoreferenceAnnotations extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      List<CollectionTextRelation> chains = new ArrayList<>(JCasUtil.select(jcas, CollectionTextRelation.class));
+      for(CollectionTextRelation chain : chains){
+        NonEmptyFSList head = null;
+        FSList nextHead = chain.getMembers();
+        do{
+          head = (NonEmptyFSList) nextHead;
+          head.removeFromIndexes();
+          nextHead = head.getTail();
+        }while(nextHead instanceof NonEmptyFSList);
+        chain.removeFromIndexes();
+      }
+      List<CoreferenceRelation> rels = new ArrayList<>(JCasUtil.select(jcas, CoreferenceRelation.class));
+      for(CoreferenceRelation rel : rels){
+        rel.getArg1().removeFromIndexes();
+        rel.getArg2().removeFromIndexes();
+        rel.removeFromIndexes();
+      }
+    }    
+  }
+  
+  public static class RemovePersonMarkables extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+//      JCas systemView=null, goldView=null;
+//      try{
+//        systemView = jcas.getView(CAS.NAME_DEFAULT_SOFA);
+//        goldView = jcas.getView(GOLD_VIEW_NAME);
+//      }catch(Exception e){
+//        throw new AnalysisEngineProcessException(e);
+//      }
+      List<Markable> toRemove = new ArrayList<>();
+      for(Markable markable : JCasUtil.select(jcas, Markable.class)){
+        if(markable.getCoveredText().equals("I")){
+          System.err.println("Unauthorized markable 'I'");
+        }
+        List<BaseToken> coveredTokens = JCasUtil.selectCovered(jcas, BaseToken.class, markable);
+        if(coveredTokens.size() == 1 && coveredTokens.get(0).getPartOfSpeech().startsWith("PRP") &&
+            !markable.getCoveredText().toLowerCase().equals("it")){
+          toRemove.add(markable);
+        }else if(coveredTokens.size() > 0 && (coveredTokens.get(0).getCoveredText().startsWith("Mr.") || coveredTokens.get(0).getCoveredText().startsWith("Dr.") ||
+                coveredTokens.get(0).getCoveredText().startsWith("Mrs.") || coveredTokens.get(0).getCoveredText().startsWith("Ms."))){
+          toRemove.add(markable);
+        }else if(markable.getCoveredText().toLowerCase().endsWith("patient") || markable.getCoveredText().toLowerCase().equals("pt")){
+          toRemove.add(markable);
+        }
+      }
+      
+      for(Markable markable : toRemove){
+        markable.removeFromIndexes();
+      }
+    } 
+  }
+  
+  /* This flow control section borrows from the UIMA implementation of FixedFlowController
+   * and its internal Flow object. Simple change to check if there are any gold
+   * coref annotations inside the cas, and if not skip out so we don't waste
+   * time running coref code on those (since we're not going to print out the answers
+   * anyways)
+   */
+  public static class CorefEvalFlowController extends org.apache.uima.flow.JCasFlowController_ImplBase {
+    List<String> mSequence;
+
+    
+    @Override
+    public void initialize(FlowControllerContext context)
+        throws ResourceInitializationException {
+      super.initialize(context);
+      
+      FlowConstraints flowConstraints = context.getAggregateMetadata().getFlowConstraints();
+      mSequence = new ArrayList<>();
+      if (flowConstraints instanceof FixedFlow) {
+        String[] sequence = ((FixedFlow) flowConstraints).getFixedFlow();
+        mSequence.addAll(Arrays.asList(sequence));
+      } else {
+        throw new ResourceInitializationException(ResourceInitializationException.FLOW_CONTROLLER_REQUIRES_FLOW_CONSTRAINTS,
+                new Object[]{this.getClass().getName(), "fixedFlow", context.getAggregateMetadata().getSourceUrlString()});
+      }
+    }
+
+    @Override
+    public Flow computeFlow(JCas jcas) throws AnalysisEngineProcessException {
+      return new CorefEvalFlow(jcas, 0);
+    }
+    
+    class CorefEvalFlow extends JCasFlow_ImplBase {
+
+      private JCas jcas;
+      private int currentStep;
+
+      public CorefEvalFlow(JCas jcas, int step){
+        this.jcas = jcas;
+        this.currentStep = step;
+      }
+
+      @Override
+      public Step next() {
+        // if we are past the last annotator finish
+        if (currentStep >= mSequence.size()) {
+          return new FinalStep();
+        }
+
+        // if we have gold standard relations, continue
+        if(currentStep > 0 && mSequence.get(currentStep-1).equals(DocumentIDPrinter.class.getName())){
+          JCas goldView;
+          try {
+            goldView = jcas.getView(GOLD_VIEW_NAME);
+            if(JCasUtil.select(goldView, CoreferenceRelation.class).size() == 0){
+              System.out.println("Skipping this document with no coreference relations.");
+              return new FinalStep();
+            }
+          } catch (CASException e) {
+            // no need to stop flow -- just go ahead to default simple step.
+            e.printStackTrace();
+          }
+        }
+        
+        // otherwise finish
+        return new SimpleStep(mSequence.get(currentStep++));
+      }
+    }
+  }
+}

Added: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfMarkableSalience.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfMarkableSalience.java?rev=1748736&view=auto
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfMarkableSalience.java (added)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfMarkableSalience.java Thu Jun 16 14:51:51 2016
@@ -0,0 +1,239 @@
+package org.apache.ctakes.coreference.eval;
+
+import java.io.File;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.coreference.ae.DeterministicMarkableAnnotator;
+import org.apache.ctakes.coreference.ae.MarkableSalienceAnnotator;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference.DocumentIDPrinter;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference.RemovePersonMarkables;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.temporal.eval.Evaluation_ImplBase;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.ViewCreatorAnnotator;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.JCasIterator;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.cleartk.eval.AnnotationStatistics;
+import org.cleartk.ml.jar.JarClassifierBuilder;
+import org.cleartk.ml.liblinear.LibLinearBooleanOutcomeDataWriter;
+
+import com.google.common.base.Function;
+import com.lexicalscope.jewel.cli.CliFactory;
+
+public class EvaluationOfMarkableSalience extends Evaluation_ImplBase<AnnotationStatistics<Boolean>> {
+
+  public static void main(String[] args) throws Exception {
+    Options options = CliFactory.parseArguments(Options.class, args);
+    List<Integer> patientSets = options.getPatients().getList();
+    List<Integer> trainItems = getTrainItems(options);
+    List<Integer> testItems = getTestItems(options);
+    
+    EvaluationOfMarkableSalience eval = 
+        new EvaluationOfMarkableSalience(new File("target/eval/salience"), 
+            options.getRawTextDirectory(), 
+            options.getXMLDirectory(), 
+            options.getXMLFormat(), 
+            options.getSubcorpus(), 
+            options.getXMIDirectory(), null);
+    eval.prepareXMIsFor(patientSets);
+
+    AnnotationStatistics<Boolean> stats = eval.trainAndTest(trainItems, testItems);
+    System.out.println(stats);
+    System.out.println(stats.confusions());
+  }
+
+  public EvaluationOfMarkableSalience(File baseDirectory,
+      File rawTextDirectory, File xmlDirectory,
+      org.apache.ctakes.temporal.eval.Evaluation_ImplBase.XMLFormat xmlFormat,
+      org.apache.ctakes.temporal.eval.Evaluation_ImplBase.Subcorpus subcorpus,
+      File xmiDirectory, File treebankDirectory) {
+    super(baseDirectory, rawTextDirectory, xmlDirectory, xmlFormat, subcorpus,
+        xmiDirectory, treebankDirectory);
+  }
+
+  @Override
+  protected void train(CollectionReader collectionReader, File directory)
+      throws Exception {
+    AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
+    aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(SetGoldConfidence.class, SetGoldConfidence.PARAM_GOLD_VIEW, GOLD_VIEW_NAME));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(MarkableSalienceAnnotator.createDataWriterDescription(
+        LibLinearBooleanOutcomeDataWriter.class,
+        directory
+        )));
+    SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+    // s=0 -> logistic regression with L2-norm (gives probabilistic outputs)
+    String[] optArray = new String[]{ "-s", "0", "-c", "1", "-w1", "1"};
+    JarClassifierBuilder.trainAndPackage(directory, optArray);
+  }
+
+  @Override
+  protected AnnotationStatistics<Boolean> test(
+      CollectionReader collectionReader, File directory) throws Exception {
+    AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
+    aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
+    
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ViewCreatorAnnotator.class, ViewCreatorAnnotator.PARAM_VIEW_NAME, "PseudoGold"));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CreatePseudoGoldMarkables.class, CreatePseudoGoldMarkables.PARAM_GOLD_VIEW, GOLD_VIEW_NAME, CreatePseudoGoldMarkables.PARAM_PSEUDO_GOLD_VIEW, "PseudoGold"));
+    aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription(directory.getAbsolutePath() + File.separator + "model.jar"));
+    AnnotationStatistics<Boolean> stats = new AnnotationStatistics<>();
+    
+    for(Iterator<JCas> casIter = new JCasIterator(collectionReader, aggregateBuilder.createAggregate()); casIter.hasNext();){
+      JCas jCas = casIter.next();
+      JCas goldView = jCas.getView("PseudoGold");
+      JCas systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
+      
+      stats.add(JCasUtil.select(goldView, Markable.class),
+          JCasUtil.select(systemView, Markable.class),
+          AnnotationStatistics.<Markable>annotationToSpan(),
+          mapConfidenceToBoolean());      
+    }
+    
+    
+    return stats;
+  }
+  
+  public static class SetGoldConfidence extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+
+    public static final String PARAM_GOLD_VIEW = "GoldViewName";
+    @ConfigurationParameter(name=PARAM_GOLD_VIEW, mandatory=true, description="View containing gold standard annotations")
+    private String goldViewName;
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      JCas goldView = null;
+      try {
+        goldView = jcas.getView(goldViewName);
+      } catch (CASException e) {
+        e.printStackTrace();
+        throw new AnalysisEngineProcessException(e);
+      }
+      
+      Map<ConllDependencyNode,Collection<Markable>> depIndex = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, Markable.class);
+      
+      // iterate over every gold coreference chain
+      for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
+        FSList head = goldChain.getMembers();
+        
+        // iterate over every gold markable in the chain
+        // first one is guaranteed to be nonempty otherwise it would not be in cas
+        do{
+          NonEmptyFSList element = (NonEmptyFSList) head;
+          Markable goldMarkable = (Markable) element.getHead();
+          if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+            // get the head of this markable, then check if there are any system markables with the same
+            // head, and if so, that markable is "true" for being coreferent, AKA high confidence.
+            ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
+
+            for(Markable sysMarkable : depIndex.get(headNode)){
+              ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
+              if(markNode == headNode){
+                sysMarkable.setConfidence(1.0f);
+                break;
+              }
+            }
+          }
+          head = element.getTail();
+        }while(head instanceof NonEmptyFSList);
+      }
+    }
+  }
+  
+  public static class CreatePseudoGoldMarkables extends org.apache.uima.fit.component.JCasAnnotator_ImplBase {
+
+    public static final String PARAM_PSEUDO_GOLD_VIEW = "PseudoViewName";
+    @ConfigurationParameter(name = PARAM_PSEUDO_GOLD_VIEW)
+    private String fakeGoldName;
+    
+    public static final String PARAM_GOLD_VIEW = "GoldViewName";
+    @ConfigurationParameter(name = PARAM_GOLD_VIEW)
+    private String goldViewName;
+    
+    @Override
+    public void process(JCas jcas) throws AnalysisEngineProcessException {
+      JCas fakeView = null;
+      JCas goldView = null;
+      
+      try{
+        fakeView = jcas.getView(fakeGoldName);
+        goldView = jcas.getView(goldViewName);
+      }catch(CASException e){
+        throw new AnalysisEngineProcessException(e);
+      }
+      // create a set of markables that map to gold
+      Set<Markable> sys = new HashSet<>();
+      Map<ConllDependencyNode,Collection<Markable>> depIndex = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, Markable.class);
+      
+      // iterate over every gold coreference chain
+      for(CollectionTextRelation goldChain : JCasUtil.select(goldView, CollectionTextRelation.class)){
+        FSList head = goldChain.getMembers();
+        
+        // iterate over every gold markable in the chain
+        // first one is guaranteed to be nonempty otherwise it would not be in cas
+        do{
+          NonEmptyFSList element = (NonEmptyFSList) head;
+          Markable goldMarkable = (Markable) element.getHead();
+          if(!(goldMarkable.getBegin() < 0 || goldMarkable.getEnd() >= jcas.getDocumentText().length())){
+            // get the head of this markable, then check if there are any system markables with the same
+            // head, and if so, that markable is "true" for being coreferent, AKA high confidence.
+            ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, goldMarkable);
+
+            for(Markable sysMarkable : depIndex.get(headNode)){
+              ConllDependencyNode markNode = DependencyUtility.getNominalHeadNode(jcas, sysMarkable);
+              if(markNode == headNode){
+                sys.add(sysMarkable);
+                break;
+              }
+            }
+          }
+          head = element.getTail();
+        }while(head instanceof NonEmptyFSList);
+      }
+      
+      // add all system markables to psuedo-gold and with confidence based on whether they map
+      for(Markable markable : JCasUtil.select(jcas, Markable.class)){
+        Markable fakeMarkable = new Markable(fakeView, markable.getBegin(), markable.getEnd());
+        
+        if(sys.contains(markable)){
+          fakeMarkable.setConfidence(1.0f);
+        }else{
+          fakeMarkable.setConfidence(0.0f);
+        }
+        fakeMarkable.addToIndexes();
+      } 
+    }
+  }
+  
+  // this is predicting non-singletons rather than singletons
+  public static Function<Markable,Boolean> mapConfidenceToBoolean(){
+    return new Function<Markable,Boolean>() {
+      public Boolean apply(Markable markable) {
+        return markable.getConfidence() > 0.5;
+      }
+    };
+  }
+}



Mime
View raw message