ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1748746 [1/2] - in /ctakes/sandbox/ctakes-coref-cleartk: ./ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/ae/features/ src/main/java/org/apache/ctakes/coreference/ae/features/cluster/ src/main/...
Date Thu, 16 Jun 2016 15:33:02 GMT
Author: tmill
Date: Thu Jun 16 15:33:01 2016
New Revision: 1748746

URL: http://svn.apache.org/viewvc?rev=1748746&view=rev
Log:
Final checkin of code used for experiments (mostly for archival purposes as required classes have been copied to ctakes trunk)

Added:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterPairer.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ExactStringPairer.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/HeadwordPairer.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SectionHeaderPairer.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/SentenceDistancePairer.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousBag.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/extractors/ContinuousTextExtractor.java
Removed:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterPartOfSpeechFeaturesExtractor.java
Modified:
    ctakes/sandbox/ctakes-coref-cleartk/pom.xml
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/ExtractSemTypePreferences.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfEventCoreference.java

Modified: ctakes/sandbox/ctakes-coref-cleartk/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/pom.xml?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/pom.xml (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/pom.xml Thu Jun 16 15:33:01 2016
@@ -29,8 +29,23 @@
     	<artifactId>ctakes-coreference</artifactId>
     </dependency>
     <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-coreference-res</artifactId>
+    </dependency>
+    <dependency>
     	<groupId>org.apache.ctakes</groupId>
     	<artifactId>ctakes-temporal</artifactId>
     </dependency>
+    <dependency>
+    	<groupId>org.chboston.cnlp</groupId>
+    	<artifactId>umls-graph-api</artifactId>
+    	<version>0.0.1-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+    	<groupId>org.apache.lucene</groupId>
+    	<artifactId>lucene-core</artifactId>
+    	<version>3.6.2</version>
+    </dependency>
   </dependencies>
+  <version>0.0.1</version>
 </project>

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java Thu Jun 16 15:33:01 2016
@@ -10,6 +10,7 @@ import java.util.List;
 
 import org.apache.ctakes.constituency.parser.util.TreeUtils;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
@@ -23,6 +24,7 @@ import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.EmptyFSList;
 import org.apache.uima.jcas.cas.FSList;
 import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.cas.TOP;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.util.ViewUriUtil;
@@ -99,6 +101,11 @@ public class CoreferenceChainScoringOutp
     if(rels.size() == 0){
       return;
     }
+    
+    // build a map from every markable that is in a chain to the chain number it is in (number is not important as long as they are
+    // distinct so we just number them in the order the uima annotation in gives them to us)
+    // This has to be reverse compatible with older coref module that added RelationArguments to a chain instead of Markables.
+    // So we grab the chain elements, check their type, then grab the markable annotation depending on that type.
     for(CollectionTextRelation chain : rels){
       FSList members = chain.getMembers();
       // if we are doing cluster-mention coreference, some clusters will be singletons, we do not use those in conll scoring
@@ -106,7 +113,14 @@ public class CoreferenceChainScoringOutp
           ((NonEmptyFSList)members).getTail() instanceof EmptyFSList) continue;
       
       while(members instanceof NonEmptyFSList){
-        Annotation mention = (Annotation) ((NonEmptyFSList) members).getHead();
+        TOP head = ((NonEmptyFSList) members).getHead();
+        Annotation mention = null;
+        if(head instanceof Annotation){
+          mention = (Annotation) head;
+        }else{
+          mention = ((RelationArgument)head).getArgument();
+        }
+//        Annotation mention = (Annotation) ((NonEmptyFSList) members).getHead();
         ent2chain.put(mention, chainNum);
         members = ((NonEmptyFSList)members).getTail();
         System.out.print("Mention: " + mention.getCoveredText());
@@ -248,11 +262,19 @@ public class CoreferenceChainScoringOutp
         endSet.add(ind);
 //        endStack.push(ind);
       }
-//      for(int ind : endMention){
-//        buff.append(ind);
-//        buff.append(')');
-//        buff.append('|');
-//      }
+
+      // In some datasets markables end in the middle of a token -- this is a problem because our check above is for all markables that cover the
+      // current token. In this case the markable end will still be unused when we get to the end of the sentence. We'll just hack it by throwing
+      // them on the last token of the sentence.
+      if(endSentToken && endSet.size() > 0){
+        System.err.println("Error! There are opened markables that never closed! Putting them on the end of the sentence.");
+        for(int ind : endSet){
+          buff.append(ind);
+          buff.append(')');
+          buff.append('|');
+        }
+        endSet.clear();
+      }
       if(buff.length() > 0){
         out.println(buff.substring(0,  buff.length()-1));
       }else{

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java Thu Jun 16 15:33:01 2016
@@ -14,8 +14,10 @@ import java.util.List;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.ctakes.coreference.ae.features.AttributeFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.CorefSyntaxFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.DistSemFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.SalienceFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.SectionFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.TemporalFeatureExtractor;
@@ -138,17 +140,22 @@ public class EventCoreferenceAnnotator e
     featureExtractorList.add(new TokenFeaturesExtractor());
     featureExtractorList.add(new PartOfSpeechFeaturesExtractor());
     featureExtractorList.add(new PhraseChunkingExtractor());
-    featureExtractorList.add(new NamedEntityFeaturesExtractor());
+//    featureExtractorList.add(new NamedEntityFeaturesExtractor()); // same features in UMLSFeatureExtractor
     featureExtractorList.add(new DependencyTreeFeaturesExtractor());
-    featureExtractorList.add(new DependencyPathFeaturesExtractor());
+//    featureExtractorList.add(new DependencyPathFeaturesExtractor()); // not in mention-cluster version
     
 //    featureList.add(new DistanceFeatureExtractor());
     featureExtractorList.add(new StringMatchingFeatureExtractor());
-    featureExtractorList.add(new TokenFeatureExtractor());
+    featureExtractorList.add(new TokenFeatureExtractor()); // agreement features
     featureExtractorList.add(new SectionFeatureExtractor());
     featureExtractorList.add(new UMLSFeatureExtractor());
-    featureExtractorList.add(new CorefSyntaxFeatureExtractor());
+    featureExtractorList.add(new CorefSyntaxFeatureExtractor()); // dep head feature
     featureExtractorList.add(new TemporalFeatureExtractor());
+    
+    // added for feature parity with cluster version:
+    featureExtractorList.add(new SalienceFeatureExtractor());
+    featureExtractorList.add(new AttributeFeatureExtractor());
+    
 //    featureExtractorList.add(new ChainStackFeatureExtractor());
     
 //    featureExtractorList.add(new DocumentStructureTreeExtractor());
@@ -435,40 +442,59 @@ public class EventCoreferenceAnnotator e
     // intermediate data structures we use.
     for(int i = 0; i < segMarkables.size(); i++){
       Markable ana = segMarkables.get(i);
-      if(this.isTraining()){
-        for(CollectionTextRelation chain : JCasUtil.select(jcas, CollectionTextRelation.class)){
-          FSList head = chain.getMembers();
-          Markable last = null;
-          while(head instanceof NonEmptyFSList){
-            Markable m = (Markable) ((NonEmptyFSList)head).getHead();
-
-            // ignore markables past the current anaphor or equal to it
-            if(m == null || m.getEnd() > ana.getEnd()){
-              break;
-            }
-            if(!(m.getBegin() == ana.getBegin() && m.getEnd() == ana.getEnd())){
-              last = m;
-            }
-            head = ((NonEmptyFSList)head).getTail();
+//      if(this.isTraining()){
+      for(CollectionTextRelation chain : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        FSList head = chain.getMembers();
+        Markable last = null;
+        while(head instanceof NonEmptyFSList){
+          Markable m = (Markable) ((NonEmptyFSList)head).getHead();
+
+          // ignore markables past the current anaphor or equal to it
+          if(m == null || m.getEnd() > ana.getEnd()){
+            break;
           }
-          if(last != null){
-            pairs.add(new IdentifiedAnnotationPair(last, ana));
+          if(!(m.getBegin() == ana.getBegin() && m.getEnd() == ana.getEnd())){
+            last = m;
           }
+          head = ((NonEmptyFSList)head).getTail();
+        }
+        if(last != null){
+          pairs.add(new IdentifiedAnnotationPair(last, ana));
         }
-//      }else{
-//        for(LinkedHashSet<Markable> chain : chains.values()){
-//          Markable last = null;
-//          for(Markable element : chain){
-//            last = element;
-//          }
-//          pairs.add(new IdentifiedAnnotationPair(last, ana));
-//        }
       }
+//      }
     }
     
     return pairs;
   }
   
+  public List<IdentifiedAnnotationPair> getHeadwordMatchingPairs(JCas jcas, Annotation segment){
+    List<IdentifiedAnnotationPair> pairs = new ArrayList<>();
+    List<Markable> segMarkables = new ArrayList<>(JCasUtil.selectCovered(jcas, Markable.class, segment));
+    for(int i = 0; i < segMarkables.size(); i++){
+      Markable ana = segMarkables.get(i);
+      ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, ana);
+      String headword = null;
+      if(headNode != null){
+        headword = headNode.getCoveredText().toLowerCase();
+      }else{
+        continue;
+      }
+      List<Markable> previousMarkables = JCasUtil.selectCovered(jcas, Markable.class, 0, ana.getBegin());
+      for(int j = 0; j < previousMarkables.size(); j++){
+        Markable ante = previousMarkables.get(j);
+        ConllDependencyNode anteNode = DependencyUtility.getNominalHeadNode(jcas, ante);
+        if(anteNode != null){
+          String anteHeadword = anteNode.getCoveredText().toLowerCase();
+          if(headword.equals(anteHeadword)){
+            pairs.add(new IdentifiedAnnotationPair(ante, ana));
+          }
+        }
+      }
+    }
+    return pairs;
+  }
+  
   @Override
   protected String classify(List<Feature> features)
       throws CleartkProcessingException {
@@ -644,7 +670,9 @@ public class EventCoreferenceAnnotator e
       
       pairs.addAll(getClosePairs(jcas, segment, 0.0));
       pairs.addAll(getSectionHeaderPairs(jcas, segment, 0.0));
-//      
+      pairs.addAll(getAlreadyLinkedPairs(jcas, segment));
+      pairs.addAll(getHeadwordMatchingPairs(jcas, segment));
+//    
 //      pairs.addAll(getConfidentPairs(jcas, segment, 0.25));
 //      if(!isTraining()){
 //        Collections.sort(pairs, new MarkableConfidenceComparator());

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MarkableHeadTreeCreator.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,85 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.util.Comparator;
+import java.util.Map;
+
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.SharedHashMap;
+import org.apache.ctakes.utils.struct.UimaBinaryTreeMap;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+
+import com.sun.istack.logging.Logger;
+
+public class MarkableHeadTreeCreator extends JCasAnnotator_ImplBase {
+
+  private static final String MAP_KEY = "MarkableHeadMap";
+  private SharedHashMap<Markable,ConllDependencyNode> markable2head; 
+  
+  private static final Logger logger = Logger.getLogger(MarkableHeadTreeCreator.class);
+  
+  @Override
+  public void initialize(UimaContext context) throws ResourceInitializationException {
+    super.initialize(context);
+    
+  }
+  
+  @Override
+  public void process(JCas jcas) throws AnalysisEngineProcessException {
+    Map<Markable,ConllDependencyNode> treeMap = UimaBinaryTreeMap.createInstance(getKey(jcas));
+    
+    for(Markable m: JCasUtil.select(jcas, Markable.class)){
+      ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, m);
+      treeMap.put(m, headNode);
+//      markable2head.put(m, headNode);
+    }
+  }
+  
+  public static String getKey(JCas jcas){
+    String docId = null;
+    try{
+      docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
+    }catch(Exception e){
+      e.printStackTrace();
+    }
+    if(docId == null || docId == DocumentIDAnnotationUtil.NO_DOCUMENT_ID){
+      try {
+        docId = ViewUriUtil.getURI(jcas).toString();
+      } catch (AnalysisEngineProcessException e) {
+        e.printStackTrace();
+        logger.warning("No document ID found using traditional methods. Using ad hoc combination");
+        String docText = jcas.getDocumentText();
+        docId = docText.substring(0, Math.min(20, docText.length())) + "_hash=" + docText.hashCode(); 
+      }
+    }
+    return docId + "-" + MAP_KEY;
+  }
+
+  public static class MarkableDepheadPairComparator implements Comparator<Markable> {
+
+    @Override
+    public int compare(Markable m1, Markable m2) {
+      // look at the start first
+      if(m1.getBegin() < m2.getBegin()){
+        return -1;
+      }else if(m2.getBegin() < m1.getBegin()){
+        return 1;
+      }else if(m1.getEnd() < m2.getEnd()){
+        return -1;
+      }else if(m2.getEnd() < m1.getEnd()){
+        return 1;
+      }else{
+        // m1 and m2 have the exact same span
+        return 0;
+      }
+    }
+  }
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Thu Jun 16 15:33:01 2016
@@ -3,14 +3,11 @@ package org.apache.ctakes.coreference.ae
 import java.io.File;
 import java.io.IOException;
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.HashMap;
-import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
-import java.util.Set;
 
 import org.apache.ctakes.core.util.ListFactory;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
@@ -25,7 +22,11 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
-import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.coreference.ae.pairing.cluster.ClusterMentionPairer_ImplBase;
+import org.apache.ctakes.coreference.ae.pairing.cluster.ClusterPairer;
+import org.apache.ctakes.coreference.ae.pairing.cluster.HeadwordPairer;
+import org.apache.ctakes.coreference.ae.pairing.cluster.SectionHeaderPairer;
+import org.apache.ctakes.coreference.ae.pairing.cluster.SentenceDistancePairer;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
@@ -33,14 +34,10 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
 import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
-import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
-import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
-import org.apache.ctakes.typesystem.type.textspan.Paragraph;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
-import org.apache.ctakes.typesystem.type.textspan.Sentence;
-import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
@@ -49,13 +46,13 @@ import org.apache.uima.fit.util.JCasUtil
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.EmptyFSList;
 import org.apache.uima.jcas.cas.NonEmptyFSList;
-import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.cleartk.ml.CleartkAnnotator;
 import org.cleartk.ml.CleartkProcessingException;
 import org.cleartk.ml.DataWriter;
 import org.cleartk.ml.Feature;
 import org.cleartk.ml.Instance;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
 import org.cleartk.ml.feature.extractor.FeatureExtractor1;
 import org.cleartk.ml.jar.DefaultDataWriterFactory;
 import org.cleartk.ml.jar.DirectoryDataWriterFactory;
@@ -72,10 +69,18 @@ public class MentionClusterCoreferenceAn
       description = "probability that a negative example should be retained for training")
   protected double probabilityOfKeepingANegativeExample = 0.5;
 
+  public static final String PARAM_USE_EXISTING_ENCODERS="UseExistingEncoders";
+  @ConfigurationParameter(name = PARAM_USE_EXISTING_ENCODERS,
+      mandatory=false,
+      description = "Whether to use encoders in output directory during data writing; if we are making multiple calls")
+  private boolean useExistingEncoders=false;
+      
   protected Random coin = new Random(0);
 
   boolean greedyFirst = true;
   
+  private static DataWriter<String> classDataWriter = null;
+  
   public static AnalysisEngineDescription createDataWriterDescription(
       Class<? extends DataWriter<String>> dataWriterClass,
       File outputDirectory,
@@ -104,11 +109,9 @@ public class MentionClusterCoreferenceAn
 
   private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> relationExtractors = this.getFeatureExtractors();
   private List<FeatureExtractor1<Markable>> mentionExtractors = this.getMentionExtractors();
+  private List<ClusterMentionPairer_ImplBase> pairExtractors = this.getPairExtractors();
   
-  private Set<String> markableStrings = null;
-  private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
-  private Map<String,Set<Markable>> headWordMarkables = null;
-  private Map<HashableArguments,Double> pairScores = null;
+//  private Set<String> markableStrings = null;
   
   protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> getFeatureExtractors() {
     List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> extractors = new ArrayList<>();
@@ -120,12 +123,13 @@ public class MentionClusterCoreferenceAn
     extractors.add(new MentionClusterStackFeaturesExtractor());
     extractors.add(new MentionClusterSalienceFeaturesExtractor());
     extractors.add(new MentionClusterAttributeFeaturesExtractor());
-    extractors.add(new MentionClusterAttributeVectorExtractor());
+//    extractors.add(new MentionClusterAttributeVectorExtractor()); // does nothing yet
     
 //    extractors.add(new MentionClusterDistanceFeaturesExtractor());
     
     try {
-      extractors.add(new MentionClusterDistSemExtractor());
+//      extractors.add(new MentionClusterDistSemExtractor("org/apache/ctakes/coreference/distsem/mimic_vectors.txt"));
+//      extractors.add(new MentionClusterDistSemExtractor("org/apache/ctakes/coreference/distsem/deps.words"));
       extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor());
     } catch (IOException e) {
       e.printStackTrace();
@@ -143,214 +147,59 @@ public class MentionClusterCoreferenceAn
     extractors.add(new MentionClusterDepHeadExtractor());
     extractors.add(new MentionClusterSalienceFeaturesExtractor());
 
-    extractors.add(new MentionClusterMentionFeaturesExtractor());
+//    try{
+//      extractors.add(new MentionClusterMentionFeaturesExtractor("org/apache/ctakes/coreference/distsem/ties1mil.lowercase.txt"));
+//    }catch(CleartkExtractorException e){
+//      e.printStackTrace();
+//    }
     extractors.add(new MentionClusterAttributeFeaturesExtractor());
 
     return extractors;
   }
   
-  protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
-      JCas jcas,
-      IdentifiedAnnotation mention){
+  protected List<ClusterMentionPairer_ImplBase> getPairExtractors(){
+    List<ClusterMentionPairer_ImplBase> pairers = new ArrayList<>();
     int sentDist = 5;
-    // using linked hash set ensures no duplicates:
-    LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
-    pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist));
-    pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist));
-    pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE));
-    pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist));
-    
-    return pairs;
+    pairers.add(new SentenceDistancePairer(sentDist));
+    pairers.add(new SectionHeaderPairer(sentDist));
+    pairers.add(new ClusterPairer(Integer.MAX_VALUE));
+    pairers.add(new HeadwordPairer());
+    return pairers;
   }
   
-  /*
-   * getExactStringMatchPairs()
-   * For mentions that have the exact string repeated elsewhere in the document we want to
-   * allow matching across any distance. We don't use the sentence distance parameter here.
-   * We make use of a global variable markableStrings that is a HashSet containig all the markable
-   * strings from this document.
-   */
-  private List<CollectionTextRelationIdentifiedAnnotationPair> getExactStringMatchPairs(
-      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
-    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
-    
-    if(markableStrings.contains(mention.getCoveredText().toLowerCase())){
-      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
-        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
-        if(mostRecent == null) continue;
-
-        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
-          if(m == mostRecent) break;
-          // see if any of the members of the cluster have the exact same string as this 
-          if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){
-            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
-            break;
-          }
-        }
-      }
+  protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+      JCas jcas,
+      Markable mention){
+    LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();   
+    for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){
+      pairs.addAll(pairer.getPairs(jcas, mention));
     }
+   
     return pairs;
   }
   
-  /*
-   * getClusterPairs()
-   * In this method we allow to link to clusters containing more than one mention even if they
-   * are beyond a sentence distance. First we check whether the most recent mention in the cluster
-   * is within the specified sentence distance (presumably longer than the sentence distance passed into
-   * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple
-   * members but only one before the focus mention. So we need to count the members of a cluster until we 
-   * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing.
-   */
-  private List<CollectionTextRelationIdentifiedAnnotationPair> getClusterPairs(
-      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
-    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
-    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
-      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
-      Annotation first = (Annotation) members.getHead();
-      if(first == null || mention.getBegin() <= first.getEnd()){
-        continue;
-      }
-
-      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
-      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
-        continue;
-      }
-      int numMembers=0;
-      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
-        numMembers++;
-        if(m == mostRecent) break;
-      }
-      if(numMembers > 1){
-        pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
-      }
+  private void resetPairers(JCas jcas){
+    for(ClusterMentionPairer_ImplBase pairer : this.pairExtractors){
+      pairer.reset(jcas);
     }
-    
-    return pairs;
   }
-
-  /*
-   * Here we want to add only things that are nearby. First we check the semantic types
-   * of the cluster we're comparing against. If any member is an Anatomical Site or Medication,
-   * we add the cluster no matter what. Otherwise we check how many sentences are in between
-   * the mention and the latest element of the cluster.
-   */
-  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSentenceDistancePairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
-    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
-    Set<String> bestAnaTypes = getBestEnt(jcas, (Markable) mention);
+   
+  @Override
+  public void initialize(UimaContext context) throws ResourceInitializationException {
+    super.initialize(context);
     
-    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
-      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
-      Annotation first = (Annotation) members.getHead();
-      if(first == null || mention.getBegin() <= first.getEnd()) continue;
-      
-      // check for distance if they are not anatomical site or medication
-      if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
-          bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
-
-        IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
-        if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist) continue;
-      }
-
-      // check for types of cluster
-      Set<String> bestClusterTypes = getBestEnt(jcas, cluster);
-      if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){
-        boolean overlap = false;
-        for(String semType : bestAnaTypes){
-          if(bestClusterTypes.contains(semType)){
-            overlap = true;
-          }
-        }
-        // they both correspond to named entities but no overlap in which category of named entity.
-        if(!overlap){
-          continue;
-        }
-      }
-      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));      
-    }
-    return pairs;
-  }
-
-  /*
-   * getSectionHeaderPairs()
-   * Here we want to add clusters where one of the members is on a line all by itself (a section header)
-   * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a 
-   * span only contains one sentence then we consider it a "header" (or also as important a list item).
-   * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by
-   * the "sentence distance" method.
-   */
-  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSectionHeaderPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
-    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
-    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
-      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
-      Annotation first = (Annotation) members.getHead();
-      if(first == null || mention.getBegin() <= first.getEnd()){
-        continue;
-      }
-
-      // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator
-      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
-      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){
-        continue;
-      }
-      
-      // now check if any of the mentions are in a section header
-      List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin());
-      for(int j = 0; j < pars.size(); j++){
-        boolean match = false;
-        Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
-        List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
-        if(coveredSents != null && coveredSents.size() == 1){
-          // this is sentences that are the same span as paragraphs -- how we model section headers
-          // see if any of the cluster mentions are in the section header
-          for(Markable m : JCasUtil.select(members, Markable.class)){
-            if(dominates(par, m)){
-              pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
-              match = true;
-              break;
-            }
-          }
-        }
-        if(match) break;
-      }
+    if(this.useExistingEncoders && classDataWriter != null){
+      this.dataWriter = classDataWriter;
+    }else if(this.isTraining()){
+      classDataWriter = this.dataWriter;
     }
-    return pairs;
-  }
-  
-  protected List<CollectionTextRelationIdentifiedAnnotationPair> getHeadwordMatchPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
-    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
-
-    ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
-    if(headNode == null){
-      Logger.getLogger(MentionClusterCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
-      return pairs;
-    }
-    String head = headNode.getCoveredText().toLowerCase();
-    if(headWordMarkables.containsKey(head)){
-      Set<Markable> headSet = headWordMarkables.get(head);
-      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
-        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
-        if(mostRecent == null) continue;
-        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
-          if(headSet.contains(mostRecent)){
-            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
-            break;
-          }
-          if(m == mostRecent) break;
-        }
-      }      
-    }
-    
-    return pairs;
   }
   
   @Override
   public void process(JCas jCas) throws AnalysisEngineProcessException {
     // lookup from pair of annotations to binary text relation
     // note: assumes that there will be at most one relation per pair
-    markableStrings = new HashSet<>();
-    nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
-    headWordMarkables = new HashMap<>();
-//    pairScores = getMarkablePairScores(jCas);
+    this.resetPairers(jCas);
     
     Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
     relationLookup = new HashMap<>();
@@ -378,7 +227,7 @@ public class MentionClusterCoreferenceAn
     
     for(Segment segment : JCasUtil.select(jCas, Segment.class)){
       for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
-        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+//        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
         boolean singleton = true;
         double maxScore = 0.0;
         CollectionTextRelation maxCluster = null;
@@ -444,50 +293,7 @@ public class MentionClusterCoreferenceAn
           }
           
           features.addAll(dupFeatures);
-          
-          
-          // pairwise score features:
-          /*
-          double minPairScore = 1.0;
-          double maxPairScore = 0.0;
-          double avePairScore = 0.0;
-          int numPairs=0;
-          for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
-            if(member.getBegin() > mention.getBegin()) break;
-            numPairs++;
-            HashableArguments markablePair = new HashableArguments(member, mention);
-            Double score = pairScores.get(markablePair);
-            if(score == null){
-              markablePair = new HashableArguments(mention, member);
-              score = pairScores.get(markablePair);
-            }
-            if(score == null){
-              score = 0.0;
-            }
-            if(score != null){
-              avePairScore += score;
-              if(score > maxPairScore){
-                maxPairScore = score;
-              }
-              if(score < minPairScore){
-                minPairScore = score;
-              }
-            }
-          }
-          
-          features.add(new Feature("PAIRWISE_MAX", maxPairScore));
-          */
-//          features.add(new Feature("PAIRWISE_MIN", minPairScore));
-//          if(numPairs > 0){
-//            avePairScore /= numPairs;
-//          }else{
-//            avePairScore = 0.0;
-//          }
-//          if(Double.isNaN(avePairScore)){
-//            Logger.getLogger(MentionClusterCoreferenceAnnotator.class).error("Pairwise average feature found with value NaN");
-//          }
-//          features.add(new Feature("PAIRWISE_AVE", avePairScore));
-
+                   
           // during training, feed the features to the data writer
           if (this.isTraining()) {
             String category = this.getRelationCategory(relationLookup, cluster, mention);
@@ -498,6 +304,7 @@ public class MentionClusterCoreferenceAn
             // create a classification instance and write it to the training data
             this.dataWriter.write(new Instance<>(category, features));
             if(!category.equals(NO_RELATION_CATEGORY)){
+              singleton = false;
               break;
             }
           }
@@ -513,7 +320,7 @@ public class MentionClusterCoreferenceAn
             if (!predictedCategory.equals(NO_RELATION_CATEGORY)) {
 //              Logger.getLogger("MCAnnotator").info(String.format("Making a pair with score %f", scores.get(predictedCategory)));
               if(greedyFirst){
-                createRelation(jCas, cluster, mention, predictedCategory);
+                createRelation(jCas, cluster, mention, predictedCategory, scores.get(predictedCategory));
                 singleton = false;
                 // break here for "closest-first" greedy decoding strategy (Soon et al., 2001), terminology from Lasalle and Denis (2013),
                 // for "best first" need to keep track of all relations with scores and only keep the highest
@@ -527,22 +334,13 @@ public class MentionClusterCoreferenceAn
             }
           }
         }
-        if(!greedyFirst && maxCluster != null){
+        if(!this.isTraining() && !greedyFirst && maxCluster != null){
           // make a link with the max cluster
-          createRelation(jCas, maxCluster, mention, "CoreferenceClusterMember");
-        }
-        
-        markableStrings.add(mention.getCoveredText().toLowerCase());
-        
-        if(headNode != null){
-          String head = headNode.getCoveredText().toLowerCase();
-          if(!headWordMarkables.containsKey(head)){
-            headWordMarkables.put(head, new HashSet<Markable>());
-          }
-          headWordMarkables.get(head).add(mention);
+          createRelation(jCas, maxCluster, mention, "CoreferenceClusterMember", maxScore);
         }
-        
-        // if we got this far and never matched up the 
+                       
+        // if we got this far and never matched up the markable then add it to list.
+        // do this even during training -- adds non-chain markables to antecedent list which will be seen during testing.
         if(singleton){
           // make the markable it's own cluster:
           CollectionTextRelation chain = new CollectionTextRelation(jCas);
@@ -560,19 +358,7 @@ public class MentionClusterCoreferenceAn
     removeSingletonClusters(jCas);
   }
   
-  private boolean headMatches(String head, List<Feature> feats){
-    boolean match = false;
-    for(Feature feat : feats){
-      if(feat.getName().equals("ClusterHeadMatchesMentionHead")){
-        if(feat.getValue().equals(true)){
-          match = true;
-        }
-        break;
-      }
-    }
-    return match;
-  }
-  
+ 
   /**
    * Looks up the arguments in the specified lookup table and converts the
    * relation into a label for classification
@@ -628,12 +414,14 @@ public class MentionClusterCoreferenceAn
       JCas jCas,
       CollectionTextRelation cluster,
       IdentifiedAnnotation mention,
-      String predictedCategory) {
+      String predictedCategory,
+      Double confidence) {
     // add the relation to the CAS
     CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
     relation.setCluster(cluster);
     relation.setMention(mention);
     relation.setCategory(predictedCategory);
+    relation.setConfidence(confidence);
     relation.addToIndexes();
     
 //    RelationArgument arg = new RelationArgument(jCas);
@@ -656,10 +444,11 @@ public class MentionClusterCoreferenceAn
     }
   }
   
-  private static final boolean dominates(Annotation arg1, Annotation arg2) {
-    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
-  }
+//  private static final boolean dominates(Annotation arg1, Annotation arg2) {
+//    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+//  }
 
+  /*
   public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
     Set<String> semTypes = new HashSet<>();
     for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
@@ -699,7 +488,7 @@ public class MentionClusterCoreferenceAn
     }
     return bestEnts;
   }
-  
+  */
   
   public Map<HashableArguments, Double> getMarkablePairScores(JCas jCas){
     Map<HashableArguments, Double> scoreMap = new HashMap<>();

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java Thu Jun 16 15:33:01 2016
@@ -54,6 +54,7 @@ import org.cleartk.ml.CleartkAnnotator;
 import org.cleartk.ml.CleartkProcessingException;
 import org.cleartk.ml.DataWriter;
 import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
 import org.cleartk.ml.feature.extractor.FeatureExtractor1;
 import org.cleartk.ml.jar.DefaultDataWriterFactory;
 import org.cleartk.ml.jar.DirectoryDataWriterFactory;
@@ -144,7 +145,11 @@ public class MentionClusterRankingCorefe
     extractors.add(new MentionClusterDepHeadExtractor());
     extractors.add(new MentionClusterSalienceFeaturesExtractor());
 
-    extractors.add(new MentionClusterMentionFeaturesExtractor());
+    try {
+      extractors.add(new MentionClusterMentionFeaturesExtractor());
+    } catch (CleartkExtractorException e) {
+      e.printStackTrace();
+    }
     extractors.add(new MentionClusterAttributeFeaturesExtractor());
 
     return extractors;

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/AttributeFeatureExtractor.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,65 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isGeneric;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isHistory;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isNegated;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isPatient;
+import static org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor.isUncertain;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.TimeMention;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+
+public class AttributeFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation ante, IdentifiedAnnotation ana)
+      throws AnalysisEngineProcessException {
+    List<Feature> features = new ArrayList<>();
+    
+    boolean anaNegated = isNegated(ana);
+    features.add(new Feature("MC_ana_NEGATED", anaNegated));
+    boolean anaUncertain = isUncertain(ana);
+    features.add(new Feature("MC_ana_UNCERTAIN", anaUncertain));
+    boolean anaGen = isGeneric(ana);
+    features.add(new Feature("MC_ana_GENERIC", anaGen));
+    boolean anaSubj = isPatient(ana);
+    features.add(new Feature("MC_ana_PATIENT", anaSubj));
+    boolean anaHist = isHistory(ana);
+    features.add(new Feature("MC_ana_HISTORY", anaHist));
+    boolean anaTimex = isTimex(ana);
+    features.add(new Feature("MC_ana_TIMEX", anaTimex));
+    
+    boolean anteNegated = isNegated(ante);
+    features.add(new Feature("MC_ante_NEGATED", anteNegated));
+    boolean anteUncertain = isUncertain(ante);
+    features.add(new Feature("MC_ante_UNCERTAIN", anteUncertain));
+    boolean anteGen = isGeneric(ante);
+    features.add(new Feature("MC_ante_GENERIC", anteGen));
+    boolean anteSubj = isPatient(ante);
+    features.add(new Feature("MC_ante_PATIENT", anteSubj));
+    boolean anteHist = isHistory(ante);
+    features.add(new Feature("MC_ante_HISTORY", anteHist));
+    boolean anteTimex = isTimex(ante);
+    features.add(new Feature("MC_ante_TIMEX", anteTimex));
+    
+    features.add(new Feature("MC_AGREE_NEG", anteNegated == anaNegated));
+    features.add(new Feature("MC_AGREE_UNC", anteUncertain == anaUncertain));    
+    features.add(new Feature("MC_AGREE_TIMEX", anteTimex == anaTimex));
+
+    return features;
+  }
+  
+  private boolean isTimex(Annotation a){
+    return JCasUtil.selectCovered(TimeMention.class, a).size() > 0;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/SalienceFeatureExtractor.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,24 @@
+package org.apache.ctakes.coreference.ae.features;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class SalienceFeatureExtractor implements RelationFeaturesExtractor<IdentifiedAnnotation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, IdentifiedAnnotation ante, IdentifiedAnnotation ana)
+      throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    feats.add(new Feature("MP_ANTE_SALIENCE", ante.getConfidence()));
+    feats.add(new Feature("MP_ANA_SALIENCE", ana.getConfidence()));
+    return feats;
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java Thu Jun 16 15:33:01 2016
@@ -1,15 +1,17 @@
 package org.apache.ctakes.coreference.ae.features;
 
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+
 import java.util.ArrayList;
 import java.util.List;
 
-import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.struct.UimaBinaryTreeMap;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
@@ -101,7 +103,7 @@ public class TokenFeatureExtractor imple
 	// FYI - old code used treebanknode types and found head using head rules filled in by the parser
 	// not sure if there is an appreciable difference...
 	public static boolean numberSingular(JCas jcas, Annotation arg, String s1){
-	  ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, arg);
+    ConllDependencyNode head = UimaBinaryTreeMap.get(getKey(jcas), arg);
 //		List<BaseToken> tokens = new ArrayList<>(JCasUtil.selectCovered(BaseToken.class, arg));
 //		for (int i = tokens.size()-1; i >=0; i--){
 //			BaseToken t = tokens.get(i);

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/UMLSFeatureExtractor.java Thu Jun 16 15:33:01 2016
@@ -19,6 +19,7 @@ import org.apache.uima.analysis_engine.A
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
+import org.chboston.cnlp.graphapi.GraphFunctions;
 import org.cleartk.ml.Feature;
 import org.cleartk.util.ViewUriUtil;
 
@@ -50,7 +51,7 @@ public class UMLSFeatureExtractor implem
 		  if(head1 != null && head2 != null){
 		    List<IdentifiedAnnotation> ents1 = new ArrayList<>(coveringMap.get(head1)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());'
 		    for(IdentifiedAnnotation ann : ents1){
-		      if(!(ann instanceof EntityMention || ann instanceof EventMention)){
+		      if(!(ann instanceof EntityMention || ann instanceof EventMention) || ann.getClass() == EventMention.class){
 		        rmList.add(ann);
 		      }
 		    }
@@ -60,7 +61,7 @@ public class UMLSFeatureExtractor implem
 		    rmList.clear();
 		    List<IdentifiedAnnotation> ents2 = new ArrayList<>(coveringMap.get(head2)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
 		    for(IdentifiedAnnotation ann : ents2){
-		      if(!(ann instanceof EntityMention || ann instanceof EventMention)){
+		      if(!(ann instanceof EntityMention || ann instanceof EventMention)|| ann.getClass() == EventMention.class){
 		        rmList.add(ann);
 		      }
 		    }
@@ -101,7 +102,13 @@ public class UMLSFeatureExtractor implem
 	          feats.add(new Feature("Arg2SemType" + a2SemType, true));
 		        if(alias(ent1, ent2)){
 		          feats.add(new Feature("UMLS_ALIAS", true));
-		          break;
+//		          break;
+		        }
+		        if(!alias(ent1, ent2) && isHypernym(ent1, ent2)){
+		          feats.add(new Feature("IS_HYPERNYM", true));
+		        }
+		        if(!alias(ent1, ent2) && isHyponym(ent1, ent2)){
+		          feats.add(new Feature("IS_HYPONYM", true));
 		        }
 		        feats.add(new Feature("Arg1Arg2SemType" + a1SemType + "_" + a2SemType, true));
 		        
@@ -150,27 +157,60 @@ public class UMLSFeatureExtractor implem
 
   public static boolean alias(IdentifiedAnnotation a1, IdentifiedAnnotation a2){  
 	  if(a1 != null && a2 != null){
-	    FSArray fsa = a1.getOntologyConceptArr();
-	    if(fsa != null){
-	      HashSet<String> cuis = new HashSet<>();
-	      for(int i = 0; i < fsa.size(); i++){
-	        if(fsa.get(i) instanceof UmlsConcept){
-	          cuis.add(((UmlsConcept)fsa.get(i)).getCui());
-	        }
-	      }
-
-	      fsa = a2.getOntologyConceptArr();
-	      if(fsa != null){
-	        for(int i = 0; i < fsa.size(); i++){
-	          if(fsa.get(i) instanceof UmlsConcept){
-	            if(cuis.contains(((UmlsConcept)fsa.get(i)).getCui())){
-	              return true;
-	            }
-	          }
+	    for(UmlsConcept concept1 : JCasUtil.select(a1.getOntologyConceptArr(), UmlsConcept.class)){
+	      String cui = concept1.getCui();
+	      for(UmlsConcept concept2 : JCasUtil.select(a2.getOntologyConceptArr(), UmlsConcept.class)){
+	        if(cui.equals(concept2.getCui())){
+	          return true;
 	        }
 	      }
-	    }
+	    }	  
 	  }
 		return false;
 	}
+  
+  public static boolean isHypernym(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+    if(a1 != null && a2 != null){
+      for(UmlsConcept concept1 : JCasUtil.select(a1.getOntologyConceptArr(), UmlsConcept.class)){
+        for(UmlsConcept concept2 : JCasUtil.select(a2.getOntologyConceptArr(), UmlsConcept.class)){
+          if(GraphFunctions.isa(concept1.getCui(), concept2.getCui())){
+            return true;
+          }
+        }
+      }
+    }
+    return false;
+  }
+  
+  public static boolean isHyponym(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+    return isHypernym(a2, a1);
+  }
+  
+  // returns distance in graph. For isa relation the distance will be positive and for
+  // inverse isa it will be negative, thus the absolute value comparisons.
+  public static int graphDistance(IdentifiedAnnotation a1, IdentifiedAnnotation a2){
+    int distance = Integer.MAX_VALUE;
+    
+    if(a1 != null && a2 != null){
+      for(UmlsConcept concept1 : JCasUtil.select(a1.getOntologyConceptArr(), UmlsConcept.class)){
+        String cui1 = concept1.getCui();
+        for(UmlsConcept concept2 : JCasUtil.select(a2.getOntologyConceptArr(), UmlsConcept.class)){
+          String cui2 = concept2.getCui();
+          int len = GraphFunctions.minDistance(cui1, cui2);
+          if(len < 0){
+            len = GraphFunctions.minDistance(cui2, cui1);
+            if(len < 0){
+              len = Integer.MAX_VALUE;
+            }else{
+              len = -len;
+            }
+          }
+          if(Math.abs(len) < Math.abs(distance)){
+            distance = len;
+          }
+        }
+      }
+    }
+    return distance;
+  }
 }

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterAttributeFeaturesExtractor.java Thu Jun 16 15:33:01 2016
@@ -72,9 +72,9 @@ public class MentionClusterAttributeFeat
     
     features.add(new Feature("MC_AGREE_NEG", matchNeg));
     features.add(new Feature("MC_AGREE_UNC", matchUnc));
-    features.add(new Feature("MC_AGREE_GEN", matchGen));
-    features.add(new Feature("MC_AGREE_SUBJ", matchSubj));
-    features.add(new Feature("MC_AGREE_HIST", matchHist));
+//    features.add(new Feature("MC_AGREE_GEN", matchGen));
+//    features.add(new Feature("MC_AGREE_SUBJ", matchSubj));
+//    features.add(new Feature("MC_AGREE_HIST", matchHist));
     
     features.add(new Feature("MC_AGREE_TIMEX", clusterTimex == mentionTimex));
 

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDepHeadExtractor.java Thu Jun 16 15:33:01 2016
@@ -1,5 +1,7 @@
 package org.apache.ctakes.coreference.ae.features.cluster;
 
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
@@ -7,13 +9,12 @@ import java.util.Set;
 
 import org.apache.ctakes.core.util.ListIterable;
 import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
-import org.apache.ctakes.dependency.parser.util.DependencyPath;
-import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.UimaBinaryTreeMap;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
 import org.cleartk.ml.Feature;
@@ -28,13 +29,13 @@ public class MentionClusterDepHeadExtrac
       IdentifiedAnnotation mention) throws AnalysisEngineProcessException {
     List<Feature> feats = new ArrayList<>();
     
-    ConllDependencyNode mentionHead = DependencyUtility.getNominalHeadNode(jCas, mention);
+    ConllDependencyNode mentionHead = UimaBinaryTreeMap.get(getKey(jCas), mention);
     Set<String> memberHeads = new HashSet<>();
     Set<String> memberPaths = new HashSet<>();
     
     for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
       if(member.getBegin() > mention.getEnd()) break;
-      ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jCas, member);
+      ConllDependencyNode memberHead = UimaBinaryTreeMap.get(getKey(jCas), member);
       if(memberHead != null){
         String headWord = memberHead.getCoveredText().toLowerCase();
         memberHeads.add(headWord);
@@ -64,7 +65,7 @@ public class MentionClusterDepHeadExtrac
   @Override
   public List<Feature> extract(JCas jCas, Markable mention) throws CleartkExtractorException {
     List<Feature> feats = new ArrayList<>();
-    ConllDependencyNode mentionHead = DependencyUtility.getNominalHeadNode(jCas, mention);
+    ConllDependencyNode mentionHead = UimaBinaryTreeMap.get(getKey(jCas), mention);
 
     if(mentionHead != null){
       feats.add(new Feature("MentionRel", mentionHead.getDeprel()));

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistSemExtractor.java Thu Jun 16 15:33:01 2016
@@ -11,13 +11,17 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
 import org.apache.ctakes.utils.distsem.WordEmbeddings;
+import org.apache.ctakes.utils.distsem.WordVector;
 import org.apache.ctakes.utils.distsem.WordVectorReader;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
 import org.cleartk.ml.Feature;
 
 public class MentionClusterDistSemExtractor implements
@@ -42,10 +46,14 @@ public class MentionClusterDistSemExtrac
     if(StringMatchingFeatureExtractor.isPronoun(mention)) return feats;
     
     double maxSim = 0.0;
+    double maxPhraseSim = 0.0;
     
     ConllDependencyNode mentionNode = DependencyUtility.getNominalHeadNode(jCas, mention);
-    // first, do not bother with pronouns:
     
+    double[] mentionVec = getPhraseVec(mention);
+    boolean exactMatch = false;
+    
+    // first, do not bother with pronouns:
     String mentionHead = mentionNode != null ? mentionNode.getCoveredText().toLowerCase() : null;
     if(mentionHead != null){
       for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
@@ -54,11 +62,21 @@ public class MentionClusterDistSemExtrac
           // is actually subsequent to the candidate mention
           break;
         }
+
+        double[] memberVec = getPhraseVec(member);
+        
+        double phraseSim = 0.0;
+        for(int i = 0; i < memberVec.length; i++){
+          phraseSim += (mentionVec[i] * memberVec[i]);
+        }
+        if(phraseSim > maxPhraseSim){
+          maxPhraseSim = phraseSim;
+        }
+        
         ConllDependencyNode memberNode = DependencyUtility.getNominalHeadNode(jCas, member);
         String memberHead = memberNode != null ? memberNode.getCoveredText().toLowerCase() : null;
         if(mentionHead.equals(memberHead)){
-          maxSim = 0.0;
-          break;
+          exactMatch = true;
         }
         if(memberNode != null && words.containsKey(memberHead) && words.containsKey(mentionHead)){
           double sim = words.getSimilarity(mentionHead, memberHead);
@@ -68,10 +86,45 @@ public class MentionClusterDistSemExtrac
         }
       }
     }
+    if(exactMatch){
+      maxSim = 0.0;
+    }
     
     feats.add(new Feature("HEAD_SIMILARITY_WORD2VEC", maxSim));
+//    feats.add(new Feature("PHRASE_SIMILARITY_WORD2VEC", maxPhraseSim));
     
     return feats;
   }
 
+  private double[] getPhraseVec(Annotation annotation){
+    double[] phraseVec = new double[words.getDimensionality()];
+    double vecLength = 0.0;
+    
+    for(BaseToken token : JCasUtil.selectCovered(BaseToken.class, annotation)){
+      String word = token.getCoveredText().toLowerCase();
+      if(words.containsKey(word)){
+        WordVector vec = words.getVector(word);
+        for(int i = 0; i < phraseVec.length; i++){
+          double val = vec.getValue(i);
+          phraseVec[i] += val;
+          vecLength = (val * val);
+        }
+      }
+    }
+    
+    // normalize vector:
+    for(int i = 0; i < phraseVec.length; i++){
+      double val = phraseVec[i];
+      vecLength += (val * val);
+    }
+    vecLength = Math.sqrt(vecLength);
+    
+    if(vecLength > 0.0){
+      for(int i = 0; i < phraseVec.length; i++){
+        phraseVec[i] /= vecLength;
+      }    
+    }
+    
+    return phraseVec;
+  }
 }

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java Thu Jun 16 15:33:01 2016
@@ -1,9 +1,12 @@
 package org.apache.ctakes.coreference.ae.features.cluster;
 
+import java.io.File;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 
+import org.apache.ctakes.coreference.ae.features.TokenFeatureExtractor;
+import org.apache.ctakes.coreference.extractors.ContinuousTextExtractor;
 import org.apache.ctakes.relationextractor.ae.features.DependencyTreeFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
@@ -30,7 +33,7 @@ import org.cleartk.ml.feature.extractor.
 public class MentionClusterMentionFeaturesExtractor implements FeatureExtractor1<Markable> {
 
   private FeatureExtractor1<BaseToken> coveredText = new CoveredTextExtractor<>();
-  private FeatureExtractor1<Markable> tokenContext = new CleartkExtractor<Markable,BaseToken>(
+  private FeatureExtractor1<Markable> tokenIdentityContext = new CleartkExtractor<Markable,BaseToken>(
       BaseToken.class,
       coveredText,
       new FirstCovered(1),
@@ -38,6 +41,9 @@ public class MentionClusterMentionFeatur
       new Bag(new Covered()),
       new Preceding(3),
       new Following(3));
+  
+  private FeatureExtractor1<BaseToken> continuousText = null;
+  private FeatureExtractor1<Markable> tokenVectorContext = null;      
 
   private FeatureExtractor1<BaseToken> pos = new TypePathExtractor<>(BaseToken.class, "partOfSpeech");
 
@@ -56,18 +62,42 @@ public class MentionClusterMentionFeatur
       "mention1pos",
       tokenPOS);
 
+  public MentionClusterMentionFeaturesExtractor() throws CleartkExtractorException{
+    this(null);
+  }
+  
+  public MentionClusterMentionFeaturesExtractor(String vectorFile) throws CleartkExtractorException {
+    if(vectorFile != null){
+      this.continuousText = new ContinuousTextExtractor(vectorFile);
+      this.tokenVectorContext = new CleartkExtractor<Markable,BaseToken>(
+          BaseToken.class,
+          continuousText,
+          new FirstCovered(1),
+          new LastCovered(1),
+//          new Bag(new Covered()),
+          new Preceding(1),
+          new Following(1));
+    }
+  }
+  
   @Override
   public List<Feature> extract(JCas view, Markable focusAnnotation) throws CleartkExtractorException {
     List<Feature> feats = new ArrayList<>();
     
     // token features:
-    feats.addAll(tokenContext.extract(view, focusAnnotation));
+    feats.addAll(tokenIdentityContext.extract(view, focusAnnotation));
     
-    feats.add(new Feature("NumCoveredTokens", JCasUtil.selectCovered(BaseToken.class, focusAnnotation).size()));
+    
+    // token vector features:
+//    if(this.tokenVectorContext != null){
+//      feats.addAll(this.tokenVectorContext.extract(view, focusAnnotation));
+//    }
     
     // pos features:
     feats.addAll(mentionFeaturesExtractor.extract(view, focusAnnotation));
     
+    // Always do num covered and dep features
+    feats.add(new Feature("NumCoveredTokens", JCasUtil.selectCovered(BaseToken.class, focusAnnotation).size()));
     feats.addAll(DependencyTreeFeaturesExtractor.extractForNode(view, focusAnnotation, "dep"));
     
     return feats;

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterStringFeaturesExtractor.java Thu Jun 16 15:33:01 2016
@@ -1,5 +1,6 @@
 package org.apache.ctakes.coreference.ae.features.cluster;
 
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
 import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.contentWords;
 import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.endMatch;
 import static org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor.soonMatch;
@@ -14,12 +15,12 @@ import java.util.Set;
 
 import org.apache.ctakes.core.util.ListIterable;
 import org.apache.ctakes.coreference.ae.features.StringMatchingFeatureExtractor;
-import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.utils.struct.CounterMap;
+import org.apache.ctakes.utils.struct.UimaBinaryTreeMap;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
 import org.cleartk.ml.Feature;
@@ -37,12 +38,12 @@ public class MentionClusterStringFeature
     String m = mention.getCoveredText();
     Set<String> mentionWords = contentWords(mention);
     Set<String> nonHeadMentionWords = new HashSet<>(mentionWords);
-    ConllDependencyNode mentionHead = DependencyUtility.getNominalHeadNode(jCas, mention);
+    ConllDependencyNode mentionHead = UimaBinaryTreeMap.get(getKey(jCas), mention);
+    
     String mentionHeadString = null;
     if(mentionHead != null){
       mentionHeadString = mentionHead.getCoveredText().toLowerCase();
       nonHeadMentionWords.remove(mentionHeadString);
-    
 
       int clusterSize = 0;
       int maxNonoverlap = 0;
@@ -62,7 +63,7 @@ public class MentionClusterStringFeature
         String s = member.getCoveredText();
         Set<String> memberWords = contentWords(member);
         Set<String> nonHeadMemberWords = new HashSet<>(memberWords);
-        ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jCas, member);
+        ConllDependencyNode memberHead = UimaBinaryTreeMap.get(getKey(jCas), member);
         String memberHeadString = null;
         if(memberHead != null){
           memberHeadString = memberHead.getCoveredText().toLowerCase();

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java?rev=1748746&r1=1748745&r2=1748746&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterUMLSFeatureExtractor.java Thu Jun 16 15:33:01 2016
@@ -1,6 +1,8 @@
 package org.apache.ctakes.coreference.ae.features.cluster;
 
-import static org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor.*;
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+import static org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor.alias;
+import static org.apache.ctakes.coreference.ae.features.UMLSFeatureExtractor.getDocId;
 
 import java.util.ArrayList;
 import java.util.Collection;
@@ -10,7 +12,6 @@ import java.util.Map;
 import java.util.Set;
 
 import org.apache.ctakes.core.util.ListIterable;
-import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
@@ -19,6 +20,7 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.UimaBinaryTreeMap;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
@@ -44,7 +46,7 @@ public class MentionClusterUMLSFeatureEx
       coveringMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
     }
     
-    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jCas, mention);
+    ConllDependencyNode head = UimaBinaryTreeMap.get(getKey(jCas), mention);
     
     if(head != null){
       List<IdentifiedAnnotation> rmList = new ArrayList<>();
@@ -61,7 +63,7 @@ public class MentionClusterUMLSFeatureEx
       
       Set<IdentifiedAnnotation> clusterEnts = new HashSet<>();
       for(Markable member : new ListIterable<Markable>(cluster.getMembers())){
-        ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jCas, member);
+        ConllDependencyNode memberHead = UimaBinaryTreeMap.get(getKey(jCas), member);
         rmList.clear();
         // get the named entities covering this cluster member:
         List<IdentifiedAnnotation> ents2 = new ArrayList<>(coveringMap.get(memberHead)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head2.getBegin(), head2.getEnd());
@@ -91,7 +93,8 @@ public class MentionClusterUMLSFeatureEx
           (clusterEnts.size() > 0 && mentionEnts.size() == 0)){
         trueFeats.add("ClusterOrMentionNoCui");
       }
-
+      
+      int minDistance = Integer.MAX_VALUE;
       for(IdentifiedAnnotation ent1 : clusterEnts){
         HashSet<String> a1Tuis = new HashSet<>(); 
         String a1SemType = ent1.getClass().getSimpleName();
@@ -108,10 +111,26 @@ public class MentionClusterUMLSFeatureEx
           HashSet<String> a2Tuis = new HashSet<>();
           String a2SemType = ent2.getClass().getSimpleName();
 //          trueFeats.add("MentionSemType" + a2SemType);
+                   
           if(alias(ent1, ent2)){
             trueFeats.add("UMLS_ALIAS");
-            break;
           }
+
+          /*
+          if(!trueFeats.contains("UMLS_ALIAS") && isHypernym(ent1, ent2)){
+            trueFeats.add("IS_HYPERNYM");
+          }
+          
+          if(!trueFeats.contains("UMLS_ALIAS") && isHyponym(ent1, ent2)){
+            trueFeats.add("IS_HYPONYM");
+          }
+          */
+
+//          int pairDist = graphDistance(ent1, ent2);
+//          if(Math.abs(pairDist) < Math.abs(minDistance)){
+//            minDistance = pairDist;
+//          }
+          
           trueFeats.add("MentionClusterSemTypePair" + a1SemType + "_" + a2SemType);
           
           FSArray cons2 = ent2.getOntologyConceptArr();
@@ -136,8 +155,18 @@ public class MentionClusterUMLSFeatureEx
           }
         }
       }
+//      double distFeat = 0.0;
+//      if(minDistance != Integer.MAX_VALUE){
+//        distFeat = 1.0 / minDistance;
+//        if(distFeat < 0){
+//          feats.add(new Feature("AncestorDistance", -distFeat));
+//        }else{
+//          feats.add(new Feature("DescendentDistance", distFeat));
+//        }
+//      }        
     }
     
+    
     for(String feat : trueFeats){
       feats.add(new Feature(feat, true));
     }
@@ -158,8 +187,8 @@ public class MentionClusterUMLSFeatureEx
       throw new CleartkExtractorException(e);
     }
     
-    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jCas, mention);
-    
+    ConllDependencyNode head = UimaBinaryTreeMap.get(getKey(jCas), mention);
+
     List<IdentifiedAnnotation> rmList = new ArrayList<>();
     // get the entities covering this markable:
     List<IdentifiedAnnotation> mentionEnts = new ArrayList<>(coveringMap.get(head)); //JCasUtil.selectCovering(jCas, IdentifiedAnnotation.class, head1.getBegin(), head1.getEnd());'

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/AnnotationPairer.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,10 @@
+package org.apache.ctakes.coreference.ae.pairing;
+
+import java.util.List;
+
+import org.apache.uima.jcas.JCas;
+
+public interface AnnotationPairer<MENTION_TYPE,PAIR_TYPE> {
+  public List<PAIR_TYPE> getPairs(JCas jcas, MENTION_TYPE mention);
+  public void reset(JCas jcas);
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java?rev=1748746&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/pairing/cluster/ClusterMentionPairer_ImplBase.java Thu Jun 16 15:33:01 2016
@@ -0,0 +1,76 @@
+package org.apache.ctakes.coreference.ae.pairing.cluster;
+
+import static org.apache.ctakes.coreference.ae.MarkableHeadTreeCreator.getKey;
+
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.ctakes.coreference.ae.MentionClusterCoreferenceAnnotator.CollectionTextRelationIdentifiedAnnotationPair;
+import org.apache.ctakes.coreference.ae.pairing.AnnotationPairer;
+//import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.utils.struct.UimaBinaryTreeMap;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+public abstract class ClusterMentionPairer_ImplBase implements AnnotationPairer<Markable, CollectionTextRelationIdentifiedAnnotationPair> {
+  public abstract List<CollectionTextRelationIdentifiedAnnotationPair> getPairs(JCas jcas, Markable m);
+  private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+
+  @Override
+  public void reset(JCas jcas){
+    nodeEntMap = JCasUtil.indexCovering(jcas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+  }
+  
+  public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
+    Set<String> semTypes = new HashSet<>();
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+      semTypes.addAll(getBestEnt(jcas, member));
+    }
+    return semTypes;
+  }
+
+  public Set<String> getBestEnt(JCas jcas, Markable markable){
+    Set<String> bestEnts = new HashSet<>();
+    IdentifiedAnnotation bestEnt = null;
+    Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+    ConllDependencyNode head = UimaBinaryTreeMap.get(getKey(jcas), markable);
+    
+    Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+    for(IdentifiedAnnotation ent : coveringEnts){
+      if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+      ConllDependencyNode entHead = UimaBinaryTreeMap.get(getKey(jcas), ent);
+      if(entHead == head){
+        if(bestEnt == null){
+          bestEnt = ent;
+        }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+          // if the span of this entity is bigger than the biggest existing one:
+          bestEnt = ent;
+          otherBestEnts = new HashSet<>();
+        }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+          // there is another one with the exact same span and possibly different type!
+          otherBestEnts.add(ent);
+        }
+      }
+    }
+
+    if(bestEnt!=null){
+      bestEnts.add(bestEnt.getClass().getSimpleName());
+      for(IdentifiedAnnotation other : otherBestEnts){
+        bestEnts.add(other.getClass().getSimpleName());
+      }
+    }
+    return bestEnts;
+  }
+
+  protected static final boolean dominates(Annotation arg1, Annotation arg2) {
+    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+  }
+}



Mime
View raw message