ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1713449 - in /ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference: ae/ ae/features/ ae/features/cluster/ eval/
Date Mon, 09 Nov 2015 14:48:12 GMT
Author: tmill
Date: Mon Nov  9 14:48:12 2015
New Revision: 1713449

URL: http://svn.apache.org/viewvc?rev=1713449&view=rev
Log:
Variety of minor changes ; synching with remotes.

Added:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
Removed:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfMarkableSpans.java
Modified:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/CoreferenceChainScoringOutput.java Mon Nov  9 14:48:12 2015
@@ -8,6 +8,7 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 
+import org.apache.ctakes.constituency.parser.util.TreeUtils;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
 import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
@@ -124,18 +125,21 @@ public class CoreferenceChainScoringOutp
     Multiset<Integer> endSet = HashMultiset.create();
     int tokenId = 0;
     int sentId = 0;
+    BaseToken nextToken = tokens.get(0);
 
     for(int i = 0; i < tokens.size(); i++){
-      BaseToken token = tokens.get(i);
+      boolean endSentToken = false;
+      BaseToken token = nextToken;
+      if(i+1 < tokens.size()){
+        nextToken = tokens.get(i+1);
+        if(nextToken instanceof NewlineToken || (token.getCoveredText().equals(".") && !(endSet.size() > 0))){
+          endSentToken = true;
+        }
+      }
 
       // if we see a newline token at the end of a sentence break the sentence
       // only print out if we are not at the start of the sentence:
       if(token instanceof NewlineToken){
-        if(tokenId > 0){
-          out.println();
-          tokenId = 0;
-          sentId++;
-        }
         continue;
       }
       
@@ -172,6 +176,7 @@ public class CoreferenceChainScoringOutp
           }
         }
       }
+
       
       out.print(filename.getPath());
       out.print('\t');
@@ -179,12 +184,23 @@ public class CoreferenceChainScoringOutp
       out.print('\t');
       out.print(tokenId++);
       out.print('\t');
-      out.print(token instanceof NewlineToken ? "Newline" : token.getCoveredText());
+      out.print(token instanceof NewlineToken ? "Newline" : TreeUtils.escapePunct(token.getCoveredText()));
       out.print('\t');
       out.print(token.getPartOfSpeech());
       out.print('\t');
-      // parse bit -- can ignore?
-      out.print('-');  out.print('\t');
+      // parse bit -- assume flat parse
+      if(tokenId == 1){
+        out.print("(NOPARSE*");
+        // special case for one word sentences:
+        if(endSentToken){
+          out.print(")");
+        }
+      }else if(endSentToken){
+        out.print("*)");
+      }else{
+        out.print("*");
+      }      
+      out.print('\t');
       // predicate lemma -- can ignore?
       out.print('-'); out.print('\t');
       // predicate frameset id -- can ignore?
@@ -206,6 +222,7 @@ public class CoreferenceChainScoringOutp
           buff.append(')');
           buff.append('|');
         }
+        endSet.remove(ind);
 //        endMention.remove(ind);
       }
       for(int ind : wholeMention){
@@ -241,9 +258,12 @@ public class CoreferenceChainScoringOutp
       }else{
         out.println("_");
       }
-//    }
-//    out.println();
-//      lastToken = token;
+     
+      if(endSentToken){
+        out.println();
+        tokenId = 0;
+        sentId++;
+      }
     }
     if(!isGold){
       icOut.println("#end document");

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/DeterministicMarkableAnnotator.java Mon Nov  9 14:48:12 2015
@@ -60,7 +60,11 @@ public class DeterministicMarkableAnnota
     for(Segment seg : JCasUtil.select(jCas, Segment.class)){
       for(ConllDependencyNode node : JCasUtil.selectCovered(jCas, ConllDependencyNode.class, seg)){
         String nodeText = node.getCoveredText().toLowerCase();
-        TerminalTreebankNode term = JCasUtil.selectCovered(TerminalTreebankNode.class, node).get(0);
+        List<TerminalTreebankNode> terms = JCasUtil.selectCovered(TerminalTreebankNode.class, node);
+        TerminalTreebankNode term = null;
+        if(terms.size() > 0){
+          term = terms.get(0);
+        }
         
         if(node.getId() == 0){
           continue;
@@ -71,7 +75,7 @@ public class DeterministicMarkableAnnota
         // 1) get nouns, and expand the markable to the phrase they cover
         // 2) get determiners like "this" and "these"
         // 3) non-passive "it"
-        if((node.getPostag().equals("NN") || node.getPostag().equals("NNS")) && term.getNodeType().startsWith("N")){
+        if(node.getPostag().startsWith("NN") && term != null && term.getNodeType().startsWith("N")){
           if(node.getForm().matches("\\s+")) continue;
           // TODO fix this godawful hack:
           if(nodeText.equals("date") || nodeText.equals("tablet") || nodeText.equals("hg") || nodeText.equals("lb") || nodeText.equals("status")
@@ -83,7 +87,7 @@ public class DeterministicMarkableAnnota
           int end = node.getEnd();
 //          if(node.getHead().getId() != 0){
             List<ConllDependencyNode> progeny = getProgeny(node, getDependencyNodes(jCas, getSentence(jCas, node)));
-            progeny = removeConjunctionNodes(node, progeny);
+            progeny = removeUnannotatedNodes(node, progeny);
             if(progeny.size() > 0){
               for(ConllDependencyNode child : progeny){
                 if(child.getBegin() < begin){
@@ -109,6 +113,12 @@ public class DeterministicMarkableAnnota
               end = prevToken.getEnd();
             }
           }
+          
+          Matcher m = headerPatt.matcher(nodeText);
+          if(m.find()){
+            begin = begin + m.end();
+          }
+
           Markable markable = new Markable(jCas, begin, end);
           markable.addToIndexes();
         }else if(node.getPostag().equals("DT") && !node.getDeprel().equals("det")){
@@ -123,7 +133,9 @@ public class DeterministicMarkableAnnota
     }
   }
 
-  private static List<ConllDependencyNode> removeConjunctionNodes(ConllDependencyNode originalNode,
+  // Post-process to remove those kinds of nodes which may or may not be correctly parsed but do not tend to align with gold annotated
+  // markables (and usually our intuitions as well, so it's not completely hacky).
+  private static List<ConllDependencyNode> removeUnannotatedNodes(ConllDependencyNode originalNode,
       List<ConllDependencyNode> progeny) {
     List<ConllDependencyNode> filtered = new ArrayList<>();
     
@@ -133,7 +145,8 @@ public class DeterministicMarkableAnnota
       boolean blockedByConj = false;
       for(ConllDependencyNode pathEl : DependencyUtility.getPath(progeny, node, originalNode)){
         if(pathEl == originalNode) continue;
-        if(pathEl.getDeprel().equals("conj") || pathEl.getDeprel().equals("cc") || pathEl.getPostag().equals(".") || pathEl.getPostag().equals(",") || pathEl.getDeprel().equals("meta")){
+        if(pathEl.getDeprel().equals("conj") || pathEl.getDeprel().equals("cc") || pathEl.getPostag().equals(".") || pathEl.getPostag().equals(",") || pathEl.getDeprel().equals("meta") 
+            || pathEl.getCoveredText().matches("(([A-Z][\\.\\:\\)])|(#\\d+)|(\\d+[\\.\\:\\)]))")){
           blockedByConj = true;
           break;
         }

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java Mon Nov  9 14:48:12 2015
@@ -477,10 +477,10 @@ public class EventCoreferenceAnnotator e
     String category = super.classifier.classify(features);
     
         
-    if(this.scoreAll && category.equals(NO_RELATION_CATEGORY)){
+    if(this.scoreAll){
       Map<String,Double> scores = super.classifier.score(features);
-      category = IDENTITY_RELATION;
       this.lastScore = scores.get(IDENTITY_RELATION);
+      category = IDENTITY_RELATION;
     }
     return category;
   }

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java Mon Nov  9 14:48:12 2015
@@ -16,6 +16,8 @@ import org.apache.ctakes.core.util.ListF
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistanceFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterMentionFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
@@ -24,8 +26,6 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
 import org.apache.ctakes.coreference.util.ClusterUtils;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
-import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
-import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator.IdentifiedAnnotationPair;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
@@ -55,6 +55,7 @@ import org.cleartk.ml.CleartkProcessingE
 import org.cleartk.ml.DataWriter;
 import org.cleartk.ml.Feature;
 import org.cleartk.ml.Instance;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
 import org.cleartk.ml.jar.DefaultDataWriterFactory;
 import org.cleartk.ml.jar.DirectoryDataWriterFactory;
 import org.cleartk.ml.jar.GenericJarClassifierFactory;
@@ -82,7 +83,7 @@ public class MentionClusterCoreferenceAn
         MentionClusterCoreferenceAnnotator.class,
         CleartkAnnotator.PARAM_IS_TRAINING,
         true,
-        RelationExtractorAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        MentionClusterCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
         downsamplingRate,
         DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
         dataWriterClass,
@@ -100,8 +101,9 @@ public class MentionClusterCoreferenceAn
         modelPath);
   }
 
-  private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> featureExtractors = this.getFeatureExtractors();
-
+  private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> relationExtractors = this.getFeatureExtractors();
+  private List<FeatureExtractor1<Markable>> mentionExtractors = this.getMentionExtractors();
+  
   private Set<String> markableStrings = null;
   private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
   private Map<String,Set<Markable>> headWordMarkables = null;
@@ -116,6 +118,7 @@ public class MentionClusterCoreferenceAn
     extractors.add(new MentionClusterDepHeadExtractor());
     extractors.add(new MentionClusterStackFeaturesExtractor());
     extractors.add(new MentionClusterSalienceFeaturesExtractor());
+//    extractors.add(new MentionClusterDistanceFeaturesExtractor());
     
     try {
       extractors.add(new MentionClusterDistSemExtractor());
@@ -127,22 +130,29 @@ public class MentionClusterCoreferenceAn
     return extractors;
   }
   
+  protected List<FeatureExtractor1<Markable>> getMentionExtractors(){
+    List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+    // mention features from pairwise system:
+    extractors.add(new MentionClusterMentionFeaturesExtractor());
+
+    return extractors;
+  }
+  
   protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
       JCas jcas,
       IdentifiedAnnotation mention){
     int sentDist = 5;
     // using linked hash set ensures no duplicates:
     LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
-    if(mention.getCoveredText().equalsIgnoreCase("this")){
-      pairs.addAll(getSentenceDistancePairs(jcas, mention, 1));
-      pairs.addAll(getClusterPairs(jcas, mention, 3));
-    }else{
+//    if(mention.getCoveredText().equalsIgnoreCase("this")){
+//      pairs.addAll(getSentenceDistancePairs(jcas, mention, 1));
+//      pairs.addAll(getClusterPairs(jcas, mention, 3));
+//    }else{
       pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist));
       pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist));
       pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE));
-//      pairs.addAll(getExactStringMatchPairs(jcas, mention, sentDist));
       pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist));
-    }
+//    }
     
     return pairs;
   }
@@ -189,6 +199,12 @@ public class MentionClusterCoreferenceAn
       JCas jcas, IdentifiedAnnotation mention, int sentDist) {
     List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
     for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
       IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
       if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
         continue;
@@ -298,6 +314,10 @@ public class MentionClusterCoreferenceAn
     List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
 
     ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+    if(headNode == null){
+      Logger.getLogger(MentionClusterCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
+      return pairs;
+    }
     String head = headNode.getCoveredText().toLowerCase();
     if(headWordMarkables.containsKey(head)){
       Set<Markable> headSet = headWordMarkables.get(head);
@@ -324,7 +344,7 @@ public class MentionClusterCoreferenceAn
     markableStrings = new HashSet<>();
     nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
     headWordMarkables = new HashMap<>();
-    pairScores = getMarkablePairScores(jCas);
+//    pairScores = getMarkablePairScores(jCas);
     
     Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
     relationLookup = new HashMap<>();
@@ -352,6 +372,7 @@ public class MentionClusterCoreferenceAn
     
     for(Segment segment : JCasUtil.select(jCas, Segment.class)){
       for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
         String mentionText = mention.getCoveredText().toLowerCase();
         boolean singleton = true;
         double maxScore = 0.0;
@@ -361,7 +382,7 @@ public class MentionClusterCoreferenceAn
           CollectionTextRelation cluster = pair.getCluster();
           // apply all the feature extractors to extract the list of features
           List<Feature> features = new ArrayList<>();
-          for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.featureExtractors) {
+          for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.relationExtractors) {
             List<Feature> feats = extractor.extract(jCas, cluster, mention);
             if (feats != null){
 //              Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
@@ -369,6 +390,10 @@ public class MentionClusterCoreferenceAn
             }
           }
           
+          for(FeatureExtractor1<Markable> extractor : this.mentionExtractors){
+            features.addAll(extractor.extract(jCas, mention));
+          }
+          
           // here is where feature conjunctions can go (dupFeatures)
           List<Feature> dupFeatures = new ArrayList<>();
           // sanity check on feature values
@@ -378,15 +403,43 @@ public class MentionClusterCoreferenceAn
               String message = String.format("Null value found in %s from %s", feature, features);
               System.err.println(message);
               //            throw new IllegalArgumentException(String.format(message, feature, features));
-//            }else{
-//              if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
-//                dupFeatures.add(new Feature("PRO+"+feature.getName(), feature.getValue()));
-//              }
+            }else{
+              String prefix = null;
+              //  Durret and Klein style feature conjunctions: pronoun type or pos tag. maybe try umls semantic-type?
+              /*
+              if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
+                prefix = "PRO_"+mentionText;
+              }else if(headNode != null && headNode.getPostag() != null){
+                prefix = headNode.getPostag();                
+              }else{
+                prefix = "UNK";
+              }
+              */
+              // headword-based feature conjunctions
+/*              if(headNode != null && headNode.getCoveredText() != null && headMatches(headNode.getCoveredText().toLowerCase(), features)){
+                prefix = "HEAD_MATCH";
+              }else{
+                prefix = "NO_HEAD_MATCH";
+              }
+*/
+              
+              // UMLS semantic type feature conjunctions
+              for(Feature feat : features){
+                if(feat.getName().startsWith("ClusterSemType")){
+                  dupFeatures.add(new Feature(feat.getName()+"_"+feature.getName(), feature.getValue()));
+                }
+              }
+              
+              if(prefix != null){
+                dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue()));
+              }
             }
           }
           features.addAll(dupFeatures);
           
+          
           // pairwise score features:
+          /*
           double minPairScore = 1.0;
           double maxPairScore = 0.0;
           double avePairScore = 0.0;
@@ -400,6 +453,9 @@ public class MentionClusterCoreferenceAn
               markablePair = new HashableArguments(mention, member);
               score = pairScores.get(markablePair);
             }
+            if(score == null){
+              score = 0.0;
+            }
             if(score != null){
               avePairScore += score;
               if(score > maxPairScore){
@@ -410,17 +466,19 @@ public class MentionClusterCoreferenceAn
               }
             }
           }
+          
           features.add(new Feature("PAIRWISE_MAX", maxPairScore));
-          features.add(new Feature("PAIRWISE_MIN", minPairScore));
-          if(numPairs > 0){
-            avePairScore /= numPairs;
-          }else{
-            avePairScore = 0.0;
-          }
-          if(Double.isNaN(avePairScore)){
-            Logger.getLogger(MentionClusterCoreferenceAnnotator.class).error("Pairwise average feature found with value NaN");
-          }
-          features.add(new Feature("PAIRWISE_AVE", avePairScore));
+          */
+//          features.add(new Feature("PAIRWISE_MIN", minPairScore));
+//          if(numPairs > 0){
+//            avePairScore /= numPairs;
+//          }else{
+//            avePairScore = 0.0;
+//          }
+//          if(Double.isNaN(avePairScore)){
+//            Logger.getLogger(MentionClusterCoreferenceAnnotator.class).error("Pairwise average feature found with value NaN");
+//          }
+//          features.add(new Feature("PAIRWISE_AVE", avePairScore));
 
           // during training, feed the features to the data writer
           if (this.isTraining()) {
@@ -468,7 +526,6 @@ public class MentionClusterCoreferenceAn
         
         markableStrings.add(mention.getCoveredText().toLowerCase());
         
-        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
         if(headNode != null){
           String head = headNode.getCoveredText().toLowerCase();
           if(!headWordMarkables.containsKey(head)){
@@ -491,6 +548,21 @@ public class MentionClusterCoreferenceAn
         }
       }
     }
+    
+    removeSingletonClusters(jCas);
+  }
+  
+  private boolean headMatches(String head, List<Feature> feats){
+    boolean match = false;
+    for(Feature feat : feats){
+      if(feat.getName().equals("ClusterHeadMatchesMentionHead")){
+        if(feat.getValue().equals(true)){
+          match = true;
+        }
+        break;
+      }
+    }
+    return match;
   }
   
   /**
@@ -562,6 +634,20 @@ public class MentionClusterCoreferenceAn
   }
 
 
+  private void removeSingletonClusters(JCas jcas){
+    List<CollectionTextRelation> toRemove = new ArrayList<>();
+    for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){     
+      NonEmptyFSList head = (NonEmptyFSList) rel.getMembers();
+      if(head.getTail() instanceof EmptyFSList){
+        toRemove.add(rel);
+      }
+    }
+    
+    for(CollectionTextRelation rel : toRemove){
+      rel.removeFromIndexes();
+    }
+  }
+  
   private static final boolean dominates(Annotation arg1, Annotation arg2) {
     return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
   }

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java?rev=1713449&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterRankingCoreferenceAnnotator.java Mon Nov  9 14:48:12 2015
@@ -0,0 +1,655 @@
+package org.apache.ctakes.coreference.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Random;
+import java.util.Set;
+
+import org.apache.ctakes.core.util.ListFactory;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterAgreementFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDepHeadExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterMentionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
+import org.apache.ctakes.coreference.util.ClusterUtils;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textsem.MedicationEventMention;
+import org.apache.ctakes.typesystem.type.textspan.Paragraph;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.ml.CleartkAnnotator;
+import org.cleartk.ml.CleartkProcessingException;
+import org.cleartk.ml.DataWriter;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.jar.DefaultDataWriterFactory;
+import org.cleartk.ml.jar.DirectoryDataWriterFactory;
+import org.cleartk.ml.jar.GenericJarClassifierFactory;
+import org.cleartk.ml.svmlight.rank.QidInstance;
+import org.cleartk.util.ViewUriUtil;
+
+public class MentionClusterRankingCoreferenceAnnotator extends CleartkAnnotator<Double> {
+  public static final String NO_RELATION_CATEGORY = "-NONE-";
+  public static final String CLUSTER_RELATION_CATEGORY = "CoreferenceClusterMember";
+  
+  public static final String PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE =
+      "ProbabilityOfKeepingANegativeExample";
+  @ConfigurationParameter(
+      name = PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+      mandatory = false,
+      description = "probability that a negative example should be retained for training")
+  protected double probabilityOfKeepingANegativeExample = 0.5;
+
+  protected Random coin = new Random(0);
+
+  boolean greedyFirst = true;
+  
+  private int qid = 0;
+  
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<?>> dataWriterClass,
+      File outputDirectory,
+      float downsamplingRate) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterRankingCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        MentionClusterRankingCoreferenceAnnotator.PARAM_PROBABILITY_OF_KEEPING_A_NEGATIVE_EXAMPLE,
+        downsamplingRate,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DirectoryDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+
+  public static AnalysisEngineDescription createAnnotatorDescription(
+      String modelPath) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        MentionClusterRankingCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        modelPath);
+  }
+
+  private List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> relationExtractors = this.getFeatureExtractors();
+  private List<FeatureExtractor1<Markable>> mentionExtractors = this.getMentionExtractors();
+  
+  private Set<String> markableStrings = null;
+  private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap = null;
+  private Map<String,Set<Markable>> headWordMarkables = null;
+  private Map<HashableArguments,Double> pairScores = null;
+  
+  protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> getFeatureExtractors() {
+    List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>> extractors = new ArrayList<>();
+    extractors.add(new MentionClusterAgreementFeaturesExtractor());
+    extractors.add(new MentionClusterStringFeaturesExtractor());
+    extractors.add(new MentionClusterSectionFeaturesExtractor());
+    extractors.add(new MentionClusterUMLSFeatureExtractor());
+    extractors.add(new MentionClusterDepHeadExtractor());
+    extractors.add(new MentionClusterStackFeaturesExtractor());
+    extractors.add(new MentionClusterSalienceFeaturesExtractor());
+//    extractors.add(new MentionClusterDistanceFeaturesExtractor());
+    
+    try {
+      extractors.add(new MentionClusterDistSemExtractor());
+      extractors.add(new MentionClusterSemTypeDepPrefsFeatureExtractor());
+    } catch (IOException e) {
+      e.printStackTrace();
+    }
+    
+    return extractors;
+  }
+  
+  protected List<FeatureExtractor1<Markable>> getMentionExtractors(){
+    List<FeatureExtractor1<Markable>> extractors = new ArrayList<>();
+    // mention features from pairwise system:
+    extractors.add(new MentionClusterMentionFeaturesExtractor());
+
+    return extractors;
+  }
+  
+  protected Iterable<CollectionTextRelationIdentifiedAnnotationPair> getCandidateRelationArgumentPairs(
+      JCas jcas,
+      IdentifiedAnnotation mention){
+    int sentDist = 5;
+    // using linked hash set ensures no duplicates:
+    LinkedHashSet<CollectionTextRelationIdentifiedAnnotationPair> pairs = new LinkedHashSet<>();
+//    if(mention.getCoveredText().equalsIgnoreCase("this")){
+//      pairs.addAll(getSentenceDistancePairs(jcas, mention, 1));
+//      pairs.addAll(getClusterPairs(jcas, mention, 3));
+//    }else{
+      pairs.addAll(getSentenceDistancePairs(jcas, mention, sentDist));
+      pairs.addAll(getSectionHeaderPairs(jcas, mention, sentDist));
+      pairs.addAll(getClusterPairs(jcas, mention, Integer.MAX_VALUE));
+      pairs.addAll(getHeadwordMatchPairs(jcas, mention, sentDist));
+//    }
+    
+    return pairs;
+  }
+  
+  /*
+   * getExactStringMatchPairs()
+   * For mentions that have the exact string repeated elsewhere in the document we want to
+   * allow matching across any distance. We don't use the sentence distance parameter here.
+   * We make use of a global variable markableStrings that is a HashSet containig all the markable
+   * strings from this document.
+   */
+  private List<CollectionTextRelationIdentifiedAnnotationPair> getExactStringMatchPairs(
+      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    
+    if(markableStrings.contains(mention.getCoveredText().toLowerCase())){
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(m == mostRecent) break;
+          // see if any of the members of the cluster have the exact same string as this 
+          if(m.getCoveredText().toLowerCase().equals(mention.getCoveredText().toLowerCase())){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+        }
+      }
+    }
+    return pairs;
+  }
+  
+  /*
+   * getClusterPairs()
+   * In this method we allow to link to clusters containing more than one mention even if they
+   * are beyond a sentence distance. First we check whether the most recent mention in the cluster
+   * is within the specified sentence distance (presumably longer than the sentence distance passed into
+   * the method that constrains by distance). The wrinkle is that during training many clusters will have multiple
+   * members but only one before the focus mention. So we need to count the members of a cluster until we 
+   * get to the most recent one in the cluster. If that value is > 1 then we allow the pairing.
+   */
+  private List<CollectionTextRelationIdentifiedAnnotationPair> getClusterPairs(
+      JCas jcas, IdentifiedAnnotation mention, int sentDist) {
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist){
+        continue;
+      }
+      int numMembers=0;
+      for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        numMembers++;
+        if(m == mostRecent) break;
+      }
+      if(numMembers > 1){
+        pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+      }
+    }
+    
+    return pairs;
+  }
+
+  /*
+   * Here we want to add only things that are nearby. First we check the semantic types
+   * of the cluster we're comparing against. If any member is an Anatomical Site or Medication,
+   * we add the cluster no matter what. Otherwise we check how many sentences are in between
+   * the mention and the latest element of the cluster.
+   */
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSentenceDistancePairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    Set<String> bestAnaTypes = getBestEnt(jcas, (Markable) mention);
+    
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()) continue;
+      
+      // check for distance if they are not anatomical site or medication
+      if(!(bestAnaTypes.contains(AnatomicalSiteMention.class.getSimpleName()) ||
+          bestAnaTypes.contains(MedicationEventMention.class.getSimpleName()))){
+
+        IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+        if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) > sentDist) continue;
+      }
+
+      // check for types of cluster
+      Set<String> bestClusterTypes = getBestEnt(jcas, cluster);
+      if(bestAnaTypes.size() > 0 && bestClusterTypes.size() > 0){
+        boolean overlap = false;
+        for(String semType : bestAnaTypes){
+          if(bestClusterTypes.contains(semType)){
+            overlap = true;
+          }
+        }
+        // they both correspond to named entities but no overlap in which category of named entity.
+        if(!overlap){
+          continue;
+        }
+      }
+      pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));      
+    }
+    return pairs;
+  }
+
+  /*
+   * getSectionHeaderPairs()
+   * Here we want to add clusters where one of the members is on a line all by itself (a section header)
+   * To do this we leverage the annotatino of Paragraphs, roughly the areas between newlines. If such a 
+   * span only contains one sentence then we consider it a "header" (or also as important a list item).
+   * If it is a header we add it. Here we use sentDist to not bother adding things that will be added by
+   * the "sentence distance" method.
+   */
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getSectionHeaderPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+    for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+      NonEmptyFSList members = ((NonEmptyFSList)cluster.getMembers());
+      Annotation first = (Annotation) members.getHead();
+      if(first == null || mention.getBegin() <= first.getEnd()){
+        continue;
+      }
+
+      // first check if it is sentence distance range -- if so we can ignore because it will be include by other pair generator
+      IdentifiedAnnotation mostRecent = (IdentifiedAnnotation) ClusterUtils.getMostRecent(members, mention);
+      if(mostRecent == null || EventCoreferenceAnnotator.sentDist(jcas, mostRecent, mention) <= sentDist){
+        continue;
+      }
+      
+      // now check if any of the mentions are in a section header
+      List<Paragraph> pars = JCasUtil.selectCovered(jcas, Paragraph.class, 0, mention.getBegin());
+      for(int j = 0; j < pars.size(); j++){
+        boolean match = false;
+        Paragraph par = pars.get(j); // pars.get(pars.size()-j-1);
+        List<Sentence> coveredSents = JCasUtil.selectCovered(jcas, Sentence.class, par);
+        if(coveredSents != null && coveredSents.size() == 1){
+          // this is sentences that are the same span as paragraphs -- how we model section headers
+          // see if any of the cluster mentions are in the section header
+          for(Markable m : JCasUtil.select(members, Markable.class)){
+            if(dominates(par, m)){
+              pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+              match = true;
+              break;
+            }
+          }
+        }
+        if(match) break;
+      }
+    }
+    return pairs;
+  }
+  
+  protected List<CollectionTextRelationIdentifiedAnnotationPair> getHeadwordMatchPairs(JCas jcas, IdentifiedAnnotation mention, int sentDist){
+    List<CollectionTextRelationIdentifiedAnnotationPair> pairs = new ArrayList<>();
+
+    ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jcas, mention);
+    if(headNode == null){
+      Logger.getLogger(MentionClusterRankingCoreferenceAnnotator.class).warn("There is a markable with no dependency node covering it.");
+      return pairs;
+    }
+    String head = headNode.getCoveredText().toLowerCase();
+    if(headWordMarkables.containsKey(head)){
+      Set<Markable> headSet = headWordMarkables.get(head);
+      for(CollectionTextRelation cluster : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        Annotation mostRecent = ClusterUtils.getMostRecent((NonEmptyFSList)cluster.getMembers(), mention);
+        if(mostRecent == null) continue;
+        for(Markable m : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          if(headSet.contains(mostRecent)){
+            pairs.add(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+            break;
+          }
+          if(m == mostRecent) break;
+        }
+      }      
+    }
+    
+    return pairs;
+  }
+  
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    // lookup from pair of annotations to binary text relation
+    // note: assumes that there will be at most one relation per pair
+    markableStrings = new HashSet<>();
+    nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
+    headWordMarkables = new HashMap<>();
+//    pairScores = getMarkablePairScores(jCas);
+    
+    Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup;
+    relationLookup = new HashMap<>();
+    if (this.isTraining()) {
+      for (CollectionTextRelation cluster : JCasUtil.select(jCas, CollectionTextRelation.class)) {
+        for(IdentifiedAnnotation mention : JCasUtil.select(cluster.getMembers(), Markable.class)){
+          CollectionTextRelationIdentifiedAnnotationRelation relation = 
+              new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+          relation.setCluster(cluster);
+          relation.setMention(mention);
+          relation.setCategory("CoreferenceClusterMember");
+          relation.addToIndexes();
+          // The key is a list of args so we can do bi-directional lookup
+          CollectionTextRelationIdentifiedAnnotationPair key = new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention);
+          if(relationLookup.containsKey(key)){
+            String cat = relationLookup.get(key).getCategory();
+            System.err.println("Error in: "+ ViewUriUtil.getURI(jCas).toString());
+            System.err.println("Error! This attempted relation " + relation.getCategory() + " already has a relation " + cat + " at this span: " + mention.getCoveredText());
+          }
+          relationLookup.put(key, relation);
+        }
+      }
+    }
+
+    
+    for(Segment segment : JCasUtil.select(jCas, Segment.class)){
+      for(Markable mention : JCasUtil.selectCovered(jCas, Markable.class, segment)){
+        ConllDependencyNode headNode = DependencyUtility.getNominalHeadNode(jCas, mention);
+        String mentionText = mention.getCoveredText().toLowerCase();
+        boolean singleton = true;
+        double maxScore = 0.0;
+        CollectionTextRelation maxCluster = null;
+        
+        for(CollectionTextRelationIdentifiedAnnotationPair pair : this.getCandidateRelationArgumentPairs(jCas, mention)){
+          CollectionTextRelation cluster = pair.getCluster();
+          // apply all the feature extractors to extract the list of features
+          List<Feature> features = new ArrayList<>();
+          for (RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation> extractor : this.relationExtractors) {
+            List<Feature> feats = extractor.extract(jCas, cluster, mention);
+            if (feats != null){
+//              Logger.getRootLogger().info(String.format("For cluster with %d mentions, %d %s features", JCasUtil.select(cluster.getMembers(), Markable.class).size(), feats.size(), extractor.getClass().getSimpleName()));
+              features.addAll(feats);
+            }
+          }
+          
+          for(FeatureExtractor1<Markable> extractor : this.mentionExtractors){
+            features.addAll(extractor.extract(jCas, mention));
+          }
+          
+          // here is where feature conjunctions can go (dupFeatures)
+          List<Feature> dupFeatures = new ArrayList<>();
+          // sanity check on feature values
+          for (Feature feature : features) {
+            if (feature.getValue() == null) {
+              feature.setValue("NULL");
+              String message = String.format("Null value found in %s from %s", feature, features);
+              System.err.println(message);
+              //            throw new IllegalArgumentException(String.format(message, feature, features));
+            }else{
+              String prefix = "";
+              if(mentionText.equals("it") || mentionText.equals("this") || mentionText.equals("that")){
+                prefix = "PRO_"+mentionText;
+              }else if(headNode != null && headNode.getPostag() != null){
+                prefix = headNode.getPostag();                
+              }else{
+                prefix = "UNK";
+              }
+              dupFeatures.add(new Feature(prefix+"_"+feature.getName(), feature.getValue()));
+            }
+          }
+          features.addAll(dupFeatures);    
+
+          // during training, feed the features to the data writer
+          // create a classification instance and write it to the training data
+
+          if (this.isTraining()) {
+            String category = this.getRelationCategory(relationLookup, cluster, mention);
+            if (category == null) {
+              continue;
+            }
+            double outVal = 1.0;
+            if(category.equals(NO_RELATION_CATEGORY)){
+              outVal = 0.0;
+            }
+
+            QidInstance<Double> inst = new QidInstance<>();
+            inst.setQid(String.valueOf(qid));
+            inst.addAll(features);
+            inst.setOutcome(outVal);
+            this.dataWriter.write(inst);
+            if(!category.equals(NO_RELATION_CATEGORY)){
+              break;
+            }
+          }
+
+          // during classification feed the features to the classifier and create
+          // annotations
+          else {
+            Double prediction = this.classify(features);
+            if(prediction > maxScore){
+              maxScore = prediction;
+              maxCluster = cluster;
+            }
+          }
+        }
+        
+        markableStrings.add(mention.getCoveredText().toLowerCase());
+        
+        if(headNode != null){
+          String head = headNode.getCoveredText().toLowerCase();
+          if(!headWordMarkables.containsKey(head)){
+            headWordMarkables.put(head, new HashSet<Markable>());
+          }
+          headWordMarkables.get(head).add(mention);
+        }
+        
+        // if we got this far and never matched up the
+        if(maxScore > 0){
+          createRelation(jCas, maxCluster, mention, CLUSTER_RELATION_CATEGORY);
+        }else{
+          // make the markable it's own cluster:
+          CollectionTextRelation chain = new CollectionTextRelation(jCas);
+          NonEmptyFSList list = new NonEmptyFSList(jCas);
+          list.setHead(mention);
+          list.setTail(new EmptyFSList(jCas));
+          chain.setMembers(list);
+          chain.addToIndexes();
+          list.addToIndexes();
+          list.getTail().addToIndexes();
+        }
+        qid++;
+      }
+    }
+    
+    removeSingletonClusters(jCas);
+  }
+  
+  /**
+   * Looks up the arguments in the specified lookup table and converts the
+   * relation into a label for classification
+   * 
+   * @return If this category should not be processed for training return
+   *         <i>null</i> otherwise it returns the label sent to the datawriter
+   */
+  protected String getRelationCategory(
+      Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation> relationLookup,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention) {
+    CollectionTextRelationIdentifiedAnnotationRelation relation = 
+        relationLookup.get(new CollectionTextRelationIdentifiedAnnotationPair(cluster, mention));
+    String category;
+    if (relation != null) {
+      category = relation.getCategory();
+    } else if (coin.nextDouble() <= this.probabilityOfKeepingANegativeExample) {
+      category = NO_RELATION_CATEGORY;
+    } else {
+      category = null;
+    }
+    return category;
+  }
+
+  /**
+   * Predict an outcome given a set of features. By default, this simply
+   * delegates to the object's <code>classifier</code>. Subclasses may override
+   * this method to implement more complex classification procedures.
+   * 
+   * @param features
+   *          The features to be classified.
+   * @return The predicted outcome (label) for the features.
+   */
+  protected Double classify(List<Feature> features) throws CleartkProcessingException {
+    return this.classifier.classify(features);
+  }
+
+  /**
+   * Create a UIMA relation type based on arguments and the relation label. This
+   * allows subclasses to create/define their own types: e.g. coreference can
+   * create CoreferenceRelation instead of BinaryTextRelation
+   * 
+   * @param jCas
+   *          - JCas object, needed to create new UIMA types
+   * @param arg1
+   *          - First argument to relation
+   * @param arg2
+   *          - Second argument to relation
+   * @param predictedCategory
+   *          - Name of relation
+   */
+  protected void createRelation(
+      JCas jCas,
+      CollectionTextRelation cluster,
+      IdentifiedAnnotation mention,
+      String predictedCategory) {
+    // add the relation to the CAS
+    CollectionTextRelationIdentifiedAnnotationRelation relation = new CollectionTextRelationIdentifiedAnnotationRelation(jCas);
+    relation.setCluster(cluster);
+    relation.setMention(mention);
+    relation.setCategory(predictedCategory);
+    relation.addToIndexes();
+    
+//    RelationArgument arg = new RelationArgument(jCas);
+//    arg.setArgument(mention);
+    ListFactory.append(jCas, cluster.getMembers(), mention);    
+  }
+
+
+  private void removeSingletonClusters(JCas jcas){
+    List<CollectionTextRelation> toRemove = new ArrayList<>();
+    for(CollectionTextRelation rel : JCasUtil.select(jcas, CollectionTextRelation.class)){     
+      NonEmptyFSList head = (NonEmptyFSList) rel.getMembers();
+      if(head.getTail() instanceof EmptyFSList){
+        toRemove.add(rel);
+      }
+    }
+    
+    for(CollectionTextRelation rel : toRemove){
+      rel.removeFromIndexes();
+    }
+  }
+  
+  private static final boolean dominates(Annotation arg1, Annotation arg2) {
+    return (arg1.getBegin() <= arg2.getBegin() && arg1.getEnd() >= arg2.getEnd());
+  }
+
+  public Set<String> getBestEnt(JCas jcas, CollectionTextRelation cluster){
+    Set<String> semTypes = new HashSet<>();
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+      semTypes.addAll(getBestEnt(jcas, member));
+    }
+    return semTypes;
+  }
+  
+  public Set<String> getBestEnt(JCas jcas, Markable markable){
+    Set<String> bestEnts = new HashSet<>();
+    IdentifiedAnnotation bestEnt = null;
+    Set<IdentifiedAnnotation> otherBestEnts = new HashSet<>();
+    ConllDependencyNode head = DependencyUtility.getNominalHeadNode(jcas, markable);
+    Collection<IdentifiedAnnotation> coveringEnts = nodeEntMap.get(head);
+    for(IdentifiedAnnotation ent : coveringEnts){
+      if(ent.getOntologyConceptArr() == null) continue; // skip non-umls entities.
+      ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+      if(entHead == head){
+        if(bestEnt == null){
+          bestEnt = ent;
+        }else if((ent.getEnd()-ent.getBegin()) > (bestEnt.getEnd() - bestEnt.getBegin())){
+          // if the span of this entity is bigger than the biggest existing one:
+          bestEnt = ent;
+          otherBestEnts = new HashSet<>();
+        }else if((ent.getEnd()-ent.getBegin()) == (bestEnt.getEnd() - bestEnt.getBegin())){
+          // there is another one with the exact same span and possibly different type!
+          otherBestEnts.add(ent);
+        }
+      }
+    }
+
+    if(bestEnt!=null){
+      bestEnts.add(bestEnt.getClass().getSimpleName());
+      for(IdentifiedAnnotation other : otherBestEnts){
+        bestEnts.add(other.getClass().getSimpleName());
+      }
+    }
+    return bestEnts;
+  }
+  
+  
+  public Map<HashableArguments, Double> getMarkablePairScores(JCas jCas){
+    Map<HashableArguments, Double> scoreMap = new HashMap<>();
+    for(CoreferenceRelation reln : JCasUtil.select(jCas, CoreferenceRelation.class)){
+      HashableArguments pair = new HashableArguments((IdentifiedAnnotation)reln.getArg1().getArgument(), (IdentifiedAnnotation)reln.getArg2().getArgument());
+      scoreMap.put(pair, reln.getConfidence());
+    }
+    return scoreMap;
+  }
+  
+  public static class CollectionTextRelationIdentifiedAnnotationPair {
+    private final CollectionTextRelation cluster;
+    private final IdentifiedAnnotation mention;
+    
+    public CollectionTextRelationIdentifiedAnnotationPair(CollectionTextRelation cluster, IdentifiedAnnotation mention){
+      this.cluster = cluster;
+      this.mention = mention;
+    }
+    
+    public final CollectionTextRelation getCluster(){
+      return this.cluster;
+    }
+    
+    public final IdentifiedAnnotation getMention(){
+      return this.mention;
+    }
+    
+    @Override
+    public boolean equals(Object obj) {
+      CollectionTextRelationIdentifiedAnnotationPair other = (CollectionTextRelationIdentifiedAnnotationPair) obj;
+      return (this.cluster == other.cluster &&
+          this.mention == other.mention);
+    }
+    
+    @Override
+    public int hashCode() {
+      return 31*cluster.hashCode() + (mention==null ? 0 : mention.hashCode());
+    }
+  }
+
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java?rev=1713449&r1=1713448&r2=1713449&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TemporalFeatureExtractor.java Mon Nov  9 14:48:12 2015
@@ -20,12 +20,15 @@ public class TemporalFeatureExtractor im
     List<Feature> feats = new ArrayList<>();
     
     String a1dtr = getDocTimeRelForArg(jCas, arg1);
-    feats.add(new Feature("Arg1DTR_" + a1dtr, true));
     String a2dtr = getDocTimeRelForArg(jCas, arg2);
+
+    feats.add(new Feature("Arg1DTR_" + a1dtr, true));
     feats.add(new Feature("Arg2DTR_" + a2dtr, true));
     
-    if(a1dtr.equals(a2dtr) && !a1dtr.equals("NA")){
-      feats.add(new Feature("DTR_Match", true));      
+    if(a1dtr.equals(a2dtr)){
+      if(!a1dtr.equals("NA")){
+        feats.add(new Feature("DTR_Match", true));
+      }
     }
     
     return feats;

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java?rev=1713449&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterDistanceFeaturesExtractor.java Mon Nov  9 14:48:12 2015
@@ -0,0 +1,46 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.ml.Feature;
+
+public class MentionClusterDistanceFeaturesExtractor
+    implements RelationFeaturesExtractor<CollectionTextRelation, IdentifiedAnnotation> {
+
+  @Override
+  public List<Feature> extract(JCas jCas, CollectionTextRelation cluster, IdentifiedAnnotation mention)
+      throws AnalysisEngineProcessException {
+    List<Feature> feats = new ArrayList<>();
+    
+    int minDistance = Integer.MAX_VALUE;
+    int neMinDistance = Integer.MAX_VALUE;
+    int sentMinDistance = Integer.MAX_VALUE;
+    
+    for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+        int dist = JCasUtil.selectBetween(BaseToken.class, member, mention).size();
+        minDistance = Math.min(minDistance, dist);
+        
+        int neDist = JCasUtil.selectBetween(Markable.class, member, mention).size();
+        neMinDistance = Math.min(neMinDistance, neDist);
+        
+        int sentDist = JCasUtil.selectBetween(Sentence.class, member, mention).size();
+        sentMinDistance = Math.min(sentMinDistance, sentDist);
+    }
+    feats.add(new Feature("MinTokenDistance", minDistance / 4000.0));
+    feats.add(new Feature("MinMarkableDistance", neMinDistance / 900.0));
+    feats.add(new Feature("MinSentDistance", sentMinDistance / 350.0));
+    
+    return feats;
+  }
+
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java?rev=1713449&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java (added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/cluster/MentionClusterMentionFeaturesExtractor.java Mon Nov  9 14:48:12 2015
@@ -0,0 +1,76 @@
+package org.apache.ctakes.coreference.ae.features.cluster;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.features.DependencyTreeFeaturesExtractor;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.cleartk.ml.Feature;
+import org.cleartk.ml.feature.extractor.CleartkExtractor;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Bag;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Covered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.FirstCovered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Following;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.LastCovered;
+import org.cleartk.ml.feature.extractor.CleartkExtractor.Preceding;
+import org.cleartk.ml.feature.extractor.CleartkExtractorException;
+import org.cleartk.ml.feature.extractor.CoveredTextExtractor;
+import org.cleartk.ml.feature.extractor.DistanceExtractor;
+import org.cleartk.ml.feature.extractor.FeatureExtractor1;
+import org.cleartk.ml.feature.extractor.NamingExtractor1;
+import org.cleartk.ml.feature.extractor.TypePathExtractor;
+
+public class MentionClusterMentionFeaturesExtractor implements FeatureExtractor1<Markable> {
+
+  private FeatureExtractor1<BaseToken> coveredText = new CoveredTextExtractor<>();
+  private FeatureExtractor1<Markable> tokenContext = new CleartkExtractor<Markable,BaseToken>(
+      BaseToken.class,
+      coveredText,
+      new FirstCovered(1),
+      new LastCovered(1),
+      new Bag(new Covered()),
+      new Preceding(3),
+      new Following(3));
+
+  private FeatureExtractor1<BaseToken> pos = new TypePathExtractor<>(BaseToken.class, "partOfSpeech");
+
+  /**
+   * All part-of-speech tags of the mention as a bag
+   */
+  private FeatureExtractor1<Markable> tokenPOS = new CleartkExtractor<Markable,BaseToken>(
+      BaseToken.class,
+      pos,
+      new Bag(new Covered()));
+
+  /**
+   * All extractors for mention 1, with features named to distinguish them from mention 2
+   */
+  private FeatureExtractor1<Markable> mentionFeaturesExtractor = new NamingExtractor1<>(
+      "mention1pos",
+      tokenPOS);
+
+  @Override
+  public List<Feature> extract(JCas view, Markable focusAnnotation) throws CleartkExtractorException {
+    List<Feature> feats = new ArrayList<>();
+    
+    // token features:
+    feats.addAll(tokenContext.extract(view, focusAnnotation));
+    
+//    feats.add(new Feature("NumCoveredTokens_" + JCasUtil.selectCovered(BaseToken.class, focusAnnotation).size()));
+    
+    // pos features:
+//    feats.addAll(mentionFeaturesExtractor.extract(view, focusAnnotation));
+    
+//    feats.addAll(DependencyTreeFeaturesExtractor.extractForNode(view, focusAnnotation, "dep"));
+    
+    return feats;
+  }
+
+}



Mime
View raw message