ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1698371 - in /ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae: EventCoreferenceAnnotator.java MentionClusterCoreferenceAnnotator.java
Date Fri, 28 Aug 2015 16:44:43 GMT
Author: tmill
Date: Fri Aug 28 16:44:43 2015
New Revision: 1698371

URL: http://svn.apache.org/r1698371
Log:
Added features and api for including pairwise scoring in cluster-based coreference.

Modified:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java?rev=1698371&r1=1698370&r2=1698371&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/EventCoreferenceAnnotator.java
Fri Aug 28 16:44:43 2015
@@ -66,6 +66,7 @@ import org.cleartk.ml.jar.GenericJarClas
 
 public class EventCoreferenceAnnotator extends RelationExtractorAnnotator {
 
+  public static final String IDENTITY_RELATION = "Identity";
 
   public static final int DEFAULT_SENT_DIST = 5;
   public static final String PARAM_SENT_DIST = "SentenceDistance";
@@ -76,12 +77,19 @@ public class EventCoreferenceAnnotator e
   public static final String PARAM_PAR_SIM = "PararaphSimilarity";
   @ConfigurationParameter(name = PARAM_PAR_SIM, mandatory = false, description = "Similarity
required to pair paragraphs for coreference")
   private double simThreshold = DEFAULT_PAR_SIM;
+
+  public static final boolean DEFAULT_SCORE_ALL = false;
+  public static final String PARAM_SCORE_ALL = "ScoreAllPairs";
+  @ConfigurationParameter(name = PARAM_SCORE_ALL, mandatory = false, description = "Whether
to score all pairs (as in a feature detector")
+  private boolean scoreAll = DEFAULT_SCORE_ALL;
   
   private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap
= null;
   private Map<Markable,Set<String>> markableEnts = null;
   private List<Markable> markablesByConfidence = null;
   private Map<Annotation,NonEmptyFSList> chains = null;
-
+  private double lastScore;
+  
+  
   private Logger logger = Logger.getLogger(EventCoreferenceAnnotator.class);
   
   public static AnalysisEngineDescription createDataWriterDescription(
@@ -110,6 +118,18 @@ public class EventCoreferenceAnnotator e
         modelPath);
   }
   
+  public static AnalysisEngineDescription createScoringAnnotatorDescription(String modelPath)
+      throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        EventCoreferenceAnnotator.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        modelPath,
+        EventCoreferenceAnnotator.PARAM_SCORE_ALL,
+        true);
+  }
+  
   @Override
   protected List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>>
getFeatureExtractors() {
     List<RelationFeaturesExtractor<IdentifiedAnnotation,IdentifiedAnnotation>>
featureExtractorList = new ArrayList<>();
@@ -157,7 +177,7 @@ public class EventCoreferenceAnnotator e
       markableEnts.put(m, getBestEnt(jCas, m));
     }
     super.process(jCas);
-    if(!this.isTraining()){
+    if(!this.isTraining() && !this.scoreAll){
       for(NonEmptyFSList chainHead : new HashSet<>(chains.values())){
         CollectionTextRelation chain = new CollectionTextRelation(jCas);
         chain.setMembers(chainHead);
@@ -453,7 +473,16 @@ public class EventCoreferenceAnnotator e
   protected String classify(List<Feature> features)
       throws CleartkProcessingException {
     numClassifications++;
-    return super.classify(features);
+    
+    String category = super.classifier.classify(features);
+    
+        
+    if(this.scoreAll && category.equals(NO_RELATION_CATEGORY)){
+      Map<String,Double> scores = super.classifier.score(features);
+      category = IDENTITY_RELATION;
+      this.lastScore = scores.get(IDENTITY_RELATION);
+    }
+    return category;
   }
   
   @Override
@@ -474,22 +503,17 @@ public class EventCoreferenceAnnotator e
       JCas jCas,
       IdentifiedAnnotation ante,
       IdentifiedAnnotation ana,
-      String predictedCategory) {
-    // check if its already been linked
-    if(!foundAnaphors.contains(ana)){
+      String predictedCategory) { 
+    if(this.scoreAll){
+      // do this first -- if we need to score all pairs then it doesn't really make sense
to talk about
+      // "found anaphors" since we're not in finding mode.
+      CoreferenceRelation relation = buildRelation(jCas, ante, ana, predictedCategory);
+      relation.setConfidence(this.lastScore);
+      relation.addToIndexes();
+    } // check if its already been linked
+    else if(!foundAnaphors.contains(ana)){
       // add the relation to the CAS
-      RelationArgument relArg1 = new RelationArgument(jCas);
-      relArg1.setArgument(ante);
-      relArg1.setRole("Antecedent");
-      relArg1.addToIndexes();
-      RelationArgument relArg2 = new RelationArgument(jCas);
-      relArg2.setArgument(ana);
-      relArg2.setRole("Anaphor");
-      relArg2.addToIndexes();
-      CoreferenceRelation relation = new CoreferenceRelation(jCas);
-      relation.setArg1(relArg1);
-      relation.setArg2(relArg2);
-      relation.setCategory(predictedCategory);
+      CoreferenceRelation relation = buildRelation(jCas, ante, ana, predictedCategory);
       relation.addToIndexes();
       foundAnaphors.add(ana);
       if(!chains.containsKey(ante)){
@@ -524,6 +548,22 @@ public class EventCoreferenceAnnotator e
     }
   }
   
+  private CoreferenceRelation buildRelation(JCas jCas, Annotation ante, Annotation ana, String
predictedCategory){
+    RelationArgument relArg1 = new RelationArgument(jCas);
+    relArg1.setArgument(ante);
+    relArg1.setRole("Antecedent");
+    relArg1.addToIndexes();
+    RelationArgument relArg2 = new RelationArgument(jCas);
+    relArg2.setArgument(ana);
+    relArg2.setRole("Anaphor");
+    relArg2.addToIndexes();
+    CoreferenceRelation relation = new CoreferenceRelation(jCas);
+    relation.setArg1(relArg1);
+    relation.setArg2(relArg2);
+    relation.setCategory(predictedCategory);
+    return relation;
+  }
+  
   @Override
   protected String getRelationCategory(
       Map<List<Annotation>, BinaryTextRelation> relationLookup,

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java?rev=1698371&r1=1698370&r2=1698371&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/MentionClusterCoreferenceAnnotator.java
Fri Aug 28 16:44:43 2015
@@ -7,7 +7,6 @@ import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedHashSet;
-import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Random;
@@ -19,16 +18,19 @@ import org.apache.ctakes.coreference.ae.
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterDistSemExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSalienceFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSectionFeaturesExtractor;
+import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStackFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterStringFeaturesExtractor;
 import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterUMLSFeatureExtractor;
-import org.apache.ctakes.coreference.ae.features.cluster.MentionClusterSemTypeDepPrefsFeatureExtractor;
 import org.apache.ctakes.coreference.util.ClusterUtils;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
+import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator.IdentifiedAnnotationPair;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
+import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
 import org.apache.ctakes.typesystem.type.relation.CollectionTextRelationIdentifiedAnnotationRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
 import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
 import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
@@ -103,6 +105,7 @@ public class MentionClusterCoreferenceAn
   private Set<String> markableStrings = null;
   private Map<ConllDependencyNode,Collection<IdentifiedAnnotation>> nodeEntMap
= null;
   private Map<String,Set<Markable>> headWordMarkables = null;
+  private Map<HashableArguments,Double> pairScores = null;
   
   protected List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>>
getFeatureExtractors() {
     List<RelationFeaturesExtractor<CollectionTextRelation,IdentifiedAnnotation>>
extractors = new ArrayList<>();
@@ -321,6 +324,7 @@ public class MentionClusterCoreferenceAn
     markableStrings = new HashSet<>();
     nodeEntMap = JCasUtil.indexCovering(jCas, ConllDependencyNode.class, IdentifiedAnnotation.class);
     headWordMarkables = new HashMap<>();
+    pairScores = getMarkablePairScores(jCas);
     
     Map<CollectionTextRelationIdentifiedAnnotationPair, CollectionTextRelationIdentifiedAnnotationRelation>
relationLookup;
     relationLookup = new HashMap<>();
@@ -364,7 +368,8 @@ public class MentionClusterCoreferenceAn
               features.addAll(feats);
             }
           }
-
+          
+          // here is where feature conjunctions can go (dupFeatures)
           List<Feature> dupFeatures = new ArrayList<>();
           // sanity check on feature values
           for (Feature feature : features) {
@@ -380,6 +385,42 @@ public class MentionClusterCoreferenceAn
             }
           }
           features.addAll(dupFeatures);
+          
+          // pairwise score features:
+          double minPairScore = 1.0;
+          double maxPairScore = 0.0;
+          double avePairScore = 0.0;
+          int numPairs=0;
+          for(Markable member : JCasUtil.select(cluster.getMembers(), Markable.class)){
+            if(member.getBegin() > mention.getBegin()) break;
+            numPairs++;
+            HashableArguments markablePair = new HashableArguments(member, mention);
+            Double score = pairScores.get(markablePair);
+            if(score == null){
+              markablePair = new HashableArguments(mention, member);
+              score = pairScores.get(markablePair);
+            }
+            if(score != null){
+              avePairScore += score;
+              if(score > maxPairScore){
+                maxPairScore = score;
+              }
+              if(score < minPairScore){
+                minPairScore = score;
+              }
+            }
+          }
+          features.add(new Feature("PAIRWISE_MAX", maxPairScore));
+          features.add(new Feature("PAIRWISE_MIN", minPairScore));
+          if(numPairs > 0){
+            avePairScore /= numPairs;
+          }else{
+            avePairScore = 0.0;
+          }
+          if(Double.isNaN(avePairScore)){
+            Logger.getLogger(MentionClusterCoreferenceAnnotator.class).error("Pairwise average
feature found with value NaN");
+          }
+          features.add(new Feature("PAIRWISE_AVE", avePairScore));
 
           // during training, feed the features to the data writer
           if (this.isTraining()) {
@@ -565,6 +606,16 @@ public class MentionClusterCoreferenceAn
     return bestEnts;
   }
   
+  
+  public Map<HashableArguments, Double> getMarkablePairScores(JCas jCas){
+    Map<HashableArguments, Double> scoreMap = new HashMap<>();
+    for(CoreferenceRelation reln : JCasUtil.select(jCas, CoreferenceRelation.class)){
+      HashableArguments pair = new HashableArguments((IdentifiedAnnotation)reln.getArg1().getArgument(),
(IdentifiedAnnotation)reln.getArg2().getArgument());
+      scoreMap.put(pair, reln.getConfidence());
+    }
+    return scoreMap;
+  }
+  
   public static class CollectionTextRelationIdentifiedAnnotationPair {
     private final CollectionTextRelation cluster;
     private final IdentifiedAnnotation mention;



Mime
View raw message