ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1716147 - in /ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data: Analyzer_ImplBase.java CoreferenceAttributeAnalyzer.java CoreferenceLinkDistanceAnalyzer.java
Date Tue, 24 Nov 2015 14:24:32 GMT
Author: tmill
Date: Tue Nov 24 14:24:32 2015
New Revision: 1716147

URL: http://svn.apache.org/viewvc?rev=1716147&view=rev
Log:
New code to look at consistency of coref attributes.

Added:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/Analyzer_ImplBase.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceAttributeAnalyzer.java
Modified:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceLinkDistanceAnalyzer.java

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/Analyzer_ImplBase.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/Analyzer_ImplBase.java?rev=1716147&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/Analyzer_ImplBase.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/Analyzer_ImplBase.java
Tue Nov 24 14:24:32 2015
@@ -0,0 +1,54 @@
+package org.apache.ctakes.coreference.data;
+
+import java.io.File;
+import java.io.FilenameFilter;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.ctakes.temporal.eval.CommandLine;
+import org.apache.ctakes.temporal.eval.THYMEData;
+import org.apache.log4j.Logger;
+
+import com.lexicalscope.jewel.cli.Option;
+
+public abstract class Analyzer_ImplBase {
+  static interface Options {
+
+    @Option(
+        shortName = "i",
+        description = "specify the path to the directory containing the text files")
+    public File getInputDirectory();
+    
+    @Option(
+        shortName = "x",
+        description = "Specify the path to the directory containing the xmis")
+    public File getXMIDirectory();
+    
+    @Option( longName = "patients" )
+    public CommandLine.IntegerRanges getPatients();
+  }
+
+  static Logger logger = Logger.getLogger("Coref Analyzer");
+  
+  public static Collection<File> getTrainFiles(List<Integer> patientSets, File
textDir, File xmiDir){
+    List<Integer> trainItems = THYMEData.getPatientSets( patientSets, THYMEData.TRAIN_REMAINDERS);
+
+    Collection<File> files = new HashSet<>();
+    
+    File[] xmiFiles = xmiDir.listFiles(new FilenameFilter(){
+
+      public boolean accept(File dir, String name) {
+        return name.endsWith("xmi");
+      }});
+    
+    for(File xmiFile : xmiFiles){
+      String name = xmiFile.getName();
+      String pid = name.substring(2, 5);
+      if(trainItems.contains(Integer.valueOf(pid))){
+        files.add(new File(textDir, name.substring(0, name.length()-4)));
+      }
+    }
+    return files;
+  }
+}

Added: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceAttributeAnalyzer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceAttributeAnalyzer.java?rev=1716147&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceAttributeAnalyzer.java
(added)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceAttributeAnalyzer.java
Tue Nov 24 14:24:32 2015
@@ -0,0 +1,152 @@
+package org.apache.ctakes.coreference.data;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.coreference.ae.DeterministicMarkableAnnotator;
+import org.apache.ctakes.coreference.ae.MarkableSalienceAnnotator;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference.CopyCoreferenceRelations;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference.DocumentIDPrinter;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference.RelationPropagator;
+import org.apache.ctakes.coreference.eval.EvaluationOfEventCoreference.RemovePersonMarkables;
+import org.apache.ctakes.dependency.parser.util.DependencyUtility;
+import org.apache.ctakes.temporal.ae.BackwardsTimeAnnotator;
+import org.apache.ctakes.temporal.ae.DocTimeRelAnnotator;
+import org.apache.ctakes.temporal.ae.EventAnnotator;
+import org.apache.ctakes.temporal.eval.CommandLine;
+import org.apache.ctakes.temporal.eval.Evaluation_ImplBase.XMIReader;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.uima.UIMAException;
+import org.apache.uima.collection.CollectionReaderDescription;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.JCasIterable;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+public class CoreferenceAttributeAnalyzer extends Analyzer_ImplBase{
+  
+  public static final String GOLD_VIEW_NAME = "GoldView";
+  
+  public static void main(String[] args) throws UIMAException, IOException {
+    Options options = CliFactory.parseArguments(Options.class, args);
+    CollectionReaderDescription reader = UriCollectionReader.getDescriptionFromFiles(getTrainFiles(options.getPatients().getList(),
options.getInputDirectory(), options.getXMIDirectory()));
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        XMIReader.class,
+        XMIReader.PARAM_XMI_DIRECTORY,
+        options.getXMIDirectory()));
+    aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDPrinter.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RelationPropagator.class));
+    aggregateBuilder.add(EventAnnotator.createAnnotatorDescription());
+    aggregateBuilder.add(BackwardsTimeAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/timeannotator/model.jar"));
+    aggregateBuilder.add(DocTimeRelAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/doctimerel/model.jar"));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DeterministicMarkableAnnotator.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(RemovePersonMarkables.class));
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyCoreferenceRelations.class,
CopyCoreferenceRelations.PARAM_GOLD_VIEW, GOLD_VIEW_NAME));
+    aggregateBuilder.add(MarkableSalienceAnnotator.createAnnotatorDescription("/org/apache/ctakes/temporal/ae/salience/model.jar"));
+
+    int numNegChainMatches = 0;
+    int numUncChainMatches = 0;
+    int numChains = 0;
+    
+    for(JCas jcas : new JCasIterable(reader, aggregateBuilder.createAggregateDescription())){
+      Map<Markable,IdentifiedAnnotation> mark2ent = new HashMap<>();
+      for(CollectionTextRelation chain : JCasUtil.select(jcas, CollectionTextRelation.class)){
+        int numNeg = 0;
+        int numUnc = 0;
+        int numTimex = 0;
+        int numMarkables = 0;
+        Markable head = (Markable) ((NonEmptyFSList)chain.getMembers()).getHead();
+        for(Markable member : JCasUtil.select(chain.getMembers(), Markable.class)){
+          if(!mark2ent.containsKey(member)){
+            mark2ent.put(member, getSameHeadEntity(jcas, member));
+          }
+          IdentifiedAnnotation ent = mark2ent.get(member);
+          if(ent == null) continue;
+          
+          numMarkables++;
+          if(ent.getPolarity() == CONST.NE_POLARITY_NEGATION_PRESENT){
+            numNeg++;
+          }
+          if(ent.getUncertainty() == CONST.NE_UNCERTAINTY_PRESENT){
+            numUnc++;
+          }
+        }
+        // only bother if some of the chains had markables corresponding to named entities:
+        if(numMarkables > 0){
+          numChains++;
+          if(numNeg == 0 || numNeg == numMarkables){
+            numNegChainMatches++;
+          }else{
+            logger.warn("Found a chain with disagreement over negation: " +
+               String.format("First element: \"%s\", span: (%d, %d)",
+                   head.getCoveredText(), head.getBegin(), head.getEnd()));
+          }
+          if(numUnc == 0 || numUnc == numMarkables){
+            numUncChainMatches++;
+          }else{
+            logger.warn("Found a chain with disagreement over uncertainty: " +
+               String.format("First element: \"%s\", span: (%d, %d)",
+                   head.getCoveredText(), head.getBegin(), head.getEnd()));
+          }
+        }
+      }
+      
+    }
+    
+    // print out some statistics:
+    System.out.println(String.format("Negation: There are %d chains in the corpus with some
UMLS named entity element and %d of them negation status agrees",
+        numChains, numNegChainMatches));
+    System.out.println(String.format("Negation: There are %d chains in the corpus with some
UMLS named entity element and %d of them uncertainty status agrees",
+        numChains, numUncChainMatches));
+    
+  }
+
+  private static IdentifiedAnnotation getSameHeadEntity(JCas jcas,
+      Markable member) {
+    IdentifiedAnnotation bestEnt = null;
+    ConllDependencyNode memberHead = DependencyUtility.getNominalHeadNode(jcas, member);
+    
+    for(IdentifiedAnnotation ent : JCasUtil.selectCovered(IdentifiedAnnotation.class, member)){
+      // must be one of the UMLS semantic types
+      if(ent instanceof EntityMention || ent instanceof EntityMention) continue;
+      
+      ConllDependencyNode entHead = DependencyUtility.getNominalHeadNode(jcas, ent);
+      if(entHead == memberHead){
+        // this entity has the same head as the markable
+        if(bestEnt == null){
+          // first entity that has the same head as the markable
+          bestEnt = ent;
+        }
+        else{
+          // there is already an entity with the same head as this markable -- see if this
one is bigger
+          // (Bigger is preferable)
+          if(ent.getBegin() < bestEnt.getBegin()){
+            bestEnt = ent;
+          }
+        }
+      }
+    }
+    return bestEnt;
+  }
+  
+}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceLinkDistanceAnalyzer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceLinkDistanceAnalyzer.java?rev=1716147&r1=1716146&r2=1716147&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceLinkDistanceAnalyzer.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/data/CoreferenceLinkDistanceAnalyzer.java
Tue Nov 24 14:24:32 2015
@@ -1,7 +1,5 @@
 package org.apache.ctakes.coreference.data;
 
-import java.io.File;
-import java.io.FilenameFilter;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
@@ -40,27 +38,14 @@ import org.cleartk.util.ae.UriToDocument
 import org.cleartk.util.cr.UriCollectionReader;
 
 import com.lexicalscope.jewel.cli.CliFactory;
-import com.lexicalscope.jewel.cli.Option;
 
-public class CoreferenceLinkDistanceAnalyzer {
-  static interface Options {
-
-    @Option(
-        shortName = "i",
-        description = "specify the path to the directory containing the text files")
-    public File getInputDirectory();
-    
-    @Option(
-        shortName = "x",
-        description = "Specify the path to the directory containing the xmis")
-    public File getXMIDirectory();
-  }
+public class CoreferenceLinkDistanceAnalyzer extends Analyzer_ImplBase {
   
   public static final String GOLD_VIEW_NAME = "GoldView";
   
   public static void main(String[] args) throws UIMAException, IOException {
     Options options = CliFactory.parseArguments(Options.class, args);
-    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(getFiles(options.getInputDirectory(),
options.getXMIDirectory()));
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(getTrainFiles(options.getPatients().getList(),
options.getInputDirectory(), options.getXMIDirectory()));
     AggregateBuilder aggregateBuilder = new AggregateBuilder();
     aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
     aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
@@ -283,20 +268,4 @@ public class CoreferenceLinkDistanceAnal
     sim = sim / (v1norm * v2norm);
     return sim;
   }
-  
-  public static Collection<File> getFiles(File textDir, File xmiDir){
-    Collection<File> files = new HashSet<>();
-    
-    File[] xmiFiles = xmiDir.listFiles(new FilenameFilter(){
-
-      public boolean accept(File dir, String name) {
-        return name.endsWith("xmi");
-      }});
-    
-    for(File xmiFile : xmiFiles){
-      String name = xmiFile.getName();
-      files.add(new File(textDir, name.substring(0, name.length()-4)));
-    }
-    return files;
-  }
 }



Mime
View raw message