ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1498893 - in /ctakes/sandbox/ctakes-coref-cleartk: desc/analysis_engine/ src/main/java/org/apache/ctakes/coreference/ae/ src/main/java/org/apache/ctakes/coreference/ae/features/ src/main/java/org/apache/ctakes/coreference/eval/
Date Tue, 02 Jul 2013 11:53:54 GMT
Author: tmill
Date: Tue Jul  2 11:53:53 2013
New Revision: 1498893

URL: http://svn.apache.org/r1498893
Log:
Checked in a bunch of support code copied over from relation extractor.

Modified:
    ctakes/sandbox/ctakes-coref-cleartk/desc/analysis_engine/CoreferencePreprocessor.xml
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java

Modified: ctakes/sandbox/ctakes-coref-cleartk/desc/analysis_engine/CoreferencePreprocessor.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/desc/analysis_engine/CoreferencePreprocessor.xml?rev=1498893&r1=1498892&r2=1498893&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/desc/analysis_engine/CoreferencePreprocessor.xml (original)
+++ ctakes/sandbox/ctakes-coref-cleartk/desc/analysis_engine/CoreferencePreprocessor.xml Tue
Jul  2 11:53:53 2013
@@ -5,6 +5,9 @@
     <delegateAnalysisEngine key="ConstituencyParserAnnotator">
       <import location="../../../ctakes-constituency-parser/desc/ConstituencyParserAnnotator.xml"/>
     </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="Chunker">
+      <import location="../../../ctakes-chunker/desc/Chunker.xml"/>
+    </delegateAnalysisEngine>
     <delegateAnalysisEngine key="TokenizerAnnotator">
       <import location="../../../ctakes-core/desc/analysis_engine/TokenizerAnnotator.xml"/>
     </delegateAnalysisEngine>
@@ -14,24 +17,21 @@
     <delegateAnalysisEngine key="SentenceDetectorAnnotator">
       <import location="../../../ctakes-core/desc/analysis_engine/SentenceDetectorAnnotator.xml"/>
     </delegateAnalysisEngine>
-    <delegateAnalysisEngine key="SimpleSegmentAnnotator">
-      <import location="../../../ctakes-core/desc/analysis_engine/SimpleSegmentAnnotator.xml"/>
-    </delegateAnalysisEngine>
-    <delegateAnalysisEngine key="POSTagger">
-      <import location="../../../ctakes-pos-tagger/desc/POSTagger.xml"/>
-    </delegateAnalysisEngine>
-    <delegateAnalysisEngine key="Chunker">
-      <import location="../../../ctakes-chunker/desc/Chunker.xml"/>
-    </delegateAnalysisEngine>
     <delegateAnalysisEngine key="LookupWindowAnnotator">
       <import location="../../../ctakes-clinical-pipeline/desc/analysis_engine/LookupWindowAnnotator.xml"/>
     </delegateAnalysisEngine>
     <delegateAnalysisEngine key="DictionaryLookupAnnotator">
-      <import location="../../../ctakes-dictionary-lookup/desc/analysis_engine/DictionaryLookupAnnotator.xml"/>
+      <import location="../../../ctakes-dictionary-lookup/desc/analysis_engine/DictionaryLookupAnnotatorUMLS.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="SimpleSegmentAnnotator">
+      <import location="../../../ctakes-core/desc/analysis_engine/SimpleSegmentAnnotator.xml"/>
     </delegateAnalysisEngine>
     <delegateAnalysisEngine key="LvgAnnotator">
       <import location="../../../ctakes-lvg/desc/analysis_engine/LvgAnnotator.xml"/>
     </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="POSTagger">
+      <import location="../../../ctakes-pos-tagger/desc/POSTagger.xml"/>
+    </delegateAnalysisEngine>
   </delegateAnalysisEngineSpecifiers>
   <analysisEngineMetaData>
     <name>CorefPreProcessor</name>

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java?rev=1498893&r1=1498892&r2=1498893&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/GoldCoreferenceReader.java
Tue Jul  2 11:53:53 2013
@@ -9,6 +9,7 @@ import java.util.HashMap;
 
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.ctakes.coreference.util.Span;
+import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
 import org.apache.ctakes.typesystem.type.relation.RelationArgument;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
@@ -41,6 +42,7 @@ public class GoldCoreferenceReader exten
 	public void process(JCas jcas) throws AnalysisEngineProcessException {
 		HashMap<String, Integer> goldSpan2id = new HashMap<String, Integer>();
 		ArrayList<Span> goldSpans = new ArrayList<Span>();
+		HashMap<String[], IdentifiedAnnotation> mentions = new HashMap<String[], IdentifiedAnnotation>();
 		
 		String docId = DocumentIDAnnotationUtil.getDocumentID(jcas);
 		File f = new File(goldDir + File.separator + docId);
@@ -51,18 +53,22 @@ public class GoldCoreferenceReader exten
 			String l;
 			while ((l = br.readLine())!=null) {
 				String[] spanPair = l.split("\\t");
-				IdentifiedAnnotation anteMention = new IdentifiedAnnotation(jcas);
+				IdentifiedAnnotation anteMention = null; //new IdentifiedAnnotation(jcas);
 				if (!goldSpan2id.containsKey(spanPair[0])){
 					goldSpan2id.put(spanPair[0], ++id);
 					String[] s = spanPair[0].split("[-:]");
 					int[] a = new int[s.length];
 					for (int i = 0; i < s.length; i++)
 						a[i] = Integer.parseInt(s[i]);
-//					goldSpans.add(new Span(a));
-					anteMention.setBegin(a[0]);
-					anteMention.setEnd(a[a.length-1]);
+					goldSpans.add(new Span(a));
+					anteMention = new IdentifiedAnnotation(jcas);
+	        anteMention.setBegin(a[0]);
+	        anteMention.setEnd(a[a.length-1]);					
+					mentions.put(spanPair, anteMention);
+				}else{
+				  anteMention = mentions.get(spanPair);
 				}
-				IdentifiedAnnotation anaMention = new IdentifiedAnnotation(jcas);
+				IdentifiedAnnotation anaMention = null; //new IdentifiedAnnotation(jcas);
 				if (!goldSpan2id.containsKey(spanPair[1])){
 					goldSpan2id.put(spanPair[1], ++id);
 					String[] s = spanPair[1].split("[-:]");
@@ -70,13 +76,23 @@ public class GoldCoreferenceReader exten
 					for (int i = 0; i < s.length; i++)
 						a[i] = Integer.parseInt(s[i]);
 					goldSpans.add(new Span(a));
+					anaMention = new IdentifiedAnnotation(jcas);
 					anaMention.setBegin(a[0]);
 					anaMention.setEnd(a[a.length-1]);
+					mentions.put(spanPair, anaMention);
+				}else{
+				  anaMention = mentions.get(spanPair);
 				}
+				
 				RelationArgument arg1 = new RelationArgument(jcas);
 				arg1.setArgument(anteMention);
+				arg1.setRole("antecedent");
 				RelationArgument arg2 = new RelationArgument(jcas);
+				arg2.setArgument(anaMention);
+				arg2.setRole("anaphor");
 				CoreferenceRelation rel = new CoreferenceRelation(jcas);
+				rel.setCategory("Identity");
+				rel.setDiscoveryTechnique(CONST.REL_DISCOVERY_TECH_GOLD_ANNOTATION);
 				rel.setArg1(arg1);
 				rel.setArg2(arg2);
 				rel.addToIndexes();

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java?rev=1498893&r1=1498892&r2=1498893&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
Tue Jul  2 11:53:53 2013
@@ -16,6 +16,7 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
@@ -30,6 +31,8 @@ import org.uimafit.util.JCasUtil;
 
 public class NamedEntityCoreferenceResolver extends RelationExtractorAnnotator {
 	
+  private static Logger logger = Logger.getLogger(NamedEntityCoreferenceResolver.class);
+
   public static AnalysisEngineDescription createDataWriterDescription(
       Class<? extends DataWriter<String>> dataWriterClass,
       File outputDirectory) throws ResourceInitializationException {
@@ -80,6 +83,7 @@ public class NamedEntityCoreferenceResol
 				if(contains(arg1, arg2) || contains(arg2, arg1)) continue;
 				
 				pairs.add(new IdentifiedAnnotationPair(arg1, arg2));
+//				logger.info(String.format("Adding candidate pair: (%s, %s)\n", arg1.getCoveredText(),
arg2.getCoveredText()));
 			}
 		}
 		return pairs;
@@ -99,7 +103,7 @@ public class NamedEntityCoreferenceResol
 
 	private List<IdentifiedAnnotation> getDocumentMarkables(JCas jcas, Annotation coveringAnnotation)
{
 	  List<IdentifiedAnnotation> mentions = new ArrayList<IdentifiedAnnotation>();
-		Collection<EntityMention> entityMentions = (JCasUtil.select(jcas, EntityMention.class));
+		Collection<EntityMention> entityMentions = JCasUtil.select(jcas, EntityMention.class);
 		Collection<EventMention> eventMentions = JCasUtil.select(jcas, EventMention.class);
 		mentions.addAll(entityMentions);
 		mentions.addAll(eventMentions);

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java?rev=1498893&r1=1498892&r2=1498893&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/DistanceFeatureExtractor.java
Tue Jul  2 11:53:53 2013
@@ -20,9 +20,9 @@ public class DistanceFeatureExtractor im
 			IdentifiedAnnotation arg2) throws AnalysisEngineProcessException {
 		List<Feature> feats = new ArrayList<Feature>();
 		feats.add(new Feature("TOK_DIST",
-				  JCasUtil.selectCovered(jCas, BaseToken.class, arg1.getBegin(), arg2.getEnd()).size()
/ CorefConst.TOKEN_DIST));
+				  JCasUtil.selectCovered(jCas, BaseToken.class, arg1.getBegin(), arg2.getEnd()).size()
/ (double)CorefConst.TOKEN_DIST));
 		feats.add(new Feature("SENT_DIST",
-				JCasUtil.selectCovered(jCas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size() /
CorefConst.NE_DIST));
+				JCasUtil.selectCovered(jCas, Sentence.class, arg1.getBegin(), arg2.getEnd()).size() /
(double) CorefConst.NE_DIST));
 		return feats;
 	}
 

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java?rev=1498893&r1=1498892&r2=1498893&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
Tue Jul  2 11:53:53 2013
@@ -1,17 +1,33 @@
 package org.apache.ctakes.coreference.eval;
 
 import java.io.File;
+import java.net.URI;
 import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
+import java.util.Map;
+import java.util.Set;
 
 import org.apache.ctakes.coreference.ae.NamedEntityCoreferenceResolver;
-import org.apache.ctakes.coreference.eval.PreprocessAndWriteXmi.Options;
+import org.apache.ctakes.relationextractor.eval.RelationExtractorEvaluation.HashableArguments;
 import org.apache.ctakes.relationextractor.eval.XMIReader;
-import org.apache.ctakes.typesystem.type.syntax.Chunk;
-import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.Feature;
 import org.apache.uima.collection.CollectionReader;
 import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.TOP;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.CasCopier;
 import org.cleartk.classifier.jar.JarClassifierBuilder;
 import org.cleartk.classifier.libsvm.LIBSVMStringOutcomeDataWriter;
 import org.cleartk.eval.AnnotationStatistics;
@@ -20,12 +36,20 @@ import org.cleartk.util.Options_ImplBase
 import org.cleartk.util.ViewURIUtil;
 import org.kohsuke.args4j.Option;
 import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.descriptor.ConfigurationParameter;
 import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
 import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.JCasIterable;
 import org.uimafit.pipeline.SimplePipeline;
 import org.uimafit.util.JCasUtil;
 
+import com.google.common.base.Function;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import com.google.common.collect.Sets;
+
 public class EvaluationOfCoreferencePairs extends
 		Evaluation_ImplBase<File, AnnotationStatistics<String>> {
   public static class Options extends Options_ImplBase {
@@ -41,14 +65,25 @@ public class EvaluationOfCoreferencePair
         usage = "specify the directory contraining the xmis for the training partition",
         required = true)
     public File traingDirectory;
+    
+    @Option(name = "--print-errors", required = false)
+    public boolean printErrors=false;
+    
+    @Option(name = "--print-relations", required = false)
+    public boolean printRelations=false;
+
   }
   
 	public static final String GOLD_VIEW_NAME = "GOLD_VIEW";
 //  private boolean xmiExists = false;
 //  private File xmiDirectory = null;
+  private boolean printErrors=false;
+  private boolean printRelations=false;
   
-	public EvaluationOfCoreferencePairs(File baseDirectory) {
+	public EvaluationOfCoreferencePairs(File baseDirectory, boolean printErrors, boolean printRelations)
{
 		super(baseDirectory);
+		this.printErrors = printErrors;
+		this.printRelations = printRelations;
 	}
 
 	@Override
@@ -73,8 +108,8 @@ public class EvaluationOfCoreferencePair
 			throws Exception {
 	  AggregateBuilder aggregateBuilder = new AggregateBuilder();
  
-	  // TODO: Annotator to add everything to gold standard view?
-	  
+//	  aggregateBuilder.add(RemoveSystemMarkables.class);
+    aggregateBuilder.add(CopyFromGold.getDescription(CoreferenceRelation.class));
 	  aggregateBuilder.add(
 	      NamedEntityCoreferenceResolver.createDataWriterDescription(
 	          LIBSVMStringOutcomeDataWriter.class,
@@ -95,289 +130,108 @@ public class EvaluationOfCoreferencePair
 	@Override
 	protected AnnotationStatistics<String> test(CollectionReader collectionReader,
 			File directory) throws Exception {
-		// TODO Auto-generated method stub
-		return null;
-	}
-
-//  protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception {
-//    return this.xmiExists
-//        ? this.getXMIReadingPreprocessorAggregateBuilder()
-//        : this.getXMIWritingPreprocessorAggregateBuilder();
-//  }
-//
-//  protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException
{
-//    AggregateBuilder aggregateBuilder = new AggregateBuilder();
-//    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
-//    return aggregateBuilder;
-//  }
-
-	/*
-  protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
-      throws Exception {
     AggregateBuilder aggregateBuilder = new AggregateBuilder();
-    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
-
-    // read manual annotations into gold view
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        ViewCreatorAnnotator.class,
-        ViewCreatorAnnotator.PARAM_VIEW_NAME,
-        GOLD_VIEW_NAME));
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        ViewTextCopierAnnotator.class,
-        ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
-        CAS.NAME_DEFAULT_SOFA,
-        ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
-        GOLD_VIEW_NAME));
-//    aggregateBuilder.add(
-//        THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory),
-//        CAS.NAME_DEFAULT_SOFA,
-//        GOLD_VIEW_NAME);
-
-    // identify segments
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
-    // identify sentences
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        SentenceDetector.class,
-        "MaxentModel",
-        ExternalResourceFactory.createExternalResourceDescription(
-            SuffixMaxentModelResourceImpl.class,
-            FileLocator.locateFile("org/apache/ctakes/core/sentdetect/sdmed.mod").toURI().toURL())));
-    // identify tokens
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
-    // merge some tokens
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
-
-    // identify part-of-speech tags
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        POSTagger.class,
-        TypeSystemDescriptionFactory.createTypeSystemDescription(),
-        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
-        POSTagger.POS_MODEL_FILE_PARAM,
-        "org/apache/ctakes/postagger/models/mayo-pos.zip",
-        POSTagger.TAG_DICTIONARY_PARAM,
-        "org/apache/ctakes/postagger/models/tag.dictionary.txt",
-        POSTagger.CASE_SENSITIVE_PARAM,
-        true));
-
-    // identify chunks
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        Chunker.class,
-        Chunker.CHUNKER_MODEL_FILE_PARAM,
-        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunk-model.claims-1.5.zip"),
-        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
-        DefaultChunkCreator.class));
-
-    // identify UMLS named entities
-
-    // adjust NP in NP NP to span both
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        ChunkAdjuster.class,
-        ChunkAdjuster.PARAM_CHUNK_PATTERN,
-        new String[] { "NP", "NP" },
-        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
-        1));
-    // adjust NP in NP PP NP to span all three
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        ChunkAdjuster.class,
-        ChunkAdjuster.PARAM_CHUNK_PATTERN,
-        new String[] { "NP", "PP", "NP" },
-        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
-        2));
-    // add lookup windows for each NP
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
-    // maximize lookup windows
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        OverlapAnnotator.class,
-        "A_ObjectClass",
-        LookupWindowAnnotation.class,
-        "B_ObjectClass",
-        LookupWindowAnnotation.class,
-        "OverlapType",
-        "A_ENV_B",
-        "ActionType",
-        "DELETE",
-        "DeleteAction",
-        new String[] { "selector=B" }));
-    // add UMLS on top of lookup windows
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        UmlsDictionaryLookupAnnotator.class,
-        "ctakes.umlsaddr",
-        "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser",
-        "ctakes.umlsvendor",
-        "NLM-6515182895",
-        "LookupDescriptor",
-        ExternalResourceFactory.createExternalResourceDescription(
-            FileResourceImpl.class,
-            new File("target/unpacked/org/apache/ctakes/dictionary/lookup/LookupDesc_Db.xml").getAbsoluteFile()),
-        "DbConnection",
-        ExternalResourceFactory.createExternalResourceDescription(
-            JdbcConnectionResourceImpl.class,
-            "",
-            JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS,
-            "org.hsqldb.jdbcDriver",
-            JdbcConnectionResourceImpl.PARAM_URL,
-            // Should be the following but it's WAY too slow
-            // "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
-            "jdbc:hsqldb:file:target/unpacked/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
-        "RxnormIndexReader",
-        ExternalResourceFactory.createExternalResourceDescription(
-            LuceneIndexReaderResourceImpl.class,
-            "",
-            "UseMemoryIndex",
-            true,
-            "IndexDirectory",
-            new File("target/unpacked/org/apache/ctakes/dictionary/lookup/rxnorm_index").getAbsoluteFile()),
-        "OrangeBookIndexReader",
-        ExternalResourceFactory.createExternalResourceDescription(
-            LuceneIndexReaderResourceImpl.class,
-            "",
-            "UseMemoryIndex",
-            true,
-            "IndexDirectory",
-            new File("target/unpacked/org/apache/ctakes/dictionary/lookup/OrangeBook").getAbsoluteFile())));
-
-    // add lvg annotator
-    String[] XeroxTreebankMap = {
-        "adj|JJ",
-        "adv|RB",
-        "aux|AUX",
-        "compl|CS",
-        "conj|CC",
-        "det|DET",
-        "modal|MD",
-        "noun|NN",
-        "prep|IN",
-        "pron|PRP",
-        "verb|VB" };
-    String[] ExclusionSet = {
-        "and",
-        "And",
-        "by",
-        "By",
-        "for",
-        "For",
-        "in",
-        "In",
-        "of",
-        "Of",
-        "on",
-        "On",
-        "the",
-        "The",
-        "to",
-        "To",
-        "with",
-        "With" };
-    AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
-        LvgAnnotator.class,
-        "UseSegments",
-        false,
-        "SegmentsToSkip",
-        new String[0],
-        "UseCmdCache",
-        false,
-        "CmdCacheFileLocation",
-        "/org/apache/ctakes/lvg/2005_norm.voc",
-        "CmdCacheFrequencyCutoff",
-        20,
-        "ExclusionSet",
-        ExclusionSet,
-        "XeroxTreebankMap",
-        XeroxTreebankMap,
-        "LemmaCacheFileLocation",
-        "/org/apache/ctakes/lvg/2005_lemma.voc",
-        "UseLemmaCache",
-        false,
-        "LemmaCacheFrequencyCutoff",
-        20,
-        "PostLemmas",
-        true,
-        "LvgCmdApi",
-        ExternalResourceFactory.createExternalResourceDescription(
-            LvgCmdApiResourceImpl.class,
-            new File(LvgCmdApiResourceImpl.class.getResource(
-                "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI())));
-    aggregateBuilder.add(lvgAnnotator);
-
-    // add dependency parser
-//    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class));
-
-    // add semantic role labeler
-//    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class));
-
-    // add constituency parser
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class));
-
-    // write out the CAS after all the above annotations
-    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
-        XMIWriter.class,
-        XMIWriter.PARAM_XMI_DIRECTORY,
-        this.xmiDirectory));
-
-    return aggregateBuilder;
-  }
-
-  public static class XMIWriter extends JCasAnnotator_ImplBase {
-
-    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
-
-    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
-    private File xmiDirectory;
+    
+    aggregateBuilder.add(CopyFromGold.getDescription(EventMention.class, EntityMention.class));
+    aggregateBuilder.add(
+        NamedEntityCoreferenceResolver.createAnnotatorDescription(directory)
+            );
+    AnnotationStatistics<String> stats = new AnnotationStatistics<String>();
+    Function<CoreferenceRelation, ?> getSpan = new Function<CoreferenceRelation,
HashableArguments>() {
+      public HashableArguments apply(CoreferenceRelation relation) {
+        return new HashableArguments(relation);
+      }
+    };
+    Function<CoreferenceRelation, String> getOutcome = AnnotationStatistics.annotationToFeatureValue("category");
 
-    @Override
-    public void initialize(UimaContext context) throws ResourceInitializationException {
-      super.initialize(context);
-      if (!this.xmiDirectory.exists()) {
-        this.xmiDirectory.mkdirs();
+    for (JCas jCas : new JCasIterable(collectionReader, aggregateBuilder.createAggregate()))
{
+      JCas goldView = jCas.getView(GOLD_VIEW_NAME);
+      JCas systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
+      Collection<CoreferenceRelation> goldRelations = JCasUtil.select(
+          goldView,
+          CoreferenceRelation.class);
+      Collection<CoreferenceRelation> systemRelations = JCasUtil.select(
+          systemView,
+          CoreferenceRelation.class);
+      stats.add(goldRelations, systemRelations, getSpan, getOutcome);
+
+      if(this.printRelations){
+        URI uri = ViewURIUtil.getURI(jCas);
+        String[] path = uri.getPath().split("/");
+        printRelationAnnotations(path[path.length - 1], systemRelations);
       }
-    }
 
-    @Override
-    public void process(JCas jCas) throws AnalysisEngineProcessException {
-      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
-      try {
-        FileOutputStream outputStream = new FileOutputStream(xmiFile);
-        try {
-          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
-          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
-          serializer.serialize(jCas.getCas(), handler);
-        } finally {
-          outputStream.close();
+      if(this.printErrors){
+        Map<HashableArguments, BinaryTextRelation> goldMap = Maps.newHashMap();
+        for (BinaryTextRelation relation : goldRelations) {
+          goldMap.put(new HashableArguments(relation), relation);
+        }
+        Map<HashableArguments, BinaryTextRelation> systemMap = Maps.newHashMap();
+        for (BinaryTextRelation relation : systemRelations) {
+          systemMap.put(new HashableArguments(relation), relation);
+        }
+        Set<HashableArguments> all = Sets.union(goldMap.keySet(), systemMap.keySet());
+        List<HashableArguments> sorted = Lists.newArrayList(all);
+        Collections.sort(sorted);
+        for (HashableArguments key : sorted) {
+          BinaryTextRelation goldRelation = goldMap.get(key);
+          BinaryTextRelation systemRelation = systemMap.get(key);
+          if (goldRelation == null) {
+            System.out.println("System added: " + formatRelation(systemRelation));
+          } else if (systemRelation == null) {
+            System.out.println("System dropped: " + formatRelation(goldRelation));
+          } else if (!systemRelation.getCategory().equals(goldRelation.getCategory())) {
+            String label = systemRelation.getCategory();
+            System.out.printf("System labeled %s for %s\n", label, formatRelation(systemRelation));
+          }
         }
-      } catch (SAXException e) {
-        throw new AnalysisEngineProcessException(e);
-      } catch (IOException e) {
-        throw new AnalysisEngineProcessException(e);
       }
     }
+    return stats;
+
+	}
+	
+  private static String formatRelation(BinaryTextRelation relation) {
+    IdentifiedAnnotation arg1 = (IdentifiedAnnotation)relation.getArg1().getArgument();
+    IdentifiedAnnotation arg2 = (IdentifiedAnnotation)relation.getArg2().getArgument();
+    String text = arg1.getCAS().getDocumentText();
+    int begin = Math.min(arg1.getBegin(), arg2.getBegin());
+    int end = Math.max(arg1.getBegin(), arg2.getBegin());
+    begin = Math.max(0, begin - 50);
+    end = Math.min(text.length(), end + 50);
+    return String.format(
+        "%s(%s(type=%d), %s(type=%d)) in ...%s...",
+        relation.getCategory(),
+        arg1.getCoveredText(),
+        arg1.getTypeID(),
+        arg2.getCoveredText(),
+        arg2.getTypeID(),
+        text.substring(begin, end).replaceAll("[\r\n]", " "));
   }
-  */
-/*
-  public static class XMIReader extends JCasAnnotator_ImplBase {
 
-    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+  private static void printRelationAnnotations(String fileName, Collection<? extends BinaryTextRelation>
relations) {
 
-    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
-    private File xmiDirectory;
+    for(BinaryTextRelation binaryTextRelation : relations) {
 
-    @Override
-    public void process(JCas jCas) throws AnalysisEngineProcessException {
-      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
-      try {
-        FileInputStream inputStream = new FileInputStream(xmiFile);
-        try {
-          XmiCasDeserializer.deserialize(inputStream, jCas.getCas());
-        } finally {
-          inputStream.close();
-        }
-      } catch (SAXException e) {
-        throw new AnalysisEngineProcessException(e);
-      } catch (IOException e) {
-        throw new AnalysisEngineProcessException(e);
-      }
+      Annotation arg1 = binaryTextRelation.getArg1().getArgument();
+      Annotation arg2 = binaryTextRelation.getArg2().getArgument();
+
+      String arg1Type = arg1.getClass().getSimpleName();
+      String arg2Type = arg2.getClass().getSimpleName();
+
+      int arg1Begin = arg1.getBegin();
+      int arg1End = arg1.getEnd();
+      int arg2Begin = arg2.getBegin();
+      int arg2End = arg2.getEnd();
+
+      String category = binaryTextRelation.getCategory();
+
+      System.out.format("%s\t%s\t%s\t%d\t%d\t%s\t%d\t%d\n", 
+          fileName, category, arg1Type, arg1Begin, arg1End, arg2Type, arg2Begin, arg2End);
     }
   }
-  */
+
+
   static File getXMIFile(File xmiDirectory, File textFile) {
     return new File(xmiDirectory, textFile.getName() + ".xmi");
   }
@@ -386,13 +240,46 @@ public class EvaluationOfCoreferencePair
     return getXMIFile(xmiDirectory, new File(ViewURIUtil.getURI(jCas).getPath()));
   }
 
-  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+  public static class CopyFromGold extends JCasAnnotator_ImplBase {
+
+    public static AnalysisEngineDescription getDescription(Class<?>... classes)
+        throws ResourceInitializationException {
+      return AnalysisEngineFactory.createPrimitiveDescription(
+          CopyFromGold.class,
+          CopyFromGold.PARAM_ANNOTATION_CLASSES,
+          classes);
+    }
+
+    public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses";
+
+    @ConfigurationParameter(name = PARAM_ANNOTATION_CLASSES, mandatory = true)
+    private Class<? extends TOP>[] annotationClasses;
 
     @Override
     public void process(JCas jCas) throws AnalysisEngineProcessException {
-      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
-        if (chunk.getChunkType().equals("NP")) {
-          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+      JCas goldView, systemView;
+      try {
+        goldView = jCas.getView(GOLD_VIEW_NAME);
+        systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA);
+      } catch (CASException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+      for (Class<? extends TOP> annotationClass : this.annotationClasses) {
+        for (TOP annotation : Lists.newArrayList(JCasUtil.select(systemView, annotationClass)))
{
+          if (annotation.getClass().equals(annotationClass)) {
+            annotation.removeFromIndexes();
+          }
+        }
+      }
+      CasCopier copier = new CasCopier(goldView.getCas(), systemView.getCas());
+      Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA);
+      for (Class<? extends TOP> annotationClass : this.annotationClasses) {
+        for (TOP annotation : JCasUtil.select(goldView, annotationClass)) {
+          TOP copy = (TOP) copier.copyFs(annotation);
+          if (copy instanceof Annotation) {
+            copy.setFeatureValue(sofaFeature, systemView.getSofa());
+          }
+          copy.addToIndexes(systemView);
         }
       }
     }
@@ -408,7 +295,7 @@ public class EvaluationOfCoreferencePair
 	  List<File> trainItems = getFiles(options.traingDirectory);
 	  List<File> testItems = getFiles(options.testDirectory);
 	  
-	  EvaluationOfCoreferencePairs eval = new EvaluationOfCoreferencePairs(new File("target/models/"));
+	  EvaluationOfCoreferencePairs eval = new EvaluationOfCoreferencePairs(new File("target/models/"),
options.printErrors, options.printRelations);
 	  AnnotationStatistics<String> stats = eval.trainAndTest(trainItems, testItems);
 	  System.err.println(stats);
 	}

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java?rev=1498893&r1=1498892&r2=1498893&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/PreprocessAndWriteXmi.java
Tue Jul  2 11:53:53 2013
@@ -21,11 +21,36 @@ package org.apache.ctakes.coreference.ev
 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
+import java.net.URISyntaxException;
 
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
 import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
+import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
+import org.apache.ctakes.core.resource.SuffixMaxentModelResourceImpl;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.ctakes.coreference.ae.GoldCoreferenceReader;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.postagger.POSTagger;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
 import org.apache.uima.UIMAException;
 import org.apache.uima.UIMAFramework;
 import org.apache.uima.UimaContext;
@@ -44,16 +69,22 @@ import org.apache.uima.util.XMLParser;
 import org.apache.uima.util.XMLSerializer;
 import org.cleartk.util.Options_ImplBase;
 import org.cleartk.util.ViewURIUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
 import org.cleartk.util.cr.FilesCollectionReader;
 import org.kohsuke.args4j.Option;
 import org.uimafit.component.JCasAnnotator_ImplBase;
 import org.uimafit.component.ViewCreatorAnnotator;
+import org.uimafit.component.ViewTextCopierAnnotator;
 import org.uimafit.descriptor.ConfigurationParameter;
 import org.uimafit.factory.AggregateBuilder;
 import org.uimafit.factory.AnalysisEngineFactory;
 import org.uimafit.factory.CollectionReaderFactory;
 import org.uimafit.factory.ConfigurationParameterFactory;
+import org.uimafit.factory.ExternalResourceFactory;
+import org.uimafit.factory.TypePrioritiesFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
 import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.util.JCasUtil;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -88,7 +119,7 @@ public class PreprocessAndWriteXmi {
 	}
 	
 
-	public static void main(String[] args) throws IOException, UIMAException {
+	public static void main(String[] args) throws IOException, UIMAException, URISyntaxException
{
 	    Options options = new Options();
 	    options.parseOptions(args);
     
@@ -117,15 +148,14 @@ public class PreprocessAndWriteXmi {
 	}
 
 	
-	public static AnalysisEngine createPreprocessingAE(File preprocessDescFile) throws IOException,
InvalidXMLException, ResourceInitializationException {
+	public static AnalysisEngine createPreprocessingAE(File preprocessDescFile) throws IOException,
InvalidXMLException, ResourceInitializationException, URISyntaxException {
 	    // create the pre-processing pipeline
 	      XMLParser parser = UIMAFramework.getXMLParser();
 	      XMLInputSource source = new XMLInputSource(preprocessDescFile);
 	      AnalysisEngineDescription desc = parser.parseAnalysisEngineDescription(source);
 	      return UIMAFramework.produceAnalysisEngine(desc);
 	}
-	
-	
+
 	public static AnalysisEngine createGoldAnnotator(File goldRoot)
 			throws ResourceInitializationException {
 		// pipeline to read manual annotations into the gold view, not the default view
@@ -223,4 +253,16 @@ public class PreprocessAndWriteXmi {
 
   }
 
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+
 }



Mime
View raw message