Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 9766211364 for ; Thu, 31 Jul 2014 17:58:43 +0000 (UTC) Received: (qmail 57959 invoked by uid 500); 31 Jul 2014 17:58:43 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 57922 invoked by uid 500); 31 Jul 2014 17:58:43 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 57913 invoked by uid 99); 31 Jul 2014 17:58:43 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 31 Jul 2014 17:58:43 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 31 Jul 2014 17:58:39 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 62562238918A; Thu, 31 Jul 2014 17:57:40 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1614944 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Date: Thu, 31 Jul 2014 17:57:40 -0000 To: commits@ctakes.apache.org From: clin@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140731175740.62562238918A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: clin Date: Thu Jul 31 17:57:39 2014 New Revision: 1614944 URL: http://svn.apache.org/r1614944 Log: enable i2b2 xml writer to handle null arg2 Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java?rev=1614944&r1=1614943&r2=1614944&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java (original) +++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/eval/Evaluation_ImplBase.java Thu Jul 31 17:57:39 2014 @@ -120,348 +120,348 @@ import com.google.common.collect.Lists; import com.lexicalscope.jewel.cli.Option; public abstract class Evaluation_ImplBase extends - org.cleartk.eval.Evaluation_ImplBase { +org.cleartk.eval.Evaluation_ImplBase { - private static Logger LOGGER = Logger.getLogger(Evaluation_ImplBase.class); + private static Logger LOGGER = Logger.getLogger(Evaluation_ImplBase.class); - public static final String GOLD_VIEW_NAME = "GoldView"; - - enum XMLFormat { Knowtator, Anafora, I2B2 } - - static interface Options { - - @Option(longName = "text", defaultToNull = true) - public File getRawTextDirectory(); - - @Option(longName = "xml") - public File getXMLDirectory(); - - @Option(longName = "format", defaultValue="Anafora") - public XMLFormat getXMLFormat(); - - @Option(longName = "xmi") - public File getXMIDirectory(); - - @Option(longName = "patients") - public CommandLine.IntegerRanges getPatients(); - - @Option(longName = "treebank", defaultToNull=true) - public File getTreebankDirectory(); - - @Option(longName = "coreference", defaultToNull=true) - public File getCoreferenceDirectory(); - - @Option - public boolean getUseGoldTrees(); - - @Option - public boolean getGrid(); - - @Option - public boolean getPrintErrors(); - - @Option - public boolean getPrintOverlappingSpans(); - - @Option - public boolean getTest(); - - @Option(longName = "kernelParams", defaultToNull=true) - public String getKernelParams(); - - @Option(defaultToNull=true) - public String getI2B2Output(); - } - - protected File rawTextDirectory; - - protected File xmlDirectory; - - protected XMLFormat xmlFormat; - - protected File xmiDirectory; - - private boolean xmiExists; - - protected File treebankDirectory; - - protected File coreferenceDirectory; - - protected boolean printErrors = false; - - protected boolean printOverlapping = false; - - protected String i2b2Output = null; - - protected String[] kernelParams; - - public Evaluation_ImplBase( - File baseDirectory, - File rawTextDirectory, - File xmlDirectory, - XMLFormat xmlFormat, - File xmiDirectory, - File treebankDirectory, - File coreferenceDirectory) { - super(baseDirectory); - this.rawTextDirectory = rawTextDirectory; - this.xmlDirectory = xmlDirectory; - this.xmlFormat = xmlFormat; - this.xmiDirectory = xmiDirectory; - this.xmiExists = this.xmiDirectory.exists() && this.xmiDirectory.listFiles().length > 0; - this.treebankDirectory = treebankDirectory; - this.coreferenceDirectory = coreferenceDirectory; - } - - public Evaluation_ImplBase( - File baseDirectory, - File rawTextDirectory, - File xmlDirectory, - XMLFormat xmlFormat, - File xmiDirectory, - File treebankDirectory) { - this(baseDirectory, rawTextDirectory, xmlDirectory, xmlFormat, - xmiDirectory, treebankDirectory, null); - } - - public void setI2B2Output(String outDir){ - i2b2Output = outDir; - } - - public void prepareXMIsFor(List patientSets) throws Exception { - boolean needsXMIs = false; - for (File textFile : this.getFilesFor(patientSets)) { - if (!getXMIFile(this.xmiDirectory, textFile).exists()) { - needsXMIs = true; - break; - } - } - if (needsXMIs) { - CollectionReader reader = this.getCollectionReader(patientSets); - AnalysisEngine engine = this.getXMIWritingPreprocessorAggregateBuilder().createAggregate(); - SimplePipeline.runPipeline(reader, engine); - } - this.xmiExists = true; - } - - private List getFilesFor(List patientSets) throws FileNotFoundException { - List files = new ArrayList(); + public static final String GOLD_VIEW_NAME = "GoldView"; + + enum XMLFormat { Knowtator, Anafora, I2B2 } + + static interface Options { + + @Option(longName = "text", defaultToNull = true) + public File getRawTextDirectory(); + + @Option(longName = "xml") + public File getXMLDirectory(); + + @Option(longName = "format", defaultValue="Anafora") + public XMLFormat getXMLFormat(); + + @Option(longName = "xmi") + public File getXMIDirectory(); + + @Option(longName = "patients") + public CommandLine.IntegerRanges getPatients(); + + @Option(longName = "treebank", defaultToNull=true) + public File getTreebankDirectory(); + + @Option(longName = "coreference", defaultToNull=true) + public File getCoreferenceDirectory(); + + @Option + public boolean getUseGoldTrees(); + + @Option + public boolean getGrid(); + + @Option + public boolean getPrintErrors(); + + @Option + public boolean getPrintOverlappingSpans(); + + @Option + public boolean getTest(); + + @Option(longName = "kernelParams", defaultToNull=true) + public String getKernelParams(); + + @Option(defaultToNull=true) + public String getI2B2Output(); + } + + protected File rawTextDirectory; + + protected File xmlDirectory; + + protected XMLFormat xmlFormat; + + protected File xmiDirectory; + + private boolean xmiExists; + + protected File treebankDirectory; + + protected File coreferenceDirectory; + + protected boolean printErrors = false; + + protected boolean printOverlapping = false; + + protected String i2b2Output = null; + + protected String[] kernelParams; + + public Evaluation_ImplBase( + File baseDirectory, + File rawTextDirectory, + File xmlDirectory, + XMLFormat xmlFormat, + File xmiDirectory, + File treebankDirectory, + File coreferenceDirectory) { + super(baseDirectory); + this.rawTextDirectory = rawTextDirectory; + this.xmlDirectory = xmlDirectory; + this.xmlFormat = xmlFormat; + this.xmiDirectory = xmiDirectory; + this.xmiExists = this.xmiDirectory.exists() && this.xmiDirectory.listFiles().length > 0; + this.treebankDirectory = treebankDirectory; + this.coreferenceDirectory = coreferenceDirectory; + } + + public Evaluation_ImplBase( + File baseDirectory, + File rawTextDirectory, + File xmlDirectory, + XMLFormat xmlFormat, + File xmiDirectory, + File treebankDirectory) { + this(baseDirectory, rawTextDirectory, xmlDirectory, xmlFormat, + xmiDirectory, treebankDirectory, null); + } + + public void setI2B2Output(String outDir){ + i2b2Output = outDir; + } + + public void prepareXMIsFor(List patientSets) throws Exception { + boolean needsXMIs = false; + for (File textFile : this.getFilesFor(patientSets)) { + if (!getXMIFile(this.xmiDirectory, textFile).exists()) { + needsXMIs = true; + break; + } + } + if (needsXMIs) { + CollectionReader reader = this.getCollectionReader(patientSets); + AnalysisEngine engine = this.getXMIWritingPreprocessorAggregateBuilder().createAggregate(); + SimplePipeline.runPipeline(reader, engine); + } + this.xmiExists = true; + } + + private List getFilesFor(List patientSets) throws FileNotFoundException { + List files = new ArrayList(); if (this.rawTextDirectory == null - && this.xmlFormat == XMLFormat.Anafora) { - for (File dir : this.xmlDirectory.listFiles()) { - Set ids = new HashSet(); - for (Integer set : patientSets) { - ids.add(String.format("ID%03d", set)); - } - if (dir.isDirectory()) { - if (ids.contains(dir.getName().substring(0, 5))) { - File file = new File(dir, dir.getName()); - if (file.exists()) { - files.add(file); - } else { - LOGGER.warn("Missing note: " + file); - } - } else { - LOGGER.info("Skipping note: " + dir); - } - } - } + && this.xmlFormat == XMLFormat.Anafora) { + for (File dir : this.xmlDirectory.listFiles()) { + Set ids = new HashSet(); + for (Integer set : patientSets) { + ids.add(String.format("ID%03d", set)); + } + if (dir.isDirectory()) { + if (ids.contains(dir.getName().substring(0, 5))) { + File file = new File(dir, dir.getName()); + if (file.exists()) { + files.add(file); + } else { + LOGGER.warn("Missing note: " + file); + } + } else { + LOGGER.info("Skipping note: " + dir); + } + } + } } else if(this.xmlFormat == XMLFormat.I2B2) { - File trainDir = new File(this.xmlDirectory, "training"); - File testDir = new File(this.xmlDirectory, "test"); - for (Integer pt : patientSets){ - File xmlTrain = new File(trainDir, pt+".xml"); - File train = new File(trainDir, pt+".xml.txt"); - if(train.exists()){ - if(xmlTrain.exists()){ - files.add(train); - }else{ - System.err.println("Text file in training has no corresponding xml -- skipping: " + train); - } - } - File xmlTest = new File(testDir, pt+".xml"); - File test = new File(testDir, pt+".xml.txt"); - if(xmlTest.exists()){ - if(test.exists()){ - files.add(test); - }else{ - throw new FileNotFoundException("Could not find the test text file -- for cTAKES usage you must copy the text files into the xml directory for the test set."); - } - } - assert !(train.exists() && test.exists()); - } + File trainDir = new File(this.xmlDirectory, "training"); + File testDir = new File(this.xmlDirectory, "test"); + for (Integer pt : patientSets){ + File xmlTrain = new File(trainDir, pt+".xml"); + File train = new File(trainDir, pt+".xml.txt"); + if(train.exists()){ + if(xmlTrain.exists()){ + files.add(train); + }else{ + System.err.println("Text file in training has no corresponding xml -- skipping: " + train); + } + } + File xmlTest = new File(testDir, pt+".xml"); + File test = new File(testDir, pt+".xml.txt"); + if(xmlTest.exists()){ + if(test.exists()){ + files.add(test); + }else{ + throw new FileNotFoundException("Could not find the test text file -- for cTAKES usage you must copy the text files into the xml directory for the test set."); + } + } + assert !(train.exists() && test.exists()); + } } else { - for (Integer set : patientSets) { - final int setNum = set; - for (File file : rawTextDirectory.listFiles(new FilenameFilter(){ - @Override - public boolean accept(File dir, String name) { - return name.contains(String.format("ID%03d", setNum)); - }})) { - // skip hidden files like .svn - if (!file.isHidden()) { - if(xmlFormat == XMLFormat.Knowtator){ - files.add(file); - }else{ - // look for equivalent in xml directory: - File xmlFile = new File(xmlDirectory, file.getName()); - if(xmlFile.exists()){ - if(coreferenceDirectory != null){ - // verify that coref version of xml exists - File corefFile = new File(coreferenceDirectory, file.getName()+".Coreference.gold.completed.xml"); - if(corefFile.exists() && xmlFile.exists()){ - files.add(file); - }else{ - System.err.println("Missing coref patient file : " + corefFile); - } - }else{ - files.add(file); - } - }else{ - System.err.println("Missing patient file : " + xmlFile); - } - } - } - } - } - } - return files; - } - - @Override - protected CollectionReader getCollectionReader(List patientSets) throws Exception { - return UriCollectionReader.getCollectionReaderFromFiles(this.getFilesFor(patientSets)); - } - - protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception { - return this.xmiExists - ? this.getXMIReadingPreprocessorAggregateBuilder() - : this.getXMIWritingPreprocessorAggregateBuilder(); - } - - protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException { - AggregateBuilder aggregateBuilder = new AggregateBuilder(); - aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription()); - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - XMIReader.class, - XMIReader.PARAM_XMI_DIRECTORY, - this.xmiDirectory)); - return aggregateBuilder; - } - - protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() - throws Exception { - AggregateBuilder aggregateBuilder = new AggregateBuilder(); - aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription()); - - // read manual annotations into gold view - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - ViewCreatorAnnotator.class, - ViewCreatorAnnotator.PARAM_VIEW_NAME, - GOLD_VIEW_NAME)); - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - ViewTextCopierAnnotator.class, - ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME, - CAS.NAME_DEFAULT_SOFA, - ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME, - GOLD_VIEW_NAME)); - switch (this.xmlFormat) { - case Anafora: - aggregateBuilder.add( - THYMEAnaforaXMLReader.getDescription(this.xmlDirectory), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME); - break; - case Knowtator: - aggregateBuilder.add( - THYMEKnowtatorXMLReader.getDescription(this.xmlDirectory), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME); - break; - case I2B2: - aggregateBuilder.add( - I2B2TemporalXMLReader.getDescription(this.xmlDirectory), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME); - break; - } - - if(this.coreferenceDirectory != null){ - aggregateBuilder.add( - THYMEAnaforaXMLReader.getDescription(this.coreferenceDirectory), - CAS.NAME_DEFAULT_SOFA, - GOLD_VIEW_NAME); - } - - // identify segments - if(this.xmlFormat == XMLFormat.I2B2){ - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class)); - }else{ - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class)); - } - // identify sentences - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - SentenceDetector.class, - SentenceDetector.SD_MODEL_FILE_PARAM, - "org/apache/ctakes/core/sentdetect/sd-med-model.zip")); - // identify tokens - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class)); - // merge some tokens - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class)); - - // identify part-of-speech tags - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - POSTagger.class, - TypeSystemDescriptionFactory.createTypeSystemDescription(), - TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class), - POSTagger.POS_MODEL_FILE_PARAM, - "org/apache/ctakes/postagger/models/mayo-pos.zip")); - - // identify chunks - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - Chunker.class, - Chunker.CHUNKER_MODEL_FILE_PARAM, - FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"), - Chunker.CHUNKER_CREATOR_CLASS_PARAM, - DefaultChunkCreator.class)); - - // identify UMLS named entities - - // adjust NP in NP NP to span both - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - ChunkAdjuster.class, - ChunkAdjuster.PARAM_CHUNK_PATTERN, - new String[] { "NP", "NP" }, - ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, - 1)); - // adjust NP in NP PP NP to span all three - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - ChunkAdjuster.class, - ChunkAdjuster.PARAM_CHUNK_PATTERN, - new String[] { "NP", "PP", "NP" }, - ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, - 2)); - // add lookup windows for each NP - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class)); - // maximize lookup windows - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - OverlapAnnotator.class, - "A_ObjectClass", - LookupWindowAnnotation.class, - "B_ObjectClass", - LookupWindowAnnotation.class, - "OverlapType", - "A_ENV_B", - "ActionType", - "DELETE", - "DeleteAction", - new String[] { "selector=B" })); - // add UMLS on top of lookup windows - aggregateBuilder.add( - UmlsDictionaryLookupAnnotator.createAnnotatorDescription() - ); + for (Integer set : patientSets) { + final int setNum = set; + for (File file : rawTextDirectory.listFiles(new FilenameFilter(){ + @Override + public boolean accept(File dir, String name) { + return name.contains(String.format("ID%03d", setNum)); + }})) { + // skip hidden files like .svn + if (!file.isHidden()) { + if(xmlFormat == XMLFormat.Knowtator){ + files.add(file); + }else{ + // look for equivalent in xml directory: + File xmlFile = new File(xmlDirectory, file.getName()); + if(xmlFile.exists()){ + if(coreferenceDirectory != null){ + // verify that coref version of xml exists + File corefFile = new File(coreferenceDirectory, file.getName()+".Coreference.gold.completed.xml"); + if(corefFile.exists() && xmlFile.exists()){ + files.add(file); + }else{ + System.err.println("Missing coref patient file : " + corefFile); + } + }else{ + files.add(file); + } + }else{ + System.err.println("Missing patient file : " + xmlFile); + } + } + } + } + } + } + return files; + } + + @Override + protected CollectionReader getCollectionReader(List patientSets) throws Exception { + return UriCollectionReader.getCollectionReaderFromFiles(this.getFilesFor(patientSets)); + } + + protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception { + return this.xmiExists + ? this.getXMIReadingPreprocessorAggregateBuilder() + : this.getXMIWritingPreprocessorAggregateBuilder(); + } + + protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException { + AggregateBuilder aggregateBuilder = new AggregateBuilder(); + aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription()); + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + XMIReader.class, + XMIReader.PARAM_XMI_DIRECTORY, + this.xmiDirectory)); + return aggregateBuilder; + } + + protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() + throws Exception { + AggregateBuilder aggregateBuilder = new AggregateBuilder(); + aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription()); + + // read manual annotations into gold view + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ViewCreatorAnnotator.class, + ViewCreatorAnnotator.PARAM_VIEW_NAME, + GOLD_VIEW_NAME)); + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ViewTextCopierAnnotator.class, + ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME, + CAS.NAME_DEFAULT_SOFA, + ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME, + GOLD_VIEW_NAME)); + switch (this.xmlFormat) { + case Anafora: + aggregateBuilder.add( + THYMEAnaforaXMLReader.getDescription(this.xmlDirectory), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME); + break; + case Knowtator: + aggregateBuilder.add( + THYMEKnowtatorXMLReader.getDescription(this.xmlDirectory), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME); + break; + case I2B2: + aggregateBuilder.add( + I2B2TemporalXMLReader.getDescription(this.xmlDirectory), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME); + break; + } + + if(this.coreferenceDirectory != null){ + aggregateBuilder.add( + THYMEAnaforaXMLReader.getDescription(this.coreferenceDirectory), + CAS.NAME_DEFAULT_SOFA, + GOLD_VIEW_NAME); + } - /* + // identify segments + if(this.xmlFormat == XMLFormat.I2B2){ + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class)); + }else{ + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SegmentsFromBracketedSectionTagsAnnotator.class)); + } + // identify sentences + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + SentenceDetector.class, + SentenceDetector.SD_MODEL_FILE_PARAM, + "org/apache/ctakes/core/sentdetect/sd-med-model.zip")); + // identify tokens + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class)); + // merge some tokens + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class)); + + // identify part-of-speech tags + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + POSTagger.class, + TypeSystemDescriptionFactory.createTypeSystemDescription(), + TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class), + POSTagger.POS_MODEL_FILE_PARAM, + "org/apache/ctakes/postagger/models/mayo-pos.zip")); + + // identify chunks + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + Chunker.class, + Chunker.CHUNKER_MODEL_FILE_PARAM, + FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"), + Chunker.CHUNKER_CREATOR_CLASS_PARAM, + DefaultChunkCreator.class)); + + // identify UMLS named entities + + // adjust NP in NP NP to span both + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ChunkAdjuster.class, + ChunkAdjuster.PARAM_CHUNK_PATTERN, + new String[] { "NP", "NP" }, + ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, + 1)); + // adjust NP in NP PP NP to span all three + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ChunkAdjuster.class, + ChunkAdjuster.PARAM_CHUNK_PATTERN, + new String[] { "NP", "PP", "NP" }, + ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, + 2)); + // add lookup windows for each NP + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class)); + // maximize lookup windows + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + OverlapAnnotator.class, + "A_ObjectClass", + LookupWindowAnnotation.class, + "B_ObjectClass", + LookupWindowAnnotation.class, + "OverlapType", + "A_ENV_B", + "ActionType", + "DELETE", + "DeleteAction", + new String[] { "selector=B" })); + // add UMLS on top of lookup windows + aggregateBuilder.add( + UmlsDictionaryLookupAnnotator.createAnnotatorDescription() + ); + + /* // add lvg annotator String[] XeroxTreebankMap = { "adj|JJ", @@ -524,391 +524,396 @@ public abstract class Evaluation_ImplBas new File(LvgCmdApiResourceImpl.class.getResource( "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI()))); aggregateBuilder.add(lvgAnnotator); - */ - aggregateBuilder.add(LvgAnnotator.createAnnotatorDescription()); + */ + aggregateBuilder.add(LvgAnnotator.createAnnotatorDescription()); + + // add dependency parser + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class)); - // add dependency parser - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class)); + // add semantic role labeler + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class)); - // add semantic role labeler - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class)); + // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps + if(this.treebankDirectory != null){ + aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory)); + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class)); + }else{ + // add ctakes constituency parses to system view + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class, + ConstituencyParser.PARAM_MODEL_FILENAME, + "org/apache/ctakes/constituency/parser/models/thyme.bin")); + // "org/apache/ctakes/constituency/parser/models/sharp-3.1.bin")); + // "org/apache/ctakes/constituency/parser/models/thymeNotempeval.bin")); + // aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(BerkeleyParserWrapper.class, + // BerkeleyParserWrapper.PARAM_MODEL_FILENAME, + // + // "org/apache/ctakes/constituency/parser/models/thyme.gcg.4sm.bin")); + // "org/apache/ctakes/constituency/parser/models/thyme.4sm.bin")); + } + // write out the CAS after all the above annotations + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + XMIWriter.class, + XMIWriter.PARAM_XMI_DIRECTORY, + this.xmiDirectory)); + + return aggregateBuilder; + } + + public static List selectExact(JCas jCas, Class annotationClass, Segment segment) { + List annotations = Lists.newArrayList(); + for (T annotation : JCasUtil.selectCovered(jCas, annotationClass, segment)) { + if (annotation.getClass().equals(annotationClass)) { + annotations.add(annotation); + } + } + return annotations; + } + + public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) { + if (chunk.getChunkType().equals("NP")) { + new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes(); + } + } + } + } + + public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + List lws = new ArrayList(JCasUtil.select(jCas, LookupWindowAnnotation.class)); + // we'll navigate backwards so that as we delete things we shorten the list from the back + for(int i = lws.size()-2; i >= 0; i--){ + LookupWindowAnnotation lw1 = lws.get(i); + LookupWindowAnnotation lw2 = lws.get(i+1); + if(lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd()){ + /// lw1 envelops or encloses lw2 + lws.remove(i+1); + lw2.removeFromIndexes(); + } + } + + } + + } + + public static class EntityMentionRemover extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) { + mention.removeFromIndexes(); + } + } + } + + public static class EventMentionRemover extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class))) { + mention.removeFromIndexes(); + } + } + } + + // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed + public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase { + private static Pattern SECTION_PATTERN = Pattern.compile( + "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])", + Pattern.DOTALL); + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + Matcher matcher = SECTION_PATTERN.matcher(jCas.getDocumentText()); + while (matcher.find()) { + Segment segment = new Segment(jCas); + segment.setBegin(matcher.start() + matcher.group(1).length()); + segment.setEnd(matcher.end() - matcher.group(3).length()); + segment.setId(matcher.group(2)); + segment.addToIndexes(); + } + } + } + + static File getXMIFile(File xmiDirectory, File textFile) { + return new File(xmiDirectory, textFile.getName() + ".xmi"); + } + + static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException { + return getXMIFile(xmiDirectory, new File(ViewURIUtil.getURI(jCas).getPath())); + } + + public static class XMIWriter extends JCasAnnotator_ImplBase { + + public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; + + @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true) + private File xmiDirectory; + + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + if (!this.xmiDirectory.exists()) { + this.xmiDirectory.mkdirs(); + } + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + File xmiFile = getXMIFile(this.xmiDirectory, jCas); + try { + FileOutputStream outputStream = new FileOutputStream(xmiFile); + try { + XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem()); + ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler(); + serializer.serialize(jCas.getCas(), handler); + } finally { + outputStream.close(); + } + } catch (SAXException e) { + throw new AnalysisEngineProcessException(e); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + } + + public static class XMIReader extends JCasAnnotator_ImplBase { + + public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; + + @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true) + private File xmiDirectory; + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + File xmiFile = getXMIFile(this.xmiDirectory, jCas); + try { + FileInputStream inputStream = new FileInputStream(xmiFile); + try { + XmiCasDeserializer.deserialize(inputStream, jCas.getCas()); + } finally { + inputStream.close(); + } + } catch (SAXException e) { + throw new AnalysisEngineProcessException(e); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + } + + public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase { + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + JCas goldView, systemView; + try { + goldView = jCas.getView(GOLD_VIEW_NAME); + systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA); + } catch (CASException e) { + e.printStackTrace(); + throw new AnalysisEngineProcessException(); + } + for(TimeMention mention : JCasUtil.select(goldView, TimeMention.class)){ + // for each time expression, get the treebank node with the same span. + List nodes = JCasUtil.selectCovered(systemView, TreebankNode.class, mention); + TreebankNode sameSpanNode = null; + for(TreebankNode node : nodes){ + if(node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd()){ + sameSpanNode = node; + break; + } + } + if(sameSpanNode != null){ + // look at node at the position of the timex3. + if(sameSpanNode.getNodeType().equals("PP")){ + // if it is a PP it should be moved down to the NP + int numChildren = sameSpanNode.getChildren().size(); + if(numChildren == 2 && sameSpanNode.getChildren(0).getNodeType().equals("IN") && sameSpanNode.getChildren(1).getNodeType().equals("NP")){ + // move the time span to this node: + TreebankNode mentionNode = sameSpanNode.getChildren(numChildren-1); + mention.setBegin(mentionNode.getBegin()); + mention.setEnd(mentionNode.getEnd()); + } + } + }else{ + // if there is no matching tree span, see if the DT to the left would help. + // now adjust for missing DT to the left + List precedingPreterms = JCasUtil.selectPreceding(systemView, TerminalTreebankNode.class, mention, 1); + if(precedingPreterms != null && precedingPreterms.size() == 1){ + TerminalTreebankNode leftTerm = precedingPreterms.get(0); + if(leftTerm.getNodeType().equals("DT")){ + // now see if adding this would make it match a tree + List matchingNodes = JCasUtil.selectCovered(systemView, TreebankNode.class, leftTerm.getBegin(), mention.getEnd()); + for(TreebankNode node : matchingNodes){ + if(node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd()){ + sameSpanNode = node; + break; + } + } + if(sameSpanNode != null){ + // adding the DT to the left of th emention made it match a tree: + System.err.println("Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + mention.getCoveredText()); + mention.setBegin(leftTerm.getBegin()); + } + } + } + } + } + } + } + + + public static class CopyFromGold extends JCasAnnotator_ImplBase { + + public static AnalysisEngineDescription getDescription(Class... classes) + throws ResourceInitializationException { + return AnalysisEngineFactory.createPrimitiveDescription( + CopyFromGold.class, + CopyFromGold.PARAM_ANNOTATION_CLASSES, + classes); + } + + public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses"; + + @ConfigurationParameter(name = PARAM_ANNOTATION_CLASSES, mandatory = true) + private Class[] annotationClasses; + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + JCas goldView, systemView; + try { + goldView = jCas.getView(GOLD_VIEW_NAME); + systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA); + } catch (CASException e) { + throw new AnalysisEngineProcessException(e); + } + for (Class annotationClass : this.annotationClasses) { + for (TOP annotation : Lists.newArrayList(JCasUtil.select(systemView, annotationClass))) { + if (annotation.getClass().equals(annotationClass)) { + annotation.removeFromIndexes(); + } + } + } + CasCopier copier = new CasCopier(goldView.getCas(), systemView.getCas()); + Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA); + for (Class annotationClass : this.annotationClasses) { + for (TOP annotation : JCasUtil.select(goldView, annotationClass)) { + TOP copy = (TOP) copier.copyFs(annotation); + if (copy instanceof Annotation) { + copy.setFeatureValue(sofaFeature, systemView.getSofa()); + } + copy.addToIndexes(systemView); + } + } + } + } + + public static class WriteI2B2XML extends JCasAnnotator_ImplBase { + public static final String PARAM_OUTPUT_DIR="PARAM_OUTPUT_DIR"; + @ConfigurationParameter(mandatory=true,description="Output directory to write xml files to.",name=PARAM_OUTPUT_DIR) + protected String outputDir; + + @Override + public void process(JCas jcas) throws AnalysisEngineProcessException { + try { + // get the output file name from the input file name and output directory. + File outDir = new File(outputDir); + if(!outDir.exists()) outDir.mkdirs(); + File inFile = new File(ViewURIUtil.getURI(jcas)); + String outFile = inFile.getName().replace(".txt", ""); + + // build the xml + DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); + Document doc = docBuilder.newDocument(); + Element rootElement = doc.createElement("ClinicalNarrativeTemporalAnnotation"); + Element textElement = doc.createElement("TEXT"); + Element tagsElement = doc.createElement("TAGS"); + textElement.setTextContent(jcas.getDocumentText()); + rootElement.appendChild(textElement); + rootElement.appendChild(tagsElement); + doc.appendChild(rootElement); + + Map argToId = new HashMap<>(); + int id=0; + for(TimeMention timex : JCasUtil.select(jcas, TimeMention.class)){ + Element timexElement = doc.createElement("TIMEX3"); + String timexID = "T"+id; id++; + argToId.put(timex, timexID); + timexElement.setAttribute("id", timexID); + timexElement.setAttribute("start", String.valueOf(timex.getBegin()+1)); + timexElement.setAttribute("end", String.valueOf(timex.getEnd()+1)); + timexElement.setAttribute("text", timex.getCoveredText()); + timexElement.setAttribute("type", "NA"); + timexElement.setAttribute("val", "NA"); + timexElement.setAttribute("mod", "NA"); + tagsElement.appendChild(timexElement); + } + + id = 0; + for(EventMention event : JCasUtil.select(jcas, EventMention.class)){ + if (event.getClass().equals(EventMention.class)) { + // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events + Element eventEl = doc.createElement("EVENT"); + String eventID = "E"+id; id++; + argToId.put(event, eventID); + eventEl.setAttribute("id", eventID); + eventEl.setAttribute("start", String.valueOf(event.getBegin()+1)); + eventEl.setAttribute("end", String.valueOf(event.getEnd()+1)); + eventEl.setAttribute("text", event.getCoveredText()); + eventEl.setAttribute("modality", "NA"); + eventEl.setAttribute("polarity", "NA"); + eventEl.setAttribute("type", "NA"); + tagsElement.appendChild(eventEl); + } + } + + id = 0; + for(TemporalTextRelation rel : JCasUtil.select(jcas, TemporalTextRelation.class)){ + Element linkEl = doc.createElement("TLINK"); + String linkID = "TL"+id; id++; + linkEl.setAttribute("id", linkID); + Annotation arg1 = rel.getArg1().getArgument(); + linkEl.setAttribute("fromID", argToId.get(arg1)); + linkEl.setAttribute("fromText", arg1.getCoveredText()); + Annotation arg2 = rel.getArg2().getArgument(); + if(arg2!=null){ + linkEl.setAttribute("toID", argToId.get(arg2)); + linkEl.setAttribute("toText", arg2.getCoveredText()); + }else{ + linkEl.setAttribute("toID", "Discharge"); + linkEl.setAttribute("toText", "Discharge"); + } + linkEl.setAttribute("type", rel.getCategory()); + tagsElement.appendChild(linkEl); + } + + // boilerplate xml-writing code: + TransformerFactory transformerFactory = TransformerFactory.newInstance(); + Transformer transformer = transformerFactory.newTransformer(); + transformer.setOutputProperty(OutputKeys.INDENT, "yes"); + transformer.setOutputProperty(OutputKeys.METHOD, "xml"); + DOMSource source = new DOMSource(doc); + StreamResult result = new StreamResult(new File(outputDir, outFile)); + transformer.transform(source, result); + } catch (ParserConfigurationException e) { + e.printStackTrace(); + throw new AnalysisEngineProcessException(e); + } catch (TransformerConfigurationException e) { + e.printStackTrace(); + throw new AnalysisEngineProcessException(e); + } catch (TransformerException e) { + e.printStackTrace(); + throw new AnalysisEngineProcessException(e); + } + + } - // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps - if(this.treebankDirectory != null){ - aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory)); - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class)); - }else{ - // add ctakes constituency parses to system view - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class, - ConstituencyParser.PARAM_MODEL_FILENAME, - "org/apache/ctakes/constituency/parser/models/thyme.bin")); -// "org/apache/ctakes/constituency/parser/models/sharp-3.1.bin")); -// "org/apache/ctakes/constituency/parser/models/thymeNotempeval.bin")); -// aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(BerkeleyParserWrapper.class, -// BerkeleyParserWrapper.PARAM_MODEL_FILENAME, -// -// "org/apache/ctakes/constituency/parser/models/thyme.gcg.4sm.bin")); -// "org/apache/ctakes/constituency/parser/models/thyme.4sm.bin")); - } - // write out the CAS after all the above annotations - aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( - XMIWriter.class, - XMIWriter.PARAM_XMI_DIRECTORY, - this.xmiDirectory)); - - return aggregateBuilder; - } - - public static List selectExact(JCas jCas, Class annotationClass, Segment segment) { - List annotations = Lists.newArrayList(); - for (T annotation : JCasUtil.selectCovered(jCas, annotationClass, segment)) { - if (annotation.getClass().equals(annotationClass)) { - annotations.add(annotation); - } - } - return annotations; - } - - public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase { - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) { - if (chunk.getChunkType().equals("NP")) { - new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes(); - } - } - } - } - - public static class RemoveEnclosedLookupWindows extends JCasAnnotator_ImplBase { - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - List lws = new ArrayList(JCasUtil.select(jCas, LookupWindowAnnotation.class)); - // we'll navigate backwards so that as we delete things we shorten the list from the back - for(int i = lws.size()-2; i >= 0; i--){ - LookupWindowAnnotation lw1 = lws.get(i); - LookupWindowAnnotation lw2 = lws.get(i+1); - if(lw1.getBegin() <= lw2.getBegin() && lw1.getEnd() >= lw2.getEnd()){ - /// lw1 envelops or encloses lw2 - lws.remove(i+1); - lw2.removeFromIndexes(); - } - } - - } - - } - - public static class EntityMentionRemover extends JCasAnnotator_ImplBase { - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) { - mention.removeFromIndexes(); - } - } - } - - public static class EventMentionRemover extends JCasAnnotator_ImplBase { - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class))) { - mention.removeFromIndexes(); - } - } - } - - // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed - public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase { - private static Pattern SECTION_PATTERN = Pattern.compile( - "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])", - Pattern.DOTALL); - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - Matcher matcher = SECTION_PATTERN.matcher(jCas.getDocumentText()); - while (matcher.find()) { - Segment segment = new Segment(jCas); - segment.setBegin(matcher.start() + matcher.group(1).length()); - segment.setEnd(matcher.end() - matcher.group(3).length()); - segment.setId(matcher.group(2)); - segment.addToIndexes(); - } - } - } - - static File getXMIFile(File xmiDirectory, File textFile) { - return new File(xmiDirectory, textFile.getName() + ".xmi"); - } - - static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException { - return getXMIFile(xmiDirectory, new File(ViewURIUtil.getURI(jCas).getPath())); - } - - public static class XMIWriter extends JCasAnnotator_ImplBase { - - public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; - - @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true) - private File xmiDirectory; - - @Override - public void initialize(UimaContext context) throws ResourceInitializationException { - super.initialize(context); - if (!this.xmiDirectory.exists()) { - this.xmiDirectory.mkdirs(); - } - } - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - File xmiFile = getXMIFile(this.xmiDirectory, jCas); - try { - FileOutputStream outputStream = new FileOutputStream(xmiFile); - try { - XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem()); - ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler(); - serializer.serialize(jCas.getCas(), handler); - } finally { - outputStream.close(); - } - } catch (SAXException e) { - throw new AnalysisEngineProcessException(e); - } catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - } - - public static class XMIReader extends JCasAnnotator_ImplBase { - - public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; - - @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true) - private File xmiDirectory; - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - File xmiFile = getXMIFile(this.xmiDirectory, jCas); - try { - FileInputStream inputStream = new FileInputStream(xmiFile); - try { - XmiCasDeserializer.deserialize(inputStream, jCas.getCas()); - } finally { - inputStream.close(); - } - } catch (SAXException e) { - throw new AnalysisEngineProcessException(e); - } catch (IOException e) { - throw new AnalysisEngineProcessException(e); - } - } - } - - public static class TimexAnnotationCorrector extends JCasAnnotator_ImplBase { - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - JCas goldView, systemView; - try { - goldView = jCas.getView(GOLD_VIEW_NAME); - systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA); - } catch (CASException e) { - e.printStackTrace(); - throw new AnalysisEngineProcessException(); - } - for(TimeMention mention : JCasUtil.select(goldView, TimeMention.class)){ - // for each time expression, get the treebank node with the same span. - List nodes = JCasUtil.selectCovered(systemView, TreebankNode.class, mention); - TreebankNode sameSpanNode = null; - for(TreebankNode node : nodes){ - if(node.getBegin() == mention.getBegin() && node.getEnd() == mention.getEnd()){ - sameSpanNode = node; - break; - } - } - if(sameSpanNode != null){ - // look at node at the position of the timex3. - if(sameSpanNode.getNodeType().equals("PP")){ - // if it is a PP it should be moved down to the NP - int numChildren = sameSpanNode.getChildren().size(); - if(numChildren == 2 && sameSpanNode.getChildren(0).getNodeType().equals("IN") && sameSpanNode.getChildren(1).getNodeType().equals("NP")){ - // move the time span to this node: - TreebankNode mentionNode = sameSpanNode.getChildren(numChildren-1); - mention.setBegin(mentionNode.getBegin()); - mention.setEnd(mentionNode.getEnd()); - } - } - }else{ - // if there is no matching tree span, see if the DT to the left would help. - // now adjust for missing DT to the left - List precedingPreterms = JCasUtil.selectPreceding(systemView, TerminalTreebankNode.class, mention, 1); - if(precedingPreterms != null && precedingPreterms.size() == 1){ - TerminalTreebankNode leftTerm = precedingPreterms.get(0); - if(leftTerm.getNodeType().equals("DT")){ - // now see if adding this would make it match a tree - List matchingNodes = JCasUtil.selectCovered(systemView, TreebankNode.class, leftTerm.getBegin(), mention.getEnd()); - for(TreebankNode node : matchingNodes){ - if(node.getBegin() == leftTerm.getBegin() && node.getEnd() == mention.getEnd()){ - sameSpanNode = node; - break; - } - } - if(sameSpanNode != null){ - // adding the DT to the left of th emention made it match a tree: - System.err.println("Adding DT: " + leftTerm.getCoveredText() + " to TIMEX: " + mention.getCoveredText()); - mention.setBegin(leftTerm.getBegin()); - } - } - } - } - } - } - } - - - public static class CopyFromGold extends JCasAnnotator_ImplBase { - - public static AnalysisEngineDescription getDescription(Class... classes) - throws ResourceInitializationException { - return AnalysisEngineFactory.createPrimitiveDescription( - CopyFromGold.class, - CopyFromGold.PARAM_ANNOTATION_CLASSES, - classes); - } - - public static final String PARAM_ANNOTATION_CLASSES = "AnnotationClasses"; - - @ConfigurationParameter(name = PARAM_ANNOTATION_CLASSES, mandatory = true) - private Class[] annotationClasses; - - @Override - public void process(JCas jCas) throws AnalysisEngineProcessException { - JCas goldView, systemView; - try { - goldView = jCas.getView(GOLD_VIEW_NAME); - systemView = jCas.getView(CAS.NAME_DEFAULT_SOFA); - } catch (CASException e) { - throw new AnalysisEngineProcessException(e); - } - for (Class annotationClass : this.annotationClasses) { - for (TOP annotation : Lists.newArrayList(JCasUtil.select(systemView, annotationClass))) { - if (annotation.getClass().equals(annotationClass)) { - annotation.removeFromIndexes(); - } - } - } - CasCopier copier = new CasCopier(goldView.getCas(), systemView.getCas()); - Feature sofaFeature = jCas.getTypeSystem().getFeatureByFullName(CAS.FEATURE_FULL_NAME_SOFA); - for (Class annotationClass : this.annotationClasses) { - for (TOP annotation : JCasUtil.select(goldView, annotationClass)) { - TOP copy = (TOP) copier.copyFs(annotation); - if (copy instanceof Annotation) { - copy.setFeatureValue(sofaFeature, systemView.getSofa()); - } - copy.addToIndexes(systemView); - } - } - } - } - - public static class WriteI2B2XML extends JCasAnnotator_ImplBase { - public static final String PARAM_OUTPUT_DIR="PARAM_OUTPUT_DIR"; - @ConfigurationParameter(mandatory=true,description="Output directory to write xml files to.",name=PARAM_OUTPUT_DIR) - protected String outputDir; - - @Override - public void process(JCas jcas) throws AnalysisEngineProcessException { - try { - // get the output file name from the input file name and output directory. - File outDir = new File(outputDir); - if(!outDir.exists()) outDir.mkdirs(); - File inFile = new File(ViewURIUtil.getURI(jcas)); - String outFile = inFile.getName().replace(".txt", ""); - - // build the xml - DocumentBuilderFactory docFactory = DocumentBuilderFactory.newInstance(); - DocumentBuilder docBuilder = docFactory.newDocumentBuilder(); - Document doc = docBuilder.newDocument(); - Element rootElement = doc.createElement("ClinicalNarrativeTemporalAnnotation"); - Element textElement = doc.createElement("TEXT"); - Element tagsElement = doc.createElement("TAGS"); - textElement.setTextContent(jcas.getDocumentText()); - rootElement.appendChild(textElement); - rootElement.appendChild(tagsElement); - doc.appendChild(rootElement); - - Map argToId = new HashMap<>(); - int id=0; - for(TimeMention timex : JCasUtil.select(jcas, TimeMention.class)){ - Element timexElement = doc.createElement("TIMEX3"); - String timexID = "T"+id; id++; - argToId.put(timex, timexID); - timexElement.setAttribute("id", timexID); - timexElement.setAttribute("start", String.valueOf(timex.getBegin()+1)); - timexElement.setAttribute("end", String.valueOf(timex.getEnd()+1)); - timexElement.setAttribute("text", timex.getCoveredText()); - timexElement.setAttribute("type", "NA"); - timexElement.setAttribute("val", "NA"); - timexElement.setAttribute("mod", "NA"); - tagsElement.appendChild(timexElement); - } - - id = 0; - for(EventMention event : JCasUtil.select(jcas, EventMention.class)){ - if (event.getClass().equals(EventMention.class)) { - // this ensures we are only looking at THYME events and not ctakes-dictionary-lookup events - Element eventEl = doc.createElement("EVENT"); - String eventID = "E"+id; id++; - argToId.put(event, eventID); - eventEl.setAttribute("id", eventID); - eventEl.setAttribute("start", String.valueOf(event.getBegin()+1)); - eventEl.setAttribute("end", String.valueOf(event.getEnd()+1)); - eventEl.setAttribute("text", event.getCoveredText()); - eventEl.setAttribute("modality", "NA"); - eventEl.setAttribute("polarity", "NA"); - eventEl.setAttribute("type", "NA"); - tagsElement.appendChild(eventEl); - } - } - - id = 0; - for(TemporalTextRelation rel : JCasUtil.select(jcas, TemporalTextRelation.class)){ - Element linkEl = doc.createElement("TLINK"); - String linkID = "TL"+id; id++; - linkEl.setAttribute("id", linkID); - Annotation arg1 = rel.getArg1().getArgument(); - linkEl.setAttribute("fromID", argToId.get(arg1)); - linkEl.setAttribute("fromText", arg1.getCoveredText()); - Annotation arg2 = rel.getArg2().getArgument(); - linkEl.setAttribute("toID", argToId.get(arg2)); - linkEl.setAttribute("toText", arg2.getCoveredText()); - linkEl.setAttribute("type", rel.getCategory()); - tagsElement.appendChild(linkEl); - } - - // boilerplate xml-writing code: - TransformerFactory transformerFactory = TransformerFactory.newInstance(); - Transformer transformer = transformerFactory.newTransformer(); - transformer.setOutputProperty(OutputKeys.INDENT, "yes"); - transformer.setOutputProperty(OutputKeys.METHOD, "xml"); - DOMSource source = new DOMSource(doc); - StreamResult result = new StreamResult(new File(outputDir, outFile)); - transformer.transform(source, result); - } catch (ParserConfigurationException e) { - e.printStackTrace(); - throw new AnalysisEngineProcessException(e); - } catch (TransformerConfigurationException e) { - e.printStackTrace(); - throw new AnalysisEngineProcessException(e); - } catch (TransformerException e) { - e.printStackTrace(); - throw new AnalysisEngineProcessException(e); - } - - } - - } + } }