ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1706472 - in /ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor: ae/DeepPheAnaforaXMLReader.java pipelines/AnaforaGoldStandardEvaluationPipeline.java
Date Fri, 02 Oct 2015 19:18:37 GMT
Author: dligach
Date: Fri Oct  2 19:18:36 2015
New Revision: 1706472

URL: http://svn.apache.org/viewvc?rev=1706472&view=rev
Log:
very rough draft of an anafora reader for location_of

Added:
    ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/ae/DeepPheAnaforaXMLReader.java
  (with props)
    ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/AnaforaGoldStandardEvaluationPipeline.java
  (with props)

Added: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/ae/DeepPheAnaforaXMLReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/ae/DeepPheAnaforaXMLReader.java?rev=1706472&view=auto
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/ae/DeepPheAnaforaXMLReader.java
(added)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/ae/DeepPheAnaforaXMLReader.java
Fri Oct  2 19:18:36 2015
@@ -0,0 +1,225 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.relationextractor.ae;
+
+import java.io.File;
+import java.io.IOException;
+import java.net.MalformedURLException;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.ctakes.typesystem.type.refsem.Event;
+import org.apache.ctakes.typesystem.type.refsem.EventProperties;
+import org.apache.ctakes.typesystem.type.relation.AspectualTextRelation;
+import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CollectionTextRelation;
+import org.apache.ctakes.typesystem.type.relation.CoreferenceRelation;
+import org.apache.ctakes.typesystem.type.relation.RelationArgument;
+import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.Markable;
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.EmptyFSList;
+import org.apache.uima.jcas.cas.NonEmptyFSList;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.jdom2.Element;
+import org.jdom2.JDOMException;
+import org.jdom2.input.SAXBuilder;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+
+public class DeepPheAnaforaXMLReader extends JCasAnnotator_ImplBase {
+  
+  private static Logger LOGGER = Logger.getLogger(DeepPheAnaforaXMLReader.class);
+
+  public static final String PARAM_ANAFORA_DIRECTORY = "anaforaDirectory";
+
+  @ConfigurationParameter(
+      name = PARAM_ANAFORA_DIRECTORY,
+      description = "root directory of the Anafora-annotated files, with one subdirectory
for "
+          + "each annotated file")
+  private File anaforaDirectory;
+
+  public static final String PARAM_ANAFORA_XML_SUFFIXES = "anaforaSuffixes";
+
+  @ConfigurationParameter(
+      name = PARAM_ANAFORA_XML_SUFFIXES,
+      mandatory = false,
+      description = "list of suffixes that might be added to a file name to identify the
Anafora "
+          + "XML annotations file; only the first suffix corresponding to a file will be
used")
+  private String[] anaforaXMLSuffixes = new String[] {
+      ".Temporal-Relations.gold.completed.xml",
+      ".Temporal-Relation.gold.completed.xml",
+          ".Temporal.dave.completed.xml",
+      ".Temporal-Relation-Adjudication.gold.completed.xml",
+      ".Temporal-Entity-Adjudication.gold.completed.xml",
+      ".temporal.Temporal-Adjudication.gold.completed.xml",
+      ".temporal.Temporal-Entities.gold.completed.xml",
+      ".Temporal-Entity.gold.completed.xml",
+      ".Gold_Temporal_Entities.xml",
+      ".Gold_Temporal_Relations.xml"};
+
+  public static AnalysisEngineDescription getDescription() throws ResourceInitializationException
{
+    return AnalysisEngineFactory.createEngineDescription(DeepPheAnaforaXMLReader.class);
+  }
+
+  public static AnalysisEngineDescription getDescription(File anaforaDirectory)
+      throws ResourceInitializationException {
+    return AnalysisEngineFactory.createEngineDescription(
+        DeepPheAnaforaXMLReader.class,
+        DeepPheAnaforaXMLReader.PARAM_ANAFORA_DIRECTORY,
+        anaforaDirectory);
+  }
+
+  @Override
+  public void process(JCas jCas) throws AnalysisEngineProcessException {
+    
+    // determine source text file
+    String textFileName = ViewUriUtil.getURI(jCas).getPath();
+    String xmlFileName = textFileName + ".UmlsDeepPhe.dave.inprogress.xml";
+    LOGGER.info("processing xml file: " + xmlFileName);
+
+    processXmlFile(jCas, new File(xmlFileName));
+  }
+  
+  private static void processXmlFile(JCas jCas, File xmlFile) throws AnalysisEngineProcessException{
+    
+    Element dataElem;
+    try {
+      dataElem = new SAXBuilder().build(xmlFile.toURI().toURL()).getRootElement();
+    } catch (MalformedURLException e) {
+      throw new AnalysisEngineProcessException(e);
+    } catch (JDOMException e) {
+      throw new AnalysisEngineProcessException(e);
+    } catch (IOException e) {
+      throw new AnalysisEngineProcessException(e);
+    }
+
+    for (Element annotationsElem : dataElem.getChildren("annotations")) {
+
+      Map<String, Annotation> idToAnnotation = Maps.newHashMap();
+
+      for (Element entityElem : annotationsElem.getChildren("entity")) {
+        
+        String id = removeSingleChildText(entityElem, "id", null);
+        Element spanElem = removeSingleChild(entityElem, "span", id);
+        String type = removeSingleChildText(entityElem, "type", id);
+        Element propertiesElem = removeSingleChild(entityElem, "properties", id);
+
+        // UIMA doesn't support disjoint spans, so take the span enclosing everything
+        int begin = Integer.MAX_VALUE;
+        int end = Integer.MIN_VALUE;
+        for(String spanString : spanElem.getText().split(";")) {
+          String[] beginEndStrings = spanString.split(",");
+          if (beginEndStrings.length != 2) {
+            error("span not of the format 'number,number'", id);
+          }
+          int spanBegin = Integer.parseInt(beginEndStrings[0]);
+          int spanEnd = Integer.parseInt(beginEndStrings[1]);
+          if (spanBegin < begin) {
+            begin = spanBegin;
+          }
+          if (spanEnd > end) {
+            end = spanEnd;
+          }
+        }
+
+        Annotation annotation = null;
+        if(type.equals("Disease_Disorder")) {
+          EventMention eventMention = new EventMention(jCas, begin, end);
+          Event event = new Event(jCas);
+          EventProperties eventProperties = new EventProperties(jCas);
+          eventProperties.addToIndexes();
+          event.addToIndexes();
+          eventMention.setEvent(event);
+          eventMention.addToIndexes();
+          annotation = eventMention;
+        } else if(type.equals("Anatomical_site")) {
+          AnatomicalSiteMention anatomicalSiteMention = new AnatomicalSiteMention(jCas, begin,
end);
+          anatomicalSiteMention.addToIndexes();
+        } else if (type.equals("Metastasis")) {
+          DiseaseDisorderMention diseaseDisorderMention = new DiseaseDisorderMention(jCas,
begin, end);
+          diseaseDisorderMention.addToIndexes();
+        } else {
+          continue; // not going to worry about other types for the moment
+        }
+
+        // match the annotation to its ID for later use
+        idToAnnotation.put(id, annotation);
+
+        // make sure all XML has been consumed
+        removeSingleChild(entityElem, "parentsType", id);
+        if (!propertiesElem.getChildren().isEmpty() || !entityElem.getChildren().isEmpty())
{
+          List<String> children = Lists.newArrayList();
+          for (Element child : propertiesElem.getChildren()) {
+            children.add(child.getName());
+          }
+          for (Element child : entityElem.getChildren()) {
+            children.add(child.getName());
+          }
+          error("unprocessed children " + children, id);
+        }
+      }
+    }
+  }
+
+  private static Element getSingleChild(Element elem, String elemName, String causeID) {
+    List<Element> children = elem.getChildren(elemName);
+    if (children.size() != 1) {
+      error(String.format("not exactly one '%s' child", elemName), causeID);
+    }
+    return children.size() > 0 ? children.get(0) : null;
+  }
+
+  private static Element removeSingleChild(Element elem, String elemName, String causeID)
{
+    Element child = getSingleChild(elem, elemName, causeID);
+    elem.removeChildren(elemName);
+    return child;
+  }
+
+  private static String removeSingleChildText(Element elem, String elemName, String causeID)
{
+    Element child = getSingleChild(elem, elemName, causeID);
+    String text = child.getText();
+    if (text.isEmpty()) {
+      error(String.format("an empty '%s' child", elemName), causeID);
+      text = null;
+    }
+    elem.removeChildren(elemName);
+    return text;
+  }
+
+  private static void error(String found, String id) {
+    LOGGER.error(String.format("found %s in annotation with ID %s", found, id));
+  }
+}

Propchange: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/ae/DeepPheAnaforaXMLReader.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/AnaforaGoldStandardEvaluationPipeline.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/AnaforaGoldStandardEvaluationPipeline.java?rev=1706472&view=auto
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/AnaforaGoldStandardEvaluationPipeline.java
(added)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/AnaforaGoldStandardEvaluationPipeline.java
Fri Oct  2 19:18:36 2015
@@ -0,0 +1,169 @@
+package org.apache.ctakes.relationextractor.pipelines;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.relationextractor.ae.DeepPheAnaforaXMLReader;
+import org.apache.ctakes.relationextractor.eval.SHARPXMI.CopyDocumentTextToGoldView;
+import org.apache.ctakes.relationextractor.eval.SHARPXMI.DocumentIDAnnotator;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.component.ViewCreatorAnnotator;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.google.common.io.CharStreams;
+
+public class AnaforaGoldStandardEvaluationPipeline {
+
+  public static final File INPUT_DIR = new File("/Users/dima/Boston/Data/DeepPhe/Metastasis/patient93_report028_NOTE/");
+  public static final String OUTPUT_DIR = "/Users/Dima/Boston/Out/";
+  public static final String GOLD_VIEW_NAME = "GoldView";
+
+  public static void main(String[] args) throws Exception {
+
+    List<File> files = new ArrayList<>();
+    files.add(new File("/Users/dima/Boston/Data/DeepPhe/Metastasis/patient93_report028_NOTE/patient93_report028_NOTE"));
+    
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+    AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+    SimplePipeline.runPipeline(reader, engine);
+  }
+
+  protected static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() throws Exception
{
+
+    AggregateBuilder builder = new AggregateBuilder();
+    builder.add(UriToDocumentTextAnnotator.getDescription());
+    
+//    File preprocessDescFile = new File("desc/analysis_engine/RelationExtractorPreprocessor.xml");
+//    XMLParser parser = UIMAFramework.getXMLParser();
+//    XMLInputSource source = new XMLInputSource(preprocessDescFile);
+//    builder.add(parser.parseAnalysisEngineDescription(source));
+    
+    builder.add(AnalysisEngineFactory.createEngineDescription(
+        ViewCreatorAnnotator.class,
+        ViewCreatorAnnotator.PARAM_VIEW_NAME,
+        GOLD_VIEW_NAME));
+    builder.add(AnalysisEngineFactory.createEngineDescription(CopyDocumentTextToGoldView.class));
+    builder.add(AnalysisEngineFactory.createEngineDescription(DocumentIDAnnotator.class),
+        CAS.NAME_DEFAULT_SOFA,
+        GOLD_VIEW_NAME);
+    
+    builder.add(
+        DeepPheAnaforaXMLReader.getDescription(INPUT_DIR),
+        CAS.NAME_DEFAULT_SOFA,
+        GOLD_VIEW_NAME);
+
+    // write out the CAS after all the above annotations
+    builder.add(AnalysisEngineFactory.createEngineDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        OUTPUT_DIR));
+
+    return builder;
+  }
+
+  /* 
+   * The following class overrides a ClearTK utility annotator class for reading
+   * a text file into a JCas. The code is copy/pasted so that one tiny modification
+   * can be made for this corpus -- replace a single odd character (0xc) with a 
+   * space since it trips up xml output.  
+   */
+  public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      URI uri = ViewUriUtil.getURI(jCas);
+      String content;
+
+      try {
+        content = CharStreams.toString(new InputStreamReader(uri.toURL().openStream()));
+        content = content.replace((char) 0xc, ' ');
+        jCas.setSofaDataString(content, "text/plain");
+      } catch (MalformedURLException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }  
+  }
+
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewUriUtil.getURI(jCas).getPath()));
+  }
+
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+}

Propchange: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/pipelines/AnaforaGoldStandardEvaluationPipeline.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message