ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1690743 - in /ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes: pipelines/ClinicalConceptViewer.java semtype/ semtype/BasicPipeline.java semtype/ClinicalConceptViewer.java
Date Mon, 13 Jul 2015 16:13:39 GMT
Author: dligach
Date: Mon Jul 13 16:13:39 2015
New Revision: 1690743

URL: http://svn.apache.org/r1690743
Log:
new package for processing one-per-line word files

Added:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/BasicPipeline.java 
 (with props)
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/ClinicalConceptViewer.java
  (with props)
Modified:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java?rev=1690743&r1=1690742&r2=1690743&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java
(original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java
Mon Jul 13 16:13:39 2015
@@ -37,12 +37,8 @@ import com.lexicalscope.jewel.cli.CliFac
 import com.lexicalscope.jewel.cli.Option;
 
 /**
- * Print gold standard relations and their context.
- * Full set of relations annotated in SHARP is the following:
- * 
- * affects, causes/brings_about, complicates/disrupts, contraindicates, degree_of, diagnoses,
- * indicates, is_indicated_for, location_of, manages/treats, manifestation_of, result_of
- * 
+ * Read cTAKES annotations from XMI files.
+ *  
  * @author dmitriy dligach
  */
 public class ClinicalConceptViewer {
@@ -64,8 +60,8 @@ public class ClinicalConceptViewer {
 	}
 
   /**
-   * Print CUIs and their attributes.
-   * 
+   * Print events and entities.
+   *  
    * @author dmitriy dligach
    */
   public static class RelationContextPrinter extends JCasAnnotator_ImplBase {

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/BasicPipeline.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/BasicPipeline.java?rev=1690743&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/BasicPipeline.java (added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/BasicPipeline.java Mon
Jul 13 16:13:39 2015
@@ -0,0 +1,251 @@
+package org.apache.ctakes.semtype;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.dictionary.lookup2.ae.AbstractJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.ExternalResourceFactory;
+import org.apache.uima.fit.factory.TypePrioritiesFactory;
+import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.google.common.io.CharStreams;
+
+public class BasicPipeline {
+
+  public static File inputDirectory = new File("/Users/dima/Boston/Vectors/SemType/Text/");
+  public static String outputDirectory = "/Users/Dima/Boston/Out/";
+
+  public static void main(String[] args) throws Exception {
+
+    List<File> files = new ArrayList<File>();
+    for(File file : inputDirectory.listFiles()) {
+      files.add(file);
+    }
+
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+    AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+    SimplePipeline.runPipeline(reader, engine);
+  }
+
+  protected static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(UriToDocumentTextAnnotatorCtakes.class));
+
+    // identify segments 
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(SimpleSegmentAnnotator.class));
+
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        SentenceDetector.class,
+        SentenceDetector.SD_MODEL_FILE_PARAM,
+        "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        POSTagger.class,
+        TypeSystemDescriptionFactory.createTypeSystemDescription(),
+        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
+        POSTagger.POS_MODEL_FILE_PARAM,
+        "org/apache/ctakes/postagger/models/mayo-pos.zip"));
+
+    // identify chunks
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        Chunker.class,
+        Chunker.CHUNKER_MODEL_FILE_PARAM,
+        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
+        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+        DefaultChunkCreator.class));
+
+    // identify UMLS named entities
+
+    // adjust NP in NP NP to span both
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        1));
+    // adjust NP in NP PP NP to span all three
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "PP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        2));
+    // add lookup windows for each NP
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyNPChunksToLookupWindowAnnotations.class));
+    // maximize lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        OverlapAnnotator.class,
+        "A_ObjectClass",
+        LookupWindowAnnotation.class,
+        "B_ObjectClass",
+        LookupWindowAnnotation.class,
+        "OverlapType",
+        "A_ENV_B",
+        "ActionType",
+        "DELETE",
+        "DeleteAction",
+        new String[] { "selector=B" }));
+    // add UMLS on top of lookup windows
+    try {
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DefaultJCasTermAnnotator.class,
+          AbstractJCasTermAnnotator.PARAM_WINDOW_ANNOT_PRP,
+          "org.apache.ctakes.typesystem.type.textspan.Sentence",
+          JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY,
+          ExternalResourceFactory.createExternalResourceDescription(
+              FileResourceImpl.class,
+              FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml"))
+          ));
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+      throw new ResourceInitializationException(e);
+    }
+
+    aggregateBuilder.add(LvgAnnotator.createAnnotatorDescription());
+
+//    // add dependency parser
+//    aggregateBuilder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
+//    
+//    // add semantic role labeler
+//    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ClearNLPSemanticRoleLabelerAE.class));
+
+    // write out the CAS after all the above annotations
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        outputDirectory));
+
+    return aggregateBuilder;
+  }
+  
+  /* 
+   * The following class overrides a ClearTK utility annotator class for reading
+   * a text file into a JCas. The code is copy/pasted so that one tiny modification
+   * can be made for this corpus -- replace a single odd character (0xc) with a 
+   * space since it trips up xml output.  
+   */
+  public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      URI uri = ViewUriUtil.getURI(jCas);
+      String content;
+
+      try {
+        content = CharStreams.toString(new InputStreamReader(uri.toURL().openStream()));
+        content = content.replace((char) 0xc, ' ');
+        jCas.setSofaDataString(content, "text/plain");
+      } catch (MalformedURLException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }  
+  }
+  
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+  
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewUriUtil.getURI(jCas).getPath()));
+  }
+  
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+}

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/BasicPipeline.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/ClinicalConceptViewer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/ClinicalConceptViewer.java?rev=1690743&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/ClinicalConceptViewer.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/ClinicalConceptViewer.java
Mon Jul 13 16:13:39 2015
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.semtype;
+
+import java.io.File;
+
+import org.apache.ctakes.pipelines.Utils;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+/**
+ * Read cTAKES annotations from XMI files.
+ *  
+ * @author dmitriy dligach
+ */
+public class ClinicalConceptViewer {
+  
+  static interface Options {
+
+    @Option(
+        longName = "xmi-dir",
+        description = "path to xmi files")
+    public File getInputDirectory();
+  }
+  
+	public static void main(String[] args) throws Exception {
+		  
+		Options options = CliFactory.parseArguments(Options.class, args);
+    CollectionReader collectionReader = Utils.getCollectionReader(options.getInputDirectory());
+    AnalysisEngine annotationConsumer = AnalysisEngineFactory.createEngine(RelationContextPrinter.class);
+		SimplePipeline.runPipeline(collectionReader, annotationConsumer);
+	}
+
+  /**
+   * Print events and entities.
+   *  
+   * @author dmitriy dligach
+   */
+  public static class RelationContextPrinter extends JCasAnnotator_ImplBase {
+    
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      
+      JCas systemView;
+      try {
+        systemView = jCas.getView("_InitialView");
+      } catch (CASException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+
+      for(IdentifiedAnnotation mention : JCasUtil.select(systemView, EventMention.class))
{
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
+      }
+      
+      for(IdentifiedAnnotation mention : JCasUtil.select(systemView, EntityMention.class))
{
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
+      }
+    }
+  }
+}
+
+  
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/ClinicalConceptViewer.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message