ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1693617 - in /ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes: pipelines/ pipelines/nicu/ semtype/ utils/
Date Fri, 31 Jul 2015 15:35:31 GMT
Author: dligach
Date: Fri Jul 31 15:35:30 2015
New Revision: 1693617

URL: http://svn.apache.org/r1693617
Log:
refactoring + added polarity and uncertainty 

Added:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/BasicAnnotations.java
  (with props)
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/PrintEntitiesAndEvents.java
  (with props)
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/utils/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/utils/Utils.java
      - copied, changed from r1689954, ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/Utils.java
Removed:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/Utils.java
Modified:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/SemanticTypePrinter.java
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/DictionaryLookupPipeline.java

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java?rev=1693617&r1=1693616&r2=1693617&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java
(original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/ClinicalConceptViewer.java
Fri Jul 31 15:35:30 2015
@@ -23,6 +23,7 @@ import java.io.File;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.Utils;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CASException;

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/SemanticTypePrinter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/SemanticTypePrinter.java?rev=1693617&r1=1693616&r2=1693617&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/SemanticTypePrinter.java
(original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/SemanticTypePrinter.java
Fri Jul 31 15:35:30 2015
@@ -26,6 +26,7 @@ import org.apache.ctakes.typesystem.type
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
 import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.utils.Utils;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CASException;

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/BasicAnnotations.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/BasicAnnotations.java?rev=1693617&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/BasicAnnotations.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/BasicAnnotations.java
Fri Jul 31 15:35:30 2015
@@ -0,0 +1,247 @@
+package org.apache.ctakes.pipelines.nicu;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine;
+import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine;
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.dictionary.lookup2.ae.AbstractJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.ExternalResourceFactory;
+import org.apache.uima.fit.factory.TypePrioritiesFactory;
+import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.google.common.io.CharStreams;
+
+public class BasicAnnotations {
+
+  public static File inputDirectory = new File("/Users/dima/Boston/Nicu/Text/");
+  public static String outputDirectory = "/Users/dima/Boston/Nicu/Xmi/";
+
+  public static void main(String[] args) throws Exception {
+
+    List<File> files = new ArrayList<File>();
+    for(File file : inputDirectory.listFiles()) {
+      files.add(file);
+    }
+
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+    AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+    SimplePipeline.runPipeline(reader, engine);
+  }
+
+  protected static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(UriToDocumentTextAnnotatorCtakes.class));
+
+    // identify segments 
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(SimpleSegmentAnnotator.class));
+
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        SentenceDetector.class,
+        SentenceDetector.SD_MODEL_FILE_PARAM,
+        "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        POSTagger.class,
+        TypeSystemDescriptionFactory.createTypeSystemDescription(),
+        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
+        POSTagger.POS_MODEL_FILE_PARAM,
+        "org/apache/ctakes/postagger/models/mayo-pos.zip"));
+
+    // identify chunks
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        Chunker.class,
+        Chunker.CHUNKER_MODEL_FILE_PARAM,
+        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
+        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+        DefaultChunkCreator.class));
+
+    // identify UMLS named entities
+
+    // adjust NP in NP NP to span both
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        1));
+    // adjust NP in NP PP NP to span all three
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "PP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        2));
+    // add lookup windows for each NP
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyNPChunksToLookupWindowAnnotations.class));
+    // maximize lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        OverlapAnnotator.class,
+        "A_ObjectClass",
+        LookupWindowAnnotation.class,
+        "B_ObjectClass",
+        LookupWindowAnnotation.class,
+        "OverlapType",
+        "A_ENV_B",
+        "ActionType",
+        "DELETE",
+        "DeleteAction",
+        new String[] { "selector=B" }));
+    // add UMLS on top of lookup windows
+    try {
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DefaultJCasTermAnnotator.class,
+          AbstractJCasTermAnnotator.PARAM_WINDOW_ANNOT_PRP,
+          "org.apache.ctakes.typesystem.type.textspan.Sentence",
+          JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY,
+          ExternalResourceFactory.createExternalResourceDescription(
+              FileResourceImpl.class,
+              FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml"))
+          ));
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+      throw new ResourceInitializationException(e);
+    }
+
+    aggregateBuilder.add(PolarityCleartkAnalysisEngine.createAnnotatorDescription());
+    aggregateBuilder.add(UncertaintyCleartkAnalysisEngine.createAnnotatorDescription());
+    
+    // write out the CAS after all the above annotations
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        outputDirectory));
+
+    return aggregateBuilder;
+  }
+  
+  /* 
+   * The following class overrides a ClearTK utility annotator class for reading
+   * a text file into a JCas. The code is copy/pasted so that one tiny modification
+   * can be made for this corpus -- replace a single odd character (0xc) with a 
+   * space since it trips up xml output.  
+   */
+  public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      URI uri = ViewUriUtil.getURI(jCas);
+      String content;
+
+      try {
+        content = CharStreams.toString(new InputStreamReader(uri.toURL().openStream()));
+        content = content.replace((char) 0xc, ' ');
+        jCas.setSofaDataString(content, "text/plain");
+      } catch (MalformedURLException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }  
+  }
+  
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+  
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewUriUtil.getURI(jCas).getPath()));
+  }
+  
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+}

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/BasicAnnotations.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/PrintEntitiesAndEvents.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/PrintEntitiesAndEvents.java?rev=1693617&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/PrintEntitiesAndEvents.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/PrintEntitiesAndEvents.java
Fri Jul 31 15:35:30 2015
@@ -0,0 +1,87 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.pipelines.nicu;
+
+import java.io.File;
+
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.utils.Utils;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+
+import com.google.common.collect.Lists;
+import com.lexicalscope.jewel.cli.CliFactory;
+import com.lexicalscope.jewel.cli.Option;
+
+/**
+ * Read cTAKES annotations from XMI files.
+ *  
+ * @author dmitriy dligach
+ */
+public class PrintEntitiesAndEvents {
+  
+  static interface Options {
+
+    @Option(
+        longName = "xmi-dir",
+        description = "path to xmi files")
+    public File getInputDirectory();
+  }
+  
+	public static void main(String[] args) throws Exception {
+		  
+		Options options = CliFactory.parseArguments(Options.class, args);
+    CollectionReader collectionReader = Utils.getCollectionReader(options.getInputDirectory());
+    AnalysisEngine annotationConsumer = AnalysisEngineFactory.createEngine(RelationContextPrinter.class);
+		SimplePipeline.runPipeline(collectionReader, annotationConsumer);
+	}
+
+  /**
+   * Print events and entities.
+   *  
+   * @author dmitriy dligach
+   */
+  public static class RelationContextPrinter extends JCasAnnotator_ImplBase {
+    
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+      for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class)))
{
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
+      }
+
+      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class)))
{
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
+      }
+    }
+  }
+}
+
+  
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/nicu/PrintEntitiesAndEvents.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/DictionaryLookupPipeline.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/DictionaryLookupPipeline.java?rev=1693617&r1=1693616&r2=1693617&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/DictionaryLookupPipeline.java
(original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/semtype/DictionaryLookupPipeline.java
Fri Jul 31 15:35:30 2015
@@ -56,8 +56,8 @@ import com.google.common.io.CharStreams;
 
 public class DictionaryLookupPipeline {
 
-  public static File inputDirectory = new File("/Users/dima/Boston/Vectors/SemType/Text/");
-  public static String outputDirectory = "/Users/Dima/Boston/Out/";
+  public static File inputDirectory = new File("/Users/dima/Boston/Nicu/Text/");
+  public static String outputDirectory = "/Users/dima/Boston/Nicu/Xmi/";
 
   public static void main(String[] args) throws Exception {
 

Copied: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/utils/Utils.java (from r1689954,
ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/Utils.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/utils/Utils.java?p2=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/utils/Utils.java&p1=ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/Utils.java&r1=1689954&r2=1693617&rev=1693617&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/Utils.java (original)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/utils/Utils.java Fri Jul 31
15:35:30 2015
@@ -16,7 +16,7 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.ctakes.pipelines;
+package org.apache.ctakes.utils;
 
 import java.io.File;
 import java.util.ArrayList;



Mime
View raw message