ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1689886 - in /ctakes/sandbox/ctakes-wsd: ./ .settings/ src/main/java/org/apache/ctakes/sample/ src/main/java/org/apache/ctakes/sample/pipeline/ src/main/java/org/apache/ctakes/wsd/pipelines/
Date Wed, 08 Jul 2015 14:28:36 GMT
Author: dligach
Date: Wed Jul  8 14:28:35 2015
New Revision: 1689886

URL: http://svn.apache.org/r1689886
Log:
added a few more useful pipelines and fixed the old ones that were broken

Added:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Analyze.java
  (with props)
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Basic.java 
 (with props)
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Experiment.java
  (with props)
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Preprocess.java
  (with props)
Removed:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/
Modified:
    ctakes/sandbox/ctakes-wsd/.classpath
    ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs
    ctakes/sandbox/ctakes-wsd/pom.xml

Modified: ctakes/sandbox/ctakes-wsd/.classpath
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.classpath?rev=1689886&r1=1689885&r2=1689886&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/.classpath (original)
+++ ctakes/sandbox/ctakes-wsd/.classpath Wed Jul  8 14:28:35 2015
@@ -22,12 +22,12 @@
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
-	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
 		<attributes>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
-	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
 		<attributes>
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>

Modified: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs?rev=1689886&r1=1689885&r2=1689886&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs (original)
+++ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs Wed Jul  8 14:28:35 2015
@@ -1,5 +1,5 @@
 eclipse.preferences.version=1
-org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
-org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
+org.eclipse.jdt.core.compiler.compliance=1.7
 org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
-org.eclipse.jdt.core.compiler.source=1.6
+org.eclipse.jdt.core.compiler.source=1.7

Modified: ctakes/sandbox/ctakes-wsd/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/pom.xml?rev=1689886&r1=1689885&r2=1689886&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-wsd/pom.xml (original)
+++ ctakes/sandbox/ctakes-wsd/pom.xml Wed Jul  8 14:28:35 2015
@@ -1,36 +1,15 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
-
-    Licensed to the Apache Software Foundation (ASF) under one
-    or more contributor license agreements.  See the NOTICE file
-    distributed with this work for additional information
-    regarding copyright ownership.  The ASF licenses this file
-    to you under the Apache License, Version 2.0 (the
-    "License"); you may not use this file except in compliance
-    with the License.  You may obtain a copy of the License at
-
-      http://www.apache.org/licenses/LICENSE-2.0
-
-    Unless required by applicable law or agreed to in writing,
-    software distributed under the License is distributed on an
-    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-    KIND, either express or implied.  See the License for the
-    specific language governing permissions and limitations
-    under the License.
-
--->
-<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
-  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
   <modelVersion>4.0.0</modelVersion>
-  <artifactId>ctakes-temporal</artifactId>
-    <packaging>jar</packaging>
-    <name>Apache cTAKES Temporal Information Extraction</name>
   <parent>
     <groupId>org.apache.ctakes</groupId>
     <artifactId>ctakes</artifactId>
-    <version>3.2.1-SNAPSHOT</version>
+    <version>3.2.3-SNAPSHOT</version>
   </parent>
-  <dependencies>
+  <artifactId>ctakes-wsd</artifactId>
+  <packaging>jar</packaging>
+  <description>Mostly written by Dmitriy (Dima) Dligach</description>
+  <name>Miscellaneous cTAKES code</name>
+    <dependencies>
     <dependency>
       <groupId>org.apache.ctakes</groupId>
       <artifactId>ctakes-temporal-res</artifactId>
@@ -84,18 +63,9 @@
       <artifactId>ctakes-constituency-parser</artifactId>
     </dependency>
     <dependency>
-      <groupId>net.sourceforge.ctakesresources</groupId>
-      <artifactId>ctakes-resources-umls2011ab</artifactId>
-      <version>3.1.1</version>
-    </dependency>
-    <dependency>
-      <groupId>org.jdom</groupId>
-      <artifactId>jdom2</artifactId>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-clinical-pipeline</artifactId>
     </dependency>
-        <dependency>
-            <groupId>com.lexicalscope.jewelcli</groupId>
-            <artifactId>jewelcli</artifactId>
-        </dependency>
     <dependency>
       <groupId>org.cleartk</groupId>
       <artifactId>cleartk-util</artifactId>
@@ -118,10 +88,6 @@
     </dependency>
     <dependency>
       <groupId>org.cleartk</groupId>
-      <artifactId>cleartk-syntax</artifactId>
-    </dependency>
-    <dependency>
-      <groupId>org.cleartk</groupId>
       <artifactId>cleartk-ml-libsvm</artifactId>
     </dependency>
     <dependency>
@@ -204,4 +170,4 @@
       </plugins>
     </pluginManagement>
   </build>
-</project>
+</project>
\ No newline at end of file

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Analyze.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Analyze.java?rev=1689886&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Analyze.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Analyze.java
Wed Jul  8 14:28:35 2015
@@ -0,0 +1,141 @@
+package org.apache.ctakes.sample.pipeline;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.ctakes.core.cr.XMIReader;
+import org.apache.ctakes.typesystem.type.refsem.OntologyConcept;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.CollectionReaderFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+
+import com.google.common.collect.Lists;
+
+/**
+ * Generate various data sets for analyzing polysemy.
+ * 
+ * @author dmitriy dligach
+ */
+public class Analyze {
+
+  public static final String GOLD_VIEW_NAME = "GoldView";
+
+  public static File inputDirectory = new File("/Users/dima/Boston/Data/Sharp/FromSameer/Xmi");
+  public static void main(String[] args) throws Exception {
+
+      List<File> xmiFiles = Arrays.asList(inputDirectory.listFiles());
+    String[] paths = new String[xmiFiles.size()];
+    for (int i = 0; i < paths.length; ++i) {
+      paths[i] = xmiFiles.get(i).getPath();
+    }
+
+    CollectionReader xmiCollectionReader = CollectionReaderFactory.createCollectionReader(
+        XMIReader.class,
+        XMIReader.PARAM_FILES,
+        paths);
+
+    AnalysisEngine consumer = AnalysisEngineFactory.createPrimitive(PrintCuis.class);
+
+    SimplePipeline.runPipeline(xmiCollectionReader, consumer);
+  }
+
+  public static class PrintConceptSemanticTypes extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+      for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class)))
{
+        // for some reason in gold begin offset for some mentions is a huge number
+        if(mention.getBegin() > jCas.getDocumentText().length()) {
+          continue;
+        }
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
+      }
+
+      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class)))
{
+        // avoid weird crashes
+        if(mention.getBegin() > jCas.getDocumentText().length()) {
+          continue;
+        }
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        System.out.format("%s|%s\n", text, semanticType);
+      }
+    }
+  }
+
+  public static class PrintCuis extends JCasAnnotator_ImplBase {
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+
+      for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class)))
{
+        if(mention.getBegin() > jCas.getDocumentText().length()) {
+          continue;
+        }
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        for(String code : getOntologyConceptCodes(mention)) {
+          System.out.format("%s|%s|%s\n", text, semanticType, code);
+        }
+      }
+
+      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class)))
{
+        if(mention.getBegin() > jCas.getDocumentText().length()) {
+          continue;
+        }
+        String text = mention.getCoveredText().toLowerCase();
+        String semanticType = mention.getClass().getSimpleName();
+        for(String code : getOntologyConceptCodes(mention)) {
+          System.out.format("%s|%s|%s\n", text, semanticType, code);
+        }
+      }
+    }
+  }
+
+  /**
+   * Get the CUIs, RxNorm codes, etc.
+   */
+  public static Set<String> getOntologyConceptCodes(IdentifiedAnnotation identifiedAnnotation)
{
+
+    Set<String> codes = new HashSet<String>();
+
+    FSArray fsArray = identifiedAnnotation.getOntologyConceptArr();
+    if(fsArray == null) {
+      return codes;
+    }
+
+    for(FeatureStructure featureStructure : fsArray.toArray()) {
+      OntologyConcept ontologyConcept = (OntologyConcept) featureStructure;
+
+      if(ontologyConcept instanceof UmlsConcept) {
+        UmlsConcept umlsConcept = (UmlsConcept) ontologyConcept;
+        String code = umlsConcept.getCui();
+        codes.add(code);
+      } else { // SNOMED or RxNorm
+        String code = ontologyConcept.getCodingScheme() + ontologyConcept.getCode();
+        codes.add(code);
+      }
+    }
+
+    return codes;
+  }
+}
+

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Analyze.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Basic.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Basic.java?rev=1689886&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Basic.java (added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Basic.java Wed
Jul  8 14:28:35 2015
@@ -0,0 +1,253 @@
+package org.apache.ctakes.sample.pipeline;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup2.ae.AbstractJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator;
+import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.ExternalResourceFactory;
+import org.apache.uima.fit.factory.TypePrioritiesFactory;
+import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.google.common.io.CharStreams;
+
+public class Basic {
+
+  public static File inputDirectory = new File("/Users/Dima/Boston/Data/Sharp/FromSameer/Text/");
+  public static String outputDirectory = "/Users/Dima/Boston/Out/";
+
+  public static void main(String[] args) throws Exception {
+
+    List<File> files = new ArrayList<File>();
+    for(File file : inputDirectory.listFiles()) {
+      files.add(file);
+    }
+
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+    AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+    SimplePipeline.runPipeline(reader, engine);
+  }
+
+  protected static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(UriToDocumentTextAnnotatorCtakes.class));
+
+    // identify segments 
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(SimpleSegmentAnnotator.class));
+
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        SentenceDetector.class,
+        SentenceDetector.SD_MODEL_FILE_PARAM,
+        "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        POSTagger.class,
+        TypeSystemDescriptionFactory.createTypeSystemDescription(),
+        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
+        POSTagger.POS_MODEL_FILE_PARAM,
+        "org/apache/ctakes/postagger/models/mayo-pos.zip"));
+
+    // identify chunks
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        Chunker.class,
+        Chunker.CHUNKER_MODEL_FILE_PARAM,
+        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
+        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+        DefaultChunkCreator.class));
+
+    // identify UMLS named entities
+
+    // adjust NP in NP NP to span both
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        1));
+    // adjust NP in NP PP NP to span all three
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "PP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        2));
+    // add lookup windows for each NP
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(CopyNPChunksToLookupWindowAnnotations.class));
+    // maximize lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        OverlapAnnotator.class,
+        "A_ObjectClass",
+        LookupWindowAnnotation.class,
+        "B_ObjectClass",
+        LookupWindowAnnotation.class,
+        "OverlapType",
+        "A_ENV_B",
+        "ActionType",
+        "DELETE",
+        "DeleteAction",
+        new String[] { "selector=B" }));
+    // add UMLS on top of lookup windows
+    try {
+      aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(DefaultJCasTermAnnotator.class,
+          AbstractJCasTermAnnotator.PARAM_WINDOW_ANNOT_PRP,
+          "org.apache.ctakes.typesystem.type.textspan.Sentence",
+          JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY,
+          ExternalResourceFactory.createExternalResourceDescription(
+              FileResourceImpl.class,
+              FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml"))
+          ));
+    } catch (FileNotFoundException e) {
+      e.printStackTrace();
+      throw new ResourceInitializationException(e);
+    }
+
+    aggregateBuilder.add(LvgAnnotator.createAnnotatorDescription());
+
+    // add dependency parser
+    aggregateBuilder.add(ClearNLPDependencyParserAE.createAnnotatorDescription());
+    
+    // add semantic role labeler
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ClearNLPSemanticRoleLabelerAE.class));
+
+    // write out the CAS after all the above annotations
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        outputDirectory));
+
+    return aggregateBuilder;
+  }
+  
+  /* 
+   * The following class overrides a ClearTK utility annotator class for reading
+   * a text file into a JCas. The code is copy/pasted so that one tiny modification
+   * can be made for this corpus -- replace a single odd character (0xc) with a 
+   * space since it trips up xml output.  
+   */
+  public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      URI uri = ViewUriUtil.getURI(jCas);
+      String content;
+
+      try {
+        content = CharStreams.toString(new InputStreamReader(uri.toURL().openStream()));
+        content = content.replace((char) 0xc, ' ');
+        jCas.setSofaDataString(content, "text/plain");
+      } catch (MalformedURLException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }  
+  }
+  
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+  
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewUriUtil.getURI(jCas).getPath()));
+  }
+  
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+}

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Basic.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Experiment.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Experiment.java?rev=1689886&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Experiment.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Experiment.java
Wed Jul  8 14:28:35 2015
@@ -0,0 +1,42 @@
+package org.apache.ctakes.sample.pipeline;
+
+import java.io.File;
+
+import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory;
+import org.apache.ctakes.core.cc.XmiWriterCasConsumerCtakes;
+import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.CollectionReaderFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+
+public class Experiment {
+
+  public static File inputDirectory = new File("/Users/dima/Boston/Data/Sharp/FromSameer/Text");
+  public static String outputDirectory = "/Users/dima/Boston/Out";
+
+  public static void main(String[] args) throws Exception {
+
+    String path = "/Users/dima/Boston/Workspaces/cTakes/ctakes/ctakes-core/desc/collection_reader/FilesInDirectoryCollectionReader.xml";
+    CollectionReader collectionReader = CollectionReaderFactory.createReaderFromPath(
+        path,
+        FilesInDirectoryCollectionReader.PARAM_INPUTDIR,
+        inputDirectory);
+
+    AggregateBuilder pipeline = new AggregateBuilder();
+    pipeline.add(ClinicalPipelineFactory.getDefaultPipeline());
+    
+    AnalysisEngine xmiWriter = AnalysisEngineFactory.createEngine(
+        XmiWriterCasConsumerCtakes.class,
+        XmiWriterCasConsumerCtakes.PARAM_OUTPUTDIR,
+        outputDirectory);
+
+    SimplePipeline.runPipeline(collectionReader,
+        pipeline.createAggregate(),
+        xmiWriter);
+  }
+}
+
+

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Experiment.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Preprocess.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Preprocess.java?rev=1689886&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Preprocess.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Preprocess.java
Wed Jul  8 14:28:35 2015
@@ -0,0 +1,366 @@
+package org.apache.ctakes.sample.pipeline;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
+import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.relationextractor.eval.SHARPXMI.DocumentIDAnnotator;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.component.ViewCreatorAnnotator;
+import org.apache.uima.fit.component.ViewTextCopierAnnotator;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.factory.ExternalResourceFactory;
+import org.apache.uima.fit.factory.TypePrioritiesFactory;
+import org.apache.uima.fit.factory.TypeSystemDescriptionFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.fit.util.JCasUtil;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class Preprocess {
+
+  public static final String GOLD_VIEW_NAME = "GoldView";
+
+  public static File inputDirectory = new File("/Users/Dima/Boston/Data/Sharp/FromSameer/Text/");
+  public static String outputDirectory = "/Users/Dima/Boston/Out";
+
+  public static void main(String[] args) throws Exception {
+
+    List<File> files = new ArrayList<File>();
+    for(File file : inputDirectory.listFiles()) {
+      files.add(file);
+    }
+
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+    AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+    SimplePipeline.runPipeline(reader, engine);
+  }
+
+  public static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() throws Exception
{
+    
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+
+    // read manual annotations into gold view
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ViewCreatorAnnotator.class,
+        ViewCreatorAnnotator.PARAM_VIEW_NAME,
+        GOLD_VIEW_NAME));
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ViewTextCopierAnnotator.class,
+        ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+        CAS.NAME_DEFAULT_SOFA,
+        ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+        GOLD_VIEW_NAME));
+    // need document ids so that SHARP XML reader can figure out the path to xml files
+    aggregateBuilder.add(
+        AnalysisEngineFactory.createPrimitiveDescription(DocumentIDAnnotator.class),
+        CAS.NAME_DEFAULT_SOFA,
+        GOLD_VIEW_NAME);
+    aggregateBuilder.add(
+        AnalysisEngineFactory.createPrimitiveDescription(SHARPKnowtatorXMLReader.class),
+        CAS.NAME_DEFAULT_SOFA,
+        GOLD_VIEW_NAME);
+    
+    // identify segments
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        SentenceDetector.class,
+        SentenceDetector.SD_MODEL_FILE_PARAM,
+        "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        POSTagger.class,
+        TypeSystemDescriptionFactory.createTypeSystemDescription(),
+        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
+        POSTagger.POS_MODEL_FILE_PARAM,
+        "org/apache/ctakes/postagger/models/mayo-pos.zip"));
+
+    // identify chunks
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        Chunker.class,
+        Chunker.CHUNKER_MODEL_FILE_PARAM,
+        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
+        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+        DefaultChunkCreator.class));
+
+    // identify UMLS named entities
+
+    // adjust NP in NP NP to span both
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        1));
+    // adjust NP in NP PP NP to span all three
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "PP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        2));
+    // add lookup windows for each NP
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
+    // maximize lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        OverlapAnnotator.class,
+        "A_ObjectClass",
+        LookupWindowAnnotation.class,
+        "B_ObjectClass",
+        LookupWindowAnnotation.class,
+        "OverlapType",
+        "A_ENV_B",
+        "ActionType",
+        "DELETE",
+        "DeleteAction",
+        new String[] { "selector=B" }));
+    // add UMLS on top of lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        UmlsDictionaryLookupAnnotator.class,
+        "ctakes.umlsaddr",
+        "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser",
+        "ctakes.umlsvendor",
+        "NLM-6515182895",
+        "LookupDescriptor",
+        ExternalResourceFactory.createExternalResourceDescription(
+            FileResourceImpl.class,
+            FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/LookupDesc_Db.xml")),
+            "DbConnection",
+            ExternalResourceFactory.createExternalResourceDescription(
+                JdbcConnectionResourceImpl.class,
+                "",
+                JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS,
+                "org.hsqldb.jdbcDriver",
+                JdbcConnectionResourceImpl.PARAM_URL,
+                // Should be the following but it's WAY too slow
+                // "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+                "jdbc:hsqldb:file:target/unpacked/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+                "RxnormIndexReader",
+                ExternalResourceFactory.createExternalResourceDescription(
+                    LuceneIndexReaderResourceImpl.class,
+                    "",
+                    "UseMemoryIndex",
+                    true,
+                    "IndexDirectory",
+                    new File("target/unpacked/org/apache/ctakes/dictionary/lookup/rxnorm_index").getAbsoluteFile()),
+                    "OrangeBookIndexReader",
+                    ExternalResourceFactory.createExternalResourceDescription(
+                        LuceneIndexReaderResourceImpl.class,
+                        "",
+                        "UseMemoryIndex",
+                        true,
+                        "IndexDirectory",
+                        FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/OrangeBook"))));
+
+    // add lvg annotator
+    String[] XeroxTreebankMap = {
+        "adj|JJ",
+        "adv|RB",
+        "aux|AUX",
+        "compl|CS",
+        "conj|CC",
+        "det|DET",
+        "modal|MD",
+        "noun|NN",
+        "prep|IN",
+        "pron|PRP",
+    "verb|VB" };
+    String[] ExclusionSet = {
+        "and",
+        "And",
+        "by",
+        "By",
+        "for",
+        "For",
+        "in",
+        "In",
+        "of",
+        "Of",
+        "on",
+        "On",
+        "the",
+        "The",
+        "to",
+        "To",
+        "with",
+    "With" };
+    AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+        LvgAnnotator.class,
+        "UseSegments",
+        false,
+        "SegmentsToSkip",
+        new String[0],
+        "UseCmdCache",
+        false,
+        "CmdCacheFileLocation",
+        "/org/apache/ctakes/lvg/2005_norm.voc",
+        "CmdCacheFrequencyCutoff",
+        20,
+        "ExclusionSet",
+        ExclusionSet,
+        "XeroxTreebankMap",
+        XeroxTreebankMap,
+        "LemmaCacheFileLocation",
+        "/org/apache/ctakes/lvg/2005_lemma.voc",
+        "UseLemmaCache",
+        false,
+        "LemmaCacheFrequencyCutoff",
+        20,
+        "PostLemmas",
+        true,
+        "LvgCmdApi",
+        ExternalResourceFactory.createExternalResourceDescription(
+            LvgCmdApiResourceImpl.class,
+            new File(LvgCmdApiResourceImpl.class.getResource(
+                "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI())));
+    aggregateBuilder.add(lvgAnnotator);
+
+    // add dependency parser
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class));
+
+    // add semantic role labeler
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class));
+
+//    // add gold standard parses to gold view, and adjust gold view to correct a few annotation
mis-steps
+//    if(this.treebankDirectory != null){
+//      aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory));
+//      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class));
+//    }else{
+//      // add ctakes constituency parses to system view
+//      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class));
+//    }
+
+    // write out the CAS after all the above annotations
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        outputDirectory));
+
+    return aggregateBuilder;
+  }
+
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+  
+  // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed
+  public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase
{
+    private static Pattern SECTION_PATTERN = Pattern.compile(
+        "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])",
+        Pattern.DOTALL);
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      Matcher matcher = SECTION_PATTERN.matcher(jCas.getDocumentText());
+      while (matcher.find()) {
+        Segment segment = new Segment(jCas);
+        segment.setBegin(matcher.start() + matcher.group(1).length());
+        segment.setEnd(matcher.end() - matcher.group(3).length());
+        segment.setId(matcher.group(2));
+        segment.addToIndexes();
+      }
+    }
+  }
+  
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewUriUtil.getURI(jCas).getPath()));
+  }
+
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+}

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/sample/pipeline/Preprocess.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message