ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1560454 - in /ctakes/sandbox/ctakes-wsd: ./ .settings/ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/ctakes/ src/main/java/org/apache/ctakes/wsd/ src/main/java/org/apache/ctakes/wsd/pip...
Date Wed, 22 Jan 2014 18:15:35 GMT
Author: dligach
Date: Wed Jan 22 18:15:34 2014
New Revision: 1560454

URL: http://svn.apache.org/r1560454
Log: (empty)

Added:
    ctakes/sandbox/ctakes-wsd/.classpath   (with props)
    ctakes/sandbox/ctakes-wsd/.project   (with props)
    ctakes/sandbox/ctakes-wsd/.settings/
    ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs   (with props)
    ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs   (with props)
    ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs   (with props)
    ctakes/sandbox/ctakes-wsd/pom.xml   (with props)
    ctakes/sandbox/ctakes-wsd/src/
    ctakes/sandbox/ctakes-wsd/src/main/
    ctakes/sandbox/ctakes-wsd/src/main/java/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java 
 (with props)
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java
  (with props)
    ctakes/sandbox/ctakes-wsd/src/main/resources/
    ctakes/sandbox/ctakes-wsd/src/test/
    ctakes/sandbox/ctakes-wsd/src/test/java/
    ctakes/sandbox/ctakes-wsd/src/test/resources/

Added: ctakes/sandbox/ctakes-wsd/.classpath
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.classpath?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/.classpath (added)
+++ ctakes/sandbox/ctakes-wsd/.classpath Wed Jan 22 18:15:34 2014
@@ -0,0 +1,36 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<classpath>
+	<classpathentry kind="src" output="target/classes" path="src/main/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
+		<attributes>
+			<attribute name="optional" value="true"/>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.6">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
+		<attributes>
+			<attribute name="maven.pomderived" value="true"/>
+		</attributes>
+	</classpathentry>
+	<classpathentry kind="output" path="target/classes"/>
+</classpath>

Propchange: ctakes/sandbox/ctakes-wsd/.classpath
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/.project
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.project?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/.project (added)
+++ ctakes/sandbox/ctakes-wsd/.project Wed Jan 22 18:15:34 2014
@@ -0,0 +1,23 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<projectDescription>
+	<name>ctakes-wsd</name>
+	<comment></comment>
+	<projects>
+	</projects>
+	<buildSpec>
+		<buildCommand>
+			<name>org.eclipse.jdt.core.javabuilder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+		<buildCommand>
+			<name>org.eclipse.m2e.core.maven2Builder</name>
+			<arguments>
+			</arguments>
+		</buildCommand>
+	</buildSpec>
+	<natures>
+		<nature>org.eclipse.jdt.core.javanature</nature>
+		<nature>org.eclipse.m2e.core.maven2Nature</nature>
+	</natures>
+</projectDescription>

Propchange: ctakes/sandbox/ctakes-wsd/.project
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs (added)
+++ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs Wed Jan 22 18:15:34
2014
@@ -0,0 +1,6 @@
+eclipse.preferences.version=1
+encoding//src/main/java=UTF-8
+encoding//src/main/resources=UTF-8
+encoding//src/test/java=UTF-8
+encoding//src/test/resources=UTF-8
+encoding/<project>=UTF-8

Propchange: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs (added)
+++ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs Wed Jan 22 18:15:34 2014
@@ -0,0 +1,5 @@
+eclipse.preferences.version=1
+org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6
+org.eclipse.jdt.core.compiler.compliance=1.6
+org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
+org.eclipse.jdt.core.compiler.source=1.6

Propchange: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs (added)
+++ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs Wed Jan 22 18:15:34 2014
@@ -0,0 +1,4 @@
+activeProfiles=
+eclipse.preferences.version=1
+resolveWorkspaceProjects=true
+version=1

Propchange: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/pom.xml
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/pom.xml?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/pom.xml (added)
+++ ctakes/sandbox/ctakes-wsd/pom.xml Wed Jan 22 18:15:34 2014
@@ -0,0 +1,207 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <artifactId>ctakes-temporal</artifactId>
+    <packaging>jar</packaging>
+    <name>Apache cTAKES Temporal Information Extraction</name>
+  <parent>
+    <groupId>org.apache.ctakes</groupId>
+    <artifactId>ctakes</artifactId>
+    <version>3.1.2-SNAPSHOT</version>
+  </parent>
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-temporal-res</artifactId>
+    </dependency>     
+    <dependency>
+      <groupId>org.jdom</groupId>
+      <artifactId>jdom2</artifactId>
+    </dependency>
+        <dependency>
+            <groupId>com.lexicalscope.jewelcli</groupId>
+            <artifactId>jewelcli</artifactId>
+        </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-type-system</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-context-tokenizer</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-pos-tagger</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-chunker</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-dictionary-lookup</artifactId>
+    </dependency>
+      <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-lvg</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-dependency-parser</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-relation-extractor</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.ctakes</groupId>
+      <artifactId>ctakes-constituency-parser</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>net.sourceforge.ctakesresources</groupId>
+      <artifactId>ctakes-resources-umls2011ab</artifactId>
+      <version>3.1.1</version>
+    </dependency>
+    <dependency>
+      <groupId>org.jdom</groupId>
+      <artifactId>jdom2</artifactId>
+    </dependency>
+        <dependency>
+            <groupId>com.lexicalscope.jewelcli</groupId>
+            <artifactId>jewelcli</artifactId>
+        </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-util</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-ml</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-eval</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-timeml</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-ml-svmlight</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-syntax</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-ml-libsvm</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-ml-tksvmlight</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-type-system</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.cleartk</groupId>
+      <artifactId>cleartk-ml-crfsuite</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>info.bethard</groupId>
+      <artifactId>timenorm</artifactId>
+      <version>0.9.0</version>
+    </dependency>
+    <dependency>
+      <groupId>com.googlecode.java-diff-utils</groupId>
+      <artifactId>diffutils</artifactId>
+      <version>1.3.0</version>
+    </dependency>
+  </dependencies>
+  <!-- The below is all necessary to unpack the UMLS resources since they 
+    can't be used from the classpath -->
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-dependency-plugin</artifactId>
+        <executions>
+          <execution>
+            <id>unpack</id>
+            <phase>initialize</phase>
+            <goals>
+              <goal>unpack</goal>
+            </goals>
+            <configuration>
+              <artifactItems>
+                <artifactItem>
+                  <groupId>net.sourceforge.ctakesresources</groupId>
+                  <artifactId>ctakes-resources-umls2011ab</artifactId>
+                  <version>3.1.1</version>
+                </artifactItem>
+              </artifactItems>
+              <outputDirectory>${project.build.directory}/unpacked</outputDirectory>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+    </plugins>
+    <pluginManagement>
+      <plugins>
+        <plugin>
+          <groupId>org.eclipse.m2e</groupId>
+          <artifactId>lifecycle-mapping</artifactId>
+          <version>1.0.0</version>
+          <configuration>
+            <lifecycleMappingMetadata>
+              <pluginExecutions>
+                <pluginExecution>
+                  <pluginExecutionFilter>
+                    <groupId>org.apache.maven.plugins</groupId>
+                    <artifactId>maven-dependency-plugin</artifactId>
+                    <versionRange>[2.0,)</versionRange>
+                    <goals>
+                      <goal>unpack</goal>
+                    </goals>
+                  </pluginExecutionFilter>
+                  <action>
+                    <execute />
+                  </action>
+                </pluginExecution>
+              </pluginExecutions>
+            </lifecycleMappingMetadata>
+          </configuration>
+        </plugin>
+      </plugins>
+    </pluginManagement>
+  </build>
+</project>

Propchange: ctakes/sandbox/ctakes-wsd/pom.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java (added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java Wed
Jan 22 18:15:34 2014
@@ -0,0 +1,78 @@
+package org.apache.ctakes.wsd.pipelines;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.ctakes.core.cr.XMIReader;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.cleartk.util.Options_ImplBase;
+import org.kohsuke.args4j.Option;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.util.JCasUtil;
+
+import com.google.common.collect.Lists;
+
+/**
+ * 
+ * Read XMI files and apply a consumer that extracts relation features for a downstream component.
+ * 
+ * @author dmitriy dligach
+ *
+ */
+public class Analyze {
+
+  public static class Options extends Options_ImplBase {
+
+    @Option(
+        name = "--input-dir",
+        usage = "specify the path to the directory containing the XMI files",
+        required = true)
+    public File inputDirectory;
+
+    @Option(
+        name = "--output-dir",
+        usage = "specify the path to the directory where the training data will be placed",
+        required = false)
+    public File outputDirectory;
+  }
+  
+  public static void main(String[] args) throws Exception {
+    
+    Options options = new Options();
+    options.parseOptions(args);
+
+    List<File> trainFiles = Arrays.asList(options.inputDirectory.listFiles());
+    String[] paths = new String[trainFiles.size()];
+    for (int i = 0; i < paths.length; ++i) {
+      paths[i] = trainFiles.get(i).getPath();
+    }
+
+    CollectionReader xmiCollectionReader = CollectionReaderFactory.createCollectionReader(
+        XMIReader.class,
+        XMIReader.PARAM_FILES,
+        paths);
+    
+    AnalysisEngine featureExtractorAe = AnalysisEngineFactory.createPrimitive(DoSomething.class);
+        
+    SimplePipeline.runPipeline(xmiCollectionReader, featureExtractorAe);
+  }
+  
+  public static class DoSomething extends JCasAnnotator_ImplBase{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class)))
{
+        System.out.println(mention.getCoveredText());
+      }
+    }
+    
+  }
+}

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java?rev=1560454&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java
Wed Jan 22 18:15:34 2014
@@ -0,0 +1,370 @@
+package org.apache.ctakes.wsd.pipelines;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
+import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
+import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
+import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.postagger.POSTagger;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewURIUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.ViewCreatorAnnotator;
+import org.uimafit.component.ViewTextCopierAnnotator;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.ExternalResourceFactory;
+import org.uimafit.factory.TypePrioritiesFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.util.JCasUtil;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class Preprocess {
+
+  public static final String GOLD_VIEW_NAME = "GoldView";
+
+  public static File inputDirectory = new File("/Users/dima/Boston/Data/Sharp/Cloud/sharp/text/train/");
+  public static String outputDirectory = "/Users/Dima/Temp/";
+
+  public static void main(String[] args) throws Exception {
+
+    List<File> files = new ArrayList<File>();
+    for(File file : inputDirectory.listFiles()) {
+      files.add(file);
+    }
+
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+    AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+    SimplePipeline.runPipeline(reader, engine);
+  }
+
+  public static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() throws Exception
{
+    
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+
+    // read manual annotations into gold view
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ViewCreatorAnnotator.class,
+        ViewCreatorAnnotator.PARAM_VIEW_NAME,
+        GOLD_VIEW_NAME));
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ViewTextCopierAnnotator.class,
+        ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+        CAS.NAME_DEFAULT_SOFA,
+        ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+        GOLD_VIEW_NAME));
+
+//    switch (this.xmlFormat) {
+//    case Anafora:
+//      aggregateBuilder.add(
+//          THYMEAnaforaXMLReader.getDescription(this.xmlDirectory),
+//          CAS.NAME_DEFAULT_SOFA,
+//          GOLD_VIEW_NAME);
+//      break;
+//    case Knowtator:
+//      aggregateBuilder.add(
+//          THYMEKnowtatorXMLReader.getDescription(this.xmlDirectory),
+//          CAS.NAME_DEFAULT_SOFA,
+//          GOLD_VIEW_NAME);
+//      break;
+//    }
+
+    // identify segments
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        SentenceDetector.class,
+        SentenceDetector.SD_MODEL_FILE_PARAM,
+        "org/apache/ctakes/core/sentdetect/sd-med-model.zip"));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        POSTagger.class,
+        TypeSystemDescriptionFactory.createTypeSystemDescription(),
+        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
+        POSTagger.POS_MODEL_FILE_PARAM,
+        "org/apache/ctakes/postagger/models/mayo-pos.zip"));
+
+    // identify chunks
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        Chunker.class,
+        Chunker.CHUNKER_MODEL_FILE_PARAM,
+        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"),
+        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+        DefaultChunkCreator.class));
+
+    // identify UMLS named entities
+
+    // adjust NP in NP NP to span both
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        1));
+    // adjust NP in NP PP NP to span all three
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "PP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        2));
+    // add lookup windows for each NP
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
+    // maximize lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        OverlapAnnotator.class,
+        "A_ObjectClass",
+        LookupWindowAnnotation.class,
+        "B_ObjectClass",
+        LookupWindowAnnotation.class,
+        "OverlapType",
+        "A_ENV_B",
+        "ActionType",
+        "DELETE",
+        "DeleteAction",
+        new String[] { "selector=B" }));
+    // add UMLS on top of lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        UmlsDictionaryLookupAnnotator.class,
+        "ctakes.umlsaddr",
+        "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser",
+        "ctakes.umlsvendor",
+        "NLM-6515182895",
+        "LookupDescriptor",
+        ExternalResourceFactory.createExternalResourceDescription(
+            FileResourceImpl.class,
+            FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/LookupDesc_Db.xml")),
+            "DbConnection",
+            ExternalResourceFactory.createExternalResourceDescription(
+                JdbcConnectionResourceImpl.class,
+                "",
+                JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS,
+                "org.hsqldb.jdbcDriver",
+                JdbcConnectionResourceImpl.PARAM_URL,
+                // Should be the following but it's WAY too slow
+                // "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+                "jdbc:hsqldb:file:target/unpacked/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+                "RxnormIndexReader",
+                ExternalResourceFactory.createExternalResourceDescription(
+                    LuceneIndexReaderResourceImpl.class,
+                    "",
+                    "UseMemoryIndex",
+                    true,
+                    "IndexDirectory",
+                    new File("target/unpacked/org/apache/ctakes/dictionary/lookup/rxnorm_index").getAbsoluteFile()),
+                    "OrangeBookIndexReader",
+                    ExternalResourceFactory.createExternalResourceDescription(
+                        LuceneIndexReaderResourceImpl.class,
+                        "",
+                        "UseMemoryIndex",
+                        true,
+                        "IndexDirectory",
+                        FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/OrangeBook"))));
+
+    // add lvg annotator
+    String[] XeroxTreebankMap = {
+        "adj|JJ",
+        "adv|RB",
+        "aux|AUX",
+        "compl|CS",
+        "conj|CC",
+        "det|DET",
+        "modal|MD",
+        "noun|NN",
+        "prep|IN",
+        "pron|PRP",
+    "verb|VB" };
+    String[] ExclusionSet = {
+        "and",
+        "And",
+        "by",
+        "By",
+        "for",
+        "For",
+        "in",
+        "In",
+        "of",
+        "Of",
+        "on",
+        "On",
+        "the",
+        "The",
+        "to",
+        "To",
+        "with",
+    "With" };
+    AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+        LvgAnnotator.class,
+        "UseSegments",
+        false,
+        "SegmentsToSkip",
+        new String[0],
+        "UseCmdCache",
+        false,
+        "CmdCacheFileLocation",
+        "/org/apache/ctakes/lvg/2005_norm.voc",
+        "CmdCacheFrequencyCutoff",
+        20,
+        "ExclusionSet",
+        ExclusionSet,
+        "XeroxTreebankMap",
+        XeroxTreebankMap,
+        "LemmaCacheFileLocation",
+        "/org/apache/ctakes/lvg/2005_lemma.voc",
+        "UseLemmaCache",
+        false,
+        "LemmaCacheFrequencyCutoff",
+        20,
+        "PostLemmas",
+        true,
+        "LvgCmdApi",
+        ExternalResourceFactory.createExternalResourceDescription(
+            LvgCmdApiResourceImpl.class,
+            new File(LvgCmdApiResourceImpl.class.getResource(
+                "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI())));
+    aggregateBuilder.add(lvgAnnotator);
+
+    // add dependency parser
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class));
+
+    // add semantic role labeler
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class));
+
+//    // add gold standard parses to gold view, and adjust gold view to correct a few annotation
mis-steps
+//    if(this.treebankDirectory != null){
+//      aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory));
+//      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class));
+//    }else{
+//      // add ctakes constituency parses to system view
+//      aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class));
+//    }
+
+    // write out the CAS after all the above annotations
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        outputDirectory));
+
+    return aggregateBuilder;
+  }
+
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+  
+  // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed
+  public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase
{
+    private static Pattern SECTION_PATTERN = Pattern.compile(
+        "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])",
+        Pattern.DOTALL);
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      Matcher matcher = SECTION_PATTERN.matcher(jCas.getDocumentText());
+      while (matcher.find()) {
+        Segment segment = new Segment(jCas);
+        segment.setBegin(matcher.start() + matcher.group(1).length());
+        segment.setEnd(matcher.end() - matcher.group(3).length());
+        segment.setId(matcher.group(2));
+        segment.addToIndexes();
+      }
+    }
+  }
+  
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewURIUtil.getURI(jCas).getPath()));
+  }
+
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+}

Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain



Mime
View raw message