ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From dlig...@apache.org
Subject svn commit: r1740378 - /ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/MimicWord2VecPreprocessing.java
Date Thu, 21 Apr 2016 18:18:32 GMT
Author: dligach
Date: Thu Apr 21 18:18:32 2016
New Revision: 1740378

URL: http://svn.apache.org/viewvc?rev=1740378&view=rev
Log:
preprocessing pipeline for mimic iii corpus

Added:
    ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/MimicWord2VecPreprocessing.java

Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/MimicWord2VecPreprocessing.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/MimicWord2VecPreprocessing.java?rev=1740378&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/MimicWord2VecPreprocessing.java
(added)
+++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/pipelines/MimicWord2VecPreprocessing.java
Thu Apr 21 18:18:32 2016
@@ -0,0 +1,146 @@
+package org.apache.ctakes.pipelines;
+
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.net.MalformedURLException;
+import java.net.URI;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.cleartk.ae.SentenceDetectorAnnotator;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.impl.XmiCasSerializer;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
+import org.apache.uima.fit.descriptor.ConfigurationParameter;
+import org.apache.uima.fit.factory.AggregateBuilder;
+import org.apache.uima.fit.factory.AnalysisEngineFactory;
+import org.apache.uima.fit.pipeline.SimplePipeline;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.util.ViewUriUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.cleartk.util.cr.UriCollectionReader;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import com.google.common.io.CharStreams;
+
+public class MimicWord2VecPreprocessing {
+
+  public static File inputDirectory = new File("/Users/Dima/Loyola/Data/MimicIII/Text/0/");
+  public static String outputDirectory = "/Users/Dima/Loyola/Data/MimicIII/Xmi/0/";
+
+  public static void main(String[] args) throws Exception {
+
+    List<File> files = new ArrayList<File>();
+    for(File file : inputDirectory.listFiles()) {
+      files.add(file);
+    }
+
+    CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
+    AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate();
+    SimplePipeline.runPipeline(reader, engine);
+  }
+
+  protected static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(UriToDocumentTextAnnotatorCtakes.class));
+
+    // identify segments 
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(SimpleSegmentAnnotator.class));
+
+    // identify sentences (use Tim's sentence segmenter that handles line breaks correctly)
+    aggregateBuilder.add(SentenceDetectorAnnotator.getDescription("/org/apache/ctakes/core/sentdetect/model.jar"));
+    
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(ContextDependentTokenizerAnnotator.class));
+    
+    // write out the CAS after all the above annotations
+    aggregateBuilder.add(AnalysisEngineFactory.createEngineDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        outputDirectory));
+
+    return aggregateBuilder;
+  }
+  
+  /* 
+   * The following class overrides a ClearTK utility annotator class for reading
+   * a text file into a JCas. The code is copy/pasted so that one tiny modification
+   * can be made for this corpus -- replace a single odd character (0xc) with a 
+   * space since it trips up xml output.  
+   */
+  public static class UriToDocumentTextAnnotatorCtakes extends UriToDocumentTextAnnotator
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      URI uri = ViewUriUtil.getURI(jCas);
+      String content;
+
+      try {
+        content = CharStreams.toString(new InputStreamReader(uri.toURL().openStream()));
+        content = content.replace((char) 0xc, ' ');
+        jCas.setSofaDataString(content, "text/plain");
+      } catch (MalformedURLException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }  
+  }
+  
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewUriUtil.getURI(jCas).getPath()));
+  }
+  
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+}



Mime
View raw message