ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1627157 - in /ctakes/trunk/ctakes-core: desc/cas_consumer/SentenceTokensPrinter.xml src/main/java/org/apache/ctakes/core/cc/SentenceTokensPrinter.java
Date Tue, 23 Sep 2014 20:43:37 GMT
Author: seanfinan
Date: Tue Sep 23 20:43:37 2014
New Revision: 1627157

URL: http://svn.apache.org/r1627157
Log:
Adding new cas consumer that prints space-separated tokens, one sentence per line.
Output is saved to file (by documentId) if parameter <OutputDirectory> is specified,
otherwise output is printed on standard out.

Added:
    ctakes/trunk/ctakes-core/desc/cas_consumer/SentenceTokensPrinter.xml
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/SentenceTokensPrinter.java

Added: ctakes/trunk/ctakes-core/desc/cas_consumer/SentenceTokensPrinter.xml
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/desc/cas_consumer/SentenceTokensPrinter.xml?rev=1627157&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/desc/cas_consumer/SentenceTokensPrinter.xml (added)
+++ ctakes/trunk/ctakes-core/desc/cas_consumer/SentenceTokensPrinter.xml Tue Sep 23 20:43:37
2014
@@ -0,0 +1,62 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+
+    Licensed to the Apache Software Foundation (ASF) under one
+    or more contributor license agreements.  See the NOTICE file
+    distributed with this work for additional information
+    regarding copyright ownership.  The ASF licenses this file
+    to you under the Apache License, Version 2.0 (the
+    "License"); you may not use this file except in compliance
+    with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing,
+    software distributed under the License is distributed on an
+    "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+    KIND, either express or implied.  See the License for the
+    specific language governing permissions and limitations
+    under the License.
+
+-->
+<casConsumerDescription xmlns="http://uima.apache.org/resourceSpecifier">
+   <frameworkImplementation>org.apache.uima.java</frameworkImplementation>
+   <implementationName>org.apache.ctakes.core.cc.SentenceTokensPrinter</implementationName>
+   <processingResourceMetaData>
+      <name>SentenceTokensPrinter</name>
+      <description>For each CAS the tokens are printed per sentence to a line.
+         If the parameter OutputDirectory is provided then tokenized sentences are saved
in files named after document
+         IDs.
+         If no OutputDirectory is specified then tokenized sentences are printed to standard
output.
+      </description>
+      <version>1.0</version>
+      <vendor>TCH</vendor>
+      <configurationParameters>
+         <configurationParameter>
+            <name>OutputDirectory</name>
+            <description>The directory name where the plain text tokenized sentences
will be written to.</description>
+            <type>String</type>
+            <multiValued>false</multiValued>
+            <mandatory>false</mandatory>
+         </configurationParameter>
+      </configurationParameters>
+      <configurationParameterSettings>
+         <nameValuePair>
+            <name>OutputDirectory</name>
+            <value>
+               <string>CHANGE_ME</string>
+            </value>
+         </nameValuePair>
+      </configurationParameterSettings>
+      <typeSystemDescription>
+         <imports>
+         </imports>
+      </typeSystemDescription>
+      <typePriorities/>
+      <capabilities/>
+      <operationalProperties>
+         <modifiesCas>false</modifiesCas>
+         <multipleDeploymentAllowed>false</multipleDeploymentAllowed>
+      </operationalProperties>
+   </processingResourceMetaData>
+</casConsumerDescription>

Added: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/SentenceTokensPrinter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/SentenceTokensPrinter.java?rev=1627157&view=auto
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/SentenceTokensPrinter.java
(added)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/SentenceTokensPrinter.java
Tue Sep 23 20:43:37 2014
@@ -0,0 +1,166 @@
+package org.apache.ctakes.core.cc;
+
+import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.ctakes.core.util.JCasUtil;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.log4j.Logger;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.collection.CasConsumer_ImplBase;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JFSIndexRepository;
+import org.apache.uima.jcas.tcas.Annotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceProcessException;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Saves the (base) tokens of each sentence on a separate line, separated by spaces
+ *
+ * @author SPF , chip-nlp
+ * @version %I%
+ * @since 9/22/2014
+ */
+public class SentenceTokensPrinter extends CasConsumer_ImplBase {
+
+   // LOG4J logger based on interface name
+   final static private Logger LOGGER = Logger.getLogger( "SentenceTokensPrinter" );
+
+   public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+
+
+   private String _outputDirPath;
+
+   /**
+    * Checks for parameter <code>OutputDirectory</code>.  If present then files
will be saved, if not stdout is used.
+    * {@inheritDoc}
+    *
+    * @throws ResourceInitializationException if parameter <code>OutputDirectory</code>
has an invalid value
+    */
+   @Override
+   public void initialize() throws ResourceInitializationException {
+      super.initialize();
+      final String outputDirPath = (String)getConfigParameterValue( PARAM_OUTPUTDIR );
+      if ( outputDirPath != null && !outputDirPath.isEmpty() ) {
+         final File outputDirectory = new File( outputDirPath );
+         if ( !outputDirectory.exists() && !outputDirectory.mkdirs() ) {
+            throw new ResourceInitializationException(
+                  new IOException( "Parameter setting 'OutputDirectory' does not point to
an existing directory" +
+                                   " or one that could be created." ) );
+         }
+         _outputDirPath = outputDirPath;
+      }
+   }
+
+
+   /**
+    * Saves the (base) tokens of each sentence on a separate line, separated by spaces
+    * {@inheritDoc}
+    */
+   @Override
+   public void processCas( final CAS cas ) throws ResourceProcessException {
+      JCas jcas;
+      try {
+         jcas = cas.getJCas();
+      } catch ( CASException casE ) {
+         LOGGER.error( casE.getMessage() );
+         return;
+      }
+      final int sentenceTypeCode = JCasUtil.getType( "org.apache.ctakes.typesystem.type.textspan.Sentence"
);
+      final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
+      final AnnotationIndex<Annotation> sentences = indexes.getAnnotationIndex( sentenceTypeCode
);
+      if ( sentences == null ) {  // I don't trust AnnotationIndex.size(), so don't check
+         return;
+      }
+      final Collection<String> tokenizedSentences = new ArrayList<>( sentences.size()
);
+      try {
+         for ( Object sentence : sentences ) {
+            tokenizedSentences.add( getSentenceTokens( jcas, (Annotation)sentence ) );
+         }
+      } catch ( ArrayIndexOutOfBoundsException iobE ) {
+         // JCasHashMap will throw this every once in a while.  Assume the sentences are
done and move on
+         LOGGER.warn( iobE.getMessage() );
+      }
+      final String documentId = DocumentIDAnnotationUtil.getDocumentID( jcas );
+      outputSentenceTokens( documentId, tokenizedSentences );
+   }
+
+   /**
+    * @param jcas     -
+    * @param sentence -
+    * @return the (base) tokens of the sentence, separated by spaces
+    */
+   static private String getSentenceTokens( final JCas jcas, final Annotation sentence )
{
+      final StringBuilder sb = new StringBuilder();
+      final List<BaseToken> allBaseTokens = org.apache.uima.fit.util.JCasUtil
+            .selectCovered( jcas, BaseToken.class, sentence );
+      for ( BaseToken baseToken : allBaseTokens ) {
+         if ( baseToken instanceof NewlineToken ) {
+            // mid-sentence newlines are ignored - this honors the newline behavior of the
selected Sentence Detector
+            continue;
+         }
+         sb.append( baseToken.getCoveredText() ).append( ' ' );
+      }
+      return sb.toString();
+   }
+
+   /**
+    * @param documentId         id of the document, used for output identification of the
analyzed document
+    * @param tokenizedSentences space-separated sentence tokens
+    */
+   private void outputSentenceTokens( final String documentId, final Iterable<String>
tokenizedSentences ) {
+      if ( _outputDirPath == null ) {
+         printSentenceTokens( documentId, tokenizedSentences );
+      } else {
+         saveSentenceTokens( _outputDirPath, documentId, tokenizedSentences );
+      }
+   }
+
+   /**
+    * Prints the (base) tokens of each sentence on a separate line, separated by spaces,
on standard output
+    *
+    * @param documentId         id of the document, used for output identification of the
analyzed document
+    * @param tokenizedSentences space-separated sentence tokens
+    */
+   static private void printSentenceTokens( final String documentId, final Iterable<String>
tokenizedSentences ) {
+      System.out.println( "===========================   " + documentId + "   ==========================="
);
+      for ( String tokenizedSentence : tokenizedSentences ) {
+         System.out.println( tokenizedSentence );
+      }
+   }
+
+   /**
+    * Saves the (base) tokens of each sentence on a separate line, separated by spaces
+    *
+    * @param outputDirPath      root output directory specified by parameter <code>OutputDirectory</code>
+    * @param documentId         id of the document, used for the output file name of the
analyzed document
+    * @param tokenizedSentences space-separated sentence tokens
+    */
+   static private void saveSentenceTokens( final String outputDirPath,
+                                           final String documentId, final Iterable<String>
tokenizedSentences ) {
+      // Be prepared for documentId that contains directory segments, some of which may not
exist
+      final File outputFile = new File( outputDirPath + File.pathSeparator + documentId );
+      if ( !outputFile.getParentFile().exists() && !outputFile.getParentFile().mkdirs()
) {
+         LOGGER.error( outputFile.getPath() + " is an invalid output file path" );
+         return;
+      }
+      try ( final BufferedWriter writer = new BufferedWriter( new FileWriter( outputFile
) ) ) {
+         for ( String tokenizedSentence : tokenizedSentences ) {
+            writer.write( tokenizedSentence );
+            writer.newLine();
+         }
+      } catch ( IOException ioE ) {
+         LOGGER.error( ioE.getMessage() );
+      }
+   }
+
+}



Mime
View raw message