ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1689882 - in /ctakes/trunk: ctakes-core/src/main/java/org/apache/ctakes/core/cc/ ctakes-core/src/main/java/org/apache/ctakes/core/util/ ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/ ctakes-dependency-parser/src/main/ja...
Date Wed, 08 Jul 2015 14:23:12 GMT
Author: seanfinan
Date: Wed Jul  8 14:23:11 2015
New Revision: 1689882

URL: http://svn.apache.org/r1689882
Log:
CTAKES-365 Added return constant "UnknownDocument" when the document id cannot be determined -- replacing old null return
Main code change is in DocumentIDAnnotationUtil
All other files (mostly cas consumers) are updated to reflect the change

Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/CasConsumer.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FilesInDirectoryCasConsumer.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XmiWriterCasConsumerCtakes.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/CtakesFileNamer.java
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DocumentIDAnnotationUtil.java
    ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java
    ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/DependencyNodeWriter.java
    ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/CasConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/CasConsumer.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/CasConsumer.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/CasConsumer.java Wed Jul  8 14:23:11 2015
@@ -18,13 +18,8 @@
  */
 package org.apache.ctakes.core.cc;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.OutputStream;
-
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.log4j.Logger;
-
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.impl.XCASSerializer;
 import org.apache.uima.collection.CasConsumer_ImplBase;
@@ -32,89 +27,89 @@ import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.ResourceProcessException;
 
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.OutputStream;
 
 
 /**
  * CasConsumer that writes a JCas (the current view) to an xml file
- * 
- * @author Mayo Clinic 
+ *
+ * @author Mayo Clinic
  */
-public class CasConsumer extends CasConsumer_ImplBase
-{
-    // LOG4J logger based on class name
-    private Logger iv_logger = Logger.getLogger(getClass().getName());
-
-    private String iv_outputDir = null;
-    
-    // iv_procCount is used to name the output files sequentially if there 
-    // is a problem with naming based on source names
-    private int iv_procCount = 0; 
-
-    
-    /**
-     * Read in configuration parameters
-     */
-    public void initialize() throws ResourceInitializationException {
-        iv_outputDir = (String) getConfigParameterValue("outputDir");
-    }
-
-
-    /**
-     * Write a formatted xml file containing data from the view.
-     * The file name will come from the DocumentID annotation,
-     * which is associated with a view.
-     * We append .xml to the DocumentID/filename 
-     */
-    private void processView(JCas view) throws Exception {
-        // String docText = view.getDocumentText();
-
-        String docName = DocumentIDAnnotationUtil.getDocumentID(view);
-
-        File outputFile;
-        if (docName==null) {
-        	docName = "doc" + iv_procCount + ".xml";
-        }
-        else {
-        	docName = docName + ".xml";        		
-			//	if (!docName.endsWith(".xml")) {
-			//    	docName = docName + ".xml";        		
-			//	}
-        }
-        
-        OutputStream out=null;
-        try {
-        	File outputDir = new File(iv_outputDir);
-        	outputDir.mkdirs();
-            outputFile = new File(iv_outputDir + File.separatorChar + docName);
-            out = new FileOutputStream(outputFile);
-            XCASSerializer.serialize(view.getCas(), out, true); // true -> formats the output
-        } 
-        finally {
-	        iv_procCount++;
-	        if (out != null) {
-	        	out.close();
-	        }
-        }
-
-    }
-
-    
-    /**
-     * Create an xml file from the data in the cas.
-     */
-    public void processCas(CAS cas) throws ResourceProcessException {
-
-    	iv_logger.info("Started");
-    	
-        try { 
-
-        	JCas currentView = cas.getCurrentView().getJCas();
-            processView(currentView);
-            
-        } catch (Exception e) {
-        	throw new ResourceProcessException(e);
-        }
+public class CasConsumer extends CasConsumer_ImplBase {
+   // LOG4J logger based on class name
+   private Logger iv_logger = Logger.getLogger( getClass().getName() );
+
+   private String iv_outputDir = null;
+
+   // iv_procCount is used to name the output files sequentially if there
+   // is a problem with naming based on source names
+   private int iv_procCount = 0;
+
+
+   /**
+    * Read in configuration parameters
+    */
+   public void initialize() throws ResourceInitializationException {
+      iv_outputDir = (String)getConfigParameterValue( "outputDir" );
+   }
+
+
+   /**
+    * Write a formatted xml file containing data from the view.
+    * The file name will come from the DocumentID annotation,
+    * which is associated with a view.
+    * We append .xml to the DocumentID/filename
+    */
+   private void processView( JCas view ) throws Exception {
+      // String docText = view.getDocumentText();
+
+      String docName = DocumentIDAnnotationUtil.getDocumentID( view );
+
+      File outputFile;
+      if ( docName == null || docName.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+         docName = "doc" + iv_procCount + ".xml";
+      } else {
+         docName = docName + ".xml";
+         //	if (!docName.endsWith(".xml")) {
+         //    	docName = docName + ".xml";
+         //	}
+      }
+
+      OutputStream out = null;
+      try {
+         File outputDir = new File( iv_outputDir );
+         outputDir.mkdirs();
+         outputFile = new File( iv_outputDir + File.separatorChar + docName );
+         out = new FileOutputStream( outputFile );
+         XCASSerializer.serialize( view.getCas(), out, true ); // true -> formats the output
+      } finally {
+         iv_procCount++;
+         if ( out != null ) {
+            out.close();
+         }
+      }
+
+   }
+
+
+   /**
+    * Create an xml file from the data in the cas.
+    */
+   public void processCas( CAS cas ) throws ResourceProcessException {
+
+      iv_logger.info( "Started" );
+
+      try {
+
+         JCas currentView = cas.getCurrentView().getJCas();
+         processView( currentView );
+
+      } catch ( Exception e ) {
+         throw new ResourceProcessException( e );
+      }
 
-    }
+   }
 
 }
\ No newline at end of file

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FilesInDirectoryCasConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FilesInDirectoryCasConsumer.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FilesInDirectoryCasConsumer.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/FilesInDirectoryCasConsumer.java Wed Jul  8 14:23:11 2015
@@ -18,93 +18,74 @@
  */
 package org.apache.ctakes.core.cc;
 
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
+import org.apache.log4j.Logger;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.collection.CasConsumer_ImplBase;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.ResourceProcessException;
 
+import java.io.*;
+
 
 /**
- * For each CAS a local file with the document text is written to a directory specifed by a parameter.  
- * This CAS consumer does not make use of any annotation information in the cas except for the document 
- * id specified the CommonTypeSystem.xml descriptor.  The document id will be the name of the file written 
- * for each CAS.  
- * 
- * This CAS consumer may be useful if you want to write the results of a collection reader and/or CAS 
- * initializer to the local file system.  For example, a JDBC Collection Reader may read XML documents 
- * from a database and a specialized cas initializer may convert the XML to plain text.  The 
+ * For each CAS a local file with the document text is written to a directory specifed by a parameter.
+ * This CAS consumer does not make use of any annotation information in the cas except for the document
+ * id specified the CommonTypeSystem.xml descriptor.  The document id will be the name of the file written
+ * for each CAS.
+ * <p/>
+ * This CAS consumer may be useful if you want to write the results of a collection reader and/or CAS
+ * initializer to the local file system.  For example, a JDBC Collection Reader may read XML documents
+ * from a database and a specialized cas initializer may convert the XML to plain text.  The
  * FilesInDirectoryCasConsumer can now be used to write the plain text to local plain text files.
  */
 
 public class FilesInDirectoryCasConsumer extends CasConsumer_ImplBase {
 
-	public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+   public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+
+   static private final Logger LOGGER = Logger.getLogger( "FilesInDirectoryCasConsumer" );
+
+   File iv_outputDirectory;
 
-	File iv_outputDirectory;
-	
-	public void initialize() throws ResourceInitializationException 
-	{
-	    String outputDirectoryName = (String)getConfigParameterValue(PARAM_OUTPUTDIR);
-	    iv_outputDirectory = new File(outputDirectoryName);
-	    if(!iv_outputDirectory.exists() || !iv_outputDirectory.isDirectory())
-	    	throw new ResourceInitializationException(
-	    			new Exception("Parameter setting 'OutputDirectory' does not point to an existing directory."));
-	}
-	
-	public void processCas(CAS cas) throws ResourceProcessException 
-	{
-		try 
-		{
-			JCas jcas;
-			jcas = cas.getJCas();
-			//	jcas = cas.getJCas().getView("_InitialView");
-			//	jcas = cas.getJCas().getView("plaintext");
-		
-			String documentID = DocumentIDAnnotationUtil.getDocumentID(jcas);
-			String documentText = jcas.getDocumentText();
-
-			if (documentID==null) {
-
-				jcas = cas.getJCas().getView("_InitialView");
-				documentID = DocumentIDAnnotationUtil.getDocumentID(jcas);
-
-				if (documentID==null) {
-				
-					jcas = cas.getJCas().getView("plaintext");
-					documentID = DocumentIDAnnotationUtil.getDocumentID(jcas);
-					
-					if (documentID==null) {
-						documentID = "doc_"+new java.util.Date().getTime()+".xml"; // use timestamp in name: doc_TIMESTAMP.xml 
-						System.err.println("Unable to find DocumentIDAnnotation, using " + documentID);
-					}
-				}
-				
-			}
-
-			writeToFile(documentID, documentText);
-			
-		}
-		catch(Exception e)
-		{
-			throw new ResourceProcessException(e);
-		}
-	}
-	
-	private void writeToFile(String documentID, String documentText) throws IOException
-	{
-		File outputFile = new File(iv_outputDirectory, documentID);
-		outputFile.createNewFile();
-		OutputStream out = new BufferedOutputStream(new FileOutputStream(outputFile));
-		out.write(documentText.getBytes());
-		out.flush();
-		out.close();
-	}
+   public void initialize() throws ResourceInitializationException {
+      String outputDirectoryName = (String)getConfigParameterValue( PARAM_OUTPUTDIR );
+      iv_outputDirectory = new File( outputDirectoryName );
+      if ( !iv_outputDirectory.exists() || !iv_outputDirectory.isDirectory() ) {
+         throw new ResourceInitializationException(
+               new Exception( "Parameter setting 'OutputDirectory' does not point to an existing directory." ) );
+      }
+   }
+
+   public void processCas( CAS cas ) throws ResourceProcessException {
+      try {
+         JCas jcas;
+         jcas = cas.getJCas();
+         //	jcas = cas.getJCas().getView("_InitialView");
+         //	jcas = cas.getJCas().getView("plaintext");
+
+         String documentText = jcas.getDocumentText();
+
+         String documentID = DocumentIDAnnotationUtil.getDeepDocumentId( jcas );
+         if ( documentID == null || documentID.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+            documentID = "doc_" + new java.util.Date().getTime() + ".xml"; // use timestamp in name: doc_TIMESTAMP.xml
+            LOGGER.warn( "Unable to find DocumentIDAnnotation, using " + documentID );
+         }
+
+         writeToFile( documentID, documentText );
+
+      } catch ( Exception e ) {
+         throw new ResourceProcessException( e );
+      }
+   }
+
+   private void writeToFile( String documentID, String documentText ) throws IOException {
+      File outputFile = new File( iv_outputDirectory, documentID );
+      outputFile.createNewFile();
+      OutputStream out = new BufferedOutputStream( new FileOutputStream( outputFile ) );
+      out.write( documentText.getBytes() );
+      out.flush();
+      out.close();
+   }
 }

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XmiWriterCasConsumerCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XmiWriterCasConsumerCtakes.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XmiWriterCasConsumerCtakes.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/cc/XmiWriterCasConsumerCtakes.java Wed Jul  8 14:23:11 2015
@@ -37,14 +37,6 @@ package org.apache.ctakes.core.cc;
  * under the License.
  */
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URI;
-import java.net.URISyntaxException;
-import java.net.URL;
-
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.uima.UIMAFramework;
 import org.apache.uima.UimaContext;
@@ -64,168 +56,163 @@ import org.apache.uima.util.XMLInputSour
 import org.apache.uima.util.XMLSerializer;
 import org.xml.sax.SAXException;
 
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URI;
+import java.net.URISyntaxException;
+import java.net.URL;
+
 /**
  * A simple CAS consumer that writes the CAS to XMI format.
- * <p>
+ * <p/>
  * This CAS Consumer takes one parameter:
  * <ul>
  * <li><code>OutputDirectory</code> - path to directory into which output files will be written</li>
  * </ul>
  */
 public class XmiWriterCasConsumerCtakes extends CasConsumer_ImplBase {
-  /**
-   * Name of configuration parameter that must be set to the path of a directory into which the
-   * output files will be written.
-   */
-  public static final String PARAM_OUTPUTDIR = "OutputDirectory";
-  @ConfigurationParameter(name = PARAM_OUTPUTDIR, description = "Output directory to write xmi files", mandatory = true)
-  private File mOutputDir;
-
-  private int mDocNum;
-
-  @Override
-  public void initialize(UimaContext context) throws ResourceInitializationException{
-    super.initialize(context);
-    mDocNum = 0;
-    if (!mOutputDir.exists()) {
-      mOutputDir.mkdirs();
-    }
-  }
-
-  /**
-   * Processes the CAS which was populated by the TextAnalysisEngines. <br>
-   * In this case, the CAS is converted to XMI and written into the output file .
-   * 
-   * @param aCAS
-   *          a CAS which has been populated by the TAEs
- * @throws AnalysisEngineProcessException 
-   * 
-   * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
-   */
-  @Override
-  public void process(CAS aCAS) throws AnalysisEngineProcessException {
-    String modelFileName = null;
-
-    JCas jcas;
-    try {
-		jcas = aCAS.getJCas();
-	} catch (CASException e1) {
-		e1.printStackTrace();
-		throw new AnalysisEngineProcessException(e1);
-	}
-    
-    String originalFileName = DocumentIDAnnotationUtil.getDocumentID(jcas);
-    File outFile = null;
-    if (originalFileName != null && !originalFileName.isEmpty())
-    {
-      File inFile;
-      try
-      {
-    	String outFileName = null;
-    	if (originalFileName.contains("/"))
-    	{
-	        URI uri = UriUtils.quote(originalFileName);
-	        inFile = new File(uri);
-	        outFileName = inFile.getName();
-    	} else
-    	{
-    		outFileName = originalFileName;
-    	}
-        outFileName += ".xmi";
-        outFile = new File(mOutputDir, outFileName);
-        
-      } catch (URISyntaxException e)
-      {
-    	// bad URI, use default processing below
-      }
-      
-    }
-    if (outFile == null) {
-        outFile = new File(mOutputDir, "doc" + mDocNum++ + ".xmi"); // Jira UIMA-629
-     }
-    // serialize XCAS and write to output file
-    try {
-      writeXmi(jcas.getCas(), outFile, modelFileName);
-    } catch (IOException e) {
-      throw new AnalysisEngineProcessException(e);
-    } catch (SAXException e) {
-      throw new AnalysisEngineProcessException(e);
-    }
-  }
-
-  /**
-   * Serialize a CAS to a file in XMI format
-   * 
-   * @param aCas
-   *          CAS to serialize
-   * @param name
-   *          output file
-   * @throws SAXException
-   * @throws Exception
-   * 
-   * @throws ResourceProcessException
-   */
-  private void writeXmi(CAS aCas, File name) throws IOException, SAXException {
-    FileOutputStream out = null;
-
-    try {
-      // write XMI
-      out = new FileOutputStream(name);
-      XmiCasSerializer ser = new XmiCasSerializer(aCas.getTypeSystem());
-      XMLSerializer xmlSer = new XMLSerializer(out, false);
-      ser.serialize(aCas, xmlSer.getContentHandler());
-    } finally {
-      if (out != null) {
-        out.close();
-      }
-    }
-  }
-
-  /**
-   * Serialize a CAS to a file in XMI format
-   * 
-   * @param aCas
-   *          CAS to serialize
-   * @param name
-   *          output file
-   * @throws SAXException
-   * @throws Exception
-   * 
-   * @throws ResourceProcessException
-   */
-  private void writeXmi(CAS aCas, File name, String modelFileName) throws IOException, SAXException {
-    FileOutputStream out = null;
-
-    try {
-      // write XMI
-      out = new FileOutputStream(name);
-      XmiCasSerializer ser = new XmiCasSerializer(aCas.getTypeSystem());
-      XMLSerializer xmlSer = new XMLSerializer(out, false);
-      ser.serialize(aCas, xmlSer.getContentHandler());
-    } finally {
-      if (out != null) {
-        out.close();
-      }
-    }
-  }
-
-  /**
-   * Parses and returns the descriptor for this collection reader. The descriptor is stored in the
-   * uima.jar file and located using the ClassLoader.
-   * 
-   * @return an object containing all of the information parsed from the descriptor.
-   * 
-   * @throws InvalidXMLException
-   *           if the descriptor is invalid or missing
-   */
-  public static CasConsumerDescription getDescription() throws InvalidXMLException {
-    InputStream descStream = XmiWriterCasConsumerCtakes.class
-            .getResourceAsStream("XmiWriterCasConsumerCtakes.xml");
-    return UIMAFramework.getXMLParser().parseCasConsumerDescription(
-            new XMLInputSource(descStream, null));
-  }
-  
-  public static URL getDescriptorURL() {
-    return XmiWriterCasConsumerCtakes.class.getResource("XmiWriterCasConsumerCtakes.xml");
-  }  
+   /**
+    * Name of configuration parameter that must be set to the path of a directory into which the
+    * output files will be written.
+    */
+   public static final String PARAM_OUTPUTDIR = "OutputDirectory";
+   @ConfigurationParameter(name = PARAM_OUTPUTDIR, description = "Output directory to write xmi files", mandatory = true)
+   private File mOutputDir;
+
+   private int mDocNum;
+
+   @Override
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      super.initialize( context );
+      mDocNum = 0;
+      if ( !mOutputDir.exists() ) {
+         mOutputDir.mkdirs();
+      }
+   }
+
+   /**
+    * Processes the CAS which was populated by the TextAnalysisEngines. <br>
+    * In this case, the CAS is converted to XMI and written into the output file .
+    *
+    * @param aCAS a CAS which has been populated by the TAEs
+    * @throws AnalysisEngineProcessException
+    * @see org.apache.uima.collection.base_cpm.CasObjectProcessor#processCas(org.apache.uima.cas.CAS)
+    */
+   @Override
+   public void process( CAS aCAS ) throws AnalysisEngineProcessException {
+      String modelFileName = null;
+
+      JCas jcas;
+      try {
+         jcas = aCAS.getJCas();
+      } catch ( CASException e1 ) {
+         e1.printStackTrace();
+         throw new AnalysisEngineProcessException( e1 );
+      }
+
+      String originalFileName = DocumentIDAnnotationUtil.getDocumentID( jcas );
+      File outFile = null;
+      if ( originalFileName != null
+           && !originalFileName.isEmpty()
+           && !originalFileName.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+         File inFile;
+         try {
+            String outFileName = null;
+            if ( originalFileName.contains( "/" ) ) {
+               URI uri = UriUtils.quote( originalFileName );
+               inFile = new File( uri );
+               outFileName = inFile.getName();
+            } else {
+               outFileName = originalFileName;
+            }
+            outFileName += ".xmi";
+            outFile = new File( mOutputDir, outFileName );
+
+         } catch ( URISyntaxException e ) {
+            // bad URI, use default processing below
+         }
+
+      }
+      if ( outFile == null ) {
+         outFile = new File( mOutputDir, "doc" + mDocNum++ + ".xmi" ); // Jira UIMA-629
+      }
+      // serialize XCAS and write to output file
+      try {
+         writeXmi( jcas.getCas(), outFile, modelFileName );
+      } catch ( IOException e ) {
+         throw new AnalysisEngineProcessException( e );
+      } catch ( SAXException e ) {
+         throw new AnalysisEngineProcessException( e );
+      }
+   }
+
+   /**
+    * Serialize a CAS to a file in XMI format
+    *
+    * @param aCas CAS to serialize
+    * @param name output file
+    * @throws SAXException
+    * @throws Exception
+    * @throws ResourceProcessException
+    */
+   private void writeXmi( CAS aCas, File name ) throws IOException, SAXException {
+      FileOutputStream out = null;
+
+      try {
+         // write XMI
+         out = new FileOutputStream( name );
+         XmiCasSerializer ser = new XmiCasSerializer( aCas.getTypeSystem() );
+         XMLSerializer xmlSer = new XMLSerializer( out, false );
+         ser.serialize( aCas, xmlSer.getContentHandler() );
+      } finally {
+         if ( out != null ) {
+            out.close();
+         }
+      }
+   }
+
+   /**
+    * Serialize a CAS to a file in XMI format
+    *
+    * @param aCas CAS to serialize
+    * @param name output file
+    * @throws SAXException
+    * @throws Exception
+    * @throws ResourceProcessException
+    */
+   private void writeXmi( CAS aCas, File name, String modelFileName ) throws IOException, SAXException {
+      FileOutputStream out = null;
+
+      try {
+         // write XMI
+         out = new FileOutputStream( name );
+         XmiCasSerializer ser = new XmiCasSerializer( aCas.getTypeSystem() );
+         XMLSerializer xmlSer = new XMLSerializer( out, false );
+         ser.serialize( aCas, xmlSer.getContentHandler() );
+      } finally {
+         if ( out != null ) {
+            out.close();
+         }
+      }
+   }
+
+   /**
+    * Parses and returns the descriptor for this collection reader. The descriptor is stored in the
+    * uima.jar file and located using the ClassLoader.
+    *
+    * @return an object containing all of the information parsed from the descriptor.
+    * @throws InvalidXMLException if the descriptor is invalid or missing
+    */
+   public static CasConsumerDescription getDescription() throws InvalidXMLException {
+      InputStream descStream = XmiWriterCasConsumerCtakes.class
+            .getResourceAsStream( "XmiWriterCasConsumerCtakes.xml" );
+      return UIMAFramework.getXMLParser().parseCasConsumerDescription(
+            new XMLInputSource( descStream, null ) );
+   }
+
+   public static URL getDescriptorURL() {
+      return XmiWriterCasConsumerCtakes.class.getResource( "XmiWriterCasConsumerCtakes.xml" );
+   }
 }

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/CtakesFileNamer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/CtakesFileNamer.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/CtakesFileNamer.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/CtakesFileNamer.java Wed Jul  8 14:23:11 2015
@@ -22,7 +22,6 @@ package org.apache.ctakes.core.util;
 import org.apache.uima.UimaContext;
 import org.apache.uima.fit.component.initialize.ConfigurationParameterInitializer;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
-import org.apache.uima.fit.factory.ConfigurationParameterFactory;
 import org.apache.uima.fit.factory.initializable.Initializable;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
@@ -30,54 +29,55 @@ import org.apache.uima.resource.Resource
 /**
  * This is a very simple implementation of XWriterFileNamer that generates file names based on a
  * prefix string and a incrementing counter.
- * 
+ *
  * @author Philip Ogren
  */
 
 public class CtakesFileNamer implements Initializable {
 
-        /**
-         * The parameter name for the configuration parameter that specifies a fixed prefix for all
-         * returned file names.
-         */
-        public static final String PARAM_PREFIX = "prefix";
-        @ConfigurationParameter(name = PARAM_PREFIX, description = "specify a prefix that is prepended to all returned file names", defaultValue="")
-        private String prefix;
-
-        /**
-         * The parameter name for the configuration parameter that specifies a fixed suffix for all
-         * returned file names.
-         */
-        public static final String PARAM_SUFFIX = "suffix";
-        @ConfigurationParameter(name = PARAM_SUFFIX, description = "specify a suffix that is appended to all returned file names", defaultValue="")
-        private String suffix;
-
-        int i = 1;
-
-        public String nameFile(JCas jcas)
-        {
-          String sourceFileName = DocumentIDAnnotationUtil.getDocumentID(jcas);
-          StringBuilder b = new StringBuilder();
-          if (prefix != null && !prefix.isEmpty())
-          { b.append(prefix); }
-          
-          if (sourceFileName != null && !sourceFileName.isEmpty())
-          {
-        	  b.append(sourceFileName);
-          } else
-          {
-        	  b.append(i++);
-          }
-          
-          if (suffix != null && !suffix.isEmpty())
-          { b.append(suffix); }
-          
-          String calculatedFilename = b.toString();
-          
-          return calculatedFilename;
-        }
-
-        public void initialize(UimaContext context) throws ResourceInitializationException {
-                ConfigurationParameterInitializer.initialize(this, context);
-        }
+   /**
+    * The parameter name for the configuration parameter that specifies a fixed prefix for all
+    * returned file names.
+    */
+   public static final String PARAM_PREFIX = "prefix";
+   @ConfigurationParameter(name = PARAM_PREFIX, description = "specify a prefix that is prepended to all returned file names", defaultValue = "")
+   private String prefix;
+
+   /**
+    * The parameter name for the configuration parameter that specifies a fixed suffix for all
+    * returned file names.
+    */
+   public static final String PARAM_SUFFIX = "suffix";
+   @ConfigurationParameter(name = PARAM_SUFFIX, description = "specify a suffix that is appended to all returned file names", defaultValue = "")
+   private String suffix;
+
+   int i = 1;
+
+   public String nameFile( JCas jcas ) {
+      String sourceFileName = DocumentIDAnnotationUtil.getDocumentID( jcas );
+      StringBuilder b = new StringBuilder();
+      if ( prefix != null && !prefix.isEmpty() ) {
+         b.append( prefix );
+      }
+
+      if ( sourceFileName != null
+           && !sourceFileName.isEmpty()
+           && !sourceFileName.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+         b.append( sourceFileName );
+      } else {
+         b.append( i++ );
+      }
+
+      if ( suffix != null && !suffix.isEmpty() ) {
+         b.append( suffix );
+      }
+
+      String calculatedFilename = b.toString();
+
+      return calculatedFilename;
+   }
+
+   public void initialize( UimaContext context ) throws ResourceInitializationException {
+      ConfigurationParameterInitializer.initialize( this, context );
+   }
 }
\ No newline at end of file

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DocumentIDAnnotationUtil.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DocumentIDAnnotationUtil.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DocumentIDAnnotationUtil.java (original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/util/DocumentIDAnnotationUtil.java Wed Jul  8 14:23:11 2015
@@ -18,26 +18,116 @@
  */
 package org.apache.ctakes.core.util;
 
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.jcas.JFSIndexRepository;
-import org.apache.uima.jcas.cas.TOP;
-
-import org.apache.ctakes.typesystem.type.structured.DocumentID;
-
-public class DocumentIDAnnotationUtil 
-{
-	public static String getDocumentID(JCas jcas)
-	{
-		try
-		{
-		 	JFSIndexRepository indexes = jcas.getJFSIndexRepository();
-		 	FSIterator<TOP> documentIDIterator = indexes.getAllIndexedFS(DocumentID.type);
-		 	DocumentID documentIDAnnotation = (DocumentID) documentIDIterator.next();
-		 	String documentID = documentIDAnnotation.getDocumentID();
-		 	return documentID;
-		}
-		catch(Exception e) { return null;}
-	}
+import org.apache.ctakes.typesystem.type.structured.DocumentID;
+import org.apache.log4j.Logger;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.cas.CASRuntimeException;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.JFSIndexRepository;
+import org.apache.uima.jcas.cas.TOP;
+
+/**
+ * Utility class for fetching document id
+ */
+final public class DocumentIDAnnotationUtil {
+
+   // Added for CTAKES-365
+   static public final String NO_DOCUMENT_ID = "UnknownDocument";
+
+   static private final Logger LOGGER = Logger.getLogger( "DocumentIDAnnotationUtil" );
+
+
+   // Utility classes should be final and have only a private constructor
+   private DocumentIDAnnotationUtil() {
+   }
+
+   /**
+    * Check the jcas for a document id.  Unlike {@link #getDocumentID(org.apache.uima.jcas.JCas)},
+    * this method does not progress into deeper jcas layers/views.
+    *
+    * @param jcas ye olde ...
+    * @return the document id contained in the type "DocumentID" or {@link #NO_DOCUMENT_ID}
+    */
+   public static String getDocumentID( final JCas jcas ) {
+      if ( jcas == null ) {
+         // could throw an IllegalArgumentException,
+         // but a caller might be providing a null view, so a graceful handling is better
+         LOGGER.debug( "NULL CAS" );
+         return NO_DOCUMENT_ID;
+      }
+      // todo - improve the FS handling
+      final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
+      final FSIterator<TOP> documentIDIterator = indexes.getAllIndexedFS( DocumentID.type );
+      if ( documentIDIterator == null || !documentIDIterator.hasNext() ) {
+         LOGGER.debug( "Could not find document Id Annotation" );
+         return NO_DOCUMENT_ID;
+      }
+      final DocumentID documentIDAnnotation = (DocumentID)documentIDIterator.next();
+      try {
+         return documentIDAnnotation.getDocumentID();
+      } catch ( CASRuntimeException casRTE ) {
+         LOGGER.warn( "document Id Annotation does not have the id feature set", casRTE );
+         return NO_DOCUMENT_ID;
+      }
+   }
+
+
+   /**
+    * Gets the document Id by progressing through 3 layers until an Id is found: starting JCas, Initial View, Plaintext View
+    *
+    * @param startingJcas initial JCas to start the checking
+    * @return Document Id from the starting JCas, the Initial View, the Plaintext View, or {@link #NO_DOCUMENT_ID}
+    */
+   static public String getDeepDocumentId( final JCas startingJcas ) {
+      String documentID = getDocumentID( startingJcas );
+      if ( documentID == null || documentID.equals( NO_DOCUMENT_ID ) ) {
+         try {
+            LOGGER.debug( "Checking document Id for initial view" );
+            final JCas viewJcas = startingJcas.getView( "_InitialView" );
+            documentID = DocumentIDAnnotationUtil.getDocumentID( viewJcas );
+            if ( documentID == null || documentID.equals( NO_DOCUMENT_ID ) ) {
+               LOGGER.debug( "Checking document Id for plaintext view" );
+               final JCas plaintextJcas = startingJcas.getView( "plaintext" );
+               documentID = DocumentIDAnnotationUtil.getDocumentID( plaintextJcas );
+               if ( documentID == null || documentID.equals( NO_DOCUMENT_ID ) ) {
+                  LOGGER.warn( "Unable to find DocumentIDAnnotation" );
+                  return NO_DOCUMENT_ID;
+               }
+            }
+         } catch ( CASException casE ) {
+            LOGGER.warn( "Unable to find DocumentIDAnnotation", casE );
+            return NO_DOCUMENT_ID;
+         }
+      }
+      return documentID;
+   }
+
+   /**
+    * Create a unique id for the document that can be used for an output filename or url.
+    * Will be the source document file name if possible,
+    * otherwise the first 10 characters of the text plus text hashcode,
+    * or "Unknown_" and the current millis if there is no text.
+    * Non-alphanumeric characters are replaced with '_'.
+    *
+    * @param jcas -
+    * @return an ok document id
+    */
+   static public String getDocumentIdForFile( final JCas jcas ) {
+      String docId = getDeepDocumentId( jcas );
+      if ( docId == null || docId.isEmpty() ) {
+         String casDocText = jcas.getDocumentText();
+         if ( casDocText != null ) {
+            casDocText = casDocText.trim();
+            if ( !casDocText.isEmpty() ) {
+               docId = casDocText.substring( 0, Math.min( casDocText.length(), 10 ) ) + "_" + casDocText.hashCode();
+            }
+         }
+      }
+      if ( docId == null || docId.isEmpty() ) {
+         docId = "Unknown_" + System.currentTimeMillis();
+      }
+      return docId.replaceAll( "[^A-Za-z0-9\\.]", "_" );
+   }
 
 }

Modified: ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java (original)
+++ ctakes/trunk/ctakes-coreference/src/main/java/org/apache/ctakes/coreference/cc/PreprocessAndWriteXmi.java Wed Jul  8 14:23:11 2015
@@ -18,11 +18,6 @@
  */
 package org.apache.ctakes.coreference.cc;
 
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
-
 import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.uima.UIMAException;
@@ -36,7 +31,6 @@ import org.apache.uima.fit.component.JCa
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
 import org.apache.uima.fit.factory.AnalysisEngineFactory;
 import org.apache.uima.fit.factory.CollectionReaderFactory;
-import org.apache.uima.fit.factory.ConfigurationParameterFactory;
 import org.apache.uima.fit.pipeline.SimplePipeline;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
@@ -47,101 +41,106 @@ import org.kohsuke.args4j.Option;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+
 public class PreprocessAndWriteXmi {
-	public static class Options {
+   public static class Options {
 
-		@Option(name = "-t", 
-				aliases = "--textRoot", 
-				usage = "specify the directory contraining the textFiles (for example /NLP/Corpus/Relations/mipacq/text/train",
-				required = true)
-				public String textRoot;
-
-		// TODO - fix to use an xml collection reader instead of the hacky way it's done now...
-		//		@Option(name = "-x",
-		//				aliases = "--xmlRoot",
-		//				usage = "specify the directory containing the knowtator xml files (for example: /NLP/Corpus/Relations/mipacq/xml/train",
-		//        required = true)
-		//		public File xmlRoot;
-
-		@Option(name = "-o",
-				aliases = "--outputRoot",
-				usage = "specify the directory to write out CAS XMI files",
-				required = true)
-				public File outputRoot;
-	}
-
-	/**
-	 * @param args
-	 * @throws IOException 
-	 * @throws UIMAException 
-	 * @throws CmdLineException 
-	 */
-	public static void main(String[] args) throws UIMAException, IOException, CmdLineException {
-		Options options = new Options();
-		CmdLineParser parser = new CmdLineParser(options);
-		parser.parseArgument(args);
+      @Option(name = "-t",
+            aliases = "--textRoot",
+            usage = "specify the directory contraining the textFiles (for example /NLP/Corpus/Relations/mipacq/text/train",
+            required = true)
+      public String textRoot;
+
+      // TODO - fix to use an xml collection reader instead of the hacky way it's done now...
+      //		@Option(name = "-x",
+      //				aliases = "--xmlRoot",
+      //				usage = "specify the directory containing the knowtator xml files (for example: /NLP/Corpus/Relations/mipacq/xml/train",
+      //        required = true)
+      //		public File xmlRoot;
+
+      @Option(name = "-o",
+            aliases = "--outputRoot",
+            usage = "specify the directory to write out CAS XMI files",
+            required = true)
+      public File outputRoot;
+   }
+
+   /**
+    * @param args
+    * @throws IOException
+    * @throws UIMAException
+    * @throws CmdLineException
+    */
+   public static void main( String[] args ) throws UIMAException, IOException, CmdLineException {
+      Options options = new Options();
+      CmdLineParser parser = new CmdLineParser( options );
+      parser.parseArgument( args );
 
-		File outputRoot = options.outputRoot;
-		String inputRoot = options.textRoot;
+      File outputRoot = options.outputRoot;
+      String inputRoot = options.textRoot;
 //		TypeSystemDescription typeSystem = 
 //			TypeSystemDescriptionFactory.createTypeSystemDescriptionFromPath("../common-type-system/desc/common_type_system.xml", 
 //																			 "../assertion/desc/medfactsTypeSystem.xml");
 
-		AnalysisEngine ae = AnalysisEngineFactory.createEngineFromPath("desc/analysis_engine/ODIESvmVectorCreator.xml");
+      AnalysisEngine ae = AnalysisEngineFactory.createEngineFromPath( "desc/analysis_engine/ODIESvmVectorCreator.xml" );
 
-		CollectionReader reader = CollectionReaderFactory.createReaderFromPath(
-				"../ctakes-core/desc/collection_reader/FilesInDirectoryCollectionReader.xml",
-				FilesInDirectoryCollectionReader.PARAM_INPUTDIR,
-				inputRoot);
+      CollectionReader reader = CollectionReaderFactory.createReaderFromPath(
+            "../ctakes-core/desc/collection_reader/FilesInDirectoryCollectionReader.xml",
+            FilesInDirectoryCollectionReader.PARAM_INPUTDIR,
+            inputRoot );
 
-		AnalysisEngine serializer = AnalysisEngineFactory.createEngine(
-				PreprocessAndWriteXmi.SerializeDocumentToXMI.class,
+      AnalysisEngine serializer = AnalysisEngineFactory.createEngine(
+            PreprocessAndWriteXmi.SerializeDocumentToXMI.class,
 //				typeSystem,
-				PreprocessAndWriteXmi.SerializeDocumentToXMI.PARAM_OUTPUT_DIRECTORY, 
-				outputRoot.getPath());
+            PreprocessAndWriteXmi.SerializeDocumentToXMI.PARAM_OUTPUT_DIRECTORY,
+            outputRoot.getPath() );
 
-		SimplePipeline.runPipeline(reader, ae, serializer);	    
-	}
+      SimplePipeline.runPipeline( reader, ae, serializer );
+   }
 
-	public static class SerializeDocumentToXMI extends JCasAnnotator_ImplBase {
-		public static final String PARAM_OUTPUT_DIRECTORY = "OutputDirectory";
+   public static class SerializeDocumentToXMI extends JCasAnnotator_ImplBase {
+      public static final String PARAM_OUTPUT_DIRECTORY = "OutputDirectory";
 
-		@ConfigurationParameter(name = PARAM_OUTPUT_DIRECTORY, mandatory = true, description = "Specifies the output directory in which to write xmi files")
-		private File outputDirectory;
-
-		@Override
-		public void initialize(UimaContext context) throws ResourceInitializationException {
-			super.initialize(context);
-			if (!this.outputDirectory.exists()) {
-				this.outputDirectory.mkdirs();
-			}
-		}
-
-		@Override
-		public void process(JCas jCas) throws AnalysisEngineProcessException {
-			try {
-				// FIXME - not using this right now, just use default jcas
+      @ConfigurationParameter(name = PARAM_OUTPUT_DIRECTORY, mandatory = true, description = "Specifies the output directory in which to write xmi files")
+      private File outputDirectory;
+
+      @Override
+      public void initialize( UimaContext context ) throws ResourceInitializationException {
+         super.initialize( context );
+         if ( !this.outputDirectory.exists() ) {
+            this.outputDirectory.mkdirs();
+         }
+      }
+
+      @Override
+      public void process( JCas jCas ) throws AnalysisEngineProcessException {
+         try {
+            // FIXME - not using this right now, just use default jcas
 //				JCas goldView = jCas.getView(RelationExtractorEvaluation.GOLD_VIEW_NAME);
-				JCas goldView = jCas;
-				String documentID = DocumentIDAnnotationUtil.getDocumentID(goldView);
-				if (documentID == null) {
-					throw new IllegalArgumentException("No documentID for CAS:\n" + jCas);
-				}
-				File outFile = new File(this.outputDirectory, documentID + ".xmi");
-				ContentHandler handler = new XMLSerializer(new FileOutputStream(outFile)).getContentHandler();
-				new XmiCasSerializer(jCas.getTypeSystem()).serialize(jCas.getCas(), handler);
-			} catch (CASRuntimeException e) {
-				throw new AnalysisEngineProcessException(e);
-			} catch (SAXException e) {
-				throw new AnalysisEngineProcessException(e);
-			} catch (FileNotFoundException e) {
-				throw new AnalysisEngineProcessException(e);
+            JCas goldView = jCas;
+            String documentID = DocumentIDAnnotationUtil.getDocumentID( goldView );
+            if ( documentID == null || documentID.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+               throw new IllegalArgumentException( "No documentID for CAS:\n" + jCas );
+            }
+            File outFile = new File( this.outputDirectory, documentID + ".xmi" );
+            ContentHandler handler = new XMLSerializer( new FileOutputStream( outFile ) ).getContentHandler();
+            new XmiCasSerializer( jCas.getTypeSystem() ).serialize( jCas.getCas(), handler );
+         } catch ( CASRuntimeException e ) {
+            throw new AnalysisEngineProcessException( e );
+         } catch ( SAXException e ) {
+            throw new AnalysisEngineProcessException( e );
+         } catch ( FileNotFoundException e ) {
+            throw new AnalysisEngineProcessException( e );
 //			} catch (CASException e) {
 //				throw new AnalysisEngineProcessException(e);
-			}	
-		}
+         }
+      }
 
-	}
+   }
 
 }
 

Modified: ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/DependencyNodeWriter.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/DependencyNodeWriter.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/DependencyNodeWriter.java (original)
+++ ctakes/trunk/ctakes-dependency-parser/src/main/java/org/apache/ctakes/dependency/parser/DependencyNodeWriter.java Wed Jul  8 14:23:11 2015
@@ -16,195 +16,191 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.ctakes.dependency.parser;
-/*
- * Copyright: (c) 2010   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
- * http://www.apache.org/licenses/LICENSE-2.0 
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and 
- * limitations under the License. 
- */
-
-
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileWriter;
-
-import org.apache.log4j.Logger;
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.FSIterator;
-import org.apache.uima.cas.text.AnnotationIndex;
-import org.apache.uima.collection.CasConsumer_ImplBase;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.resource.ResourceProcessException;
-
+package org.apache.ctakes.dependency.parser;
+/*
+ * Copyright: (c) 2010   Mayo Foundation for Medical Education and 
+ * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
+ * triple-shield Mayo logo are trademarks and service marks of MFMER.
+ *
+ * Except as contained in the copyright notice above, or as used to identify 
+ * MFMER as the author of this software, the trade names, trademarks, service
+ * marks, or product names of the copyright holder shall not be used in
+ * advertising, promotion or otherwise in connection with this software without
+ * prior written authorization of the copyright holder.
+ * 
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ * http://www.apache.org/licenses/LICENSE-2.0 
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and 
+ * limitations under the License. 
+ */
+
+
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
-import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
-import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.typesystem.type.syntax.ConllDependencyNode;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.log4j.Logger;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
+import org.apache.uima.cas.text.AnnotationIndex;
+import org.apache.uima.collection.CasConsumer_ImplBase;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.resource.ResourceProcessException;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileWriter;
 
 
 /**
  * CasConsumer that writes a JCas (the current view) to an xml file
- * 
- * @author Mayo Clinic 
+ *
+ * @author Mayo Clinic
  */
-public class DependencyNodeWriter extends CasConsumer_ImplBase
-{
-    // LOG4J logger based on class name
-    private Logger iv_logger = Logger.getLogger(getClass().getName());
-
-    private String iv_outputDir = null;
-    private String iv_outputFormat = null;
-    
-    // iv_procCount is used to name the output files sequentially if there 
-    // is a problem with naming based on source names
-    private int iv_procCount = 0; 
-
-    
-    /**
-     * Read in configuration parameters
-     */
-    public void initialize() throws ResourceInitializationException {
-        iv_outputDir = (String) getConfigParameterValue("outputDir");
-        iv_outputFormat = (String) getConfigParameterValue("outputFormat");
-    }
-
-
-    /**
-     * Write a tab-delimited file containing data from the view.
-     * The file name will come from the DocumentID annotation,
-     * which is associated with a view.
-     */
-    private void processView(JCas jCas) throws Exception {
-        // String docText = view.getDocumentText();
-
-        String docName = DocumentIDAnnotationUtil.getDocumentID(jCas);
-
-        File outputFile;
-        if (docName==null) {
-        	docName = "doc" + iv_procCount + "." + iv_outputFormat.toLowerCase();
-        }
-        else {
-        	docName = docName + "." + iv_outputFormat.toLowerCase();        		
-			//	if (!docName.endsWith(".xml")) {
-			//    	docName = docName + ".xml";        		
-			//	}
-        }
-        
+public class DependencyNodeWriter extends CasConsumer_ImplBase {
+   // LOG4J logger based on class name
+   private Logger iv_logger = Logger.getLogger( getClass().getName() );
+
+   private String iv_outputDir = null;
+   private String iv_outputFormat = null;
+
+   // iv_procCount is used to name the output files sequentially if there
+   // is a problem with naming based on source names
+   private int iv_procCount = 0;
+
+
+   /**
+    * Read in configuration parameters
+    */
+   public void initialize() throws ResourceInitializationException {
+      iv_outputDir = (String)getConfigParameterValue( "outputDir" );
+      iv_outputFormat = (String)getConfigParameterValue( "outputFormat" );
+   }
+
+
+   /**
+    * Write a tab-delimited file containing data from the view.
+    * The file name will come from the DocumentID annotation,
+    * which is associated with a view.
+    */
+   private void processView( JCas jCas ) throws Exception {
+      // String docText = view.getDocumentText();
+
+      String docName = DocumentIDAnnotationUtil.getDocumentID( jCas );
+
+      File outputFile;
+      if ( docName == null || docName.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+         docName = "doc" + iv_procCount + "." + iv_outputFormat.toLowerCase();
+      } else {
+         docName = docName + "." + iv_outputFormat.toLowerCase();
+         //	if (!docName.endsWith(".xml")) {
+         //    	docName = docName + ".xml";
+         //	}
+      }
+
 //        OutputStream out=null;
-        try {
-        	File outputDir = new File(iv_outputDir);
-        	outputDir.mkdirs();
-            outputFile = new File(iv_outputDir + File.separatorChar + docName);
+      try {
+         File outputDir = new File( iv_outputDir );
+         outputDir.mkdirs();
+         outputFile = new File( iv_outputDir + File.separatorChar + docName );
 //            out = new FileOutputStream(outputFile);
 //            XCASSerializer.serialize(view.getCas(), out, true); // true -> formats the output
-            outputFile.createNewFile();
-            BufferedWriter bw = new BufferedWriter(new FileWriter(outputFile));
-
-            AnnotationIndex nodeIndex = jCas.getAnnotationIndex(ConllDependencyNode.type);
-            FSIterator sentences = jCas.getAnnotationIndex(Sentence.type).iterator();
-
-            while (sentences.hasNext()) {
-                Sentence sentence = (Sentence) sentences.next();
-
-
-                ConllDependencyNode node = null;
-                FSIterator nodeIterator = nodeIndex.subiterator(sentence);
-                while (nodeIterator.hasNext()) {
-//                    int pID = (node==null)? 0 : node.getID();
-                    node = (ConllDependencyNode) nodeIterator.next();
-                    
-                    if (node.getId()!=0 ) { // && node.getID() !=pID) {
-
-                        if (iv_outputFormat.toLowerCase().contains("min")) {
-                            bw.write(node.getId()+"\t");
-                            bw.write(node.getForm()+"\t");
-                            bw.write( (node.getHead()==null ? "_" : node.getHead().getId()) + "\t");
-                            bw.write(node.getDeprel()+"\n");
-                        } else if (iv_outputFormat.toLowerCase().contains("mpos")) {
-                            bw.write(node.getId()+"\t");
-                            bw.write(node.getForm()+"\t");
-                            bw.write(node.getPostag()+"\t");
-                            bw.write( (node.getHead()==null ? "_" : node.getHead().getId()) + "\t");
-                            bw.write(node.getDeprel()+"\n");
-                        } else if (iv_outputFormat.toLowerCase().contains("mlem")) {
-                            bw.write(node.getId()+"\t");
-                            bw.write(node.getForm()+"\t");
-                            bw.write(node.getLemma()+"\t");
-                            bw.write( (node.getHead()==null ? "_" : node.getHead().getId()) + "\t");
-                            bw.write(node.getDeprel()+"\n");
-                        } else if (iv_outputFormat.toLowerCase().contains("dep")) {
-                            bw.write(node.getId()+"\t");
-                            bw.write(node.getForm()+"\t");
-                            bw.write(node.getLemma()+"\t");
-                            bw.write(node.getPostag()+"\t");
-                            bw.write( (node.getHead()==null ? "_" : node.getHead().getId()) + "\t");
-                            bw.write(node.getDeprel()+"\n");
-                        } else { //if (iv_outputFormat.toLowerCase().contains("conll")) {
-                            bw.write(node.getId()+"\t");
-                            bw.write(node.getForm()+"\t");
-                            bw.write(node.getLemma()+"\t");
-                            bw.write(node.getCpostag()+"\t");
-                            bw.write(node.getPostag()+"\t");
-                            bw.write(node.getFeats()+"\t");
-                            bw.write( (node.getHead()==null ? "_" : node.getHead().getId()) + "\t");
-                            bw.write(node.getDeprel()+"\t");
-                            bw.write( (node.getPhead()==null ? "_" : node.getPhead().getId()) + "\t");
-                            bw.write(node.getPdeprel()+"\n");
-                        }
-                    }
-                    
-                }
-                bw.write("\n");
-
-
-            }
-            bw.flush();
-        
-        } 
-        finally {
-	        iv_procCount++;
+         outputFile.createNewFile();
+         BufferedWriter bw = new BufferedWriter( new FileWriter( outputFile ) );
+
+         AnnotationIndex nodeIndex = jCas.getAnnotationIndex( ConllDependencyNode.type );
+         FSIterator sentences = jCas.getAnnotationIndex( Sentence.type ).iterator();
+
+         while ( sentences.hasNext() ) {
+            Sentence sentence = (Sentence)sentences.next();
+
+
+            ConllDependencyNode node = null;
+            FSIterator nodeIterator = nodeIndex.subiterator( sentence );
+            while ( nodeIterator.hasNext() ) {
+//                    int pID = (node==null)? 0 : node.getID();
+               node = (ConllDependencyNode)nodeIterator.next();
+
+               if ( node.getId() != 0 ) { // && node.getID() !=pID) {
+
+                  if ( iv_outputFormat.toLowerCase().contains( "min" ) ) {
+                     bw.write( node.getId() + "\t" );
+                     bw.write( node.getForm() + "\t" );
+                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
+                     bw.write( node.getDeprel() + "\n" );
+                  } else if ( iv_outputFormat.toLowerCase().contains( "mpos" ) ) {
+                     bw.write( node.getId() + "\t" );
+                     bw.write( node.getForm() + "\t" );
+                     bw.write( node.getPostag() + "\t" );
+                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
+                     bw.write( node.getDeprel() + "\n" );
+                  } else if ( iv_outputFormat.toLowerCase().contains( "mlem" ) ) {
+                     bw.write( node.getId() + "\t" );
+                     bw.write( node.getForm() + "\t" );
+                     bw.write( node.getLemma() + "\t" );
+                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
+                     bw.write( node.getDeprel() + "\n" );
+                  } else if ( iv_outputFormat.toLowerCase().contains( "dep" ) ) {
+                     bw.write( node.getId() + "\t" );
+                     bw.write( node.getForm() + "\t" );
+                     bw.write( node.getLemma() + "\t" );
+                     bw.write( node.getPostag() + "\t" );
+                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
+                     bw.write( node.getDeprel() + "\n" );
+                  } else { //if (iv_outputFormat.toLowerCase().contains("conll")) {
+                     bw.write( node.getId() + "\t" );
+                     bw.write( node.getForm() + "\t" );
+                     bw.write( node.getLemma() + "\t" );
+                     bw.write( node.getCpostag() + "\t" );
+                     bw.write( node.getPostag() + "\t" );
+                     bw.write( node.getFeats() + "\t" );
+                     bw.write( (node.getHead() == null ? "_" : node.getHead().getId()) + "\t" );
+                     bw.write( node.getDeprel() + "\t" );
+                     bw.write( (node.getPhead() == null ? "_" : node.getPhead().getId()) + "\t" );
+                     bw.write( node.getPdeprel() + "\n" );
+                  }
+               }
+
+            }
+            bw.write( "\n" );
+
+
+         }
+         bw.flush();
+
+      } finally {
+         iv_procCount++;
 //	        if (out != null) {
 //	        	out.close();
 //	        }
-        }
+      }
+
+   }
+
+
+   /**
+    * Create an xml file from the data in the cas.
+    */
+   public void processCas( CAS cas ) throws ResourceProcessException {
+
+      iv_logger.info( "Started" );
+
+      try {
 
-    }
+         JCas currentView = cas.getCurrentView().getJCas();
+         processView( currentView );
 
-    
-    /**
-     * Create an xml file from the data in the cas.
-     */
-    public void processCas(CAS cas) throws ResourceProcessException {
-
-    	iv_logger.info("Started");
-    	
-        try { 
-
-        	JCas currentView = cas.getCurrentView().getJCas();
-            processView(currentView);
-            
-        } catch (Exception e) {
-        	throw new ResourceProcessException(e);
-        }
+      } catch ( Exception e ) {
+         throw new ResourceProcessException( e );
+      }
 
-    }
+   }
 
 }
\ No newline at end of file

Modified: ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java?rev=1689882&r1=1689881&r2=1689882&view=diff
==============================================================================
--- ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java (original)
+++ ctakes/trunk/ctakes-relation-extractor/src/main/java/org/apache/ctakes/relationextractor/eval/SHARPXMI.java Wed Jul  8 14:23:11 2015
@@ -18,16 +18,11 @@
  */
 package org.apache.ctakes.relationextractor.eval;
 
-import java.io.File;
-import java.io.FileOutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
+import com.google.common.base.Function;
+import com.google.common.base.Functions;
+import com.google.common.collect.Lists;
+import com.google.common.collect.Ordering;
+import com.lexicalscope.jewel.cli.Option;
 import org.apache.ctakes.core.ae.SHARPKnowtatorXMLReader;
 import org.apache.ctakes.core.util.DocumentIDAnnotationUtil;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
@@ -54,296 +49,296 @@ import org.cleartk.util.ae.UriToDocument
 import org.cleartk.util.cr.UriCollectionReader;
 import org.xml.sax.ContentHandler;
 
-import com.google.common.base.Function;
-import com.google.common.base.Functions;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Ordering;
-import com.lexicalscope.jewel.cli.Option;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.util.*;
+import java.util.regex.Pattern;
 
 public class SHARPXMI {
 
-  public static List<File> getTrainTextFiles(File batchesDirectory) {
-    // seed_set1: batches 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 18, 19
-    // seed_set2: batches 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 18, 19
-    // seed_set3: batches 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19
-    // seed_set4: batches 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19
-    return getTextFilesFor(
-        batchesDirectory,
-        Pattern.compile("^(ss[1234]_batch0[2-9]|ss[1234]_batch1[56]"
-            + "|ss[1234]_batch1[89]|ss[123]_batch01"
-            + "|ss[12]_batch1[34]|ss[34]_batch1[12])$"));
-  }
-
-  public static List<File> getDevTextFiles(File batchesDirectory) {
-    // seed_set1: batches 10, 17
-    // seed_set2: batches 10, 17
-    // seed_set3: batches 10, 17
-    // seed_set4: batches 10, 17
-    return getTextFilesFor(batchesDirectory, Pattern.compile("^(ss[1234]_batch1[07])$"));
-  }
-
-  public static List<File> getTestTextFiles(File batchesDirectory) {
-    // seed_set1: batches 11, 12
-    // seed_set2: batches 11, 12
-    // seed_set3: batches 13, 14
-    // seed_set4: batches 13, 14
-    return getTextFilesFor(
-        batchesDirectory,
-        Pattern.compile("^(ss[12]_batch1[12]|ss[34]_batch1[34])$"));
-  }
-
-  public static List<File> getAllTextFiles(File batchesDirectory) {
-    return getTextFilesFor(batchesDirectory, Pattern.compile(""));
-  }
-
-  private static List<File> getTextFilesFor(File batchesDirectory, Pattern pattern) {
-    List<File> files = Lists.newArrayList();
-    for (File batchDir : batchesDirectory.listFiles()) {
-      if (batchDir.isDirectory() && !batchDir.isHidden()) {
-        if (pattern.matcher(batchDir.getName()).find()) {
-          File textDirectory = new File(batchDir, "Knowtator/text");
-          for (File textFile : textDirectory.listFiles()) {
-            if (textFile.isFile() && !textFile.isHidden()) {
-              files.add(textFile);
+   public static List<File> getTrainTextFiles( File batchesDirectory ) {
+      // seed_set1: batches 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 18, 19
+      // seed_set2: batches 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 14, 15, 16, 18, 19
+      // seed_set3: batches 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19
+      // seed_set4: batches 1, 2, 3, 4, 5, 6, 7, 8, 9, 11, 12, 15, 16, 18, 19
+      return getTextFilesFor(
+            batchesDirectory,
+            Pattern.compile( "^(ss[1234]_batch0[2-9]|ss[1234]_batch1[56]"
+                             + "|ss[1234]_batch1[89]|ss[123]_batch01"
+                             + "|ss[12]_batch1[34]|ss[34]_batch1[12])$" ) );
+   }
+
+   public static List<File> getDevTextFiles( File batchesDirectory ) {
+      // seed_set1: batches 10, 17
+      // seed_set2: batches 10, 17
+      // seed_set3: batches 10, 17
+      // seed_set4: batches 10, 17
+      return getTextFilesFor( batchesDirectory, Pattern.compile( "^(ss[1234]_batch1[07])$" ) );
+   }
+
+   public static List<File> getTestTextFiles( File batchesDirectory ) {
+      // seed_set1: batches 11, 12
+      // seed_set2: batches 11, 12
+      // seed_set3: batches 13, 14
+      // seed_set4: batches 13, 14
+      return getTextFilesFor(
+            batchesDirectory,
+            Pattern.compile( "^(ss[12]_batch1[12]|ss[34]_batch1[34])$" ) );
+   }
+
+   public static List<File> getAllTextFiles( File batchesDirectory ) {
+      return getTextFilesFor( batchesDirectory, Pattern.compile( "" ) );
+   }
+
+   private static List<File> getTextFilesFor( File batchesDirectory, Pattern pattern ) {
+      List<File> files = Lists.newArrayList();
+      for ( File batchDir : batchesDirectory.listFiles() ) {
+         if ( batchDir.isDirectory() && !batchDir.isHidden() ) {
+            if ( pattern.matcher( batchDir.getName() ).find() ) {
+               File textDirectory = new File( batchDir, "Knowtator/text" );
+               for ( File textFile : textDirectory.listFiles() ) {
+                  if ( textFile.isFile() && !textFile.isHidden() ) {
+                     files.add( textFile );
+                  }
+               }
             }
-          }
-        }
+         }
       }
-    }
-    return files;
-  }
-
-  public static List<File> toXMIFiles(Options options, List<File> textFiles) {
-    List<File> xmiFiles = Lists.newArrayList();
-    for (File textFile : textFiles) {
-      xmiFiles.add(toXMIFile(options, textFile));
-    }
-    return xmiFiles;
-  }
-
-  private static File toXMIFile(Options options, File textFile) {
-    return new File(options.getXMIDirectory(), textFile.getName() + ".xmi");
-  }
-
-  public static interface Options {
-    @Option(
-        longName = "batches-dir",
-        description = "directory containing ssN_batchNN directories, each of which should contain "
-            + "a Knowtator directory and a Knowtator_XML directory")
-    public File getBatchesDirectory();
-
-    @Option(
-        longName = "xmi-dir",
-        defaultValue = "target/xmi",
-        description = "directory to store and load XMI serialization of annotations")
-    public File getXMIDirectory();
-
-    @Option(
-        longName = "generate-xmi",
-        description = "read in the gold annotations and serialize them as XMI")
-    public boolean getGenerateXMI();
-  }
-
-  public static final String GOLD_VIEW_NAME = "GoldView";
-
-  public static void generateXMI(Options options) throws Exception {
-    // if necessary, write the XMIs first
-    if (options.getGenerateXMI()) {
-      if (!options.getXMIDirectory().exists()) {
-        options.getXMIDirectory().mkdirs();
+      return files;
+   }
+
+   public static List<File> toXMIFiles( Options options, List<File> textFiles ) {
+      List<File> xmiFiles = Lists.newArrayList();
+      for ( File textFile : textFiles ) {
+         xmiFiles.add( toXMIFile( options, textFile ) );
       }
+      return xmiFiles;
+   }
 
-      // create a collection reader that loads URIs for all Knowtator text files
-      List<File> files = Lists.newArrayList();
-      files.addAll(getTrainTextFiles(options.getBatchesDirectory()));
-      files.addAll(getDevTextFiles(options.getBatchesDirectory()));
-      files.addAll(getTestTextFiles(options.getBatchesDirectory()));
-      CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files);
-
-      // load the text from the URI, run the preprocessor, then run the
-      // Knowtator XML reader
-      AggregateBuilder builder = new AggregateBuilder();
-      builder.add(UriToDocumentTextAnnotator.getDescription());
-      File preprocessDescFile = new File("desc/analysis_engine/RelationExtractorPreprocessor.xml");
-      XMLParser parser = UIMAFramework.getXMLParser();
-      XMLInputSource source = new XMLInputSource(preprocessDescFile);
-      builder.add(parser.parseAnalysisEngineDescription(source));
-      builder.add(AnalysisEngineFactory.createEngineDescription(
-          ViewCreatorAnnotator.class,
-          ViewCreatorAnnotator.PARAM_VIEW_NAME,
-          GOLD_VIEW_NAME));
-      builder.add(AnalysisEngineFactory.createEngineDescription(CopyDocumentTextToGoldView.class));
-      builder.add(
-          AnalysisEngineFactory.createEngineDescription(DocumentIDAnnotator.class),
-          CAS.NAME_DEFAULT_SOFA,
-          GOLD_VIEW_NAME);
-      builder.add(
-          AnalysisEngineFactory.createEngineDescription(SHARPKnowtatorXMLReader.class,
-        		  SHARPKnowtatorXMLReader.PARAM_SET_DEFAULTS,
-        		  true),
-          CAS.NAME_DEFAULT_SOFA,
-          GOLD_VIEW_NAME);
-
-      // write out an XMI for each file
-      for (Iterator<JCas> casIter = new JCasIterator(reader, builder.createAggregate()); casIter.hasNext();) {
-    	JCas jCas = casIter.next();
-        JCas goldView = jCas.getView(GOLD_VIEW_NAME);
-        String documentID = DocumentIDAnnotationUtil.getDocumentID(goldView);
-        if (documentID == null) {
-          throw new IllegalArgumentException("No documentID for CAS:\n" + jCas);
-        }
-        File outFile = toXMIFile(options, new File(documentID));
-        FileOutputStream stream = new FileOutputStream(outFile);
-        ContentHandler handler = new XMLSerializer(stream).getContentHandler();
-        new XmiCasSerializer(jCas.getTypeSystem()).serialize(jCas.getCas(), handler);
-        stream.close();
-      }
-    }
-  }
-
-  public enum EvaluateOn {
-    TRAIN, DEV, TEST
-  }
-
-  public static interface EvaluationOptions extends Options {
-    @Option(
-        longName = "evaluate-on",
-        defaultValue = "DEV",
-        description = "perform evaluation using the training (TRAIN), development (DEV) or test "
-            + "(TEST) data.")
-    public EvaluateOn getEvaluteOn();
-
-    @Option(
-        longName = "grid-search",
-        description = "run a grid search to select the best parameters")
-    public boolean getGridSearch();
-  }
-  
-  public static abstract class Evaluation_ImplBase extends org.cleartk.eval.Evaluation_ImplBase<File, AnnotationStatistics<String>> {
-
-    public Evaluation_ImplBase(File baseDirectory) {
-      super(baseDirectory);
-    }
-
-    @Override
-    public CollectionReader getCollectionReader(List<File> items) throws Exception {
-      return CollectionReaderFactory.createReader(
-          XMIReader.class,
-          TypeSystemDescriptionFactory.createTypeSystemDescription(),
-          XMIReader.PARAM_FILES,
-          items);
-    }
-
-  }
-
-  public static void validate(EvaluationOptions options) throws Exception {
-    // error on invalid option combinations
-    if (options.getEvaluteOn().equals(EvaluateOn.TEST) && options.getGridSearch()) {
-      throw new IllegalArgumentException("grid search can only be run on the train or dev sets");
-    }
-  }
-
-  public static <T extends Evaluation_ImplBase> void evaluate(
-      EvaluationOptions options,
-      ParameterSettings bestSettings,
-      List<ParameterSettings> gridOfSettings,
-      Function<ParameterSettings, T> getEvaluation) throws Exception {
-    // define the set of possible training parameters
-    List<ParameterSettings> possibleParams;
-    if (options.getGridSearch()) {
-      possibleParams = gridOfSettings;
-    } else {
-      possibleParams = Lists.newArrayList(bestSettings);
-    }
-
-    // run an evaluation for each set of parameters
-    Map<ParameterSettings, Double> scoredParams = new HashMap<ParameterSettings, Double>();
-    for (ParameterSettings params : possibleParams) {
-      Evaluation_ImplBase evaluation = getEvaluation.apply(params);
-
-      List<File> trainFiles, devFiles, testFiles;
-      switch (options.getEvaluteOn()) {
-      case TRAIN:
-        // run n-fold cross-validation on the training set
-        trainFiles = getTrainTextFiles(options.getBatchesDirectory());
-        trainFiles = toXMIFiles(options, trainFiles);
-        List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation(trainFiles, 2);
-        params.stats = AnnotationStatistics.addAll(foldStats);
-        break;
-      case DEV:
-        // train on the training set and evaluate on the dev set
-        trainFiles = getTrainTextFiles(options.getBatchesDirectory());
-        trainFiles = toXMIFiles(options, trainFiles);
-        devFiles = getDevTextFiles(options.getBatchesDirectory());
-        devFiles = toXMIFiles(options, devFiles);
-        params.stats = evaluation.trainAndTest(trainFiles, devFiles);
-        break;
-      case TEST:
-        // train on the training set + dev set and evaluate on the test set
-        List<File> allTrainFiles = new ArrayList<File>();
-        allTrainFiles.addAll(getTrainTextFiles(options.getBatchesDirectory()));
-        allTrainFiles.addAll(getDevTextFiles(options.getBatchesDirectory()));
-        allTrainFiles = toXMIFiles(options, allTrainFiles);
-        testFiles = getTestTextFiles(options.getBatchesDirectory());
-        testFiles = toXMIFiles(options, testFiles);
-        params.stats = evaluation.trainAndTest(allTrainFiles, testFiles);
-        break;
-      default:
-        throw new IllegalArgumentException("Invalid EvaluateOn: " + options.getEvaluteOn());
-      }
-      scoredParams.put(params, params.stats.f1());
-    }
-
-    // print parameters sorted by F1
-    List<ParameterSettings> list = new ArrayList<ParameterSettings>(scoredParams.keySet());
-    Function<ParameterSettings, Double> getCount = Functions.forMap(scoredParams);
-    Collections.sort(list, Ordering.natural().onResultOf(getCount));
-
-    // print performance of each set of parameters
-    if (list.size() > 1) {
-      System.err.println("Summary");
-      for (ParameterSettings params : list) {
-        System.err.printf(
-            "F1=%.3f P=%.3f R=%.3f %s\n",
-            params.stats.f1(),
-            params.stats.precision(),
-            params.stats.recall(),
-            params);
-      }
-      System.err.println();
-    }
-
-    // print overall best model
-    if (!list.isEmpty()) {
-      ParameterSettings lastParams = list.get(list.size() - 1);
-      System.err.println("Best model:");
-      System.err.print(lastParams.stats);
-      System.err.println(lastParams);
-      System.err.println(lastParams.stats.confusions());
-      System.err.println();
-    }
-  }
-
-  public static class DocumentIDAnnotator extends JCasAnnotator_ImplBase {
-
-    @Override
-    public void process(JCas jCas) throws AnalysisEngineProcessException {
-      String documentID = new File(ViewUriUtil.getURI(jCas)).getPath();
-      DocumentID documentIDAnnotation = new DocumentID(jCas);
-      documentIDAnnotation.setDocumentID(documentID);
-      documentIDAnnotation.addToIndexes();
-    }
-  }
-
-  public static class CopyDocumentTextToGoldView extends JCasAnnotator_ImplBase {
-    @Override
-    public void process(JCas jCas) throws AnalysisEngineProcessException {
-      try {
-        JCas goldView = jCas.getView(GOLD_VIEW_NAME);
-        goldView.setDocumentText(jCas.getDocumentText());
-      } catch (CASException e) {
-        throw new AnalysisEngineProcessException(e);
+   private static File toXMIFile( Options options, File textFile ) {
+      return new File( options.getXMIDirectory(), textFile.getName() + ".xmi" );
+   }
+
+   public static interface Options {
+      @Option(
+            longName = "batches-dir",
+            description = "directory containing ssN_batchNN directories, each of which should contain "
+                          + "a Knowtator directory and a Knowtator_XML directory")
+      public File getBatchesDirectory();
+
+      @Option(
+            longName = "xmi-dir",
+            defaultValue = "target/xmi",
+            description = "directory to store and load XMI serialization of annotations")
+      public File getXMIDirectory();
+
+      @Option(
+            longName = "generate-xmi",
+            description = "read in the gold annotations and serialize them as XMI")
+      public boolean getGenerateXMI();
+   }
+
+   public static final String GOLD_VIEW_NAME = "GoldView";
+
+   public static void generateXMI( Options options ) throws Exception {
+      // if necessary, write the XMIs first
+      if ( options.getGenerateXMI() ) {
+         if ( !options.getXMIDirectory().exists() ) {
+            options.getXMIDirectory().mkdirs();
+         }
+
+         // create a collection reader that loads URIs for all Knowtator text files
+         List<File> files = Lists.newArrayList();
+         files.addAll( getTrainTextFiles( options.getBatchesDirectory() ) );
+         files.addAll( getDevTextFiles( options.getBatchesDirectory() ) );
+         files.addAll( getTestTextFiles( options.getBatchesDirectory() ) );
+         CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles( files );
+
+         // load the text from the URI, run the preprocessor, then run the
+         // Knowtator XML reader
+         AggregateBuilder builder = new AggregateBuilder();
+         builder.add( UriToDocumentTextAnnotator.getDescription() );
+         File preprocessDescFile = new File( "desc/analysis_engine/RelationExtractorPreprocessor.xml" );
+         XMLParser parser = UIMAFramework.getXMLParser();
+         XMLInputSource source = new XMLInputSource( preprocessDescFile );
+         builder.add( parser.parseAnalysisEngineDescription( source ) );
+         builder.add( AnalysisEngineFactory.createEngineDescription(
+               ViewCreatorAnnotator.class,
+               ViewCreatorAnnotator.PARAM_VIEW_NAME,
+               GOLD_VIEW_NAME ) );
+         builder.add( AnalysisEngineFactory.createEngineDescription( CopyDocumentTextToGoldView.class ) );
+         builder.add(
+               AnalysisEngineFactory.createEngineDescription( DocumentIDAnnotator.class ),
+               CAS.NAME_DEFAULT_SOFA,
+               GOLD_VIEW_NAME );
+         builder.add(
+               AnalysisEngineFactory.createEngineDescription( SHARPKnowtatorXMLReader.class,
+                     SHARPKnowtatorXMLReader.PARAM_SET_DEFAULTS,
+                     true ),
+               CAS.NAME_DEFAULT_SOFA,
+               GOLD_VIEW_NAME );
+
+         // write out an XMI for each file
+         for ( Iterator<JCas> casIter = new JCasIterator( reader, builder.createAggregate() ); casIter.hasNext(); ) {
+            JCas jCas = casIter.next();
+            JCas goldView = jCas.getView( GOLD_VIEW_NAME );
+            String documentID = DocumentIDAnnotationUtil.getDocumentID( goldView );
+            if ( documentID == null || documentID.equals( DocumentIDAnnotationUtil.NO_DOCUMENT_ID ) ) {
+               throw new IllegalArgumentException( "No documentID for CAS:\n" + jCas );
+            }
+            File outFile = toXMIFile( options, new File( documentID ) );
+            FileOutputStream stream = new FileOutputStream( outFile );
+            ContentHandler handler = new XMLSerializer( stream ).getContentHandler();
+            new XmiCasSerializer( jCas.getTypeSystem() ).serialize( jCas.getCas(), handler );
+            stream.close();
+         }
+      }
+   }
+
+   public enum EvaluateOn {
+      TRAIN, DEV, TEST
+   }
+
+   public static interface EvaluationOptions extends Options {
+      @Option(
+            longName = "evaluate-on",
+            defaultValue = "DEV",
+            description = "perform evaluation using the training (TRAIN), development (DEV) or test "
+                          + "(TEST) data.")
+      public EvaluateOn getEvaluteOn();
+
+      @Option(
+            longName = "grid-search",
+            description = "run a grid search to select the best parameters")
+      public boolean getGridSearch();
+   }
+
+   public static abstract class Evaluation_ImplBase
+         extends org.cleartk.eval.Evaluation_ImplBase<File, AnnotationStatistics<String>> {
+
+      public Evaluation_ImplBase( File baseDirectory ) {
+         super( baseDirectory );
+      }
+
+      @Override
+      public CollectionReader getCollectionReader( List<File> items ) throws Exception {
+         return CollectionReaderFactory.createReader(
+               XMIReader.class,
+               TypeSystemDescriptionFactory.createTypeSystemDescription(),
+               XMIReader.PARAM_FILES,
+               items );
+      }
+
+   }
+
+   public static void validate( EvaluationOptions options ) throws Exception {
+      // error on invalid option combinations
+      if ( options.getEvaluteOn().equals( EvaluateOn.TEST ) && options.getGridSearch() ) {
+         throw new IllegalArgumentException( "grid search can only be run on the train or dev sets" );
+      }
+   }
+
+   public static <T extends Evaluation_ImplBase> void evaluate(
+         EvaluationOptions options,
+         ParameterSettings bestSettings,
+         List<ParameterSettings> gridOfSettings,
+         Function<ParameterSettings, T> getEvaluation ) throws Exception {
+      // define the set of possible training parameters
+      List<ParameterSettings> possibleParams;
+      if ( options.getGridSearch() ) {
+         possibleParams = gridOfSettings;
+      } else {
+         possibleParams = Lists.newArrayList( bestSettings );
+      }
+
+      // run an evaluation for each set of parameters
+      Map<ParameterSettings, Double> scoredParams = new HashMap<ParameterSettings, Double>();
+      for ( ParameterSettings params : possibleParams ) {
+         Evaluation_ImplBase evaluation = getEvaluation.apply( params );
+
+         List<File> trainFiles, devFiles, testFiles;
+         switch ( options.getEvaluteOn() ) {
+            case TRAIN:
+               // run n-fold cross-validation on the training set
+               trainFiles = getTrainTextFiles( options.getBatchesDirectory() );
+               trainFiles = toXMIFiles( options, trainFiles );
+               List<AnnotationStatistics<String>> foldStats = evaluation.crossValidation( trainFiles, 2 );
+               params.stats = AnnotationStatistics.addAll( foldStats );
+               break;
+            case DEV:
+               // train on the training set and evaluate on the dev set
+               trainFiles = getTrainTextFiles( options.getBatchesDirectory() );
+               trainFiles = toXMIFiles( options, trainFiles );
+               devFiles = getDevTextFiles( options.getBatchesDirectory() );
+               devFiles = toXMIFiles( options, devFiles );
+               params.stats = evaluation.trainAndTest( trainFiles, devFiles );
+               break;
+            case TEST:
+               // train on the training set + dev set and evaluate on the test set
+               List<File> allTrainFiles = new ArrayList<File>();
+               allTrainFiles.addAll( getTrainTextFiles( options.getBatchesDirectory() ) );
+               allTrainFiles.addAll( getDevTextFiles( options.getBatchesDirectory() ) );
+               allTrainFiles = toXMIFiles( options, allTrainFiles );
+               testFiles = getTestTextFiles( options.getBatchesDirectory() );
+               testFiles = toXMIFiles( options, testFiles );
+               params.stats = evaluation.trainAndTest( allTrainFiles, testFiles );
+               break;
+            default:
+               throw new IllegalArgumentException( "Invalid EvaluateOn: " + options.getEvaluteOn() );
+         }
+         scoredParams.put( params, params.stats.f1() );
+      }
+
+      // print parameters sorted by F1
+      List<ParameterSettings> list = new ArrayList<ParameterSettings>( scoredParams.keySet() );
+      Function<ParameterSettings, Double> getCount = Functions.forMap( scoredParams );
+      Collections.sort( list, Ordering.natural().onResultOf( getCount ) );
+
+      // print performance of each set of parameters
+      if ( list.size() > 1 ) {
+         System.err.println( "Summary" );
+         for ( ParameterSettings params : list ) {
+            System.err.printf(
+                  "F1=%.3f P=%.3f R=%.3f %s\n",
+                  params.stats.f1(),
+                  params.stats.precision(),
+                  params.stats.recall(),
+                  params );
+         }
+         System.err.println();
+      }
+
+      // print overall best model
+      if ( !list.isEmpty() ) {
+         ParameterSettings lastParams = list.get( list.size() - 1 );
+         System.err.println( "Best model:" );
+         System.err.print( lastParams.stats );
+         System.err.println( lastParams );
+         System.err.println( lastParams.stats.confusions() );
+         System.err.println();
+      }
+   }
+
+   public static class DocumentIDAnnotator extends JCasAnnotator_ImplBase {
+
+      @Override
+      public void process( JCas jCas ) throws AnalysisEngineProcessException {
+         String documentID = new File( ViewUriUtil.getURI( jCas ) ).getPath();
+         DocumentID documentIDAnnotation = new DocumentID( jCas );
+         documentIDAnnotation.setDocumentID( documentID );
+         documentIDAnnotation.addToIndexes();
+      }
+   }
+
+   public static class CopyDocumentTextToGoldView extends JCasAnnotator_ImplBase {
+      @Override
+      public void process( JCas jCas ) throws AnalysisEngineProcessException {
+         try {
+            JCas goldView = jCas.getView( GOLD_VIEW_NAME );
+            goldView.setDocumentText( jCas.getDocumentText() );
+         } catch ( CASException e ) {
+            throw new AnalysisEngineProcessException( e );
+         }
       }
-    }
-  }
+   }
 }



Mime
View raw message