ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1482808 - in /ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference: ae/ ae/features/ eval/
Date Wed, 15 May 2013 13:14:22 GMT
Author: tmill
Date: Wed May 15 13:14:21 2013
New Revision: 1482808

URL: http://svn.apache.org/r1482808
Log:
Added descriptor methods to coref annotator. Added some eval boilerplate to eval code, not
correct but it compiles so checking in to sync up.

Modified:
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
    ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java?rev=1482808&r1=1482807&r2=1482808&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/NamedEntityCoreferenceResolver.java
Wed May 15 13:14:21 2013
@@ -1,5 +1,6 @@
 package org.apache.ctakes.coreference.ae;
 
+import java.io.File;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
@@ -12,15 +13,46 @@ import org.apache.ctakes.coreference.uti
 import org.apache.ctakes.relationextractor.ae.RelationExtractorAnnotator;
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;
 import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.EventMention;
 import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.cleartk.classifier.CleartkAnnotator;
+import org.cleartk.classifier.DataWriter;
+import org.cleartk.classifier.jar.DefaultDataWriterFactory;
+import org.cleartk.classifier.jar.GenericJarClassifierFactory;
+import org.uimafit.factory.AnalysisEngineFactory;
 import org.uimafit.util.JCasUtil;
 
 public class NamedEntityCoreferenceResolver extends RelationExtractorAnnotator {
 	
+  public static AnalysisEngineDescription createDataWriterDescription(
+      Class<? extends DataWriter<String>> dataWriterClass,
+      File outputDirectory) throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        NamedEntityCoreferenceResolver.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        true,
+        DefaultDataWriterFactory.PARAM_DATA_WRITER_CLASS_NAME,
+        dataWriterClass,
+        DefaultDataWriterFactory.PARAM_OUTPUT_DIRECTORY,
+        outputDirectory);
+  }
+  
+  public static AnalysisEngineDescription createAnnotatorDescription(File modelDirectory)
+      throws ResourceInitializationException {
+    return AnalysisEngineFactory.createPrimitiveDescription(
+        NamedEntityCoreferenceResolver.class,
+        CleartkAnnotator.PARAM_IS_TRAINING,
+        false,
+        GenericJarClassifierFactory.PARAM_CLASSIFIER_JAR_PATH,
+        new File(modelDirectory, "model.jar"));
+  }
+
 	@Override
 	protected List<RelationFeaturesExtractor> getFeatureExtractors() {
 		List<RelationFeaturesExtractor> extractors = new ArrayList<RelationFeaturesExtractor>();
@@ -66,11 +98,15 @@ public class NamedEntityCoreferenceResol
 	}
 
 	private List<IdentifiedAnnotation> getDocumentMarkables(JCas jcas, Annotation coveringAnnotation)
{
-		Collection<EntityMention> mentions = (JCasUtil.select(jcas, EntityMention.class));
+	  List<IdentifiedAnnotation> mentions = new ArrayList<IdentifiedAnnotation>();
+		Collection<EntityMention> entityMentions = (JCasUtil.select(jcas, EntityMention.class));
+		Collection<EventMention> eventMentions = JCasUtil.select(jcas, EventMention.class);
+		mentions.addAll(entityMentions);
+		mentions.addAll(eventMentions);
 //		expandToNP(mentions);
 //		mergeNP(mentions);
 //		elevateAdjectives(mentions);
-		return new ArrayList<IdentifiedAnnotation>(mentions);
+		return mentions;
 	}
 
 	@Override

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java?rev=1482808&r1=1482807&r2=1482808&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/StringMatchingFeatureExtractor.java
Wed May 15 13:14:21 2013
@@ -1,7 +1,6 @@
 package org.apache.ctakes.coreference.ae.features;
 
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java?rev=1482808&r1=1482807&r2=1482808&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/ae/features/TokenFeatureExtractor.java
Wed May 15 13:14:21 2013
@@ -1,7 +1,6 @@
 package org.apache.ctakes.coreference.ae.features;
 
 import java.util.ArrayList;
-import java.util.Collection;
 import java.util.List;
 
 import org.apache.ctakes.relationextractor.ae.features.RelationFeaturesExtractor;

Modified: ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java?rev=1482808&r1=1482807&r2=1482808&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
(original)
+++ ctakes/sandbox/ctakes-coref-cleartk/src/main/java/org/apache/ctakes/coreference/eval/EvaluationOfCoreferencePairs.java
Wed May 15 13:14:21 2013
@@ -1,23 +1,75 @@
 package org.apache.ctakes.coreference.eval;
 
 import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
 import java.util.List;
 
+import org.apache.ctakes.chunker.ae.Chunker;
+import org.apache.ctakes.chunker.ae.DefaultChunkCreator;
+import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster;
+import org.apache.ctakes.constituency.parser.ae.ConstituencyParser;
+import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator;
+import org.apache.ctakes.core.ae.OverlapAnnotator;
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.resource.FileResourceImpl;
+import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl;
+import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl;
+import org.apache.ctakes.core.resource.SuffixMaxentModelResourceImpl;
+import org.apache.ctakes.coreference.ae.NamedEntityCoreferenceResolver;
+import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
+import org.apache.ctakes.postagger.POSTagger;
 import org.apache.ctakes.relationextractor.eval.XMIReader;
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.Chunk;
+import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation;
+import org.apache.ctakes.typesystem.type.textspan.Segment;
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.UIMAException;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XmiCasSerializer;
 import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.XMLSerializer;
+import org.cleartk.classifier.jar.JarClassifierBuilder;
 import org.cleartk.eval.AnnotationStatistics;
 import org.cleartk.eval.Evaluation_ImplBase;
+import org.cleartk.util.ViewURIUtil;
+import org.cleartk.util.ae.UriToDocumentTextAnnotator;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.component.ViewCreatorAnnotator;
+import org.uimafit.component.ViewTextCopierAnnotator;
+import org.uimafit.descriptor.ConfigurationParameter;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.AnalysisEngineFactory;
 import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.factory.ExternalResourceFactory;
+import org.uimafit.factory.TypePrioritiesFactory;
 import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.testing.util.HideOutput;
+import org.uimafit.util.JCasUtil;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
 
 public class EvaluationOfCoreferencePairs extends
 		Evaluation_ImplBase<File, AnnotationStatistics<String>> {
 
 	public static final String GOLD_VIEW_NAME = "GOLD_VIEW";
-
+  private boolean xmiExists = false;
+  private File xmiDirectory = null;
+  
 	public EvaluationOfCoreferencePairs(File baseDirectory) {
 		super(baseDirectory);
-		// TODO Auto-generated constructor stub
 	}
 
 	@Override
@@ -34,14 +86,25 @@ public class EvaluationOfCoreferencePair
 				XMIReader.class,
 				TypeSystemDescriptionFactory.createTypeSystemDescription(),
 				XMIReader.PARAM_FILES,
-				paths);
+				items);
 	}
 
 	@Override
 	protected void train(CollectionReader collectionReader, File directory)
 			throws Exception {
-		// TODO Auto-generated method stub
-		
+	  AggregateBuilder aggregateBuilder = this.getPreprocessorAggregateBuilder();
+ 
+	  aggregateBuilder.add(NamedEntityCoreferenceResolver.createAnnotatorDescription(directory));
+    SimplePipeline.runPipeline(collectionReader, aggregateBuilder.createAggregate());
+
+    HideOutput hider = new HideOutput();
+    // libsvm:
+    JarClassifierBuilder.trainAndPackage(directory,  "-t", "0", "-c", "10");
+    // tksvmlight with no tk features:
+//    JarClassifierBuilder.trainAndPackage(directory, "-t", "0", "-c", "10", "-N", "0");
+//    JarClassifierBuilder.trainAndPackage(directory,  "-t", "5", "-S", "0", "-N", "3", "-C",
"+", "-T", "1.0");
+    hider.restoreOutput();
+    hider.close();
 	}
 
 	@Override
@@ -51,6 +114,307 @@ public class EvaluationOfCoreferencePair
 		return null;
 	}
 
+  protected AggregateBuilder getPreprocessorAggregateBuilder() throws Exception {
+    return this.xmiExists
+        ? this.getXMIReadingPreprocessorAggregateBuilder()
+        : this.getXMIWritingPreprocessorAggregateBuilder();
+  }
+
+  protected AggregateBuilder getXMIReadingPreprocessorAggregateBuilder() throws UIMAException
{
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+//    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+//        XMIReader.class,
+//        XMIReader.PARAM_XMI_DIRECTORY,
+//        this.xmiDirectory));
+    return aggregateBuilder;
+  }
+
+  protected AggregateBuilder getXMIWritingPreprocessorAggregateBuilder()
+      throws Exception {
+    AggregateBuilder aggregateBuilder = new AggregateBuilder();
+    aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription());
+
+    // read manual annotations into gold view
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ViewCreatorAnnotator.class,
+        ViewCreatorAnnotator.PARAM_VIEW_NAME,
+        GOLD_VIEW_NAME));
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ViewTextCopierAnnotator.class,
+        ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME,
+        CAS.NAME_DEFAULT_SOFA,
+        ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME,
+        GOLD_VIEW_NAME));
+//    aggregateBuilder.add(
+//        THYMEKnowtatorXMLReader.getDescription(this.knowtatorXMLDirectory),
+//        CAS.NAME_DEFAULT_SOFA,
+//        GOLD_VIEW_NAME);
+
+    // identify segments
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class));
+    // identify sentences
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        SentenceDetector.class,
+        "MaxentModel",
+        ExternalResourceFactory.createExternalResourceDescription(
+            SuffixMaxentModelResourceImpl.class,
+            FileLocator.locateFile("org/apache/ctakes/core/sentdetect/sdmed.mod").toURI().toURL())));
+    // identify tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class));
+    // merge some tokens
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class));
+
+    // identify part-of-speech tags
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        POSTagger.class,
+        TypeSystemDescriptionFactory.createTypeSystemDescription(),
+        TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class),
+        POSTagger.POS_MODEL_FILE_PARAM,
+        "org/apache/ctakes/postagger/models/mayo-pos.zip",
+        POSTagger.TAG_DICTIONARY_PARAM,
+        "org/apache/ctakes/postagger/models/tag.dictionary.txt",
+        POSTagger.CASE_SENSITIVE_PARAM,
+        true));
+
+    // identify chunks
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        Chunker.class,
+        Chunker.CHUNKER_MODEL_FILE_PARAM,
+        FileLocator.locateFile("org/apache/ctakes/chunker/models/chunk-model.claims-1.5.zip"),
+        Chunker.CHUNKER_CREATOR_CLASS_PARAM,
+        DefaultChunkCreator.class));
+
+    // identify UMLS named entities
+
+    // adjust NP in NP NP to span both
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        1));
+    // adjust NP in NP PP NP to span all three
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        ChunkAdjuster.class,
+        ChunkAdjuster.PARAM_CHUNK_PATTERN,
+        new String[] { "NP", "PP", "NP" },
+        ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN,
+        2));
+    // add lookup windows for each NP
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class));
+    // maximize lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        OverlapAnnotator.class,
+        "A_ObjectClass",
+        LookupWindowAnnotation.class,
+        "B_ObjectClass",
+        LookupWindowAnnotation.class,
+        "OverlapType",
+        "A_ENV_B",
+        "ActionType",
+        "DELETE",
+        "DeleteAction",
+        new String[] { "selector=B" }));
+    // add UMLS on top of lookup windows
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        UmlsDictionaryLookupAnnotator.class,
+        "ctakes.umlsaddr",
+        "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser",
+        "ctakes.umlsvendor",
+        "NLM-6515182895",
+        "LookupDescriptor",
+        ExternalResourceFactory.createExternalResourceDescription(
+            FileResourceImpl.class,
+            new File("target/unpacked/org/apache/ctakes/dictionary/lookup/LookupDesc_Db.xml").getAbsoluteFile()),
+        "DbConnection",
+        ExternalResourceFactory.createExternalResourceDescription(
+            JdbcConnectionResourceImpl.class,
+            "",
+            JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS,
+            "org.hsqldb.jdbcDriver",
+            JdbcConnectionResourceImpl.PARAM_URL,
+            // Should be the following but it's WAY too slow
+            // "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+            "jdbc:hsqldb:file:target/unpacked/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"),
+        "RxnormIndexReader",
+        ExternalResourceFactory.createExternalResourceDescription(
+            LuceneIndexReaderResourceImpl.class,
+            "",
+            "UseMemoryIndex",
+            true,
+            "IndexDirectory",
+            new File("target/unpacked/org/apache/ctakes/dictionary/lookup/rxnorm_index").getAbsoluteFile()),
+        "OrangeBookIndexReader",
+        ExternalResourceFactory.createExternalResourceDescription(
+            LuceneIndexReaderResourceImpl.class,
+            "",
+            "UseMemoryIndex",
+            true,
+            "IndexDirectory",
+            new File("target/unpacked/org/apache/ctakes/dictionary/lookup/OrangeBook").getAbsoluteFile())));
+
+    // add lvg annotator
+    String[] XeroxTreebankMap = {
+        "adj|JJ",
+        "adv|RB",
+        "aux|AUX",
+        "compl|CS",
+        "conj|CC",
+        "det|DET",
+        "modal|MD",
+        "noun|NN",
+        "prep|IN",
+        "pron|PRP",
+        "verb|VB" };
+    String[] ExclusionSet = {
+        "and",
+        "And",
+        "by",
+        "By",
+        "for",
+        "For",
+        "in",
+        "In",
+        "of",
+        "Of",
+        "on",
+        "On",
+        "the",
+        "The",
+        "to",
+        "To",
+        "with",
+        "With" };
+    AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription(
+        LvgAnnotator.class,
+        "UseSegments",
+        false,
+        "SegmentsToSkip",
+        new String[0],
+        "UseCmdCache",
+        false,
+        "CmdCacheFileLocation",
+        "/org/apache/ctakes/lvg/2005_norm.voc",
+        "CmdCacheFrequencyCutoff",
+        20,
+        "ExclusionSet",
+        ExclusionSet,
+        "XeroxTreebankMap",
+        XeroxTreebankMap,
+        "LemmaCacheFileLocation",
+        "/org/apache/ctakes/lvg/2005_lemma.voc",
+        "UseLemmaCache",
+        false,
+        "LemmaCacheFrequencyCutoff",
+        20,
+        "PostLemmas",
+        true,
+        "LvgCmdApi",
+        ExternalResourceFactory.createExternalResourceDescription(
+            LvgCmdApiResourceImpl.class,
+            new File(LvgCmdApiResourceImpl.class.getResource(
+                "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI())));
+    aggregateBuilder.add(lvgAnnotator);
+
+    // add dependency parser
+//    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class));
+
+    // add semantic role labeler
+//    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class));
+
+    // add constituency parser
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class));
+
+    // write out the CAS after all the above annotations
+    aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(
+        XMIWriter.class,
+        XMIWriter.PARAM_XMI_DIRECTORY,
+        this.xmiDirectory));
+
+    return aggregateBuilder;
+  }
+
+  public static class XMIWriter extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void initialize(UimaContext context) throws ResourceInitializationException {
+      super.initialize(context);
+      if (!this.xmiDirectory.exists()) {
+        this.xmiDirectory.mkdirs();
+      }
+    }
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileOutputStream outputStream = new FileOutputStream(xmiFile);
+        try {
+          XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem());
+          ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler();
+          serializer.serialize(jCas.getCas(), handler);
+        } finally {
+          outputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+/*
+  public static class XMIReader extends JCasAnnotator_ImplBase {
+
+    public static final String PARAM_XMI_DIRECTORY = "XMIDirectory";
+
+    @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true)
+    private File xmiDirectory;
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      File xmiFile = getXMIFile(this.xmiDirectory, jCas);
+      try {
+        FileInputStream inputStream = new FileInputStream(xmiFile);
+        try {
+          XmiCasDeserializer.deserialize(inputStream, jCas.getCas());
+        } finally {
+          inputStream.close();
+        }
+      } catch (SAXException e) {
+        throw new AnalysisEngineProcessException(e);
+      } catch (IOException e) {
+        throw new AnalysisEngineProcessException(e);
+      }
+    }
+  }
+  */
+  static File getXMIFile(File xmiDirectory, File textFile) {
+    return new File(xmiDirectory, textFile.getName() + ".xmi");
+  }
+
+  static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException
{
+    return getXMIFile(xmiDirectory, new File(ViewURIUtil.getURI(jCas).getPath()));
+  }
+
+  public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase
{
+
+    @Override
+    public void process(JCas jCas) throws AnalysisEngineProcessException {
+      for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) {
+        if (chunk.getChunkType().equals("NP")) {
+          new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes();
+        }
+      }
+    }
+  }
+
 	/**
 	 * @param args
 	 */



Mime
View raw message