Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 350A810615 for ; Wed, 22 Jan 2014 18:15:59 +0000 (UTC) Received: (qmail 68935 invoked by uid 500); 22 Jan 2014 18:15:58 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 68894 invoked by uid 500); 22 Jan 2014 18:15:58 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 68886 invoked by uid 99); 22 Jan 2014 18:15:58 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 22 Jan 2014 18:15:58 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 22 Jan 2014 18:15:55 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 5B0AB23888E2; Wed, 22 Jan 2014 18:15:35 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1560454 - in /ctakes/sandbox/ctakes-wsd: ./ .settings/ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/ctakes/ src/main/java/org/apache/ctakes/wsd/ src/main/java/org/apache/ctakes/wsd/pip... Date: Wed, 22 Jan 2014 18:15:35 -0000 To: commits@ctakes.apache.org From: dligach@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140122181535.5B0AB23888E2@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: dligach Date: Wed Jan 22 18:15:34 2014 New Revision: 1560454 URL: http://svn.apache.org/r1560454 Log: (empty) Added: ctakes/sandbox/ctakes-wsd/.classpath (with props) ctakes/sandbox/ctakes-wsd/.project (with props) ctakes/sandbox/ctakes-wsd/.settings/ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs (with props) ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs (with props) ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs (with props) ctakes/sandbox/ctakes-wsd/pom.xml (with props) ctakes/sandbox/ctakes-wsd/src/ ctakes/sandbox/ctakes-wsd/src/main/ ctakes/sandbox/ctakes-wsd/src/main/java/ ctakes/sandbox/ctakes-wsd/src/main/java/org/ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java (with props) ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java (with props) ctakes/sandbox/ctakes-wsd/src/main/resources/ ctakes/sandbox/ctakes-wsd/src/test/ ctakes/sandbox/ctakes-wsd/src/test/java/ ctakes/sandbox/ctakes-wsd/src/test/resources/ Added: ctakes/sandbox/ctakes-wsd/.classpath URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.classpath?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/.classpath (added) +++ ctakes/sandbox/ctakes-wsd/.classpath Wed Jan 22 18:15:34 2014 @@ -0,0 +1,36 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Propchange: ctakes/sandbox/ctakes-wsd/.classpath ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-wsd/.project URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.project?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/.project (added) +++ ctakes/sandbox/ctakes-wsd/.project Wed Jan 22 18:15:34 2014 @@ -0,0 +1,23 @@ + + + ctakes-wsd + + + + + + org.eclipse.jdt.core.javabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.jdt.core.javanature + org.eclipse.m2e.core.maven2Nature + + Propchange: ctakes/sandbox/ctakes-wsd/.project ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs (added) +++ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs Wed Jan 22 18:15:34 2014 @@ -0,0 +1,6 @@ +eclipse.preferences.version=1 +encoding//src/main/java=UTF-8 +encoding//src/main/resources=UTF-8 +encoding//src/test/java=UTF-8 +encoding//src/test/resources=UTF-8 +encoding/=UTF-8 Propchange: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.core.resources.prefs ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs (added) +++ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs Wed Jan 22 18:15:34 2014 @@ -0,0 +1,5 @@ +eclipse.preferences.version=1 +org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.6 +org.eclipse.jdt.core.compiler.compliance=1.6 +org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning +org.eclipse.jdt.core.compiler.source=1.6 Propchange: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.jdt.core.prefs ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs (added) +++ ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs Wed Jan 22 18:15:34 2014 @@ -0,0 +1,4 @@ +activeProfiles= +eclipse.preferences.version=1 +resolveWorkspaceProjects=true +version=1 Propchange: ctakes/sandbox/ctakes-wsd/.settings/org.eclipse.m2e.core.prefs ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-wsd/pom.xml URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/pom.xml?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/pom.xml (added) +++ ctakes/sandbox/ctakes-wsd/pom.xml Wed Jan 22 18:15:34 2014 @@ -0,0 +1,207 @@ + + + + 4.0.0 + ctakes-temporal + jar + Apache cTAKES Temporal Information Extraction + + org.apache.ctakes + ctakes + 3.1.2-SNAPSHOT + + + + org.apache.ctakes + ctakes-temporal-res + + + org.jdom + jdom2 + + + com.lexicalscope.jewelcli + jewelcli + + + org.apache.ctakes + ctakes-type-system + + + org.apache.ctakes + ctakes-core + + + org.apache.ctakes + ctakes-context-tokenizer + + + org.apache.ctakes + ctakes-pos-tagger + + + org.apache.ctakes + ctakes-chunker + + + org.apache.ctakes + ctakes-dictionary-lookup + + + org.apache.ctakes + ctakes-lvg + + + org.apache.ctakes + ctakes-dependency-parser + + + org.apache.ctakes + ctakes-relation-extractor + + + org.apache.ctakes + ctakes-constituency-parser + + + net.sourceforge.ctakesresources + ctakes-resources-umls2011ab + 3.1.1 + + + org.jdom + jdom2 + + + com.lexicalscope.jewelcli + jewelcli + + + org.cleartk + cleartk-util + + + org.cleartk + cleartk-ml + + + org.cleartk + cleartk-eval + + + org.cleartk + cleartk-timeml + + + org.cleartk + cleartk-ml-svmlight + + + org.cleartk + cleartk-syntax + + + org.cleartk + cleartk-ml-libsvm + + + org.cleartk + cleartk-ml-tksvmlight + + + org.cleartk + cleartk-type-system + + + org.cleartk + cleartk-ml-crfsuite + + + info.bethard + timenorm + 0.9.0 + + + com.googlecode.java-diff-utils + diffutils + 1.3.0 + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack + initialize + + unpack + + + + + net.sourceforge.ctakesresources + ctakes-resources-umls2011ab + 3.1.1 + + + ${project.build.directory}/unpacked + + + + + + + + + org.eclipse.m2e + lifecycle-mapping + 1.0.0 + + + + + + org.apache.maven.plugins + maven-dependency-plugin + [2.0,) + + unpack + + + + + + + + + + + + + + Propchange: ctakes/sandbox/ctakes-wsd/pom.xml ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java (added) +++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java Wed Jan 22 18:15:34 2014 @@ -0,0 +1,78 @@ +package org.apache.ctakes.wsd.pipelines; + +import java.io.File; +import java.util.Arrays; +import java.util.List; + +import org.apache.ctakes.core.cr.XMIReader; +import org.apache.ctakes.typesystem.type.textsem.EntityMention; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.jcas.JCas; +import org.cleartk.util.Options_ImplBase; +import org.kohsuke.args4j.Option; +import org.uimafit.component.JCasAnnotator_ImplBase; +import org.uimafit.factory.AnalysisEngineFactory; +import org.uimafit.factory.CollectionReaderFactory; +import org.uimafit.pipeline.SimplePipeline; +import org.uimafit.util.JCasUtil; + +import com.google.common.collect.Lists; + +/** + * + * Read XMI files and apply a consumer that extracts relation features for a downstream component. + * + * @author dmitriy dligach + * + */ +public class Analyze { + + public static class Options extends Options_ImplBase { + + @Option( + name = "--input-dir", + usage = "specify the path to the directory containing the XMI files", + required = true) + public File inputDirectory; + + @Option( + name = "--output-dir", + usage = "specify the path to the directory where the training data will be placed", + required = false) + public File outputDirectory; + } + + public static void main(String[] args) throws Exception { + + Options options = new Options(); + options.parseOptions(args); + + List trainFiles = Arrays.asList(options.inputDirectory.listFiles()); + String[] paths = new String[trainFiles.size()]; + for (int i = 0; i < paths.length; ++i) { + paths[i] = trainFiles.get(i).getPath(); + } + + CollectionReader xmiCollectionReader = CollectionReaderFactory.createCollectionReader( + XMIReader.class, + XMIReader.PARAM_FILES, + paths); + + AnalysisEngine featureExtractorAe = AnalysisEngineFactory.createPrimitive(DoSomething.class); + + SimplePipeline.runPipeline(xmiCollectionReader, featureExtractorAe); + } + + public static class DoSomething extends JCasAnnotator_ImplBase{ + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) { + System.out.println(mention.getCoveredText()); + } + } + + } +} Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java?rev=1560454&view=auto ============================================================================== --- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java (added) +++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java Wed Jan 22 18:15:34 2014 @@ -0,0 +1,370 @@ +package org.apache.ctakes.wsd.pipelines; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.apache.ctakes.chunker.ae.Chunker; +import org.apache.ctakes.chunker.ae.DefaultChunkCreator; +import org.apache.ctakes.chunker.ae.adjuster.ChunkAdjuster; +import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator; +import org.apache.ctakes.core.ae.OverlapAnnotator; +import org.apache.ctakes.core.ae.SentenceDetector; +import org.apache.ctakes.core.ae.SimpleSegmentAnnotator; +import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB; +import org.apache.ctakes.core.resource.FileLocator; +import org.apache.ctakes.core.resource.FileResourceImpl; +import org.apache.ctakes.core.resource.JdbcConnectionResourceImpl; +import org.apache.ctakes.core.resource.LuceneIndexReaderResourceImpl; +import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE; +import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE; +import org.apache.ctakes.dictionary.lookup.ae.UmlsDictionaryLookupAnnotator; +import org.apache.ctakes.lvg.ae.LvgAnnotator; +import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl; +import org.apache.ctakes.postagger.POSTagger; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.ctakes.typesystem.type.syntax.Chunk; +import org.apache.ctakes.typesystem.type.textspan.LookupWindowAnnotation; +import org.apache.ctakes.typesystem.type.textspan.Segment; +import org.apache.ctakes.typesystem.type.textspan.Sentence; +import org.apache.uima.UimaContext; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.impl.XmiCasSerializer; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.jcas.JCas; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.XMLSerializer; +import org.cleartk.util.ViewURIUtil; +import org.cleartk.util.ae.UriToDocumentTextAnnotator; +import org.cleartk.util.cr.UriCollectionReader; +import org.uimafit.component.JCasAnnotator_ImplBase; +import org.uimafit.component.ViewCreatorAnnotator; +import org.uimafit.component.ViewTextCopierAnnotator; +import org.uimafit.descriptor.ConfigurationParameter; +import org.uimafit.factory.AggregateBuilder; +import org.uimafit.factory.AnalysisEngineFactory; +import org.uimafit.factory.ExternalResourceFactory; +import org.uimafit.factory.TypePrioritiesFactory; +import org.uimafit.factory.TypeSystemDescriptionFactory; +import org.uimafit.pipeline.SimplePipeline; +import org.uimafit.util.JCasUtil; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; + +public class Preprocess { + + public static final String GOLD_VIEW_NAME = "GoldView"; + + public static File inputDirectory = new File("/Users/dima/Boston/Data/Sharp/Cloud/sharp/text/train/"); + public static String outputDirectory = "/Users/Dima/Temp/"; + + public static void main(String[] args) throws Exception { + + List files = new ArrayList(); + for(File file : inputDirectory.listFiles()) { + files.add(file); + } + + CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files); + AnalysisEngine engine = getXMIWritingPreprocessorAggregateBuilder().createAggregate(); + SimplePipeline.runPipeline(reader, engine); + } + + public static AggregateBuilder getXMIWritingPreprocessorAggregateBuilder() throws Exception { + + AggregateBuilder aggregateBuilder = new AggregateBuilder(); + + aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription()); + + // read manual annotations into gold view + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ViewCreatorAnnotator.class, + ViewCreatorAnnotator.PARAM_VIEW_NAME, + GOLD_VIEW_NAME)); + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ViewTextCopierAnnotator.class, + ViewTextCopierAnnotator.PARAM_SOURCE_VIEW_NAME, + CAS.NAME_DEFAULT_SOFA, + ViewTextCopierAnnotator.PARAM_DESTINATION_VIEW_NAME, + GOLD_VIEW_NAME)); + +// switch (this.xmlFormat) { +// case Anafora: +// aggregateBuilder.add( +// THYMEAnaforaXMLReader.getDescription(this.xmlDirectory), +// CAS.NAME_DEFAULT_SOFA, +// GOLD_VIEW_NAME); +// break; +// case Knowtator: +// aggregateBuilder.add( +// THYMEKnowtatorXMLReader.getDescription(this.xmlDirectory), +// CAS.NAME_DEFAULT_SOFA, +// GOLD_VIEW_NAME); +// break; +// } + + // identify segments + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(SimpleSegmentAnnotator.class)); + // identify sentences + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + SentenceDetector.class, + SentenceDetector.SD_MODEL_FILE_PARAM, + "org/apache/ctakes/core/sentdetect/sd-med-model.zip")); + // identify tokens + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TokenizerAnnotatorPTB.class)); + // merge some tokens + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ContextDependentTokenizerAnnotator.class)); + + // identify part-of-speech tags + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + POSTagger.class, + TypeSystemDescriptionFactory.createTypeSystemDescription(), + TypePrioritiesFactory.createTypePriorities(Segment.class, Sentence.class, BaseToken.class), + POSTagger.POS_MODEL_FILE_PARAM, + "org/apache/ctakes/postagger/models/mayo-pos.zip")); + + // identify chunks + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + Chunker.class, + Chunker.CHUNKER_MODEL_FILE_PARAM, + FileLocator.locateFile("org/apache/ctakes/chunker/models/chunker-model.zip"), + Chunker.CHUNKER_CREATOR_CLASS_PARAM, + DefaultChunkCreator.class)); + + // identify UMLS named entities + + // adjust NP in NP NP to span both + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ChunkAdjuster.class, + ChunkAdjuster.PARAM_CHUNK_PATTERN, + new String[] { "NP", "NP" }, + ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, + 1)); + // adjust NP in NP PP NP to span all three + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + ChunkAdjuster.class, + ChunkAdjuster.PARAM_CHUNK_PATTERN, + new String[] { "NP", "PP", "NP" }, + ChunkAdjuster.PARAM_EXTEND_TO_INCLUDE_TOKEN, + 2)); + // add lookup windows for each NP + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(CopyNPChunksToLookupWindowAnnotations.class)); + // maximize lookup windows + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + OverlapAnnotator.class, + "A_ObjectClass", + LookupWindowAnnotation.class, + "B_ObjectClass", + LookupWindowAnnotation.class, + "OverlapType", + "A_ENV_B", + "ActionType", + "DELETE", + "DeleteAction", + new String[] { "selector=B" })); + // add UMLS on top of lookup windows + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + UmlsDictionaryLookupAnnotator.class, + "ctakes.umlsaddr", + "https://uts-ws.nlm.nih.gov/restful/isValidUMLSUser", + "ctakes.umlsvendor", + "NLM-6515182895", + "LookupDescriptor", + ExternalResourceFactory.createExternalResourceDescription( + FileResourceImpl.class, + FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/LookupDesc_Db.xml")), + "DbConnection", + ExternalResourceFactory.createExternalResourceDescription( + JdbcConnectionResourceImpl.class, + "", + JdbcConnectionResourceImpl.PARAM_DRIVER_CLASS, + "org.hsqldb.jdbcDriver", + JdbcConnectionResourceImpl.PARAM_URL, + // Should be the following but it's WAY too slow + // "jdbc:hsqldb:res:/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"), + "jdbc:hsqldb:file:target/unpacked/org/apache/ctakes/dictionary/lookup/umls2011ab/umls"), + "RxnormIndexReader", + ExternalResourceFactory.createExternalResourceDescription( + LuceneIndexReaderResourceImpl.class, + "", + "UseMemoryIndex", + true, + "IndexDirectory", + new File("target/unpacked/org/apache/ctakes/dictionary/lookup/rxnorm_index").getAbsoluteFile()), + "OrangeBookIndexReader", + ExternalResourceFactory.createExternalResourceDescription( + LuceneIndexReaderResourceImpl.class, + "", + "UseMemoryIndex", + true, + "IndexDirectory", + FileLocator.locateFile("org/apache/ctakes/dictionary/lookup/OrangeBook")))); + + // add lvg annotator + String[] XeroxTreebankMap = { + "adj|JJ", + "adv|RB", + "aux|AUX", + "compl|CS", + "conj|CC", + "det|DET", + "modal|MD", + "noun|NN", + "prep|IN", + "pron|PRP", + "verb|VB" }; + String[] ExclusionSet = { + "and", + "And", + "by", + "By", + "for", + "For", + "in", + "In", + "of", + "Of", + "on", + "On", + "the", + "The", + "to", + "To", + "with", + "With" }; + AnalysisEngineDescription lvgAnnotator = AnalysisEngineFactory.createPrimitiveDescription( + LvgAnnotator.class, + "UseSegments", + false, + "SegmentsToSkip", + new String[0], + "UseCmdCache", + false, + "CmdCacheFileLocation", + "/org/apache/ctakes/lvg/2005_norm.voc", + "CmdCacheFrequencyCutoff", + 20, + "ExclusionSet", + ExclusionSet, + "XeroxTreebankMap", + XeroxTreebankMap, + "LemmaCacheFileLocation", + "/org/apache/ctakes/lvg/2005_lemma.voc", + "UseLemmaCache", + false, + "LemmaCacheFrequencyCutoff", + 20, + "PostLemmas", + true, + "LvgCmdApi", + ExternalResourceFactory.createExternalResourceDescription( + LvgCmdApiResourceImpl.class, + new File(LvgCmdApiResourceImpl.class.getResource( + "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI()))); + aggregateBuilder.add(lvgAnnotator); + + // add dependency parser + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPDependencyParserAE.class)); + + // add semantic role labeler + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ClearNLPSemanticRoleLabelerAE.class)); + +// // add gold standard parses to gold view, and adjust gold view to correct a few annotation mis-steps +// if(this.treebankDirectory != null){ +// aggregateBuilder.add(THYMETreebankReader.getDescription(this.treebankDirectory)); +// aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(TimexAnnotationCorrector.class)); +// }else{ +// // add ctakes constituency parses to system view +// aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription(ConstituencyParser.class)); +// } + + // write out the CAS after all the above annotations + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + XMIWriter.class, + XMIWriter.PARAM_XMI_DIRECTORY, + outputDirectory)); + + return aggregateBuilder; + } + + public static class CopyNPChunksToLookupWindowAnnotations extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + for (Chunk chunk : JCasUtil.select(jCas, Chunk.class)) { + if (chunk.getChunkType().equals("NP")) { + new LookupWindowAnnotation(jCas, chunk.getBegin(), chunk.getEnd()).addToIndexes(); + } + } + } + } + + // replace this with SimpleSegmentWithTagsAnnotator if that code ever gets fixed + public static class SegmentsFromBracketedSectionTagsAnnotator extends JCasAnnotator_ImplBase { + private static Pattern SECTION_PATTERN = Pattern.compile( + "(\\[start section id=\"?(.*?)\"?\\]).*?(\\[end section id=\"?(.*?)\"?\\])", + Pattern.DOTALL); + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + Matcher matcher = SECTION_PATTERN.matcher(jCas.getDocumentText()); + while (matcher.find()) { + Segment segment = new Segment(jCas); + segment.setBegin(matcher.start() + matcher.group(1).length()); + segment.setEnd(matcher.end() - matcher.group(3).length()); + segment.setId(matcher.group(2)); + segment.addToIndexes(); + } + } + } + + static File getXMIFile(File xmiDirectory, File textFile) { + return new File(xmiDirectory, textFile.getName() + ".xmi"); + } + + static File getXMIFile(File xmiDirectory, JCas jCas) throws AnalysisEngineProcessException { + return getXMIFile(xmiDirectory, new File(ViewURIUtil.getURI(jCas).getPath())); + } + + public static class XMIWriter extends JCasAnnotator_ImplBase { + + public static final String PARAM_XMI_DIRECTORY = "XMIDirectory"; + + @ConfigurationParameter(name = PARAM_XMI_DIRECTORY, mandatory = true) + private File xmiDirectory; + + @Override + public void initialize(UimaContext context) throws ResourceInitializationException { + super.initialize(context); + if (!this.xmiDirectory.exists()) { + this.xmiDirectory.mkdirs(); + } + } + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + File xmiFile = getXMIFile(this.xmiDirectory, jCas); + try { + FileOutputStream outputStream = new FileOutputStream(xmiFile); + try { + XmiCasSerializer serializer = new XmiCasSerializer(jCas.getTypeSystem()); + ContentHandler handler = new XMLSerializer(outputStream, false).getContentHandler(); + serializer.serialize(jCas.getCas(), handler); + } finally { + outputStream.close(); + } + } catch (SAXException e) { + throw new AnalysisEngineProcessException(e); + } catch (IOException e) { + throw new AnalysisEngineProcessException(e); + } + } + } +} Propchange: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Preprocess.java ------------------------------------------------------------------------------ svn:mime-type = text/plain