Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 5BB3518722 for ; Sat, 11 Jul 2015 01:08:23 +0000 (UTC) Received: (qmail 10594 invoked by uid 500); 11 Jul 2015 01:08:23 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 10561 invoked by uid 500); 11 Jul 2015 01:08:23 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 10552 invoked by uid 99); 11 Jul 2015 01:08:23 -0000 Received: from eris.apache.org (HELO hades.apache.org) (140.211.11.105) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 11 Jul 2015 01:08:23 +0000 Received: from hades.apache.org (localhost [127.0.0.1]) by hades.apache.org (ASF Mail Server at hades.apache.org) with ESMTP id 01400AC0051 for ; Sat, 11 Jul 2015 01:08:22 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1690329 - in /ctakes/sandbox/ctakes-allergy: ./ src/ src/main/ src/main/java/ src/main/java/org/ src/main/java/org/apache/ src/main/java/org/apache/ctakes/ src/main/java/org/apache/ctakes/allergy/ src/main/java/org/apache/ctakes/allergy/ae... Date: Sat, 11 Jul 2015 01:08:22 -0000 To: commits@ctakes.apache.org From: seanfinan@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20150711010823.01400AC0051@hades.apache.org> Author: seanfinan Date: Sat Jul 11 01:08:21 2015 New Revision: 1690329 URL: http://svn.apache.org/r1690329 Log: Checkin of simple regex allergy span detector Added: ctakes/sandbox/ctakes-allergy/ ctakes/sandbox/ctakes-allergy/pom.xml ctakes/sandbox/ctakes-allergy/src/ ctakes/sandbox/ctakes-allergy/src/main/ ctakes/sandbox/ctakes-allergy/src/main/java/ ctakes/sandbox/ctakes-allergy/src/main/java/org/ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/util/ Added: ctakes/sandbox/ctakes-allergy/pom.xml URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/pom.xml?rev=1690329&view=auto ============================================================================== --- ctakes/sandbox/ctakes-allergy/pom.xml (added) +++ ctakes/sandbox/ctakes-allergy/pom.xml Sat Jul 11 01:08:21 2015 @@ -0,0 +1,113 @@ + + 4.0.0 + + org.apache.ctakes + ctakes + 3.2.3-SNAPSHOT + + ctakes-allergy + ctakes-allergy + Allergy Identification Prototype + + + org.apache.ctakes + ctakes-clinical-pipeline + + + + + + org.apache.uima + jcasgen-maven-plugin + 2.5.0 + + + generate + + + src/main/resources/org/apache/ctakes/**/types/TypeSystem.xml + + false + + + + + + + + + runAllergyCVD + + + runAllergyCVD + + + + + + org.codehaus.mojo + exec-maven-plugin + 1.2.1 + + + + compile + + java + + + + + true + true + org.apache.uima.tools.cvd.CVD + + + -lookandfeel + javax.swing.plaf.metal.MetalLookAndFeel + + + + + + org.mitre.medfacts + medfacts-i2b2 + 1.2 + system + ${project.basedir}/../ctakes-assertion/lib/med-facts-i2b2-1.2-SNAPSHOT.jar + + + org.mitre.medfacts + medfacts-zoner + 1.1 + system + ${project.basedir}/../ctakes-assertion/lib/med-facts-zoner-1.1.jar + + + org.mitre.jcarafe.core + jcarafe.core + 2.9.1 + system + ${project.basedir}/../ctakes-assertion/lib/jcarafe-core_2.9.1-0.9.8.3.RC4.jar + + + org.mitre.jcarafe.ext + jcarafe.ext + 2.9.1 + system + ${project.basedir}/../ctakes-assertion/lib/jcarafe-ext_2.9.1-0.9.8.3.RC4.jar + + + gov.nih.nlm.nls.lvg + lvg2010dist + 0.0.1 + + + + + + + + \ No newline at end of file Added: ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java?rev=1690329&view=auto ============================================================================== --- ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java (added) +++ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/ae/AllergyAnnotator.java Sat Jul 11 01:08:21 2015 @@ -0,0 +1,116 @@ +package org.apache.ctakes.allergy.ae; + +import org.apache.ctakes.typesystem.type.constants.CONST; +import org.apache.ctakes.typesystem.type.refsem.UmlsConcept; +import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation; +import org.apache.ctakes.typesystem.type.textsem.MedicationMention; +import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention; +import org.apache.log4j.Logger; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.analysis_engine.AnalysisEngineProcessException; +import org.apache.uima.fit.component.JCasAnnotator_ImplBase; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.util.JCasUtil; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.cas.FSArray; +import org.apache.uima.resource.ResourceInitializationException; + +import java.util.Collection; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * @author SPF , chip-nlp + * @version %I% + * @since 7/10/2015 + */ +final public class AllergyAnnotator extends JCasAnnotator_ImplBase { + + static private final Logger LOGGER = Logger.getLogger( "AllergyAnnotator" ); + + + static private enum AllergyPreExpression { + COLON_LIST( "\\ballergies:\\s++[a-z,'\"\\t ]*" ), + ALLERGIC_TO( "\\ballergic( reaction)? to:?\\s++[a-z\\,'\"\\t ]*" ); + final private Pattern __pattern; + private AllergyPreExpression( final String regex ) { + __pattern = Pattern.compile( regex ); + } + private Matcher getMatcher( final CharSequence windowText ) { + return __pattern.matcher( windowText ); + } + } + + static private enum AllergyPostExpression { + ALLERGY( "[a-z]* allergy"), + HYPERSENSITIVITY( "[a-z]* (hyper)?sensitivity"); + final private Pattern __pattern; + private AllergyPostExpression( final String regex ) { + __pattern = Pattern.compile( regex ); + } + private Matcher getMatcher( final CharSequence windowText ) { + return __pattern.matcher( windowText ); + } + } + + /** + * {@inheritDoc} + */ + @Override + public void process( final JCas jcas ) throws AnalysisEngineProcessException { + LOGGER.info( "Starting processing" ); + + final String docText = jcas.getDocumentText(); + final Collection medications = JCasUtil.select( jcas, MedicationMention.class ); + + for ( MedicationMention medication : medications ) { + final int windowBegin = Math.max( 0, medication.getBegin() - 40 ); + final String preWindowText = docText.substring( windowBegin, medication.getEnd() ).toLowerCase(); + for ( AllergyPreExpression preExpression : AllergyPreExpression.values() ) { + final Matcher matcher = preExpression.getMatcher( preWindowText ); + while ( matcher.find() ) { + storeAllergy( jcas, windowBegin + matcher.start(), medication.getEnd() ); + // could break from loop but there may be a wider context + } + } + final int windowEnd = Math.min( docText.length(), medication.getEnd() + 20 ); + final String postWindowText = docText.substring( medication.getBegin(), windowEnd ).toLowerCase(); + for ( AllergyPostExpression postExpression : AllergyPostExpression.values() ) { + final Matcher matcher = postExpression.getMatcher( postWindowText ); + while ( matcher.find() ) { + storeAllergy( jcas, medication.getBegin(), windowBegin + matcher.end() ); + // could break from loop but there may be a wider context + } + } + } + LOGGER.info( "Finished processing" ); + } + + + static private void storeAllergy( final JCas jcas, final int matchBegin, final int matchEnd ) { + final UmlsConcept umlsConcept = new UmlsConcept( jcas ); + umlsConcept.setCodingScheme( "AllergyPrototype" ); + // C0020517 is a generic CUI for hypersensitivity / allergy + umlsConcept.setCui( "C0020517" ); + umlsConcept.setTui( "T046" ); + umlsConcept.setPreferredText( "Hypersensitivity" ); + final FSArray conceptArr = new FSArray( jcas, 1 ); + conceptArr.set( 0, umlsConcept ); + + final IdentifiedAnnotation annotation = new SignSymptomMention( jcas ); + annotation.setTypeID( CONST.NE_TYPE_ID_FINDING ); + annotation.setBegin( matchBegin ); + annotation.setEnd( matchEnd ); + annotation.setOntologyConceptArr( conceptArr ); +// annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP ); + annotation.addToIndexes(); + } + + + + static public AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException { + return AnalysisEngineFactory.createEngineDescription( AllergyAnnotator.class ); + } + +} Added: ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java?rev=1690329&view=auto ============================================================================== --- ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java (added) +++ ctakes/sandbox/ctakes-allergy/src/main/java/org/apache/ctakes/allergy/pipeline/AllergyPipelineRunner.java Sat Jul 11 01:08:21 2015 @@ -0,0 +1,155 @@ +package org.apache.ctakes.allergy.pipeline; + +import com.lexicalscope.jewel.cli.CliFactory; +import com.lexicalscope.jewel.cli.Option; +import org.apache.ctakes.allergy.ae.AllergyAnnotator; +import org.apache.ctakes.assertion.medfacts.cleartk.PolarityCleartkAnalysisEngine; +import org.apache.ctakes.assertion.medfacts.cleartk.UncertaintyCleartkAnalysisEngine; +import org.apache.ctakes.chunker.ae.Chunker; +import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory; +import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory.CopyNPChunksToLookupWindowAnnotations; +import org.apache.ctakes.clinicalpipeline.ClinicalPipelineFactory.RemoveEnclosedLookupWindows; +import org.apache.ctakes.constituency.parser.ae.ConstituencyParser; +import org.apache.ctakes.contexttokenizer.ae.ContextDependentTokenizerAnnotator; +import org.apache.ctakes.core.ae.SentenceDetector; +import org.apache.ctakes.core.ae.SimpleSegmentAnnotator; +import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB; +import org.apache.ctakes.core.cc.XmiWriterCasConsumerCtakes; +import org.apache.ctakes.core.cr.FilesInDirectoryCollectionReader; +import org.apache.ctakes.core.resource.FileLocator; +import org.apache.ctakes.core.resource.FileResourceImpl; +import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE; +import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE; +import org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator; +import org.apache.ctakes.dictionary.lookup2.ae.JCasTermAnnotator; +import org.apache.ctakes.lvg.ae.LvgAnnotator; +import org.apache.ctakes.postagger.POSTagger; +import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngine; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.fit.factory.AggregateBuilder; +import org.apache.uima.fit.factory.AnalysisEngineFactory; +import org.apache.uima.fit.factory.CollectionReaderFactory; +import org.apache.uima.fit.factory.ExternalResourceFactory; +import org.apache.uima.fit.pipeline.SimplePipeline; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.InvalidXMLException; + +import javax.annotation.concurrent.Immutable; +import java.io.FileNotFoundException; +import java.io.IOException; + + +@Immutable +final public class AllergyPipelineRunner { + private AllergyPipelineRunner() { + } + + static interface AllergyPipelineOptions { + @Option( + shortName = "i", + description = "specify the path to the directory containing the clinical notes to be processed" ) + public String getInputDirectory(); + + @Option( + shortName = "o", + description = "specify the path to the directory where the output xmi files are to be saved" ) + public String getOutputDirectory(); + } + + static private final String CTAKES_DIR_PREFIX = "/org/apache/ctakes/"; + + public static AnalysisEngineDescription getPipelineDescription() + throws ResourceInitializationException, InvalidXMLException, IOException { + return getPipelineDescription( "" ); + } + + public static AnalysisEngineDescription getPipelineDescription( final AllergyPipelineOptions options ) + throws ResourceInitializationException, InvalidXMLException, IOException { + return getPipelineDescription( "" ); + } + + public static AnalysisEngineDescription getPipelineDescription( final String outputDirectory ) + throws ResourceInitializationException, InvalidXMLException, IOException { + final AggregateBuilder builder = new AggregateBuilder(); + // core components, dictionary, dependency parser, polarity, uncertainty + builder.add( SimpleSegmentAnnotator.createAnnotatorDescription() ); + builder.add( SentenceDetector.createAnnotatorDescription() ); + builder.add( TokenizerAnnotatorPTB.createAnnotatorDescription() ); + builder.add( LvgAnnotator.createAnnotatorDescription() ); + builder.add( ContextDependentTokenizerAnnotator.createAnnotatorDescription() ); + builder.add( POSTagger.createAnnotatorDescription() ); + builder.add( Chunker.createAnnotatorDescription() ); + builder.add( ClinicalPipelineFactory.getStandardChunkAdjusterAnnotator() ); + + builder + .add( AnalysisEngineFactory.createEngineDescription( CopyNPChunksToLookupWindowAnnotations.class ) ); + builder.add( AnalysisEngineFactory.createEngineDescription( RemoveEnclosedLookupWindows.class ) ); + try { + builder.add( AnalysisEngineFactory.createEngineDescription( DefaultJCasTermAnnotator.class, + JCasTermAnnotator.DICTIONARY_DESCRIPTOR_KEY, + ExternalResourceFactory.createExternalResourceDescription( + FileResourceImpl.class, + FileLocator.locateFile( "org/apache/ctakes/dictionary/lookup/fast/cTakesHsql.xml" ) ) + ) ); + } catch ( FileNotFoundException e ) { + e.printStackTrace(); + throw new ResourceInitializationException( e ); + } + + builder.add( AllergyAnnotator.createAnnotatorDescription() ); + + builder.add( ClearNLPDependencyParserAE.createAnnotatorDescription() ); + builder.add( PolarityCleartkAnalysisEngine.createAnnotatorDescription() ); + builder.add( UncertaintyCleartkAnalysisEngine.createAnnotatorDescription() ); + builder.add( AnalysisEngineFactory.createEngineDescription( ClearNLPSemanticRoleLabelerAE.class ) ); + builder.add( AnalysisEngineFactory.createEngineDescription( ConstituencyParser.class ) ); + + return builder.createAggregateDescription(); + } + + private static CollectionReader createFilesInDirectoryReader( final String inputDirectory ) throws UIMAException, + IOException { + final String descriptorPath + = FileLocator.getFullPath( "ctakes-core/desc/collection_reader/FilesInDirectoryCollectionReader.xml" ); + return CollectionReaderFactory.createReaderFromPath( descriptorPath, + FilesInDirectoryCollectionReader.PARAM_INPUTDIR, + inputDirectory ); + + } + + private static AnalysisEngine createXMIWriter( final String outputDirectory ) + throws ResourceInitializationException { + return AnalysisEngineFactory.createEngine( XmiWriterCasConsumerCtakes.class, + XmiWriterCasConsumerCtakes.PARAM_OUTPUTDIR, + outputDirectory ); + } + + public static void runAllergyPipeline( final String inputDirectory, + final String outputDirectory ) throws UIMAException, IOException { + final CollectionReader collectionReader = createFilesInDirectoryReader( inputDirectory ); + final AnalysisEngineDescription analysisEngineDescription = getPipelineDescription( outputDirectory ); + final AnalysisEngine xmiWriter = createXMIWriter( outputDirectory ); + runAllergyPipeline( collectionReader, analysisEngineDescription, xmiWriter ); + } + + public static void runAllergyPipeline( final CollectionReader collectionReader, + final AnalysisEngineDescription analysisEngineDescription, + final AnalysisEngine outputWriter ) throws UIMAException, IOException { + SimplePipeline.runPipeline( collectionReader, + AnalysisEngineFactory.createEngine( analysisEngineDescription ), + outputWriter ); + } + + static private String getStandardModelPath( final String moduleDirectory ) { + return CTAKES_DIR_PREFIX + moduleDirectory + "/model.jar"; + } + + + public static void main( final String... args ) throws UIMAException, IOException { + final AllergyPipelineOptions options = CliFactory.parseArguments( AllergyPipelineOptions.class, args ); + runAllergyPipeline( options.getInputDirectory(), options.getOutputDirectory() ); + } + +}