Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 5B34110940 for ; Wed, 29 Jan 2014 21:21:38 +0000 (UTC) Received: (qmail 6500 invoked by uid 500); 29 Jan 2014 21:21:37 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 6469 invoked by uid 500); 29 Jan 2014 21:21:36 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 6462 invoked by uid 99); 29 Jan 2014 21:21:36 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 29 Jan 2014 21:21:36 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 29 Jan 2014 21:21:33 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id D675E23888E2; Wed, 29 Jan 2014 21:21:11 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1562598 - /ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java Date: Wed, 29 Jan 2014 21:21:11 -0000 To: commits@ctakes.apache.org From: dligach@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140129212111.D675E23888E2@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: dligach Date: Wed Jan 29 21:21:11 2014 New Revision: 1562598 URL: http://svn.apache.org/r1562598 Log: added cui printer for cui-polysemy analysis Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java Modified: ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java?rev=1562598&r1=1562597&r2=1562598&view=diff ============================================================================== --- ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java (original) +++ ctakes/sandbox/ctakes-wsd/src/main/java/org/apache/ctakes/wsd/pipelines/Analyze.java Wed Jan 29 21:21:11 2014 @@ -30,13 +30,14 @@ import org.uimafit.util.JCasUtil; import com.google.common.collect.Lists; /** + * Generate various data sets for analyzing polysemy. * * @author dmitriy dligach */ public class Analyze { public static final String GOLD_VIEW_NAME = "GoldView"; - + public static class Options extends Options_ImplBase { @Option( @@ -44,16 +45,10 @@ public class Analyze { usage = "specify the path to the directory containing the XMI files", required = true) public File inputDirectory; - - @Option( - name = "--output-dir", - usage = "specify the path to the output directory", - required = false) - public File outputDirectory; } - + public static void main(String[] args) throws Exception { - + Options options = new Options(); options.parseOptions(args); @@ -67,73 +62,103 @@ public class Analyze { XMIReader.class, XMIReader.PARAM_FILES, paths); - - AnalysisEngine consumer = AnalysisEngineFactory.createPrimitive(Consumer.class); - + + AnalysisEngine consumer = AnalysisEngineFactory.createPrimitive(PrintCuis.class); + SimplePipeline.runPipeline(xmiCollectionReader, consumer); } - - public static class Consumer extends JCasAnnotator_ImplBase { + + public static class PrintConceptSemanticTypes extends JCasAnnotator_ImplBase { @Override public void process(JCas jCas) throws AnalysisEngineProcessException { - + JCas goldView; try { goldView = jCas.getView(GOLD_VIEW_NAME); } catch (CASException e) { throw new AnalysisEngineProcessException(e); } - + for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class))) { // for some reason in gold begin offset for some mentions is a huge number if(mention.getBegin() > jCas.getDocumentText().length()) { continue; } - + String text = mention.getCoveredText().toLowerCase(); String semanticType = mention.getClass().getSimpleName(); System.out.format("%s|%s\n", text, semanticType); } - + for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) { // avoid weird crashes if(mention.getBegin() > jCas.getDocumentText().length()) { continue; } - + String text = mention.getCoveredText().toLowerCase(); String semanticType = mention.getClass().getSimpleName(); System.out.format("%s|%s\n", text, semanticType); } } - - /** - * Get the CUIs, RxNorm codes, etc. - */ - public static Set getOntologyConceptCodes(IdentifiedAnnotation identifiedAnnotation) { - - Set codes = new HashSet(); - - FSArray fsArray = identifiedAnnotation.getOntologyConceptArr(); - if(fsArray == null) { - return codes; + } + + public static class PrintCuis extends JCasAnnotator_ImplBase { + + @Override + public void process(JCas jCas) throws AnalysisEngineProcessException { + + for (EventMention mention : Lists.newArrayList(JCasUtil.select(jCas, EventMention.class))) { + if(mention.getBegin() > jCas.getDocumentText().length()) { + continue; + } + + String text = mention.getCoveredText().toLowerCase(); + for(String code : getOntologyConceptCodes(mention)) { + System.out.format("%s|%s\n", text, code); + } } - - for(FeatureStructure featureStructure : fsArray.toArray()) { - OntologyConcept ontologyConcept = (OntologyConcept) featureStructure; - - if(ontologyConcept instanceof UmlsConcept) { - UmlsConcept umlsConcept = (UmlsConcept) ontologyConcept; - String code = umlsConcept.getCui(); - codes.add(code); - } else { // SNOMED or RxNorm - String code = ontologyConcept.getCodingScheme() + ontologyConcept.getCode(); - codes.add(code); + + for (EntityMention mention : Lists.newArrayList(JCasUtil.select(jCas, EntityMention.class))) { + if(mention.getBegin() > jCas.getDocumentText().length()) { + continue; + } + + String text = mention.getCoveredText().toLowerCase(); + for(String code : getOntologyConceptCodes(mention)) { + System.out.format("%s|%s\n", text, code); } } - + } + } + + /** + * Get the CUIs, RxNorm codes, etc. + */ + public static Set getOntologyConceptCodes(IdentifiedAnnotation identifiedAnnotation) { + + Set codes = new HashSet(); + + FSArray fsArray = identifiedAnnotation.getOntologyConceptArr(); + if(fsArray == null) { return codes; } + + for(FeatureStructure featureStructure : fsArray.toArray()) { + OntologyConcept ontologyConcept = (OntologyConcept) featureStructure; + + if(ontologyConcept instanceof UmlsConcept) { + UmlsConcept umlsConcept = (UmlsConcept) ontologyConcept; + String code = umlsConcept.getCui(); + codes.add(code); + } else { // SNOMED or RxNorm + String code = ontologyConcept.getCodingScheme() + ontologyConcept.getCode(); + codes.add(code); + } + } + + return codes; } } +