Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 1763B10891 for ; Tue, 9 Jul 2013 18:14:39 +0000 (UTC) Received: (qmail 85850 invoked by uid 500); 9 Jul 2013 18:14:39 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 85814 invoked by uid 500); 9 Jul 2013 18:14:38 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 85801 invoked by uid 99); 9 Jul 2013 18:14:37 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 09 Jul 2013 18:14:37 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Tue, 09 Jul 2013 18:14:34 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 372D22388980; Tue, 9 Jul 2013 18:14:13 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1501418 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/PrintInconsistentAnnotations.java Date: Tue, 09 Jul 2013 18:14:13 -0000 To: commits@ctakes.apache.org From: stevenbethard@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20130709181413.372D22388980@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: stevenbethard Date: Tue Jul 9 18:14:12 2013 New Revision: 1501418 URL: http://svn.apache.org/r1501418 Log: Adds script for detecting inconsistent DocTimeRels within a narrative container. Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/PrintInconsistentAnnotations.java Added: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/PrintInconsistentAnnotations.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/PrintInconsistentAnnotations.java?rev=1501418&view=auto ============================================================================== --- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/PrintInconsistentAnnotations.java (added) +++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/PrintInconsistentAnnotations.java Tue Jul 9 18:14:12 2013 @@ -0,0 +1,128 @@ +package org.apache.ctakes.temporal.data.analysis; + +import java.io.File; +import java.util.Collections; +import java.util.List; + +import org.apache.ctakes.temporal.eval.CommandLine; +import org.apache.ctakes.temporal.eval.Evaluation_ImplBase.XMIReader; +import org.apache.ctakes.temporal.eval.THYMEData; +import org.apache.ctakes.typesystem.type.relation.TemporalTextRelation; +import org.apache.ctakes.typesystem.type.textsem.EventMention; +import org.apache.uima.collection.CollectionReader; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; +import org.cleartk.util.ViewURIUtil; +import org.cleartk.util.ae.UriToDocumentTextAnnotator; +import org.cleartk.util.cr.UriCollectionReader; +import org.uimafit.factory.AggregateBuilder; +import org.uimafit.factory.AnalysisEngineFactory; +import org.uimafit.pipeline.JCasIterable; +import org.uimafit.util.JCasUtil; + +import com.google.common.collect.HashMultimap; +import com.google.common.collect.Lists; +import com.google.common.collect.Multimap; +import com.lexicalscope.jewel.cli.CliFactory; +import com.lexicalscope.jewel.cli.Option; + +public class PrintInconsistentAnnotations { + static interface Options { + @Option(longName = "xmi") + public File getXMIDirectory(); + + @Option(longName = "patients") + public CommandLine.IntegerRanges getPatients(); + + @Option(longName = "text") + public File getRawTextDirectory(); + } + + public static void main(String[] args) throws Exception { + Options options = CliFactory.parseArguments(Options.class, args); + List patientSets = options.getPatients().getList(); + List trainItems = THYMEData.getTrainPatientSets(patientSets); + List files = THYMEData.getFilesFor(trainItems, options.getRawTextDirectory()); + + CollectionReader reader = UriCollectionReader.getCollectionReaderFromFiles(files); + AggregateBuilder aggregateBuilder = new AggregateBuilder(); + aggregateBuilder.add(UriToDocumentTextAnnotator.getDescription()); + aggregateBuilder.add(AnalysisEngineFactory.createPrimitiveDescription( + XMIReader.class, + XMIReader.PARAM_XMI_DIRECTORY, + options.getXMIDirectory())); + + for (JCas jCas : new JCasIterable(reader, aggregateBuilder.createAggregate())) { + String text = jCas.getDocumentText(); + JCas goldView = jCas.getView("GoldView"); + + // group events by their narrative container + Multimap containers = HashMultimap.create(); + for (TemporalTextRelation relation : JCasUtil.select(goldView, TemporalTextRelation.class)) { + if (relation.getCategory().equals("CONTAINS")) { + Annotation arg1 = relation.getArg1().getArgument(); + Annotation arg2 = relation.getArg2().getArgument(); + if (arg2 instanceof EventMention) { + EventMention event = (EventMention) arg2; + containers.put(arg1, event); + } + } + } + + // check each container for inconsistent DocTimeRels + for (Annotation container : containers.keySet()) { + String containerDocTimeRel = + container instanceof EventMention + ? ((EventMention) container).getEvent().getProperties().getDocTimeRel() + : null; + boolean inconsistentDocTimeRels = false; + String groupDocTimeRel = null; + for (EventMention event : containers.get(container)) { + String docTimeRel = event.getEvent().getProperties().getDocTimeRel(); + if (groupDocTimeRel == null) { + groupDocTimeRel = docTimeRel; + } else if (!docTimeRel.equals(groupDocTimeRel)) { + inconsistentDocTimeRels = true; + break; + } else if (containerDocTimeRel != null && !docTimeRel.equals(containerDocTimeRel)) { + inconsistentDocTimeRels = true; + break; + } + } + + // if inconsistent: print events, DocTimeRels and surrounding context + if (inconsistentDocTimeRels) { + List offsets = Lists.newArrayList(); + offsets.add(container.getBegin()); + offsets.add(container.getEnd()); + for (EventMention event : containers.get(container)) { + offsets.add(event.getBegin()); + offsets.add(event.getEnd()); + } + Collections.sort(offsets); + int begin = Math.max(offsets.get(0), 0); + int end = Math.min(offsets.get(offsets.size() - 1), text.length()); + System.err.printf( + "Inconsistent DocTimeRels in %s, ...%s...\n", + new File(ViewURIUtil.getURI(jCas)).getName(), + text.substring(begin, end)); + if (container instanceof EventMention) { + System.err.printf( + "Container: \"%s\" (docTimeRel=%s)\n", + container.getCoveredText(), + ((EventMention) container).getEvent().getProperties().getDocTimeRel()); + } else { + System.err.printf("Container: \"%s\"\n", container.getCoveredText()); + } + for (EventMention event : containers.get(container)) { + System.err.printf( + "* \"%s\" (docTimeRel=%s)\n", + event.getCoveredText(), + event.getEvent().getProperties().getDocTimeRel()); + } + System.err.println(); + } + } + } + } +}