Return-Path: X-Original-To: apmail-ctakes-commits-archive@www.apache.org Delivered-To: apmail-ctakes-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 98A20110D6 for ; Thu, 3 Apr 2014 19:45:24 +0000 (UTC) Received: (qmail 99594 invoked by uid 500); 3 Apr 2014 19:45:23 -0000 Delivered-To: apmail-ctakes-commits-archive@ctakes.apache.org Received: (qmail 99465 invoked by uid 500); 3 Apr 2014 19:45:22 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 99314 invoked by uid 99); 3 Apr 2014 19:45:19 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 03 Apr 2014 19:45:19 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 03 Apr 2014 19:45:16 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 1DAB72388A56; Thu, 3 Apr 2014 19:44:54 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1584351 - in /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal: data/analysis/SignSymptomDurations.java duration/PreserveCertainEventEventRelationsInGold.java duration/Utils.java Date: Thu, 03 Apr 2014 19:44:54 -0000 To: commits@ctakes.apache.org From: dligach@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140403194454.1DAB72388A56@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: dligach Date: Thu Apr 3 19:44:53 2014 New Revision: 1584351 URL: http://svn.apache.org/r1584351 Log: added code needed to filter out d/d, s/s, drugs, procedures, general events that have no duration data Removed: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/data/analysis/SignSymptomDurations.java Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/PreserveCertainEventEventRelationsInGold.java ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/PreserveCertainEventEventRelationsInGold.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/PreserveCertainEventEventRelationsInGold.java?rev=1584351&r1=1584350&r2=1584351&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/PreserveCertainEventEventRelationsInGold.java (original) +++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/PreserveCertainEventEventRelationsInGold.java Thu Apr 3 19:44:53 2014 @@ -4,13 +4,13 @@ import java.io.File; import java.io.IOException; import java.util.Map; -import org.apache.ctakes.temporal.duration.Utils.Callback; import org.apache.ctakes.typesystem.type.relation.BinaryTextRelation; import org.apache.ctakes.typesystem.type.relation.RelationArgument; import org.apache.ctakes.typesystem.type.textsem.EventMention; import org.apache.uima.analysis_engine.AnalysisEngineProcessException; import org.apache.uima.cas.CASException; import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; import org.uimafit.component.JCasAnnotator_ImplBase; import org.uimafit.util.JCasUtil; @@ -22,7 +22,7 @@ import com.google.common.io.Files; * Preserve only those event-event relations whose both event arguments have duration data. */ public class PreserveCertainEventEventRelationsInGold extends JCasAnnotator_ImplBase { - + public static final String GOLD_VIEW_NAME = "GoldView"; @Override @@ -30,34 +30,34 @@ public class PreserveCertainEventEventRe Map> textToDistribution = null; try { - textToDistribution = Files.readLines(new File(Utils.durationDistributionPath), Charsets.UTF_8, new Callback()); + textToDistribution = Files.readLines(new File(Utils.durationDistributionPath), Charsets.UTF_8, new Utils.Callback()); } catch(IOException e) { e.printStackTrace(); return; } - + JCas goldView; try { goldView = jCas.getView(GOLD_VIEW_NAME); } catch (CASException e) { throw new AnalysisEngineProcessException(e); } - + // remove relations where one or both arguments have no duration data for(BinaryTextRelation relation : Lists.newArrayList(JCasUtil.select(goldView, BinaryTextRelation.class))) { RelationArgument arg1 = relation.getArg1(); RelationArgument arg2 = relation.getArg2(); - String event2Text; String event1Text; + String event2Text; if(arg1.getArgument() instanceof EventMention && arg2.getArgument() instanceof EventMention) { - event1Text = arg1.getArgument().getCoveredText().toLowerCase(); - event2Text = arg2.getArgument().getCoveredText().toLowerCase(); + event1Text = getText(jCas, arg1.getArgument()); + event2Text = getText(jCas, arg2.getArgument()); } else { // this is not an event-event relation continue; } - + if(textToDistribution.containsKey(event1Text) && textToDistribution.containsKey(event2Text)) { // we have duration distributions for both arguments, so keep it continue; @@ -67,15 +67,53 @@ public class PreserveCertainEventEventRe arg2.removeFromIndexes(); relation.removeFromIndexes(); } - + // remove events (that didn't participate in relations) that have no data for(EventMention mention : Lists.newArrayList(JCasUtil.select(goldView, EventMention.class))) { - if(textToDistribution.containsKey(mention.getCoveredText().toLowerCase())) { + String mentionText = getText(jCas, mention); + if(textToDistribution.containsKey(mentionText)) { // these are the kind we keep continue; } - + mention.removeFromIndexes(); } - } + } + + /** + * Lemmatize this annotation if this is a verb. + * Otherwise return as is. Lowercase before returning. + * + * TODO: check if there's a covering UMLS concept before lemmatizing + */ + public static String getText(JCas jCas, Annotation annotation) + throws AnalysisEngineProcessException { + + JCas systemView; + try { + systemView = jCas.getView("_InitialView"); + } catch (CASException e) { + throw new AnalysisEngineProcessException(e); + } + + String pos = Utils.getPosTag(systemView, annotation); + if(pos == null) { + return annotation.getCoveredText().toLowerCase(); + } + + String text; + if(pos.startsWith("V")) { + try { + text = Utils.lemmatize(annotation.getCoveredText(), pos); + } catch (IOException e) { + System.out.println("couldn't lemmatize: " + annotation.getCoveredText()); + e.printStackTrace(); + return annotation.getCoveredText().toLowerCase(); + } + } else { + text = annotation.getCoveredText(); + } + + return text.toLowerCase(); + } } \ No newline at end of file Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java?rev=1584351&r1=1584350&r2=1584351&view=diff ============================================================================== --- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java (original) +++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/duration/Utils.java Thu Apr 3 19:44:53 2014 @@ -19,8 +19,12 @@ import java.util.Map; import org.apache.ctakes.core.resource.FileLocator; import org.apache.ctakes.temporal.ae.feature.duration.DurationEventTimeFeatureExtractor; +import org.apache.ctakes.typesystem.type.syntax.BaseToken; +import org.apache.uima.jcas.JCas; +import org.apache.uima.jcas.tcas.Annotation; import org.threeten.bp.temporal.TemporalField; import org.threeten.bp.temporal.TemporalUnit; +import org.uimafit.util.JCasUtil; import scala.collection.immutable.Set; import scala.util.Try; @@ -39,7 +43,7 @@ import com.googlecode.clearnlp.reader.Ab public class Utils { // events and their duration distributions - public static final String durationDistributionPath = "/Users/dima/Boston/Thyme/Duration/Data/Combined/Distribution/mimic.txt"; + public static final String durationDistributionPath = "/Users/dima/Boston/Thyme/Duration/Data/Combined/Distribution/all.txt"; // eight bins over which we define a duration distribution public static final String[] bins = {"second", "minute", "hour", "day", "week", "month", "year", "decade"}; @@ -230,6 +234,24 @@ public class Utils { } /** + * Return system generated POS tag or null if none available. + */ + public static String getPosTag(JCas systemView, Annotation annotation) { + + List coveringBaseTokens = JCasUtil.selectCovered( + systemView, + BaseToken.class, + annotation.getBegin(), + annotation.getEnd()); + + if(coveringBaseTokens.size() < 1) { + return null; + } + + return coveringBaseTokens.get(0).getPartOfSpeech(); + } + + /** * Read event duration distributions from file. */ public static class Callback implements LineProcessor >> {