Return-Path: X-Original-To: archive-asf-public-internal@cust-asf2.ponee.io Delivered-To: archive-asf-public-internal@cust-asf2.ponee.io Received: from cust-asf.ponee.io (cust-asf.ponee.io [163.172.22.183]) by cust-asf2.ponee.io (Postfix) with ESMTP id 55933200AE1 for ; Mon, 6 Jun 2016 11:12:11 +0200 (CEST) Received: by cust-asf.ponee.io (Postfix) id 5429E160A24; Mon, 6 Jun 2016 09:12:11 +0000 (UTC) Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by cust-asf.ponee.io (Postfix) with SMTP id 514C8160A0E for ; Mon, 6 Jun 2016 11:12:10 +0200 (CEST) Received: (qmail 66754 invoked by uid 500); 6 Jun 2016 09:12:09 -0000 Mailing-List: contact commits-help@ctakes.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@ctakes.apache.org Delivered-To: mailing list commits@ctakes.apache.org Received: (qmail 66745 invoked by uid 99); 6 Jun 2016 09:12:09 -0000 Received: from pnap-us-west-generic-nat.apache.org (HELO spamd3-us-west.apache.org) (209.188.14.142) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 06 Jun 2016 09:12:09 +0000 Received: from localhost (localhost [127.0.0.1]) by spamd3-us-west.apache.org (ASF Mail Server at spamd3-us-west.apache.org) with ESMTP id 16BC71804C3 for ; Mon, 6 Jun 2016 09:12:09 +0000 (UTC) X-Virus-Scanned: Debian amavisd-new at spamd3-us-west.apache.org X-Spam-Flag: NO X-Spam-Score: 0.374 X-Spam-Level: X-Spam-Status: No, score=0.374 tagged_above=-999 required=6.31 tests=[KAM_ASCII_DIVIDERS=0.8, KAM_LAZY_DOMAIN_SECURITY=1, RP_MATCHES_RCVD=-1.426] autolearn=disabled Received: from mx2-lw-eu.apache.org ([10.40.0.8]) by localhost (spamd3-us-west.apache.org [10.40.0.10]) (amavisd-new, port 10024) with ESMTP id r2v0NNZhiGHN for ; Mon, 6 Jun 2016 09:12:06 +0000 (UTC) Received: from mailrelay1-us-west.apache.org (mailrelay1-us-west.apache.org [209.188.14.139]) by mx2-lw-eu.apache.org (ASF Mail Server at mx2-lw-eu.apache.org) with ESMTP id B26825F3DA for ; Mon, 6 Jun 2016 09:12:05 +0000 (UTC) Received: from svn01-us-west.apache.org (svn.apache.org [10.41.0.6]) by mailrelay1-us-west.apache.org (ASF Mail Server at mailrelay1-us-west.apache.org) with ESMTP id D1341E017A for ; Mon, 6 Jun 2016 09:12:04 +0000 (UTC) Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id CF7553A0336 for ; Mon, 6 Jun 2016 09:12:04 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1746977 - in /ctakes/sandbox/ctakes-clinical-deid/src: main/java/org/apache/ctakes/deid/ main/resources/wordlists/ main/ruta/org/apache/ctakes/deid/ test/java/org/apache/ctakes/deid/ Date: Mon, 06 Jun 2016 09:12:04 -0000 To: commits@ctakes.apache.org From: pkluegl@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20160606091204.CF7553A0336@svn01-us-west.apache.org> archived-at: Mon, 06 Jun 2016 09:12:11 -0000 Author: pkluegl Date: Mon Jun 6 09:12:04 2016 New Revision: 1746977 URL: http://svn.apache.org/viewvc?rev=1746977&view=rev Log: CTAKES-384 - extended doctor rules Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt (with props) ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt (with props) ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt (with props) Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java Mon Jun 6 09:12:04 2016 @@ -48,7 +48,7 @@ public class SimpleDeidEntityComparator public static final String PARAM_CREATE_RUTA_EVAL_ANNOTATIONS = "createRutaEvalAnnotations"; - private static final boolean PRINT_ANNOTATIONS = false; + private static final boolean PRINT_ANNOTATIONS = true; @ConfigurationParameter(name = PARAM_CREATE_RUTA_EVAL_ANNOTATIONS, mandatory = true, defaultValue = "false") private Boolean createRutaEvalAnnotations; Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt?rev=1746977&view=auto ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt (added) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt Mon Jun 6 09:12:04 2016 @@ -0,0 +1,12 @@ +MD +NP +PA-C +MDA +MD-Attending +MSN +ANP +NP +MD. +PhD. +PhD +NP \ No newline at end of file Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt?rev=1746977&view=auto ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt (added) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt Mon Jun 6 09:12:04 2016 @@ -0,0 +1,8 @@ +PCP +PA +PRS +PCP +Transcribed +Dictated +electronically +signed recommended \ No newline at end of file Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt ------------------------------------------------------------------------------ svn:eol-style = native Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt?rev=1746977&view=auto ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt (added) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt Mon Jun 6 09:12:04 2016 @@ -0,0 +1,10 @@ +Attending +Resident +Residents +Provider +Intern +Att +Surgeon +Cardiologist +MD +Staff \ No newline at end of file Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt ------------------------------------------------------------------------------ svn:eol-style = native Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt Mon Jun 6 09:12:04 2016 @@ -1,5 +1,4 @@ Name -Dr Mr Mrs Ms Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta Mon Jun 6 09:12:04 2016 @@ -12,4 +12,4 @@ RETAINTYPE; (MonthInd{-PARTOF(deid.Date)} Num4{-PARTOF(deid.Date),REGEXP("19..|20..")}){-> deid.Date}; Num4{-PARTOF(deid.Date),REGEXP("19..|20..")-> deid.Date}; -MonthInd{-PARTOF(deid.Date)-> deid.Date}; +MonthInd{-PARTOF(deid.Date), -PARTOF(deid.DeidEntity)-> deid.Date}; Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta Mon Jun 6 09:12:04 2016 @@ -18,6 +18,7 @@ TYPESYSTEM org.apache.ctakes.deid.FaxRut TYPESYSTEM org.apache.ctakes.deid.PatientRutaTypeSystem; TYPESYSTEM org.apache.ctakes.deid.IDNumRutaTypeSystem; TYPESYSTEM org.apache.ctakes.deid.MedicalRecNumRutaTypeSystem; +TYPESYSTEM org.apache.ctakes.deid.DateRutaTypeSystem; SCRIPT org.apache.ctakes.deid.Dictionaries; SCRIPT org.apache.ctakes.deid.Age; @@ -50,6 +51,9 @@ CALL(UserName); CALL(Zip); CALL(Date); + +Age{PARTOF(deid.Date) -> UNMARK(Age)}; + Email{-> Contact, Contact.entityType = "EMAIL"}; Url{-> Contact, Contact.entityType = "URL"}; Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta Mon Jun 6 09:12:04 2016 @@ -7,6 +7,7 @@ DECLARE KeywordInd; DECLARE KeywordInd ProfessionInd, StateContext, DeceasedInd, FamilyInd, MonthInd; DECLARE KeywordInd StreetInd, StreetFullInd, AgePostInd, AgePreInd, PhonePreInd; DECLARE KeywordInd NationalityInd, SpokenLanguageInd, CountryInd, NamePrefixInd, PatientPrefixInd; +DECLARE KeywordInd DrPostfixInd, DrPrefixInd1, DrPrefixInd2; TRIE( "profession.txt" = ProfessionInd, @@ -25,6 +26,9 @@ TRIE( "country.txt" = CountryInd, "name_prefix.txt" = NamePrefixInd, "patient_prefix.txt" = PatientPrefixInd, + "dr_postfix.txt" = DrPostfixInd, + "dr_prefix1.txt" = DrPrefixInd1, + "dr_prefix2.txt" = DrPrefixInd2, trie, true, 4, false, 0, "-"); DECLARE Url, Email; @@ -59,13 +63,14 @@ NUM->{ Document{REGEXP("\\d{8,12}")-> Num812}; }; -DECLARE LParen, RParen, Dash, Slash, Hash; +DECLARE LParen, RParen, Dash, Slash, Hash, Bar; SPECIAL-> { Document.ct=="("{-> LParen}; Document.ct==")"{-> RParen}; Document.ct=="-"{-> Dash}; Document.ct=="/"{-> Slash}; Document.ct=="#"{-> Hash}; + Document.ct=="|"{-> Bar}; }; DECLARE ApoInd; Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta Mon Jun 6 09:12:04 2016 @@ -12,9 +12,29 @@ RETAINTYPE; SPECIAL.ct=="^" (CAP COMMA? CAP{ENDSWITH(Split)}){-> Doctor}; -//W{REGEXP("Drs?", true)} PERIOD? -// @CW{-REGEXP("Done|Take|PO", true)} -// CW CW? -// ; +// TODO simplified? +W{REGEXP("Drs?", true)} PERIOD? + (@CW{-REGEXP("Done|Take|PO|dr", true)} + PERIOD?{-ENDSWITH(Split)} + CW? CW? CW?){-> Doctor}; + +(CW{-REGEXP("Dr|Name")} COMMA? CW CW? CW?){-> Doctor} COMMA? DrPostfixInd; +(CW{-REGEXP("Dr|Name")} CW? PERIOD{-ENDSWITH(Split)} + CW (PERIOD{-ENDSWITH(Split)} CW)?){-> Doctor} COMMA? DrPostfixInd; +(CW{-REGEXP("Dr")} PERIOD{-ENDSWITH(Split)} CAP CAP){-> Doctor} COMMA? DrPostfixInd; +(CAP{-REGEXP("DR")} COMMA? CAP (CW PERIOD)?){-> Doctor} COMMA? DrPostfixInd; +(CAP{-REGEXP("DR")} (COMMA| (CW PERIOD))? CAP CAP? CW?){-> Doctor} COMMA? DrPostfixInd; + +DrPrefixInd1{-> SHIFT(DrPrefixInd1,1,4)} W{REGEXP("(?i)by|for")} W{REGEXP("(?i)physician")} COLON?; +DrPrefixInd1 (CW{-REGEXP("Dr")} CW PERIOD? CW){-> Doctor}; +DrPrefixInd1 (CW{-REGEXP("Dr")} COMMA? CW CW?){-> Doctor}; +DrPrefixInd1 (CAP{-REGEXP("DR")} ANY{PARTOF({COMMA,CW,PERIOD})} CAP PERIOD? CAP CAP?){-> Doctor}; + +Split (W{-> Doctor} ANY?{PARTOF({Slash,COLON,Bar})})+ SEMICOLON? (Num2 Slash)? Num69 PERIOD "doc"; + +DrPrefixInd2{-> SHIFT(DrPrefixInd2,1,3)} W?{REGEXP("(?i)physician")} COLON; +DrPrefixInd2 (CW{-REGEXP("Dr")} CW PERIOD? CW){-> Doctor}; +DrPrefixInd2 (CW{-REGEXP("Dr")} COMMA? CW CW?){-> Doctor}; +DrPrefixInd1 (CAP{-REGEXP("DR")} COMMA? CAP){-> Doctor}; +DrPrefixInd1 (CW{-REGEXP("Dr")}){-> Doctor}; -//TODO \ No newline at end of file Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta Mon Jun 6 09:12:04 2016 @@ -56,6 +56,13 @@ PatientPrefixInd COLON CAP COMMA? CAP (CW{REGEXP(".")} PERIOD?)? ){-> Patient}; - - - \ No newline at end of file +W.ct=="seeing" + (CW{-REGEXP("(?i)Done|Dr|Mr|Mrs|Miss|Ms|Pt|Patients")} + CW?{-REGEXP("Done")}){-> Patient}; + +RETAINTYPE(BREAK); +BREAK BREAK (@CAP COMMA? CAP ((CW PERIOD)|(CW COMMA? CW))?){-> Patient} + BREAK BREAK? Num78 BREAK; +RETAINTYPE; + +W{REGEXP("Mr|Mrs|Ms|Miss")} PERIOD? W{-REGEXP("(?i)take|pt") -> Patient}; \ No newline at end of file Modified: ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java?rev=1746977&r1=1746976&r2=1746977&view=diff ============================================================================== --- ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java (original) +++ ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java Mon Jun 6 09:12:04 2016 @@ -23,13 +23,17 @@ import java.io.IOException; import org.apache.ctakes.core.cc.XmiWriterCasConsumerCtakes; import org.apache.uima.UIMAException; +import org.apache.uima.analysis_engine.AnalysisEngineDescription; import org.apache.uima.fit.factory.AnalysisEngineFactory; import org.apache.uima.fit.factory.CollectionReaderFactory; import org.apache.uima.fit.pipeline.SimplePipeline; import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.ruta.engine.RutaEngine; public class I2B2Evaluation { + private static final boolean DEBUG = true; + public static void main(String[] args) throws ResourceInitializationException, UIMAException, IOException { @@ -38,15 +42,25 @@ public class I2B2Evaluation { testData = args[0]; } + AnalysisEngineDescription ruta = null; + if(DEBUG) { + ruta = AnalysisEngineFactory.createEngineDescription("org.apache.ctakes.deid.DeidRutaAnnotator", + RutaEngine.PARAM_DEBUG, true, RutaEngine.PARAM_DEBUG_WITH_MATCHES, true, + RutaEngine.PARAM_PROFILE, true, RutaEngine.PARAM_STATISTICS, true, + RutaEngine.PARAM_CREATED_BY, true); + } else { + ruta = AnalysisEngineFactory.createEngineDescription("org.apache.ctakes.deid.DeidRutaAnnotator"); + } + SimplePipeline.runPipeline( - CollectionReaderFactory.createReader(I2B2DeidCollectionReader.class, + CollectionReaderFactory.createReaderDescription(I2B2DeidCollectionReader.class, I2B2DeidCollectionReader.PARAM_INPUT_DIRECTORY, testData, I2B2DeidCollectionReader.PARAM_GOLD_VIEW, "gold"), - AnalysisEngineFactory.createEngine("org.apache.ctakes.deid.DeidRutaAnnotator"), - AnalysisEngineFactory.createEngine(SimpleDeidEntityComparator.class, + ruta, + AnalysisEngineFactory.createEngineDescription(SimpleDeidEntityComparator.class, SimpleDeidEntityComparator.PARAM_CREATE_RUTA_EVAL_ANNOTATIONS, true), - AnalysisEngineFactory.createEngine(XmiWriterCasConsumerCtakes.class, + AnalysisEngineFactory.createEngineDescription(XmiWriterCasConsumerCtakes.class, XmiWriterCasConsumerCtakes.PARAM_OUTPUTDIR, "target/xmis")); }