ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pklu...@apache.org
Subject svn commit: r1746977 - in /ctakes/sandbox/ctakes-clinical-deid/src: main/java/org/apache/ctakes/deid/ main/resources/wordlists/ main/ruta/org/apache/ctakes/deid/ test/java/org/apache/ctakes/deid/
Date Mon, 06 Jun 2016 09:12:04 GMT
Author: pkluegl
Date: Mon Jun  6 09:12:04 2016
New Revision: 1746977

URL: http://svn.apache.org/viewvc?rev=1746977&view=rev
Log:
CTAKES-384
- extended doctor rules

Added:
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt   (with
props)
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt   (with
props)
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt   (with
props)
Modified:
    ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
    ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
    ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
(original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/java/org/apache/ctakes/deid/SimpleDeidEntityComparator.java
Mon Jun  6 09:12:04 2016
@@ -48,7 +48,7 @@ public class SimpleDeidEntityComparator
 
   public static final String PARAM_CREATE_RUTA_EVAL_ANNOTATIONS = "createRutaEvalAnnotations";
 
-  private static final boolean PRINT_ANNOTATIONS = false;
+  private static final boolean PRINT_ANNOTATIONS = true;
 
   @ConfigurationParameter(name = PARAM_CREATE_RUTA_EVAL_ANNOTATIONS, mandatory = true, defaultValue
= "false")
   private Boolean createRutaEvalAnnotations;

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt?rev=1746977&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt Mon Jun
 6 09:12:04 2016
@@ -0,0 +1,12 @@
+MD
+NP
+PA-C
+MDA
+MD-Attending
+MSN
+ANP
+NP
+MD.
+PhD.
+PhD
+NP
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_postfix.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt?rev=1746977&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt Mon Jun
 6 09:12:04 2016
@@ -0,0 +1,8 @@
+PCP
+PA
+PRS
+PCP
+Transcribed
+Dictated
+electronically
+signed recommended
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix1.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt?rev=1746977&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt (added)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt Mon Jun
 6 09:12:04 2016
@@ -0,0 +1,10 @@
+Attending
+Resident
+Residents
+Provider
+Intern
+Att
+Surgeon
+Cardiologist
+MD
+Staff
\ No newline at end of file

Propchange: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/dr_prefix2.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/resources/wordlists/name_prefix.txt Mon Jun
 6 09:12:04 2016
@@ -1,5 +1,4 @@
 Name
-Dr
 Mr
 Mrs
 Ms

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Date.ruta Mon
Jun  6 09:12:04 2016
@@ -12,4 +12,4 @@ RETAINTYPE;
 
 (MonthInd{-PARTOF(deid.Date)} Num4{-PARTOF(deid.Date),REGEXP("19..|20..")}){-> deid.Date};
 Num4{-PARTOF(deid.Date),REGEXP("19..|20..")-> deid.Date};
-MonthInd{-PARTOF(deid.Date)-> deid.Date};
+MonthInd{-PARTOF(deid.Date), -PARTOF(deid.DeidEntity)-> deid.Date};

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Deid.ruta Mon
Jun  6 09:12:04 2016
@@ -18,6 +18,7 @@ TYPESYSTEM org.apache.ctakes.deid.FaxRut
 TYPESYSTEM org.apache.ctakes.deid.PatientRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.IDNumRutaTypeSystem;
 TYPESYSTEM org.apache.ctakes.deid.MedicalRecNumRutaTypeSystem;
+TYPESYSTEM org.apache.ctakes.deid.DateRutaTypeSystem;
 
 SCRIPT org.apache.ctakes.deid.Dictionaries;
 SCRIPT org.apache.ctakes.deid.Age;
@@ -50,6 +51,9 @@ CALL(UserName);
 CALL(Zip);
 CALL(Date);
 
+
+Age{PARTOF(deid.Date) -> UNMARK(Age)};
+
 Email{-> Contact, Contact.entityType = "EMAIL"};
 Url{-> Contact, Contact.entityType = "URL"};
 

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
(original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Dictionaries.ruta
Mon Jun  6 09:12:04 2016
@@ -7,6 +7,7 @@ DECLARE KeywordInd;
 DECLARE KeywordInd ProfessionInd, StateContext, DeceasedInd, FamilyInd, MonthInd;
 DECLARE KeywordInd StreetInd, StreetFullInd, AgePostInd, AgePreInd, PhonePreInd;
 DECLARE KeywordInd NationalityInd, SpokenLanguageInd, CountryInd, NamePrefixInd, PatientPrefixInd;
+DECLARE KeywordInd DrPostfixInd, DrPrefixInd1, DrPrefixInd2;
 
 TRIE(
     "profession.txt" = ProfessionInd,
@@ -25,6 +26,9 @@ TRIE(
     "country.txt" = CountryInd,
     "name_prefix.txt" = NamePrefixInd,
     "patient_prefix.txt" = PatientPrefixInd,
+    "dr_postfix.txt" = DrPostfixInd,
+    "dr_prefix1.txt" = DrPrefixInd1,
+    "dr_prefix2.txt" = DrPrefixInd2,
     trie, true, 4, false, 0, "-");
 
 DECLARE Url, Email;
@@ -59,13 +63,14 @@ NUM->{
 	Document{REGEXP("\\d{8,12}")-> Num812};
 };
 
-DECLARE LParen, RParen, Dash, Slash, Hash;
+DECLARE LParen, RParen, Dash, Slash, Hash, Bar;
 SPECIAL-> {
     Document.ct=="("{-> LParen};
     Document.ct==")"{-> RParen};
     Document.ct=="-"{-> Dash};
     Document.ct=="/"{-> Slash};
     Document.ct=="#"{-> Hash};
+    Document.ct=="|"{-> Bar};
 };
 
 DECLARE ApoInd;

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta (original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Doctor.ruta Mon
Jun  6 09:12:04 2016
@@ -12,9 +12,29 @@ RETAINTYPE;
 
 SPECIAL.ct=="^" (CAP COMMA? CAP{ENDSWITH(Split)}){-> Doctor};
 
-//W{REGEXP("Drs?", true)} PERIOD? 
-//    @CW{-REGEXP("Done|Take|PO", true)}
-//    CW CW?
-//    ;
+// TODO simplified?
+W{REGEXP("Drs?", true)} PERIOD? 
+    (@CW{-REGEXP("Done|Take|PO|dr", true)}
+    PERIOD?{-ENDSWITH(Split)}
+    CW? CW? CW?){-> Doctor};
+
+(CW{-REGEXP("Dr|Name")} COMMA? CW CW? CW?){-> Doctor} COMMA? DrPostfixInd;
+(CW{-REGEXP("Dr|Name")} CW? PERIOD{-ENDSWITH(Split)} 
+    CW (PERIOD{-ENDSWITH(Split)} CW)?){-> Doctor} COMMA? DrPostfixInd;
+(CW{-REGEXP("Dr")} PERIOD{-ENDSWITH(Split)} CAP CAP){-> Doctor} COMMA? DrPostfixInd;
+(CAP{-REGEXP("DR")} COMMA? CAP (CW PERIOD)?){-> Doctor} COMMA? DrPostfixInd;
+(CAP{-REGEXP("DR")} (COMMA| (CW PERIOD))? CAP CAP? CW?){-> Doctor} COMMA? DrPostfixInd;
+
+DrPrefixInd1{-> SHIFT(DrPrefixInd1,1,4)} W{REGEXP("(?i)by|for")} W{REGEXP("(?i)physician")}
COLON?;
+DrPrefixInd1 (CW{-REGEXP("Dr")} CW PERIOD? CW){-> Doctor};
+DrPrefixInd1 (CW{-REGEXP("Dr")} COMMA? CW CW?){-> Doctor};
+DrPrefixInd1 (CAP{-REGEXP("DR")} ANY{PARTOF({COMMA,CW,PERIOD})} CAP PERIOD? CAP CAP?){->
Doctor};
+
+Split (W{-> Doctor} ANY?{PARTOF({Slash,COLON,Bar})})+ SEMICOLON? (Num2 Slash)? Num69 PERIOD
"doc";
+
+DrPrefixInd2{-> SHIFT(DrPrefixInd2,1,3)} W?{REGEXP("(?i)physician")} COLON;
+DrPrefixInd2 (CW{-REGEXP("Dr")} CW PERIOD? CW){-> Doctor};
+DrPrefixInd2 (CW{-REGEXP("Dr")} COMMA? CW CW?){-> Doctor};
+DrPrefixInd1 (CAP{-REGEXP("DR")} COMMA? CAP){-> Doctor};
+DrPrefixInd1 (CW{-REGEXP("Dr")}){-> Doctor};
 
-//TODO
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
(original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/main/ruta/org/apache/ctakes/deid/Patient.ruta
Mon Jun  6 09:12:04 2016
@@ -56,6 +56,13 @@ PatientPrefixInd COLON
     CAP COMMA? CAP (CW{REGEXP(".")} PERIOD?)?
     ){-> Patient};
 
-    
-    
-    
\ No newline at end of file
+W.ct=="seeing" 
+    (CW{-REGEXP("(?i)Done|Dr|Mr|Mrs|Miss|Ms|Pt|Patients")}  
+    CW?{-REGEXP("Done")}){-> Patient};
+
+RETAINTYPE(BREAK);
+BREAK BREAK (@CAP COMMA? CAP ((CW PERIOD)|(CW COMMA? CW))?){-> Patient}
+    BREAK BREAK? Num78 BREAK;  
+RETAINTYPE;  
+
+W{REGEXP("Mr|Mrs|Ms|Miss")} PERIOD? W{-REGEXP("(?i)take|pt") -> Patient};
\ No newline at end of file

Modified: ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java?rev=1746977&r1=1746976&r2=1746977&view=diff
==============================================================================
--- ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java
(original)
+++ ctakes/sandbox/ctakes-clinical-deid/src/test/java/org/apache/ctakes/deid/I2B2Evaluation.java
Mon Jun  6 09:12:04 2016
@@ -23,13 +23,17 @@ import java.io.IOException;
 
 import org.apache.ctakes.core.cc.XmiWriterCasConsumerCtakes;
 import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.fit.factory.AnalysisEngineFactory;
 import org.apache.uima.fit.factory.CollectionReaderFactory;
 import org.apache.uima.fit.pipeline.SimplePipeline;
 import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.ruta.engine.RutaEngine;
 
 public class I2B2Evaluation {
 
+  private static final boolean DEBUG = true;
+
   public static void main(String[] args)
           throws ResourceInitializationException, UIMAException, IOException {
 
@@ -38,15 +42,25 @@ public class I2B2Evaluation {
       testData = args[0];
     }
     
+    AnalysisEngineDescription ruta = null;
+    if(DEBUG) {
+      ruta = AnalysisEngineFactory.createEngineDescription("org.apache.ctakes.deid.DeidRutaAnnotator",

+              RutaEngine.PARAM_DEBUG, true, RutaEngine.PARAM_DEBUG_WITH_MATCHES, true,
+              RutaEngine.PARAM_PROFILE, true, RutaEngine.PARAM_STATISTICS, true, 
+              RutaEngine.PARAM_CREATED_BY, true);
+    } else {
+      ruta = AnalysisEngineFactory.createEngineDescription("org.apache.ctakes.deid.DeidRutaAnnotator");
+    }
+    
     SimplePipeline.runPipeline(
-            CollectionReaderFactory.createReader(I2B2DeidCollectionReader.class,
+            CollectionReaderFactory.createReaderDescription(I2B2DeidCollectionReader.class,
                     I2B2DeidCollectionReader.PARAM_INPUT_DIRECTORY,
                     testData,
                     I2B2DeidCollectionReader.PARAM_GOLD_VIEW, "gold"),
-            AnalysisEngineFactory.createEngine("org.apache.ctakes.deid.DeidRutaAnnotator"),
-            AnalysisEngineFactory.createEngine(SimpleDeidEntityComparator.class,
+            ruta,
+            AnalysisEngineFactory.createEngineDescription(SimpleDeidEntityComparator.class,
                     SimpleDeidEntityComparator.PARAM_CREATE_RUTA_EVAL_ANNOTATIONS, true),
-            AnalysisEngineFactory.createEngine(XmiWriterCasConsumerCtakes.class,
+            AnalysisEngineFactory.createEngineDescription(XmiWriterCasConsumerCtakes.class,
                     XmiWriterCasConsumerCtakes.PARAM_OUTPUTDIR, "target/xmis"));
 
   }



Mime
View raw message