ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1624062 [1/4] - in /ctakes/sandbox/dictionarytool: data/default/ src/org/apache/ctakes/dictionarytool/ src/org/apache/ctakes/dictionarytool/reader/ src/org/apache/ctakes/dictionarytool/util/ src/org/apache/ctakes/dictionarytool/util/collec...
Date Wed, 10 Sep 2014 17:30:43 GMT
Author: seanfinan
Date: Wed Sep 10 17:30:42 2014
New Revision: 1624062

URL: http://svn.apache.org/r1624062
Log:
Updates to create new dictionary format
Rough.  This is a developer utility only and not production quality by any means
Some day I may have time to make it pretty, but for now it is simply public and usable

Added:
    ctakes/sandbox/dictionarytool/data/default/CtakesDrugTuis.txt   (with props)
    ctakes/sandbox/dictionarytool/data/default/CtakesSnomedTuis.txt   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/DeliveryUtil.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/DoseUtil.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RxNormTermUtil.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TermPhonemator.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/ArrayListMap.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/CollectionMap.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/HashSetMap.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesDbWriter.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesWriter.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiMapWriter.java   (with props)
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTermWriter.java   (with props)
Modified:
    ctakes/sandbox/dictionarytool/data/default/CtakesAllTuis.txt
    ctakes/sandbox/dictionarytool/data/default/ModifierSuffixes.txt
    ctakes/sandbox/dictionarytool/data/default/RemovalPrefixTriggers.txt
    ctakes/sandbox/dictionarytool/data/default/UnwantedPrefixes.txt
    ctakes/sandbox/dictionarytool/data/default/UnwantedSuffixes.txt
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiRelationsMapReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTextsMapReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTuiMapReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/TuiListReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCodesForCuisReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTextsReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTuisReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsRelationsForCuisReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsSemanticTypeTuiReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsFileName.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/index/MrconsoIndex.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java

Modified: ctakes/sandbox/dictionarytool/data/default/CtakesAllTuis.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/data/default/CtakesAllTuis.txt?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/data/default/CtakesAllTuis.txt (original)
+++ ctakes/sandbox/dictionarytool/data/default/CtakesAllTuis.txt Wed Sep 10 17:30:42 2014
@@ -35,3 +35,25 @@ T059
 T060
 T061
 // DRUG
+T109
+T110
+T114
+T115
+T116
+T118
+T119
+T121
+T122
+T123
+T124
+T125
+T126
+T127
+T129
+T130
+T131
+T195
+T196
+T197
+T200
+T203

Added: ctakes/sandbox/dictionarytool/data/default/CtakesDrugTuis.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/data/default/CtakesDrugTuis.txt?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/data/default/CtakesDrugTuis.txt (added)
+++ ctakes/sandbox/dictionarytool/data/default/CtakesDrugTuis.txt Wed Sep 10 17:30:42 2014
@@ -0,0 +1,22 @@
+T109
+T110
+T114
+T115
+T116
+T118
+T119
+T121
+T122
+T123
+T124
+T125
+T126
+T127
+T129
+T130
+T131
+T195
+T196
+T197
+T200
+T203

Propchange: ctakes/sandbox/dictionarytool/data/default/CtakesDrugTuis.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/data/default/CtakesSnomedTuis.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/data/default/CtakesSnomedTuis.txt?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/data/default/CtakesSnomedTuis.txt (added)
+++ ctakes/sandbox/dictionarytool/data/default/CtakesSnomedTuis.txt Wed Sep 10 17:30:42 2014
@@ -0,0 +1,36 @@
+// ANAT
+T021
+T022
+T023
+T024
+T025
+T026
+T029
+T030
+// DISO
+T019
+T020
+T037
+T047
+T048
+T049
+T050
+T190
+T191
+// FIND
+T033
+T034
+T040
+T041
+T042
+T043
+T044
+T045
+T046
+T056
+T057
+T184
+// PROC
+T059
+T060
+T061

Propchange: ctakes/sandbox/dictionarytool/data/default/CtakesSnomedTuis.txt
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: ctakes/sandbox/dictionarytool/data/default/ModifierSuffixes.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/data/default/ModifierSuffixes.txt?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/data/default/ModifierSuffixes.txt (original)
+++ ctakes/sandbox/dictionarytool/data/default/ModifierSuffixes.txt Wed Sep 10 17:30:42 2014
@@ -4,3 +4,12 @@
 ( clinical )
 ( malignant )
 ( partial )
+- acute
+- chronic
+, acute
+, bilateral
+, chronic
+, multiple
+, single
+ acute
+ chronic

Modified: ctakes/sandbox/dictionarytool/data/default/RemovalPrefixTriggers.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/data/default/RemovalPrefixTriggers.txt?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/data/default/RemovalPrefixTriggers.txt (original)
+++ ctakes/sandbox/dictionarytool/data/default/RemovalPrefixTriggers.txt Wed Sep 10 17:30:42 2014
@@ -1,2 +1,3 @@
 deprecated 
-retired procedure 
+retired procedure
+

Modified: ctakes/sandbox/dictionarytool/data/default/UnwantedPrefixes.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/data/default/UnwantedPrefixes.txt?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/data/default/UnwantedPrefixes.txt (original)
+++ ctakes/sandbox/dictionarytool/data/default/UnwantedPrefixes.txt Wed Sep 10 17:30:42 2014
@@ -14,7 +14,10 @@
 [ q ]
 [ v ]
 [ x ]
-activities involving 
+[ so ]
+activities involving
+and unspecified
+( non-specific )
 child :
 college of american pathologists cancer checklist ;
 engaged in 

Modified: ctakes/sandbox/dictionarytool/data/default/UnwantedSuffixes.txt
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/data/default/UnwantedSuffixes.txt?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/data/default/UnwantedSuffixes.txt (original)
+++ ctakes/sandbox/dictionarytool/data/default/UnwantedSuffixes.txt Wed Sep 10 17:30:42 2014
@@ -1,3 +1,4 @@
+/
 '
 -
 - ( other )
@@ -117,6 +118,7 @@
 ( & c / o ) 
 ( & c / s )
 ( & certificate )
+( & chronic )
 ( & claim )
 ( & claim gp / ms )
 ( & congenital )
@@ -906,6 +908,7 @@
 , nos
 , not elsewhere classified
 , services & procedure codes ( category iii , temporary )
+, unclassified
 , united states
 , unspecified
 , unspecified part

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java Wed Sep 10 17:30:42 2014
@@ -1,23 +1,21 @@
 package org.apache.ctakes.dictionarytool;
 
-import org.apache.ctakes.dictionarytool.reader.OrangebookReader;
 import org.apache.ctakes.dictionarytool.reader.SourceTypeListReader;
 import org.apache.ctakes.dictionarytool.reader.TuiListReader;
 import org.apache.ctakes.dictionarytool.reader.UmlsCodesForCuisReader;
-import org.apache.ctakes.dictionarytool.reader.UmlsCuisForTextsReader;
 import org.apache.ctakes.dictionarytool.util.CreatorProperties;
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
-import org.apache.ctakes.dictionarytool.util.UmlsSourceTypeCuiValidator;
-import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
-import org.apache.ctakes.dictionarytool.writer.CuiCodesMapWriter;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+import org.apache.ctakes.dictionarytool.writer.CuiCodesDbWriter;
+import org.apache.ctakes.dictionarytool.writer.CuiCodesWriter;
 
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.Map;
 import java.util.logging.Logger;
 
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.FORMAT_DATA;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.ORANGE_BOOK;
+import static org.apache.ctakes.dictionarytool.reader.UmlsCodesForCuisReader.CuiCodeInfo;
+import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.DATA_BASE;
+import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.MED_TUI_LIST;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.SOURCE;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TERM_LIST;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TUI_LIST;
@@ -34,88 +32,71 @@ public class CodeMapCreator {
    static private final Logger LOGGER = Logger.getLogger( "CodeMapCreator" );
 
 
-//   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
-//                                               "-ob",
-//                                               "C:\\Spiffy\\App\\umls\\2013AA\\2013AA\\Sean\\OrangeBook\\EOBZIP_2013_07_08-30_Fixed_PE\\Products.txt",
-//                                               "-ol", "C:/Spiffy/Output/DictionaryToolTest/SnomedCodeMap.bsv"
-//   };
+   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+                                               "-db",
+                                               "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
+                                               "-tbl", "kludge"
+         //                                               "-ol", "C:/Spiffy/Output/DictionaryToolTest/CodeMap_sno_rx.bsv"
+   };
 
 
    public static void main( final String[] args ) {
       // Set properties (user options) used to create the dictionary
-      final CreatorProperties properties = new CreatorProperties( args );
-//            final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
+            final CreatorProperties properties = new CreatorProperties( args );
+//      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
       // Set up the term utility
-      final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( FORMAT_DATA.getValue() );
       // Write the non-medication terms
       codeMapSemanticGroups();
       // Write the medication terms
-      if ( ORANGE_BOOK.hasValue() ) {
-         codeMapOrangebook( umlsTermUtil );
-      }
+      codeMapRxNorm();
    }
 
    static private void codeMapSemanticGroups() {
       // Read wanted Sources
       final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes( SOURCE.getValue() );
       // Read wanted Tuis
-      final Collection<String> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue() );
+      final Collection<Integer> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue() );
       if ( wantedTuis == null || wantedTuis.isEmpty() ) {
          LOGGER.severe( "No valid TUI codes found in " + TUI_LIST.getValue() );
          System.exit( 1 );
       }
       // get the valid Cuis for all wanted Tuis
-      final Map<String, Collection<String>> validCuisAndTuis
+      final HashSetMap<Long, Integer> validCuisAndTuis
             = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), wantedSources, wantedTuis );
       // Get the codes for all cuis
-      final Map<String, Map<String, Collection<String>>> cuisAndCodes
-            = UmlsCodesForCuisReader.readCodesForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                       validCuisAndTuis.keySet() );
-      // Output
-      if ( TERM_LIST.hasValue() ) {
-         CuiCodesMapWriter.writeCuiCodes( TERM_LIST.getValue(), cuisAndCodes );
-         //      } else if ( DATA_BASE.hasValue() && DATA_TABLE.hasValue() ) {
-         //         if ( isRareWordIndex ) {
-         //            RareWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
-         //                                             DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
-         //         } else {
-         //            FirstWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
-         //                                              DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
-         //         }
-      }
+      final Collection<CuiCodeInfo> cuiCodeInfo
+            = UmlsCodesForCuisReader.readCuiCodeInfo( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
+                                                      validCuisAndTuis );
+      writeOutput( cuiCodeInfo );
       LOGGER.info( "Done Code-Mapping Non-Medication Cuis" );
    }
 
 
-   static private void codeMapOrangebook( final UmlsTermUtil umlsTermUtil ) {
-      final Collection<String> orangeBookTexts = OrangebookReader.readOrangeBookTexts( ORANGE_BOOK.getValue() );
-      final Map<String, Collection<String>> cuiTexts
-            = UmlsCuisForTextsReader.readCuisForTexts( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                       orangeBookTexts, umlsTermUtil );
-      final Collection<String> validCuis
-            = UmlsSourceTypeCuiValidator.getSourceTypeValidCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                                 Arrays.asList( "RXNORM" ),
-                                                                 cuiTexts.keySet() );
+   static private void codeMapRxNorm() {
+      // Read wanted Tuis
+      final Collection<Integer> wantedTuis = TuiListReader.readTuiList( MED_TUI_LIST.getValue() );
+      if ( wantedTuis == null || wantedTuis.isEmpty() ) {
+         LOGGER.severe( "No valid TUI codes found in " + MED_TUI_LIST.getValue() );
+         System.exit( 1 );
+      }
+      // get the valid Cuis for all wanted Tuis, INgredient PreciseINgredient MultipleINgredient BrandName
+      final HashSetMap<Long, Integer> validCuisAndTuis
+            = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), Arrays.asList( "RXNORM" ),
+                                              wantedTuis, Arrays.asList( "IN", "PIN", "MIN", "BN" ) );
       // Get the codes for all cuis
-      final Map<String, Map<String, Collection<String>>> cuisAndCodes
-            = UmlsCodesForCuisReader.readCodesForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                       validCuis );
-      // Output
+      final Collection<CuiCodeInfo> cuiCodeInfo
+            = UmlsCodesForCuisReader.readCuiCodeInfo( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
+                                                      validCuisAndTuis );
+      writeOutput( cuiCodeInfo );
+      LOGGER.info( "Done Code-Mapping Medication Cuis" );
+   }
+
+   static private void writeOutput( final Collection<CuiCodeInfo> cuiCodeInfo ) {
       if ( TERM_LIST.hasValue() ) {
-         CuiCodesMapWriter.writeCuiCodes( TERM_LIST.getValue(), cuisAndCodes );
-         //      } else if ( DATA_BASE.hasValue() && DATA_TABLE.hasValue() ) {
-         //         final Map<String,Collection<String>> cuiTuis
-         //               = UmlsTuisForCuisReader.readUmlsTuisForCuis( UMLS_ROOT.getValue() + '/' + CUI_TUI_MAP._filename,
-         //                                                            validCuis );
-         //         if ( isRareWordIndex ) {
-         //            RareWordDbWriter.writeTermsToDb( cuiTuis, validCuiTexts,
-         //                                             DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
-         //         } else {
-         //            FirstWordDbWriter.writeTermsToDb( cuiTuis, validCuiTexts,
-         //                                              DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
-         //         }
+         CuiCodesWriter.writeCuiCodeInfo( TERM_LIST.getValue(), cuiCodeInfo );
+      } else if ( DATA_BASE.hasValue() ) {
+         CuiCodesDbWriter.writeCuiCodeInfo( cuiCodeInfo, DATA_BASE.getValue(), "sa", "" );
       }
-      LOGGER.info( "Done Code-Mapping Medication Cuis" );
    }
 
 

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java Wed Sep 10 17:30:42 2014
@@ -1,35 +1,29 @@
 package org.apache.ctakes.dictionarytool;
 
-import org.apache.ctakes.dictionarytool.reader.OrangebookReader;
 import org.apache.ctakes.dictionarytool.reader.SourceTypeListReader;
 import org.apache.ctakes.dictionarytool.reader.TuiListReader;
-import org.apache.ctakes.dictionarytool.reader.UmlsCuisForTextsReader;
 import org.apache.ctakes.dictionarytool.reader.UmlsTextsForCuisReader;
-import org.apache.ctakes.dictionarytool.reader.UmlsTuisForCuisReader;
 import org.apache.ctakes.dictionarytool.util.CreatorProperties;
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
-import org.apache.ctakes.dictionarytool.util.UmlsSourceTypeCuiValidator;
 import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 import org.apache.ctakes.dictionarytool.writer.CuiTextsMapWriter;
 import org.apache.ctakes.dictionarytool.writer.FirstWordDbWriter;
 import org.apache.ctakes.dictionarytool.writer.RareWordDbWriter;
 
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
 import java.util.logging.Logger;
 
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.DATA_BASE;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.DATA_TABLE;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.FORMAT_DATA;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.ORANGE_BOOK;
+import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.MED_TUI_LIST;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.SOURCE;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TERM_LIST;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TUI_LIST;
 import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.UMLS_ROOT;
 import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TERM_MAP;
-import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TUI_MAP;
 
 /**
  * Author: SPF
@@ -41,94 +35,96 @@ public class DictionaryCreator {
    static private final Logger LOGGER = Logger.getLogger( "DictionaryCreator" );
 
 
-   //   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
-   //                               "-ob", "C:\\Spiffy\\App\\umls\\2013AA\\2013AA\\Sean\\OrangeBook\\EOBZIP_2013_07_08-30_Fixed_PE\\Products.txt",
-   //                               "-ol", "C:/Spiffy/Output/DictionaryToolTest/defaults.bsv"
-   //                               "-db", "jdbc:hsqldb:file:C:/Spiffy/Projects/RareWordDict/Sno2011ab_ctakes_Mem/cTakesUmls",
-   //                               "-tbl", "CTAKES_UMLS"
-   //                            };
+   static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014 spf - clears ~850 2 char terms
+   static private final int MIN_RXNORM_TERM_LENGTH = 1;
+
+
+
+   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+                                               "-db",
+                                               "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
+                                               "-tbl", "CUI_TERMS",
+         //                                  "-ol", "C:/Spiffy/Output/DictionaryToolTest/Terms_sno_rx.bsv",
+         //                                  "-fw",
+         //                                  "-mtui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesDrugTuis.txt"
+         //                                  "-tui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesSnomedTuis.txt"
+   };
 
 
    public static void main( final String[] args ) {
       // Set properties (user options) used to create the dictionary
-      final CreatorProperties properties = new CreatorProperties( args );
-      //      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
+            final CreatorProperties properties = new CreatorProperties( args );
+//      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
       // Set up the term utility
       final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( FORMAT_DATA.getValue() );
       // Write the non-medication terms
-      writeSemanticGroups( umlsTermUtil, properties.isRareWordIndex() );
+      writeSnomed( umlsTermUtil, properties.isRareWordIndex() );
       // Write the medication terms
-      if ( ORANGE_BOOK.hasValue() ) {
-         writeOrangebook( umlsTermUtil, properties.isRareWordIndex() );
-      }
+      writeRxNorm( umlsTermUtil, properties.isRareWordIndex() );
    }
 
 
-   static private void writeSemanticGroups( final UmlsTermUtil umlsTermUtil, final boolean isRareWordIndex ) {
+   static private void writeSnomed( final UmlsTermUtil umlsTermUtil, final boolean isRareWordIndex ) {
       // Read wanted Sources
       final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes( SOURCE.getValue() );
       // Read wanted Tuis
-      final Collection<String> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue() );
+      final Collection<Integer> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue() );
       if ( wantedTuis == null || wantedTuis.isEmpty() ) {
          LOGGER.severe( "No valid TUI codes found in " + TUI_LIST.getValue() );
          System.exit( 1 );
       }
       // get the valid Cuis for all wanted Tuis
-      final Map<String, Collection<String>> validCuisAndTuis
+      final HashSetMap<Long, Integer> validCuisAndTuis
             = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), wantedSources, wantedTuis );
       // Get the texts for all cuis
-      final Map<String, Collection<String>> cuiTexts
+      // Term Types are not usable for Snomed.  ObsoletePreferredname IS (obsolete Synonym) PreferredTerm SYnonym
+      // PreferredTermGreatBritain  SYnonymGreatBritain OB (spelling variation?)   MTH_* MTH version
+      final HashSetMap<Long, String> cuiTexts
             = UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                       validCuisAndTuis.keySet(), umlsTermUtil );
-      if ( TERM_LIST.hasValue() ) {
-         CuiTextsMapWriter.writeCuiTexts( TERM_LIST.getValue(), cuiTexts );
-      } else if ( DATA_BASE.hasValue() && DATA_TABLE.hasValue() ) {
-         if ( isRareWordIndex ) {
-            RareWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
-                                             DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
-         } else {
-            FirstWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
-                                              DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
-         }
-      }
+                                                       validCuisAndTuis.keySet(), umlsTermUtil, false, true,
+                                                       MIN_SNOMED_TERM_LENGTH, 7 );
+      writeOutput( validCuisAndTuis, cuiTexts, isRareWordIndex );
       LOGGER.info( "Done Writing Non-Medication Terms" );
    }
 
-
-   static private void writeOrangebook( final UmlsTermUtil umlsTermUtil, final boolean isRareWordIndex ) {
-      final Collection<String> orangeBookTexts = OrangebookReader.readOrangeBookTexts( ORANGE_BOOK.getValue() );
-      final Map<String, Collection<String>> cuiTexts
-            = UmlsCuisForTextsReader.readCuisForTexts( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                       orangeBookTexts, umlsTermUtil );
-      final Collection<String> validCuis
-            = UmlsSourceTypeCuiValidator.getSourceTypeValidCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                                 Arrays.asList( "RXNORM" ),
-                                                                 cuiTexts.keySet() );
-      final Map<String, Collection<String>> validCuiTexts = new HashMap<String, Collection<String>>( cuiTexts.size() );
-      for ( String cui : validCuis ) {
-         final Collection<String> texts = cuiTexts.get( cui );
-         if ( cui.equals( "C0028128" ) ) {
-            // special case for nitric oxide
-            texts.remove( "no" );
-         }
-         validCuiTexts.put( cui, texts );
+   static private void writeRxNorm( final UmlsTermUtil umlsTermUtil, final boolean isRareWordIndex ) {
+      // Read wanted Tuis
+      final Collection<Integer> wantedTuis = TuiListReader.readTuiList( MED_TUI_LIST.getValue() );
+      if ( wantedTuis == null || wantedTuis.isEmpty() ) {
+         LOGGER.severe( "No valid TUI codes found in " + MED_TUI_LIST.getValue() );
+         System.exit( 1 );
       }
+      // get the valid Cuis for all wanted Tuis, INgredient PreciseINgredient MultipleINgredient BrandName
+      final HashSetMap<Long, Integer> validCuisAndTuis
+            = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), Arrays.asList( "RXNORM" ),
+                                              wantedTuis, Arrays.asList( "IN", "PIN", "MIN", "BN" ) );
+      // Get the texts for all cuis
+      final HashSetMap<Long, String> cuiTexts
+            = UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
+                                                       validCuisAndTuis.keySet(), umlsTermUtil,
+                                                       false, true, MIN_RXNORM_TERM_LENGTH, 11 );
+      //      final HashSetMap<Long, String> wantedCuiTexts = RxNormTermUtil.getDeliveryFreeTerms( cuiTexts );
+      // special case for nitric oxide "no"
+      cuiTexts.remove( 28128l, "no" );
+      writeOutput( validCuisAndTuis, cuiTexts, isRareWordIndex );
+      LOGGER.info( "Done Writing RxNorm Terms" );
+   }
+
+
+   static private void writeOutput( final HashSetMap<Long, Integer> validCuisAndTuis,
+                                    final HashSetMap<Long, String> cuiTexts,
+                                    final boolean isRareWordIndex ) {
       if ( TERM_LIST.hasValue() ) {
-         CuiTextsMapWriter.writeCuiTexts( TERM_LIST.getValue(), validCuiTexts );
+         CuiTextsMapWriter.writeCuiTexts( TERM_LIST.getValue(), cuiTexts );
       } else if ( DATA_BASE.hasValue() && DATA_TABLE.hasValue() ) {
-         final Map<String, Collection<String>> cuiTuis
-               = UmlsTuisForCuisReader.readUmlsTuisForCuis( UMLS_ROOT.getValue() + '/' + CUI_TUI_MAP._filename,
-                                                            validCuis );
          if ( isRareWordIndex ) {
-            RareWordDbWriter.writeTermsToDb( cuiTuis, validCuiTexts,
-                                             DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
+            RareWordDbWriter.writeCuiTerms( validCuisAndTuis, cuiTexts,
+                                            DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
          } else {
-            FirstWordDbWriter.writeTermsToDb( cuiTuis, validCuiTexts,
+            FirstWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
                                               DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue() );
          }
       }
-      LOGGER.info( "Done Writing Medication Terms" );
    }
 
-
 }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiRelationsMapReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiRelationsMapReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiRelationsMapReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiRelationsMapReader.java Wed Sep 10 17:30:42 2014
@@ -28,20 +28,20 @@ final public class CuiRelationsMapReader
    }
 
    final static public class CuiRelations {
-      final public String __cui;
+      final public Long __cuiCode;
       final public String __text;
-      final public Collection<String> __synonyms;
-      final public Collection<String> __parents;
-      final public Collection<String> __children;
-      final public Collection<String> __broadeners;
-      final public Collection<String> __narrowers;
-      final public Collection<String> __similars;
+      final public Collection<Long> __synonyms;
+      final public Collection<Long> __parents;
+      final public Collection<Long> __children;
+      final public Collection<Long> __broadeners;
+      final public Collection<Long> __narrowers;
+      final public Collection<Long> __similars;
 
-      private CuiRelations( final String cui, final String text, final Collection<String> synonyms,
-                            final Collection<String> parents, final Collection<String> children,
-                            final Collection<String> broadeners, final Collection<String> narrowers,
-                            final Collection<String> similars ) {
-         __cui = cui;
+      private CuiRelations( final Long cuiCode, final String text, final Collection<Long> synonyms,
+                            final Collection<Long> parents, final Collection<Long> children,
+                            final Collection<Long> broadeners, final Collection<Long> narrowers,
+                            final Collection<Long> similars ) {
+         __cuiCode = cuiCode;
          __text = text;
          __synonyms = synonyms;
          __parents = parents;
@@ -55,9 +55,9 @@ final public class CuiRelationsMapReader
    private CuiRelationsMapReader() {
    }
 
-   static public Map<String, CuiRelations> readCuiRelationsMap( final String relationsPath ) {
+   static public Map<Long, CuiRelations> readCuiRelationsMap( final String relationsPath ) {
       System.out.println( "Compiling map of Cuis and Relations using " + relationsPath );
-      final Map<String, CuiRelations> cuiRelationsMap = new HashMap<String, CuiRelations>();
+      final Map<Long, CuiRelations> cuiRelationsMap = new HashMap<>();
       long lineCount = 0;
       try {
          final BufferedReader reader = FileUtil.createReader( relationsPath );
@@ -65,17 +65,23 @@ final public class CuiRelationsMapReader
          while ( tokens != null ) {
             lineCount++;
             if ( tokens.size() > CuiRelationsIndex.SYNONYMS._index ) {
-               final String cui = CuiTuiUtil.getAsCui( tokens.get( CuiRelationsIndex.CUI._index ) );
+               final Long cuiCode = CuiTuiUtil.getCuiCode( tokens.get( CuiRelationsIndex.CUI._index ) );
                final String text = tokens.get( CuiRelationsIndex.TEXT._index );
-               final Collection<String> synonyms = TokenUtil.getCsvItems( tokens.get( CuiRelationsIndex.SYNONYMS._index ) );
-               final Collection<String> parents = TokenUtil.getCsvItems( tokens.get( CuiRelationsIndex.PARENTS._index ) );
-               final Collection<String> children = TokenUtil.getCsvItems( tokens.get( CuiRelationsIndex.CHILDREN._index ) );
-               final Collection<String> broadeners = TokenUtil.getCsvItems( tokens.get( CuiRelationsIndex.BROADER._index ) );
-               final Collection<String> narrowers = TokenUtil.getCsvItems( tokens.get( CuiRelationsIndex.NARROWER._index ) );
-               final Collection<String> similars = TokenUtil.getCsvItems( tokens.get( CuiRelationsIndex.SIMILAR._index ) );
-               final CuiRelations cuiRelations = new CuiRelations( cui, text, synonyms, parents, children,
+               final Collection<Long> synonyms = CuiTuiUtil.getCuiCodes( TokenUtil.getCsvItems( tokens.get(
+                     CuiRelationsIndex.SYNONYMS._index ) ) );
+               final Collection<Long> parents = CuiTuiUtil.getCuiCodes( TokenUtil.getCsvItems( tokens.get(
+                     CuiRelationsIndex.PARENTS._index ) ) );
+               final Collection<Long> children = CuiTuiUtil.getCuiCodes( TokenUtil.getCsvItems( tokens.get(
+                     CuiRelationsIndex.CHILDREN._index ) ) );
+               final Collection<Long> broadeners = CuiTuiUtil.getCuiCodes( TokenUtil.getCsvItems( tokens.get(
+                     CuiRelationsIndex.BROADER._index ) ) );
+               final Collection<Long> narrowers = CuiTuiUtil.getCuiCodes( TokenUtil.getCsvItems( tokens.get(
+                     CuiRelationsIndex.NARROWER._index ) ) );
+               final Collection<Long> similars = CuiTuiUtil.getCuiCodes( TokenUtil.getCsvItems( tokens.get(
+                     CuiRelationsIndex.SIMILAR._index ) ) );
+               final CuiRelations cuiRelations = new CuiRelations( cuiCode, text, synonyms, parents, children,
                                                                    broadeners, narrowers, similars );
-               cuiRelationsMap.put( cui, cuiRelations );
+               cuiRelationsMap.put( cuiCode, cuiRelations );
             }
             if ( lineCount % 100000 == 0 ) {
                System.out.println( "File Line " + lineCount + "\t Cuis " + cuiRelationsMap.size() );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTextsMapReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTextsMapReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTextsMapReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTextsMapReader.java Wed Sep 10 17:30:42 2014
@@ -2,14 +2,11 @@ package org.apache.ctakes.dictionarytool
 
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.io.BufferedReader;
 import java.io.IOException;
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 
 /**
  * Author: SPF
@@ -30,9 +27,9 @@ final public class CuiTextsMapReader {
       }
    }
 
-   private Map<String, Collection<String>> readCuiTexts( final String termsPath ) {
+   private HashSetMap<String, String> readCuiTexts( final String termsPath ) {
       System.out.println( "Compiling map of Cuis and Texts using " + termsPath );
-      final Map<String, Collection<String>> cuiTexts = new HashMap<String, Collection<String>>();
+      final HashSetMap<String, String> cuiTexts = new HashSetMap<>();
       long lineCount = 0;
       try {
          final BufferedReader reader = FileUtil.createReader( termsPath );
@@ -42,12 +39,7 @@ final public class CuiTextsMapReader {
             if ( tokens.size() > CuiTextIndex.TEXT._index ) {
                final String cui = CuiTuiUtil.getAsCui( tokens.get( CuiTextIndex.CUI._index ) );
                final String text = tokens.get( CuiTextIndex.TEXT._index );
-               Collection<String> textsForCui = cuiTexts.get( cui );
-               if ( textsForCui == null ) {
-                  textsForCui = new HashSet<String>( 1 );
-                  cuiTexts.put( cui, textsForCui );
-               }
-               textsForCui.add( text );
+               cuiTexts.place( cui, text );
             }
             if ( lineCount % 100000 == 0 ) {
                System.out.println( "File Line " + lineCount + "\t Cuis " + cuiTexts.size() );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTuiMapReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTuiMapReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTuiMapReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/CuiTuiMapReader.java Wed Sep 10 17:30:42 2014
@@ -1,9 +1,7 @@
 package org.apache.ctakes.dictionarytool.reader;
 
 import org.apache.ctakes.dictionarytool.util.FileUtil;
-
-import java.util.Collection;
-import java.util.Map;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 /**
  * Author: SPF
@@ -15,7 +13,7 @@ final public class CuiTuiMapReader {
    private CuiTuiMapReader() {
    }
 
-   static private Map<String, Collection<String>> readCuiTuiMap( final String cuiTuiMapPath ) {
+   static private HashSetMap<String, String> readCuiTuiMap( final String cuiTuiMapPath ) {
       return FileUtil.readNamedSets( cuiTuiMapPath, "map of Cuis and Tuis" );
    }
 

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java Wed Sep 10 17:30:42 2014
@@ -24,8 +24,8 @@ final public class OrangebookReader {
 
 
    static public Collection<String> readOrangeBookTexts( final String orangebookPath ) {
-      final Collection<String> ingredients = new HashSet<String>( 1000 );
-      final Collection<String> tradeNames = new HashSet<String>( 1000 );
+      final Collection<String> ingredients = new HashSet<>( 1000 );
+      final Collection<String> tradeNames = new HashSet<>( 1000 );
       long lineCount = 0;
       try {
          final BufferedReader reader = FileUtil.createReader( orangebookPath );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/TuiListReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/TuiListReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/TuiListReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/TuiListReader.java Wed Sep 10 17:30:42 2014
@@ -17,11 +17,11 @@ final public class TuiListReader {
    }
 
 
-   static public Collection<String> readTuiList( final String tuiListPath ) {
+   static public Collection<Integer> readTuiList( final String tuiListPath ) {
       final Collection<String> wantedTuis = FileUtil.readOneColumn( tuiListPath, "list of Tuis" );
-      final Collection<String> addedTlist = new HashSet<String>( wantedTuis.size() );
-      for ( String code : wantedTuis ) {
-         addedTlist.add( CuiTuiUtil.getAsTui( code ) );
+      final Collection<Integer> addedTlist = new HashSet<>( wantedTuis.size() );
+      for ( String tui : wantedTuis ) {
+         addedTlist.add( CuiTuiUtil.getTuiCode( tui ) );
       }
       return addedTlist;
    }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCodesForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCodesForCuisReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCodesForCuisReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCodesForCuisReader.java Wed Sep 10 17:30:42 2014
@@ -2,6 +2,8 @@ package org.apache.ctakes.dictionarytool
 
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+import org.apache.ctakes.dictionarytool.util.index.MrconsoIndex;
 
 import java.io.BufferedReader;
 import java.io.IOException;
@@ -13,9 +15,11 @@ import java.util.List;
 import java.util.Map;
 
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.CUI;
+import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.FORM;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.LANGUAGE;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.SOURCE;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.SOURCE_CODE;
+import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.STATUS;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.TEXT;
 
 /**
@@ -28,36 +32,34 @@ final public class UmlsCodesForCuisReade
    private UmlsCodesForCuisReader() {
    }
 
-   static public Map<String, Map<String, Collection<String>>> readCodesForCuis( final String rrfPath,
-                                                                                final Collection<String> wantedCuis ) {
+   static public Collection<CuiCodeInfo> readCuiCodeInfo( final String rrfPath,
+                                                          final HashSetMap<Long, Integer> validCuisAndTuis ) {
       final Collection<String> codeSources
-            = new HashSet<String>( Arrays.asList( "ICD10PCS", "ICD9CM", "RXNORM", "SNOMEDCT" ) );
+            = new HashSet<>( Arrays.asList( "ICD10PCS", "ICD9CM", "RXNORM", "SNOMEDCT" ) );
       long lineCount = 0;
       long codeCount = 0;
-      final Map<String, Map<String, Collection<String>>> cuisAndCodes
-            = new HashMap<String, Map<String, Collection<String>>>( wantedCuis.size() );
+      final Map<Long, CuiCodeInfo> cuisAndCodes = new HashMap<>( validCuisAndTuis.size() );
       try {
          final BufferedReader reader = FileUtil.createReader( rrfPath );
          List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
          while ( tokens != null ) {
             lineCount++;
-            if ( tokens.size() > TEXT._index
-                  && tokens.get( LANGUAGE._index ).equals( "ENG" )
-                  && codeSources.contains( tokens.get( SOURCE._index ) ) ) {
-               final String cui = CuiTuiUtil.getAsCui( tokens.get( CUI._index ) );
-               if ( wantedCuis.contains( cui ) ) {
-                  Map<String, Collection<String>> codeMap = cuisAndCodes.get( cui );
-                  if ( codeMap == null ) {
-                     codeMap = new HashMap<String, Collection<String>>( 1 );
-                     cuisAndCodes.put( cui, codeMap );
+            if ( tokens.size() > TEXT._index && getToken( tokens, LANGUAGE ).equals( "ENG" ) ) {
+               final Long cuiCode = CuiTuiUtil.getCuiCode( getToken( tokens, CUI ) );
+               final Collection<Integer> tuiCodes = validCuisAndTuis.get( cuiCode );
+               if ( tuiCodes != null ) {
+                  CuiCodeInfo cuiCodeInfo = cuisAndCodes.get( cuiCode );
+                  if ( cuiCodeInfo == null ) {
+                     cuiCodeInfo = new CuiCodeInfo( cuiCode, tuiCodes );
+                     cuisAndCodes.put( cuiCode, cuiCodeInfo );
                   }
-                  Collection<String> codes = codeMap.get( tokens.get( SOURCE._index ) );
-                  if ( codes == null ) {
-                     codes = new HashSet<String>( 1 );
-                     codeMap.put( tokens.get( SOURCE._index ), codes );
+                  if ( codeSources.contains( getToken( tokens, SOURCE ) ) ) {
+                     if ( cuiCodeInfo.place( getToken( tokens, SOURCE ), getToken( tokens, SOURCE_CODE ) ) ) {
+                        codeCount++;
+                     }
                   }
-                  if ( codes.add( tokens.get( SOURCE_CODE._index ) ) ) {
-                     codeCount++;
+                  if ( getToken( tokens, STATUS ).equals( "P" ) && getToken( tokens, FORM ).equals( "PF" ) ) {
+                     cuiCodeInfo.place( "PREFTERM", getToken( tokens, TEXT ) );
                   }
                }
             }
@@ -74,7 +76,39 @@ final public class UmlsCodesForCuisReade
          System.err.println( ioE.getMessage() );
       }
       System.out.println( "File Line " + lineCount + "\t Codes " + codeCount );
-      return cuisAndCodes;
+      return cuisAndCodes.values();
+   }
+
+   static private String getToken( final List<String> tokens, final MrconsoIndex mrconsoIndex ) {
+      return tokens.get( mrconsoIndex._index );
+   }
+
+   static public final class CuiCodeInfo {
+      final private Long __cuiCode;
+      final public HashSetMap<String, String> __codes = new HashSetMap<>();
+
+      private CuiCodeInfo( final Long cuiCode, final Collection<Integer> tuiCodes ) {
+         __cuiCode = cuiCode;
+         for ( Integer tuiCode : tuiCodes ) {
+            __codes.place( "TUI", tuiCode.toString() );
+         }
+      }
+
+      public Long getCuiCode() {
+         return __cuiCode;
+      }
+
+      public String getCui() {
+         return CuiTuiUtil.getAsCui( __cuiCode );
+      }
+
+      private boolean place( final String key, final String value ) {
+         return __codes.place( key, value );
+      }
+
+      public Collection<String> obtain( final String key ) {
+         return __codes.obtain( key );
+      }
    }
 
 }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTextsReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTextsReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTextsReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTextsReader.java Wed Sep 10 17:30:42 2014
@@ -3,13 +3,12 @@ package org.apache.ctakes.dictionarytool
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.FileUtil;
 import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.CUI;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.LANGUAGE;
@@ -26,13 +25,13 @@ final public class UmlsCuisForTextsReade
    }
 
 
-   static public Map<String, Collection<String>> readCuisForTexts( final String rrfPath,
-                                                                   final Collection<String> wantedTexts,
-                                                                   final UmlsTermUtil umlsTermUtil ) {
+   static public HashSetMap<Long, String> readCuisForTexts( final String rrfPath,
+                                                            final Collection<String> wantedTexts,
+                                                            final UmlsTermUtil umlsTermUtil ) {
       System.out.println( "Compiling map of Umls Cuis and Texts" );
       long lineCount = 0;
       long textCount = 0;
-      final Map<String, Collection<String>> cuisAndText = new HashMap<String, Collection<String>>( 1000 );
+      final HashSetMap<Long, String> cuisAndText = new HashSetMap<>( 1000 );
       try {
          final BufferedReader reader = FileUtil.createReader( rrfPath );
          List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
@@ -63,16 +62,8 @@ final public class UmlsCuisForTextsReade
                   tokens = FileUtil.readBsvTokens( reader, rrfPath );
                   continue;
                }
-               final String cui = CuiTuiUtil.getAsCui( tokens.get( CUI._index ) );
-               Collection<String> textsForCui = cuisAndText.get( cui );
-               if ( textsForCui == null ) {
-                  cuisAndText.put( cui, formattedTexts );
-                  textCount += formattedTexts.size();
-               } else {
-                  final int oldSize = textsForCui.size();
-                  textsForCui.addAll( formattedTexts );
-                  textCount += textsForCui.size() - oldSize;
-               }
+               final Long cuiCode = CuiTuiUtil.getCuiCode( tokens.get( CUI._index ) );
+               textCount += cuisAndText.addAll( cuiCode, formattedTexts );
             }
             if ( lineCount % 2000 == 0 ) {
                System.out.print( "." );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTuisReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTuisReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsCuisForTuisReader.java Wed Sep 10 17:30:42 2014
@@ -2,14 +2,13 @@ package org.apache.ctakes.dictionarytool
 
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 
 import static org.apache.ctakes.dictionarytool.util.index.MrstyIndex.CUI;
 import static org.apache.ctakes.dictionarytool.util.index.MrstyIndex.TUI;
@@ -24,31 +23,26 @@ final public class UmlsCuisForTuisReader
    private UmlsCuisForTuisReader() {
    }
 
-   static public Map<String, Collection<String>> readUmlsCuisForTuis( final String cuiTuiMapPath,
-                                                                      final Collection<String> wantedTuis ) {
+   static public HashSetMap<Long, Integer> readUmlsCuisForTuis( final String cuiTuiMapPath,
+                                                                final Collection<Integer> wantedTuis ) {
       System.out.println( "Compiling list of Cuis with wanted Tuis using " + cuiTuiMapPath );
       long lineCount = 0;
-      final Map<String, Collection<String>> wantedCuisAndTuis = new HashMap<String, Collection<String>>();
-      final Collection<String> usedTuis = new HashSet<String>( wantedTuis.size() );
+      final HashSetMap<Long, Integer> wantedCuisAndTuis = new HashSetMap<>();
+      final Collection<Integer> usedTuis = new HashSet<>( wantedTuis.size() );
       try {
          final BufferedReader reader = FileUtil.createReader( cuiTuiMapPath );
          List<String> tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
          while ( tokens != null ) {
             lineCount++;
             if ( tokens.size() > TUI._index ) {
-               final String tui = CuiTuiUtil.getAsTui( tokens.get( TUI._index ) );
-               if ( !wantedTuis.contains( tui ) ) {
+               final Integer tuiCode = CuiTuiUtil.getTuiCode( tokens.get( TUI._index ) );
+               if ( !wantedTuis.contains( tuiCode ) ) {
                   tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
                   continue;
                }
-               final String cui = CuiTuiUtil.getAsCui( tokens.get( CUI._index ) );
-               Collection<String> tuis = wantedCuisAndTuis.get( cui );
-               if ( tuis == null ) {
-                  tuis = new HashSet<String>( 1 );
-                  wantedCuisAndTuis.put( cui, tuis );
-               }
-               tuis.add( tui );
-               usedTuis.add( tui );
+               final Long cuiCode = CuiTuiUtil.getCuiCode( tokens.get( CUI._index ) );
+               wantedCuisAndTuis.place( cuiCode, tuiCode );
+               usedTuis.add( tuiCode );
             }
             if ( lineCount % 100000 == 0 ) {
                System.out.println( "File Line " + lineCount + "\t Cuis " + wantedCuisAndTuis.size() );
@@ -62,7 +56,7 @@ final public class UmlsCuisForTuisReader
       System.out.println( "File Lines " + lineCount + "\t Cuis " + wantedCuisAndTuis.size() );
       if ( usedTuis.size() != wantedTuis.size() ) {
          wantedTuis.removeAll( usedTuis );
-         for ( String missingTui : wantedTuis ) {
+         for ( Integer missingTui : wantedTuis ) {
             System.out.println( "Could not find Cuis for Tui " + missingTui );
          }
       }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsRelationsForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsRelationsForCuisReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsRelationsForCuisReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsRelationsForCuisReader.java Wed Sep 10 17:30:42 2014
@@ -143,13 +143,13 @@ final public class UmlsRelationsForCuisR
                                           final Map<String, Collection<String>> cuiChildren ) {
       Collection<String> children = cuiChildren.get( parentCui );
       if ( children == null ) {
-         children = new HashSet<String>( 1 );
+         children = new HashSet<>( 1 );
          cuiChildren.put( parentCui, children );
       }
       final boolean addedChild = children.add( childCui );
       Collection<String> parents = cuiParents.get( childCui );
       if ( parents == null ) {
-         parents = new HashSet<String>( 1 );
+         parents = new HashSet<>( 1 );
          cuiParents.put( childCui, parents );
       }
       final boolean addedParent = parents.add( parentCui );
@@ -160,7 +160,7 @@ final public class UmlsRelationsForCuisR
                                       final Map<String, Collection<String>> cuiSynonyms ) {
       Collection<String> synonyms = cuiSynonyms.get( cui1 );
       if ( synonyms == null ) {
-         synonyms = new HashSet<String>( 1 );
+         synonyms = new HashSet<>( 1 );
          cuiSynonyms.put( cui1, synonyms );
       }
       return synonyms.add( cui2 );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsSemanticTypeTuiReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsSemanticTypeTuiReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsSemanticTypeTuiReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsSemanticTypeTuiReader.java Wed Sep 10 17:30:42 2014
@@ -29,8 +29,8 @@ final public class UmlsSemanticTypeTuiRe
    static public Collection<String> readSemanticTypeTuis( final String typeTuisPath,
                                                           final Collection<String> semanticTypes ) {
       System.out.println( "Reading Tuis for Semantic Types from " + typeTuisPath );
-      final Collection<String> typeTuis = new HashSet<String>();
-      final Collection<String> usedTypes = new HashSet<String>( semanticTypes.size() );
+      final Collection<String> typeTuis = new HashSet<>();
+      final Collection<String> usedTypes = new HashSet<>( semanticTypes.size() );
       long lineCount = 0;
       try {
          final BufferedReader reader = FileUtil.createReader( typeTuisPath );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java Wed Sep 10 17:30:42 2014
@@ -3,15 +3,16 @@ package org.apache.ctakes.dictionarytool
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.FileUtil;
 import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+import org.apache.ctakes.dictionarytool.util.index.MrconsoIndex;
 
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.CUI;
+import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.FORM;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.LANGUAGE;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.STATUS;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.TEXT;
@@ -27,46 +28,42 @@ final public class UmlsTextsForCuisReade
    private UmlsTextsForCuisReader() {
    }
 
-   static public Map<String, Collection<String>> readTextsForCuis( final String rrfPath,
-                                                                   final Collection<String> wantedCuis,
-                                                                   final UmlsTermUtil umlsTermUtil ) {
-      return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil, false, true );
+   static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
+                                                            final Collection<Long> wantedCuis,
+                                                            final UmlsTermUtil umlsTermUtil ) {
+      return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil, false, true, 1, Integer.MAX_VALUE );
    }
 
-   static public Map<String, Collection<String>> readTextsForCuis( final String rrfPath,
-                                                                   final Collection<String> wantedCuis,
-                                                                   final UmlsTermUtil umlsTermUtil,
-                                                                   final boolean preferredOnly,
-                                                                   final boolean extractAbbreviations ) {
+   static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
+                                                            final Collection<Long> wantedCuis,
+                                                            final UmlsTermUtil umlsTermUtil,
+                                                            final boolean preferredOnly,
+                                                            final boolean extractAbbreviations,
+                                                            final int minWordLength,
+                                                            final int maxWordCount ) {
       System.out.println( "Compiling map of Umls Cuis and Texts" );
       long lineCount = 0;
       long textCount = 0;
-      final Map<String, Collection<String>> cuisAndText = new HashMap<String, Collection<String>>( wantedCuis.size() );
+      final HashSetMap<Long, String> cuisAndText = new HashSetMap<>( wantedCuis.size() );
       try {
          final BufferedReader reader = FileUtil.createReader( rrfPath );
          List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
          while ( tokens != null ) {
             lineCount++;
             if ( tokens.size() > TEXT._index
-                  && tokens.get( LANGUAGE._index ).equals( "ENG" )
-                  && (!preferredOnly || tokens.get( STATUS._index ).equals( "P" )) ) {
-               final String cui = CuiTuiUtil.getAsCui( tokens.get( CUI._index ) );
-               if ( wantedCuis.contains( cui ) ) {
-                  String text = tokens.get( TEXT._index );
-                  Collection<String> formattedTexts = umlsTermUtil.getFormattedTexts( text, extractAbbreviations );
+                  && getToken( tokens, LANGUAGE ).equals( "ENG" )
+                  && (!preferredOnly
+                  || (getToken( tokens, STATUS ).equals( "P" ) && getToken( tokens, FORM ).equals( "PF" ))) ) {
+               final Long cuiCode = CuiTuiUtil.getCuiCode( getToken( tokens, CUI ) );
+               if ( wantedCuis.contains( cuiCode ) ) {
+                  String text = getToken( tokens, TEXT );
+                  Collection<String> formattedTexts = umlsTermUtil.getFormattedTexts( text, extractAbbreviations,
+                                                                                      minWordLength, maxWordCount );
                   if ( formattedTexts == null || formattedTexts.isEmpty() ) {
                      tokens = FileUtil.readBsvTokens( reader, rrfPath );
                      continue;
                   }
-                  Collection<String> textsForCui = cuisAndText.get( cui );
-                  if ( textsForCui == null ) {
-                     cuisAndText.put( cui, formattedTexts );
-                     textCount += formattedTexts.size();
-                  } else {
-                     final int oldSize = textsForCui.size();
-                     textsForCui.addAll( formattedTexts );
-                     textCount += textsForCui.size() - oldSize;
-                  }
+                  textCount += cuisAndText.addAll( cuiCode, formattedTexts );
                }
             }
             if ( lineCount % 2000 == 0 ) {
@@ -85,4 +82,10 @@ final public class UmlsTextsForCuisReade
       return cuisAndText;
    }
 
+
+   static private String getToken( final List<String> tokens, final MrconsoIndex mrconsoIndex ) {
+      return tokens.get( mrconsoIndex._index );
+   }
+
+
 }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java Wed Sep 10 17:30:42 2014
@@ -2,14 +2,13 @@ package org.apache.ctakes.dictionarytool
 
 import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
 
 import static org.apache.ctakes.dictionarytool.util.index.MrstyIndex.CUI;
 import static org.apache.ctakes.dictionarytool.util.index.MrstyIndex.TUI;
@@ -24,12 +23,12 @@ final public class UmlsTuisForCuisReader
    private UmlsTuisForCuisReader() {
    }
 
-   static public Map<String, Collection<String>> readUmlsTuisForCuis( final String cuiTuiMapPath,
-                                                                      final Collection<String> cuis ) {
+   static public HashSetMap<String, String> readUmlsTuisForCuis( final String cuiTuiMapPath,
+                                                                 final Collection<String> cuis ) {
       System.out.println( "Compiling list of Tuis for wanted Cuis using " + cuiTuiMapPath );
       long lineCount = 0;
-      final Map<String, Collection<String>> cuisAndTuis = new HashMap<String, Collection<String>>( cuis.size() );
-      final Collection<String> usedCuis = new HashSet<String>( cuis.size() );
+      final HashSetMap<String, String> cuisAndTuis = new HashSetMap<>( cuis.size() );
+      final Collection<String> usedCuis = new HashSet<>( cuis.size() );
       try {
          final BufferedReader reader = FileUtil.createReader( cuiTuiMapPath );
          List<String> tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
@@ -42,12 +41,7 @@ final public class UmlsTuisForCuisReader
                   continue;
                }
                final String tui = CuiTuiUtil.getAsTui( tokens.get( TUI._index ) );
-               Collection<String> tuis = cuisAndTuis.get( cui );
-               if ( tuis == null ) {
-                  tuis = new HashSet<String>( 1 );
-                  cuisAndTuis.put( cui, tuis );
-               }
-               tuis.add( tui );
+               cuisAndTuis.place( cui, tui );
                usedCuis.add( cui );
             }
             if ( lineCount % 100000 == 0 ) {

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java Wed Sep 10 17:30:42 2014
@@ -12,10 +12,10 @@ final public class CreatorProperties {
    // Could have made this a singleton with defaults set on creation and a "setOptions", but that is kind of bad form
 
    static private final String DEFAULT_DATA_DIR = "./data/default";
-   static private final String DEFAULT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesAllTuis.txt";
+   static private final String DEFAULT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesSnomedTuis.txt";
+   static private final String DEFAULT_MED_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesDrugTuis.txt";
    static private final String DEFAULT_SOURCE_FILE = DEFAULT_DATA_DIR + "/CtakesSources.txt";
 
-
    private boolean _rareWordIndex = true;
 
    public CreatorProperties( final String... args ) {
@@ -56,10 +56,13 @@ final public class CreatorProperties {
       System.out.println( "The UMLS Root Directory must be specified" );
       System.out.println( "One form of output must be specified using either -ol or -db and -tbl" );
       System.out.println( "The default index type for databases is Rare Word Index" );
-      System.out.println( "If an Orangebook Path is not specified then (orangebook) medication terms are not written" );
+      //      System.out.println( "If an Orangebook Path is not specified then (orangebook) medication terms are not written" );
       System.out.println( "If a Format Data Directory is not specified then the default is used: " + DEFAULT_DATA_DIR );
       System.out.println(
-            "If an Input Tui List Path is not specified then the cTakes Tuis are used: " + DEFAULT_TUI_FILE );
+            "If an Input Tui List Path is not specified then the cTakes Snomed Tuis are used: " + DEFAULT_TUI_FILE );
+      System.out.println(
+            "If an Input Drug Tui List Path is not specified then the cTakes Medication Tuis are used: "
+                  + DEFAULT_MED_TUI_FILE );
       System.out.println( "If a Source Type List Path is not specified then Snomed is used: " + DEFAULT_SOURCE_FILE );
    }
 
@@ -83,6 +86,9 @@ final public class CreatorProperties {
       if ( !Option.TUI_LIST.hasValue() ) {
          Option.TUI_LIST.parseValue( Option.TUI_LIST.__key, DEFAULT_TUI_FILE );
       }
+      if ( !Option.MED_TUI_LIST.hasValue() ) {
+         Option.MED_TUI_LIST.parseValue( Option.MED_TUI_LIST.__key, DEFAULT_MED_TUI_FILE );
+      }
       if ( !Option.SOURCE.hasValue() ) {
          Option.SOURCE.parseValue( Option.SOURCE.__key, DEFAULT_SOURCE_FILE );
       }
@@ -100,6 +106,7 @@ final public class CreatorProperties {
       ORANGE_BOOK( "Orangebook Path", "-ob" ),
       FORMAT_DATA( "Format Data Directory", "-fd" ),
       TUI_LIST( "Input Tui List Path", "-tui" ),
+      MED_TUI_LIST( "Medication Tui List Path", "-mtui" ),
       //      SEM_LIST( "Input Semantic Group List Path", "-sem" ),
       SOURCE( "Source Type List Path", "-src" ),
       TERM_LIST( "Output Cui and Term List Path", "-ol" ),

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java Wed Sep 10 17:30:42 2014
@@ -1,10 +1,10 @@
 package org.apache.ctakes.dictionarytool.util;
 
 import org.apache.ctakes.dictionarytool.reader.UmlsCuisForTuisReader;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
+import java.util.HashSet;
 
 import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TERM_MAP;
 import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TUI_MAP;
@@ -19,41 +19,144 @@ final public class CuiTuiUtil {
    private CuiTuiUtil() {
    }
 
+   static public String getAsCui( final Long code ) {
+      final StringBuilder sb = new StringBuilder( 8 );
+      sb.append( code );
+      return getAsCui( sb );
+   }
+
    static public String getAsCui( final String code ) {
-      final String cui = code.trim().toUpperCase();
-      if ( cui.startsWith( "C" ) ) {
-         return cui;
+      if ( code.length() == 8 && code.startsWith( "C" ) ) {
+         return code;
+      }
+      final StringBuilder sb = new StringBuilder( 8 );
+      sb.append( code.replace( "C", "" ) );
+      return getAsCui( sb );
+   }
+
+   static private String getAsCui( final StringBuilder sb ) {
+      while ( sb.length() < 7 ) {
+         sb.insert( 0, '0' );
       }
-      return "C" + cui;
+      sb.insert( 0, 'C' );
+      return sb.toString();
+   }
+
+
+   static public Long getCuiCode( final String cui ) {
+      final String cuiText = getAsCui( cui );
+      final String cuiNum = cuiText.substring( 1, cuiText.length() );
+      try {
+         return Long.parseLong( cuiNum );
+      } catch ( NumberFormatException nfE ) {
+         System.err.println( "Could not create Cui Code for " + cui );
+      }
+      return -1l;
+   }
+
+   static public Collection<Long> getCuiCodes( final Collection<String> cuis ) {
+      final Collection<Long> cuiCodes = new HashSet<>( cuis.size() );
+      for ( String cui : cuis ) {
+         cuiCodes.add( getCuiCode( cui ) );
+      }
+      return cuiCodes;
+   }
+
+   static public String getAsTui( final Integer code ) {
+      final StringBuilder sb = new StringBuilder( 4 );
+      sb.append( code );
+      return getAsTui( sb );
    }
 
    static public String getAsTui( final String code ) {
-      final String tui = code.trim().toUpperCase();
-      if ( tui.startsWith( "T" ) ) {
-         return tui;
+      if ( code.length() == 4 && code.startsWith( "T" ) ) {
+         return code;
+      }
+      final StringBuilder sb = new StringBuilder( 4 );
+      sb.append( code.replace( "T", "" ) );
+      return getAsTui( sb );
+   }
+
+   static private String getAsTui( final StringBuilder sb ) {
+      while ( sb.length() < 3 ) {
+         sb.insert( 0, '0' );
+      }
+      sb.insert( 0, 'T' );
+      return sb.toString();
+   }
+
+
+   static public Collection<String> getIntAsTuis( final Collection<Integer> tuiCodes ) {
+      final Collection<String> tuis = new HashSet<>( tuiCodes.size() );
+      for ( Integer tuiCode : tuiCodes ) {
+         tuis.add( getAsTui( tuiCode ) );
       }
-      return "T" + tui;
+      return tuis;
    }
 
-   static public Map<String, Collection<String>> getValidCuisAndTuis( final String umlsPath,
-                                                                      final Collection<String> wantedSources,
-                                                                      final Collection<String> wantedTuis ) {
+   static public Collection<String> getStringAsTuis( final Collection<String> tuiNums ) {
+      final Collection<String> tuis = new HashSet<>( tuiNums.size() );
+      for ( String tuiNum : tuiNums ) {
+         tuis.add( getAsTui( tuiNum ) );
+      }
+      return tuis;
+   }
+
+   static public Integer getTuiCode( final String tui ) {
+      final String tuiText = getAsTui( tui );
+      final String tuiNum = tuiText.substring( 1, tuiText.length() );
+      try {
+         return Integer.parseInt( tuiNum );
+      } catch ( NumberFormatException nfE ) {
+         System.err.println( "Could not create Tui Code for " + tui );
+      }
+      return -1;
+   }
+
+   static public HashSetMap<Long, Integer> getValidCuisAndTuis( final String umlsPath,
+                                                                final Collection<String> wantedSources,
+                                                                final Collection<Integer> wantedTuis ) {
       // get all the Cuis for the wanted Tuis.  Key = Cui, Value = Tuis to which the Cui belongs
-      final Map<String, Collection<String>> wantedCuisAndTuis
+      final HashSetMap<Long, Integer> wantedCuisAndTuis
             = UmlsCuisForTuisReader.readUmlsCuisForTuis( umlsPath + '/' + CUI_TUI_MAP._filename, wantedTuis );
       if ( wantedSources.isEmpty() ) {
          // No specified source types, assume that all sources are valid
          return wantedCuisAndTuis;
       }
       // filter out the Cuis that do not belong to the given sources
-      final Collection<String> validCuis
+      final Collection<Long> validCuis
             = UmlsSourceTypeCuiValidator.getSourceTypeValidCuis( umlsPath + '/' + CUI_TERM_MAP._filename,
                                                                  wantedSources,
                                                                  wantedCuisAndTuis.keySet() );
       // Key = Cui, Value = Tuis to which the Cui belongs
-      final Map<String, Collection<String>> validCuisAndTuis = new HashMap<String, Collection<String>>();
-      for ( String validCui : validCuis ) {
-         validCuisAndTuis.put( validCui, wantedCuisAndTuis.get( validCui ) );
+      final HashSetMap<Long, Integer> validCuisAndTuis = new HashSetMap<>();
+      for ( Long validCui : validCuis ) {
+         validCuisAndTuis.addAll( validCui, wantedCuisAndTuis.get( validCui ) );
+      }
+      return validCuisAndTuis;
+   }
+
+   static public HashSetMap<Long, Integer> getValidCuisAndTuis( final String umlsPath,
+                                                                final Collection<String> wantedSources,
+                                                                final Collection<Integer> wantedTuis,
+                                                                final Collection<String> termTypes ) {
+      // get all the Cuis for the wanted Tuis.  Key = Cui, Value = Tuis to which the Cui belongs
+      final HashSetMap<Long, Integer> wantedCuisAndTuis
+            = UmlsCuisForTuisReader.readUmlsCuisForTuis( umlsPath + '/' + CUI_TUI_MAP._filename, wantedTuis );
+      if ( wantedSources.isEmpty() ) {
+         // No specified source types, assume that all sources are valid
+         return wantedCuisAndTuis;
+      }
+      // filter out the Cuis that do not belong to the given sources
+      final Collection<Long> validCuis
+            = UmlsSourceTypeCuiValidator.getSourceTypeValidCuis( umlsPath + '/' + CUI_TERM_MAP._filename,
+                                                                 wantedSources,
+                                                                 wantedCuisAndTuis.keySet(),
+                                                                 termTypes );
+      // Key = Cui, Value = Tuis to which the Cui belongs
+      final HashSetMap<Long, Integer> validCuisAndTuis = new HashSetMap<>();
+      for ( Long validCui : validCuis ) {
+         validCuisAndTuis.addAll( validCui, wantedCuisAndTuis.get( validCui ) );
       }
       return validCuisAndTuis;
    }



Mime
View raw message