ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1703438 [1/2] - in /ctakes/sandbox/dictionarytool/src: META-INF/ org/apache/ctakes/dictionarytool/ org/apache/ctakes/dictionarytool/reader/ org/apache/ctakes/dictionarytool/util/ org/apache/ctakes/dictionarytool/util/token/ org/apache/ctak...
Date Wed, 16 Sep 2015 17:43:50 GMT
Author: seanfinan
Date: Wed Sep 16 17:43:48 2015
New Revision: 1703438

URL: http://svn.apache.org/r1703438
Log:
Added cui|tui|text bsv writing
Made slight changes to TextTokenizer

Added:
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizer.java
      - copied, changed from r1667128, ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizerCtakesPTB.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TokenIndexHolder.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiTextsMapWriter.java
Removed:
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
Modified:
    ctakes/sandbox/dictionarytool/src/META-INF/MANIFEST.MF
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/RelationsCreator.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/DeliveryUtil.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java

Modified: ctakes/sandbox/dictionarytool/src/META-INF/MANIFEST.MF
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/META-INF/MANIFEST.MF?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/META-INF/MANIFEST.MF (original)
+++ ctakes/sandbox/dictionarytool/src/META-INF/MANIFEST.MF Wed Sep 16 17:43:48 2015
@@ -1,3 +1,3 @@
 Manifest-Version: 1.0
-Main-Class: org.apache.ctakes.dictionarytool.DictionaryCreator
+Main-Class: org.apache.ctakes.dictionarytool.DictionaryCreator2
 

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
Wed Sep 16 17:43:48 2015
@@ -40,11 +40,24 @@ public class CodeMapCreator {
                                                 "-fd", "./data/tiny"
    };
 
+   static private final String[] TIM_OTHER_ARGS = { "-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
+                                                    "-db",
+                                                    "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/timother/timother",
+                                                    "-tbl", "kludge",
+//                                           "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/TimOtherMap.bsv",
+                                                    //                                  "-fw",
+                                                    "-fd", "./data/tim",
+                                                    "-atui", "./data/tim/emptyAnatTuis.txt",
+                                                    "-mtui", "./data/tim/TimDrugTuis.txt",
+                                                    "-tui", "./data/tim/TimSpecificTuis.txt",
+                                                    "-src", "./data/tim/TimNonDrugSources.txt"
+   };
 
    public static void main( final String[] args ) {
       // Set properties (user options) used to create the dictionary
             final CreatorProperties properties = new CreatorProperties( args );
 //      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
+//      final CreatorProperties properties = new CreatorProperties( TIM_OTHER_ARGS );
       // Set up the term utility
       // Write the non-medication terms
       codeMapSemanticGroups();
@@ -58,8 +71,8 @@ public class CodeMapCreator {
       // Read wanted Tuis
       final Collection<Integer> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue()
);
       if ( wantedTuis == null || wantedTuis.isEmpty() ) {
-         LOGGER.severe( "No valid TUI codes found in " + TUI_LIST.getValue() );
-         System.exit( 1 );
+         LOGGER.warning( "No valid TUI codes found in " + TUI_LIST.getValue() );
+         return;
       }
       // get the valid Cuis for all wanted Tuis
       final HashSetMap<Long, Integer> validCuisAndTuis
@@ -77,8 +90,8 @@ public class CodeMapCreator {
       // Read wanted Tuis
       final Collection<Integer> wantedTuis = TuiListReader.readTuiList( MED_TUI_LIST.getValue()
);
       if ( wantedTuis == null || wantedTuis.isEmpty() ) {
-         LOGGER.severe( "No valid TUI codes found in " + MED_TUI_LIST.getValue() );
-         System.exit( 1 );
+         LOGGER.warning( "No valid TUI codes found in " + MED_TUI_LIST.getValue() );
+         return;
       }
       // get the valid Cuis for all wanted Tuis, INgredient PreciseINgredient MultipleINgredient
BrandName
       final HashSetMap<Long, Integer> validCuisAndTuis

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
Wed Sep 16 17:43:48 2015
@@ -8,11 +8,13 @@ import org.apache.ctakes.dictionarytool.
 import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
 import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 import org.apache.ctakes.dictionarytool.writer.CuiTextsMapWriter;
+import org.apache.ctakes.dictionarytool.writer.CuiTuiTextsMapWriter;
 import org.apache.ctakes.dictionarytool.writer.FirstWordDbWriter;
 import org.apache.ctakes.dictionarytool.writer.RareWordDbWriter;
 
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.HashSet;
 import java.util.logging.Logger;
 
@@ -43,22 +45,39 @@ public class DictionaryCreator2 {
 
 
    static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
-                                               "-db",
-                                               "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
-                                               "-tbl", "CUI_TERMS",
-//                                           "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/Terms_sno_rx.bsv",
-         //                                  "-fw",
+//                                               "-db",
+//                                               "jdbc:hsqldb:file:C:/Spiffy/rword_dict/output/temp/fword_sno_rx_mem/fword_sno_rx_mem",
+//                                               "-tbl", "CUI_TERMS",
+//                                               "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
+//                                               "-tbl", "CUI_TERMS",
+//                                           "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/Terms_sno_rx_ptb.bsv",
+                                               "-bsv", "C:/Spiffy/rword_dict/output/umls2011_bsv/Umls2011.bsv",
+//                                           "-fw",
                                              "-fd", "./data/tiny",
                                              "-atui", "./data/tiny/CtakesAnatTuis.txt",
                                              "-tui", "./data/tiny/CtakesSnomedTuis.txt"
    };
 
+   static private final String[] TIM_OTHER_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
+                                               "-db",
+                                               "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/timother/timother",
+                                               "-tbl", "CUI_TERMS",
+//                                                   "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/TimOther.bsv",
+//                                           "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/TimSpecific.bsv",
+                                               //                                  "-fw",
+                                               "-fd", "./data/tim",
+                                               "-atui", "./data/tim/emptyAnatTuis.txt",
+                                               "-mtui", "./data/tim/TimDrugTuis.txt",
+                                               "-tui", "./data/tim/TimSpecificTuis.txt",
+                                                "-src", "./data/tim/TimNonDrugSources.txt"
+   };
 
    public static void main( final String[] args ) {
       // Set properties (user options) used to create the dictionary
             final CreatorProperties properties = new CreatorProperties( args );
 //      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
-      // Set up the term utility
+//      final CreatorProperties properties = new CreatorProperties( TIM_OTHER_ARGS );
+// Set up the term utility
       final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( FORMAT_DATA.getValue() );
       // Write the anatomical site terms
       final Collection<String> anats = writeAnat( umlsTermUtil, properties.isRareWordIndex()
);
@@ -74,8 +93,8 @@ public class DictionaryCreator2 {
       // Read wanted Tuis
       final Collection<Integer> wantedTuis = TuiListReader.readTuiList( ANAT_TUI_LIST.getValue()
);
       if ( wantedTuis == null || wantedTuis.isEmpty() ) {
-         LOGGER.severe( "No valid TUI codes found in " + ANAT_TUI_LIST.getValue() );
-         System.exit( 1 );
+         LOGGER.warning( "No valid TUI codes found in " + ANAT_TUI_LIST.getValue() );
+         return Collections.emptyList();
       }
       // get the valid Cuis for all wanted Tuis
       final HashSetMap<Long, Integer> validCuisAndTuis
@@ -103,8 +122,8 @@ public class DictionaryCreator2 {
       // Read wanted Tuis
       final Collection<Integer> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue()
);
       if ( wantedTuis == null || wantedTuis.isEmpty() ) {
-         LOGGER.severe( "No valid TUI codes found in " + TUI_LIST.getValue() );
-         System.exit( 1 );
+         LOGGER.warning( "No valid TUI codes found in " + TUI_LIST.getValue() );
+         return;
       }
       // get the valid Cuis for all wanted Tuis
       final HashSetMap<Long, Integer> validCuisAndTuis
@@ -124,8 +143,8 @@ public class DictionaryCreator2 {
       // Read wanted Tuis
       final Collection<Integer> wantedTuis = TuiListReader.readTuiList( MED_TUI_LIST.getValue()
);
       if ( wantedTuis == null || wantedTuis.isEmpty() ) {
-         LOGGER.severe( "No valid TUI codes found in " + MED_TUI_LIST.getValue() );
-         System.exit( 1 );
+         LOGGER.warning( "No valid TUI codes found in " + MED_TUI_LIST.getValue() );
+         return;
       }
       // get the valid Cuis for all wanted Tuis, INgredient PreciseINgredient MultipleINgredient
BrandName
       final HashSetMap<Long, Integer> validCuisAndTuis
@@ -149,6 +168,8 @@ public class DictionaryCreator2 {
                                     final boolean isRareWordIndex ) {
       if ( TERM_LIST.hasValue() ) {
          CuiTextsMapWriter.writeCuiTexts( TERM_LIST.getValue(), cuiTexts );
+      } else if ( BSV_LIST.hasValue() ) {
+         CuiTuiTextsMapWriter.writeCuiTuiTexts( BSV_LIST.getValue(), validCuisAndTuis, cuiTexts
);
       } else if ( DATA_BASE.hasValue() && DATA_TABLE.hasValue() ) {
          if ( isRareWordIndex ) {
             RareWordDbWriter.writeCuiTerms( validCuisAndTuis, cuiTexts,

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/RelationsCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/RelationsCreator.java?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/RelationsCreator.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/RelationsCreator.java
Wed Sep 16 17:43:48 2015
@@ -1,99 +1,99 @@
-package org.apache.ctakes.dictionarytool;
-
-import org.apache.ctakes.dictionarytool.reader.SourceTypeListReader;
-import org.apache.ctakes.dictionarytool.reader.TuiListReader;
-import org.apache.ctakes.dictionarytool.reader.UmlsRelationsForCuisReader;
-import org.apache.ctakes.dictionarytool.reader.UmlsTextsForCuisReader;
-import org.apache.ctakes.dictionarytool.util.CreatorProperties;
-import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
-import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
-import org.apache.ctakes.dictionarytool.writer.CuiRelationsMapWriter;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Map;
-import java.util.logging.Logger;
-
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.FORMAT_DATA;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.SOURCE;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TERM_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TUI_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.UMLS_ROOT;
-import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TERM_MAP;
-import static org.apache.ctakes.dictionarytool.util.UmlsFileName.RELATION_LIST;
-
-/**
- * Author: SPF
- * Affiliation: CHIP-NLP
- * Date: 3/26/14
- */
-public class RelationsCreator {
-
-   static private final Logger LOGGER = Logger.getLogger( "RelationsCreator" );
-
-
-   //      static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
-   //                                  "-ol", "C:/Spiffy/Output/DictionaryToolTest/SnomedRelations.bsv",
-   //                                  "-ol", "C:/Spiffy/Output/DictionaryToolTest/DisoRelations.bsv",
-   //                                  "-tui", "data/default/CtakesDisoTuis.txt",
-   //      };
-
-
-   public static void main( final String[] args ) {
-      // Set properties (user options) used to create the dictionary
-      final CreatorProperties properties = new CreatorProperties( args );
-      //      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
-      // Set up the term utility
-      final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( FORMAT_DATA.getValue() );
-      // Write the non-medication relations
-      writeRelations( umlsTermUtil );
-   }
-
-
-   // according to http://aclweb.org/anthology/N/N13/N13-3007.pdf  the only difference between
parent/child and
-   // broader/narrower is that parent/child came from the original source, whereas broader/narrower
was added by
-   // the UMLS editors.  Still, it may be prudent to separate the two.
-   static private void writeRelations( final UmlsTermUtil umlsTermUtil ) {
-      // Read wanted Sources
-      final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes(
SOURCE.getValue() );
-      // Read wanted Tuis
-      final Collection<String> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue()
);
-      if ( wantedTuis == null || wantedTuis.isEmpty() ) {
-         LOGGER.severe( "No valid TUI codes found in " + TUI_LIST.getValue() );
-         System.exit( 1 );
-      }
-      // get the valid Cuis for all wanted Tuis
-      final Map<String, Collection<String>> validCuisAndTuis
-            = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), wantedSources, wantedTuis
);
-      // Get the preferred texts for all cuis
-      final Map<String, Collection<String>> cuiTexts
-            = UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                       validCuisAndTuis.keySet(), umlsTermUtil,
true, false );
-      final Map<String, Collection<String>> cuiSynonyms = new HashMap<String,
Collection<String>>( cuiTexts.size() );
-      final Map<String, Collection<String>> cuiParents = new HashMap<String,
Collection<String>>( cuiTexts.size() );
-      final Map<String, Collection<String>> cuiChildren = new HashMap<String,
Collection<String>>( cuiTexts.size() );
-      final Map<String, Collection<String>> cuiBroadeners = new HashMap<String,
Collection<String>>( cuiTexts.size() );
-      final Map<String, Collection<String>> cuiNarrowers = new HashMap<String,
Collection<String>>( cuiTexts.size() );
-      final Map<String, Collection<String>> cuiSimilars = new HashMap<String,
Collection<String>>( cuiTexts.size() );
-      UmlsRelationsForCuisReader.readRelationsForCuis( UMLS_ROOT.getValue() + '/' + RELATION_LIST._filename,
-                                                       validCuisAndTuis.keySet(),
-                                                       cuiSynonyms, cuiParents, cuiChildren,
-                                                       cuiBroadeners, cuiNarrowers, cuiSimilars
);
-      if ( TERM_LIST.hasValue() ) {
-         CuiRelationsMapWriter.writeCuiRelations( TERM_LIST.getValue(),
-                                                  cuiTexts, cuiSynonyms, cuiParents, cuiChildren,
-                                                  cuiBroadeners, cuiNarrowers, cuiSimilars
);
-         //      } else if ( DATA_BASE.hasValue() && DATA_TABLE.hasValue() ) {
-         //         if ( isRareWordIndex ) {
-         //            RareWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
-         //                                             DATA_BASE.getValue(), "sa", "", DATA_TABLE.getValue()
);
-         //         } else {
-         //            FirstWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
-         //                                              DATA_BASE.getValue(), "sa", "",
DATA_TABLE.getValue() );
-         //         }
-      }
-      LOGGER.info( "Done Writing Non-Medication Relations" );
-   }
-
-
-}
+//package org.apache.ctakes.dictionarytool;
+//
+//import org.apache.ctakes.dictionarytool.reader.SourceTypeListReader;
+//import org.apache.ctakes.dictionarytool.reader.TuiListReader;
+//import org.apache.ctakes.dictionarytool.reader.UmlsRelationsForCuisReader;
+//import org.apache.ctakes.dictionarytool.reader.UmlsTextsForCuisReader;
+//import org.apache.ctakes.dictionarytool.util.CreatorProperties;
+//import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+//import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
+//import org.apache.ctakes.dictionarytool.writer.CuiRelationsMapWriter;
+//
+//import java.util.Collection;
+//import java.util.HashMap;
+//import java.util.Map;
+//import java.util.logging.Logger;
+//
+//import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.FORMAT_DATA;
+//import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.SOURCE;
+//import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TERM_LIST;
+//import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TUI_LIST;
+//import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.UMLS_ROOT;
+//import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TERM_MAP;
+//import static org.apache.ctakes.dictionarytool.util.UmlsFileName.RELATION_LIST;
+//
+///**
+// * Author: SPF
+// * Affiliation: CHIP-NLP
+// * Date: 3/26/14
+// */
+//public class RelationsCreator {
+//
+//   static private final Logger LOGGER = Logger.getLogger( "RelationsCreator" );
+//
+//
+//   //      static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+//   //                                  "-ol", "C:/Spiffy/Output/DictionaryToolTest/SnomedRelations.bsv",
+//   //                                  "-ol", "C:/Spiffy/Output/DictionaryToolTest/DisoRelations.bsv",
+//   //                                  "-tui", "data/default/CtakesDisoTuis.txt",
+//   //      };
+//
+//
+//   public static void main( final String[] args ) {
+//      // Set properties (user options) used to create the dictionary
+//      final CreatorProperties properties = new CreatorProperties( args );
+//      //      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
+//      // Set up the term utility
+//      final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( FORMAT_DATA.getValue() );
+//      // Write the non-medication relations
+//      writeRelations( umlsTermUtil );
+//   }
+//
+//
+//   // according to http://aclweb.org/anthology/N/N13/N13-3007.pdf  the only difference
between parent/child and
+//   // broader/narrower is that parent/child came from the original source, whereas broader/narrower
was added by
+//   // the UMLS editors.  Still, it may be prudent to separate the two.
+//   static private void writeRelations( final UmlsTermUtil umlsTermUtil ) {
+//      // Read wanted Sources
+//      final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes(
SOURCE.getValue() );
+//      // Read wanted Tuis
+//      final Collection<String> wantedTuis = TuiListReader.readTuiList( TUI_LIST.getValue()
);
+//      if ( wantedTuis == null || wantedTuis.isEmpty() ) {
+//         LOGGER.severe( "No valid TUI codes found in " + TUI_LIST.getValue() );
+//         System.exit( 1 );
+//      }
+//      // get the valid Cuis for all wanted Tuis
+//      final Map<String, Collection<String>> validCuisAndTuis
+//            = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), wantedSources, wantedTuis
);
+//      // Get the preferred texts for all cuis
+//      final Map<String, Collection<String>> cuiTexts
+//            = UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
+//                                                       validCuisAndTuis.keySet(), umlsTermUtil,
true, false );
+//      final Map<String, Collection<String>> cuiSynonyms = new HashMap<String,
Collection<String>>( cuiTexts.size() );
+//      final Map<String, Collection<String>> cuiParents = new HashMap<String,
Collection<String>>( cuiTexts.size() );
+//      final Map<String, Collection<String>> cuiChildren = new HashMap<String,
Collection<String>>( cuiTexts.size() );
+//      final Map<String, Collection<String>> cuiBroadeners = new HashMap<String,
Collection<String>>( cuiTexts.size() );
+//      final Map<String, Collection<String>> cuiNarrowers = new HashMap<String,
Collection<String>>( cuiTexts.size() );
+//      final Map<String, Collection<String>> cuiSimilars = new HashMap<String,
Collection<String>>( cuiTexts.size() );
+//      UmlsRelationsForCuisReader.readRelationsForCuis( UMLS_ROOT.getValue() + '/' + RELATION_LIST._filename,
+//                                                       validCuisAndTuis.keySet(),
+//                                                       cuiSynonyms, cuiParents, cuiChildren,
+//                                                       cuiBroadeners, cuiNarrowers, cuiSimilars
);
+//      if ( TERM_LIST.hasValue() ) {
+//         CuiRelationsMapWriter.writeCuiRelations( TERM_LIST.getValue(),
+//                                                  cuiTexts, cuiSynonyms, cuiParents, cuiChildren,
+//                                                  cuiBroadeners, cuiNarrowers, cuiSimilars
);
+//         //      } else if ( DATA_BASE.hasValue() && DATA_TABLE.hasValue() ) {
+//         //         if ( isRareWordIndex ) {
+//         //            RareWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
+//         //                                             DATA_BASE.getValue(), "sa", "",
DATA_TABLE.getValue() );
+//         //         } else {
+//         //            FirstWordDbWriter.writeTermsToDb( validCuisAndTuis, cuiTexts,
+//         //                                              DATA_BASE.getValue(), "sa", "",
DATA_TABLE.getValue() );
+//         //         }
+//      }
+//      LOGGER.info( "Done Writing Non-Medication Relations" );
+//   }
+//
+//
+//}

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/OrangebookReader.java
Wed Sep 16 17:43:48 2015
@@ -1,7 +1,7 @@
 package org.apache.ctakes.dictionarytool.reader;
 
 import org.apache.ctakes.dictionarytool.util.FileUtil;
-import org.apache.ctakes.dictionarytool.util.TextTokenizer;
+import org.apache.ctakes.dictionarytool.util.token.TextTokenizer;
 
 import java.io.BufferedReader;
 import java.io.IOException;

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
Wed Sep 16 17:43:48 2015
@@ -55,7 +55,7 @@ final public class CreatorProperties {
          System.out.println( option.getHelp() );
       }
       System.out.println( "The UMLS Root Directory must be specified" );
-      System.out.println( "One form of output must be specified using either -ol or -db and
-tbl" );
+      System.out.println( "One form of output must be specified using either -ol or -bsv
or -db and -tbl" );
       System.out.println( "The default index type for databases is Rare Word Index" );
       //      System.out.println( "If an Orangebook Path is not specified then (orangebook)
medication terms are not written" );
       System.out.println( "If a Format Data Directory is not specified then the default is
used: " + DEFAULT_DATA_DIR );
@@ -72,7 +72,7 @@ final public class CreatorProperties {
 
    private boolean ensurePropertiesOk() {
       boolean ok = true;
-      if ( !Option.TERM_LIST.hasValue()
+      if ( !Option.TERM_LIST.hasValue() && !Option.BSV_LIST.hasValue()
             && (!Option.DATA_BASE.hasValue() || !Option.DATA_TABLE.hasValue()) )
{
          System.err.println( "Need an output location" );
          ok = false;
@@ -118,6 +118,7 @@ final public class CreatorProperties {
       //      SEM_LIST( "Input Semantic Group List Path", "-sem" ),
       SOURCE( "Source Type List Path", "-src" ),
       TERM_LIST( "Output Cui and Term List Path", "-ol" ),
+      BSV_LIST( "Output CUI, TUI and Term BSV Path", "-bsv" ),
       DATA_BASE( "Output Database Url", "-db" ),
       DATA_TABLE( "Output Database Table", "-tbl" );
       final private String __name;

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/DeliveryUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/DeliveryUtil.java?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/DeliveryUtil.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/DeliveryUtil.java
Wed Sep 16 17:43:48 2015
@@ -1,6 +1,8 @@
 package org.apache.ctakes.dictionarytool.util;
 
 
+import org.apache.ctakes.dictionarytool.util.token.TextTokenizer;
+
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java?rev=1703438&r1=1703437&r2=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
Wed Sep 16 17:43:48 2015
@@ -1,5 +1,8 @@
 package org.apache.ctakes.dictionarytool.util;
 
+import org.apache.ctakes.dictionarytool.util.token.TextTokenizer;
+import org.apache.ctakes.dictionarytool.util.token.TextTokenizerCtakesPTB;
+
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashSet;
@@ -102,6 +105,7 @@ final public class UmlsTermUtil {
    public Collection<String> getFormattedTexts( final String text, final boolean extractAbbreviations,
                                                 final int minWordLength, final int maxWordCount
) {
       final String tokenizedText = TextTokenizer.getTokenizedText( text );
+//      final String tokenizedText = TextTokenizerCtakesPTB.getTokenizedText( text );  PTB
is not worth the trouble
       if ( tokenizedText == null || tokenizedText.isEmpty() ) {
          return Collections.emptyList();
       }

Copied: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizer.java
(from r1667128, ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizer.java?p2=ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizer.java&p1=ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java&r1=1667128&r2=1703438&rev=1703438&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/token/TextTokenizer.java
Wed Sep 16 17:43:48 2015
@@ -1,7 +1,6 @@
-package org.apache.ctakes.dictionarytool.util;
+package org.apache.ctakes.dictionarytool.util.token;
 
-import java.util.ArrayList;
-import java.util.List;
+import java.util.*;
 
 /**
  * Author: SPF
@@ -73,6 +72,8 @@ final public class TextTokenizer {
    static private final String[] SUFFIXES = {"-esque", "-ette", "-fest", "-fold", "-gate",
"-itis", "-less", "-most",
                                              "-o-torium", "-rama", "-wise"};
 
+   static private final Set<String> PREFIX_SET = new HashSet<>( Arrays.asList(
PREFIXES ) );
+   static private final Set<String> SUFFIX_SET = new HashSet<>( Arrays.asList(
SUFFIXES ) );
 
    static private String getNextCharTerm( final String word ) {
       final StringBuilder sb = new StringBuilder();
@@ -89,16 +90,11 @@ final public class TextTokenizer {
 
    static private boolean isPrefix( final String word ) {
       final String prefixQ = word + "-";
-      for ( String prefix : PREFIXES ) {
-         if ( prefix.equals( prefixQ ) ) {
-            return true;
-         }
-      }
-      return false;
+      return PREFIX_SET.contains( prefixQ );
    }
 
    static private boolean isSuffix( final String word, final int startIndex ) {
-      if ( word.length() >= startIndex ) {
+      if ( word.length() <= startIndex ) {
          return false;
       }
       final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
@@ -106,14 +102,17 @@ final public class TextTokenizer {
          return false;
       }
       final String suffixQ = "-" + nextCharTerm;
-      for ( String suffix : SUFFIXES ) {
-         if ( suffix.equals( suffixQ ) ) {
-            return true;
-         }
-      }
-      return false;
+      return SUFFIX_SET.contains( suffixQ );
    }
 
+   static private boolean isOwnerApostrophe( final CharSequence word, final int startIndex
) {
+      return word.length() == startIndex+1 && word.charAt( startIndex ) == 's';
+   }
+
+   static private boolean isNumberDecimal( final CharSequence word, final int startIndex
) {
+      // Bizarre scenario in which ctakes tokenizes ".2" as a fraction, but not ".22"
+      return word.length() == startIndex+1 && Character.isDigit( word.charAt( startIndex
) );
+   }
 
    static public List<String> getTokens( final String word ) {
       return getTokens( word, false );
@@ -138,33 +137,26 @@ final public class TextTokenizer {
             continue;
          }
          wasDigit = false;
-         if ( c != '-' ) {
-            // have a symbol other than dash
-            if ( sb.length() != 0 ) {
-               // add the current word
-               tokens.add( sb.toString() );
-               sb.setLength( 0 );
-            }
-            // add the symbol
-            tokens.add( "" + c );
-            continue;
-         }
-         final boolean isPrefix = isPrefix( sb.toString() );
-         if ( isPrefix ) {
-            // what precedes is a prefix, so append the dash to the current word and move
on
-            sb.append( '-' );
+         if ( c == '-' && (isPrefix( sb.toString() ) || isSuffix( word, i+1 )) )
{
+            // what precedes is a prefix or what follows is a suffix so append the dash to
the current word and move on
+            sb.append( c );
             continue;
          }
-         final boolean isSuffix = isSuffix( word, i + 1 );
-         if ( !isSuffix ) {
-            // what follows is not a suffix, so add the current word, add the dash, and move
on
+         if ( ( c == '\'' && isOwnerApostrophe( word, i+1 ) )
+              || ( c == '.' && isNumberDecimal( word, i+1 ) ) ) {
+            // what follows is an 's or .# so add the preceding and move on
             if ( sb.length() != 0 ) {
                tokens.add( sb.toString() );
                sb.setLength( 0 );
             }
-            tokens.add( "" + c );
+            sb.append( c );
             continue;
          }
+         // Wasn't a special symbol for consideration, so add the previous and symbol separately
+         if ( sb.length() != 0 ) {
+            tokens.add( sb.toString() );
+            sb.setLength( 0 );
+         }
          tokens.add( "" + c );
       }
       if ( sb.length() != 0 ) {



Mime
View raw message