ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1636633 - in /ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool: CodeMapCreator.java DictionaryCreator.java DictionaryCreator2.java reader/UmlsTextsForCuisReader.java util/CreatorProperties.java util/UmlsTermUtil.java
Date Tue, 04 Nov 2014 16:19:21 GMT
Author: seanfinan
Date: Tue Nov  4 16:19:20 2014
New Revision: 1636633

URL: http://svn.apache.org/r1636633
Log:
Added DictionaryCreator2.java, which can favor anatomical site term text over equal text for
signs/symptoms, disease/disorder, procedure.  Medical term text is separate, preserving drug
names such as "liver" (live-er?)
Various static public methods added elsewhere to support DictionaryCreator2 functionality.
UmlsTermUtil is now less forgiving on things such as function texts ([liver disease]&/or[liver
toxicity]), utilizing the new RemovalFunctionTriggers.txt data list.
UmlsTermUtil also no longer functionally handles template/form terms automatically e.g. "heartbeat
(___bpm)", instead handling them according to the data/ list files.


Added:
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
      - copied, changed from r1625577, ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
Modified:
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
    ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/CodeMapCreator.java
Tue Nov  4 16:19:20 2014
@@ -32,11 +32,12 @@ public class CodeMapCreator {
    static private final Logger LOGGER = Logger.getLogger( "CodeMapCreator" );
 
 
-   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
                                                "-db",
-                                               "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
-                                               "-tbl", "kludge"
+                                               "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
+                                               "-tbl", "kludge",
          //                                               "-ol", "C:/Spiffy/Output/DictionaryToolTest/CodeMap_sno_rx.bsv"
+                                                "-fd", "./data/tiny"
    };
 
 

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
Tue Nov  4 16:19:20 2014
@@ -35,19 +35,19 @@ public class DictionaryCreator {
    static private final Logger LOGGER = Logger.getLogger( "DictionaryCreator" );
 
 
-   static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014 spf
- clears ~850 2 char terms
+//   static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014
spf - clears ~850 2 char terms
+   static private final int MIN_SNOMED_TERM_LENGTH = 2; // changed back to 2, let the dictionary
lookup module cull
    static private final int MIN_RXNORM_TERM_LENGTH = 1;
 
 
 
-   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
                                                "-db",
-                                               "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
+                                               "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
                                                "-tbl", "CUI_TERMS",
-         //                                  "-ol", "C:/Spiffy/Output/DictionaryToolTest/Terms_sno_rx.bsv",
+//                                           "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/Terms_sno_rx.bsv",
          //                                  "-fw",
-         //                                  "-mtui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesDrugTuis.txt"
-         //                                  "-tui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesSnomedTuis.txt"
+                                             "-fd", "./data/tiny"
    };
 
 

Copied: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
(from r1625577, ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java)
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java?p2=ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java&p1=ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java&r1=1625577&r2=1636633&rev=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/DictionaryCreator2.java
Tue Nov  4 16:19:20 2014
@@ -13,41 +13,44 @@ import org.apache.ctakes.dictionarytool.
 
 import java.util.Arrays;
 import java.util.Collection;
+import java.util.HashSet;
 import java.util.logging.Logger;
 
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.DATA_BASE;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.DATA_TABLE;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.FORMAT_DATA;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.MED_TUI_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.SOURCE;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TERM_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.TUI_LIST;
-import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.UMLS_ROOT;
+import static org.apache.ctakes.dictionarytool.util.CreatorProperties.Option.*;
 import static org.apache.ctakes.dictionarytool.util.UmlsFileName.CUI_TERM_MAP;
 
 /**
+ * Because of the manner in which different umls source text is structured, and
+ * because of the broad-based automatic cleanup routines,
+ * DictionaryCreator2 works better than the original DictionaryCreator in handling anatomical
sites and diseases.
+ *
+ * thanks go to Tim Miller for originally finding the bug and bringing it to my attention.
+ *
+ *
  * Author: SPF
  * Affiliation: CHIP-NLP
- * Date: 2/27/14
+ * Date: 11/3/14
  */
-public class DictionaryCreator {
+public class DictionaryCreator2 {
 
-   static private final Logger LOGGER = Logger.getLogger( "DictionaryCreator" );
+   static private final Logger LOGGER = Logger.getLogger( "DictionaryCreator2" );
 
 
-   static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014 spf
- clears ~850 2 char terms
+//   static private final int MIN_SNOMED_TERM_LENGTH = 3; // changed from 2 to 3 9/8/2014
spf - clears ~850 2 char terms
+   static private final int MIN_SNOMED_TERM_LENGTH = 2; // changed back to 2, let the dictionary
lookup module cull
    static private final int MIN_RXNORM_TERM_LENGTH = 1;
 
 
 
-   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\Data\\UMLS\\2011AB\\META",
+   static private final String[] DEBUG_ARGS = {"-umls", "C:\\Spiffy\\umls\\data\\external\\2011AB\\META",
                                                "-db",
-                                               "jdbc:hsqldb:file:C:/Spiffy/Output/ctakes_sno_rx_mem_bin_3char/ctakes_sno_rx_mem",
+                                               "jdbc:hsqldb:file:C:/Spiffy/rword_dict/data/internal/ctakesnewsnorx/ctakesnewsnorx",
                                                "-tbl", "CUI_TERMS",
-         //                                  "-ol", "C:/Spiffy/Output/DictionaryToolTest/Terms_sno_rx.bsv",
+//                                           "-ol", "C:\\Spiffy\\rword_dict\\output\\temp/Terms_sno_rx.bsv",
          //                                  "-fw",
-         //                                  "-mtui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesDrugTuis.txt"
-         //                                  "-tui", "C:/Spiffy/Dev/Spiffy/Spiffy/DictionaryTool/data/default/CtakesSnomedTuis.txt"
+                                             "-fd", "./data/tiny",
+                                             "-atui", "./data/tiny/CtakesAnatTuis.txt",
+                                             "-tui", "./data/tiny/CtakesSnomedTuis.txt"
    };
 
 
@@ -57,14 +60,44 @@ public class DictionaryCreator {
 //      final CreatorProperties properties = new CreatorProperties( DEBUG_ARGS );
       // Set up the term utility
       final UmlsTermUtil umlsTermUtil = new UmlsTermUtil( FORMAT_DATA.getValue() );
+      // Write the anatomical site terms
+      final Collection<String> anats = writeAnat( umlsTermUtil, properties.isRareWordIndex()
);
       // Write the non-medication terms
-      writeSnomed( umlsTermUtil, properties.isRareWordIndex() );
+      writeSnomed( umlsTermUtil, anats, properties.isRareWordIndex() );
       // Write the medication terms
       writeRxNorm( umlsTermUtil, properties.isRareWordIndex() );
    }
 
+   static private Collection<String> writeAnat( final UmlsTermUtil umlsTermUtil, final
boolean isRareWordIndex ) {
+      // Read wanted Sources
+      final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes(
SOURCE.getValue() );
+      // Read wanted Tuis
+      final Collection<Integer> wantedTuis = TuiListReader.readTuiList( ANAT_TUI_LIST.getValue()
);
+      if ( wantedTuis == null || wantedTuis.isEmpty() ) {
+         LOGGER.severe( "No valid TUI codes found in " + ANAT_TUI_LIST.getValue() );
+         System.exit( 1 );
+      }
+      // get the valid Cuis for all wanted Tuis
+      final HashSetMap<Long, Integer> validCuisAndTuis
+            = CuiTuiUtil.getValidCuisAndTuis( UMLS_ROOT.getValue(), wantedSources, wantedTuis
);
+      // Get the texts for all cuis
+      // Term Types are not usable for Snomed.  ObsoletePreferredname IS (obsolete Synonym)
PreferredTerm SYnonym
+      // PreferredTermGreatBritain  SYnonymGreatBritain OB (spelling variation?)   MTH_*
MTH version
+      final HashSetMap<Long, String> cuiTexts
+            = UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
+            validCuisAndTuis.keySet(), umlsTermUtil, false, true,
+            MIN_SNOMED_TERM_LENGTH, 7 );
+      writeOutput( validCuisAndTuis, cuiTexts, isRareWordIndex );
+      LOGGER.info( "Done Writing Non-Medication Terms" );
+      final Collection<String> allAnatTerms = new HashSet<>( 10000 );
+      for ( Collection<String> texts : cuiTexts.values() ) {
+         allAnatTerms.addAll( texts );
+      }
+      return allAnatTerms;
+   }
 
-   static private void writeSnomed( final UmlsTermUtil umlsTermUtil, final boolean isRareWordIndex
) {
+   static private void writeSnomed( final UmlsTermUtil umlsTermUtil, final Collection<String>
anats,
+                                    final boolean isRareWordIndex ) {
       // Read wanted Sources
       final Collection<String> wantedSources = SourceTypeListReader.readSourceTypes(
SOURCE.getValue() );
       // Read wanted Tuis
@@ -81,7 +114,7 @@ public class DictionaryCreator {
       // PreferredTermGreatBritain  SYnonymGreatBritain OB (spelling variation?)   MTH_*
MTH version
       final HashSetMap<Long, String> cuiTexts
             = UmlsTextsForCuisReader.readTextsForCuis( UMLS_ROOT.getValue() + '/' + CUI_TERM_MAP._filename,
-                                                       validCuisAndTuis.keySet(), umlsTermUtil,
false, true,
+                                                       validCuisAndTuis.keySet(), umlsTermUtil,
anats, false, true,
                                                        MIN_SNOMED_TERM_LENGTH, 7 );
       writeOutput( validCuisAndTuis, cuiTexts, isRareWordIndex );
       LOGGER.info( "Done Writing Non-Medication Terms" );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
Tue Nov  4 16:19:20 2014
@@ -8,7 +8,9 @@ import org.apache.ctakes.dictionarytool.
 
 import java.io.BufferedReader;
 import java.io.IOException;
+import java.util.ArrayList;
 import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.CUI;
@@ -31,7 +33,8 @@ final public class UmlsTextsForCuisReade
    static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
                                                             final Collection<Long>
wantedCuis,
                                                             final UmlsTermUtil umlsTermUtil
) {
-      return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil, false, true, 1, Integer.MAX_VALUE
);
+      return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil,
+            false, true, 1, Integer.MAX_VALUE );
    }
 
    static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
@@ -41,6 +44,19 @@ final public class UmlsTextsForCuisReade
                                                             final boolean extractAbbreviations,
                                                             final int minWordLength,
                                                             final int maxWordCount ) {
+      return readTextsForCuis( rrfPath, wantedCuis, umlsTermUtil, new ArrayList<String>(0),
+            preferredOnly, extractAbbreviations, minWordLength, maxWordCount );
+   }
+
+
+      static public HashSetMap<Long, String> readTextsForCuis( final String rrfPath,
+                                                            final Collection<Long>
wantedCuis,
+                                                            final UmlsTermUtil umlsTermUtil,
+                                                            final Collection<String>
unwantedTexts,
+                                                            final boolean preferredOnly,
+                                                            final boolean extractAbbreviations,
+                                                            final int minWordLength,
+                                                            final int maxWordCount ) {
       System.out.println( "Compiling map of Umls Cuis and Texts" );
       long lineCount = 0;
       long textCount = 0;
@@ -63,6 +79,7 @@ final public class UmlsTextsForCuisReade
                      tokens = FileUtil.readBsvTokens( reader, rrfPath );
                      continue;
                   }
+                  formattedTexts.removeAll( unwantedTexts );
                   textCount += cuisAndText.addAll( cuiCode, formattedTexts );
                }
             }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
Tue Nov  4 16:19:20 2014
@@ -13,6 +13,7 @@ final public class CreatorProperties {
 
    static private final String DEFAULT_DATA_DIR = "./data/default";
    static private final String DEFAULT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesSnomedTuis.txt";
+   static private final String DEFAULT_ANAT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesAnatTuis.txt";
    static private final String DEFAULT_MED_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesDrugTuis.txt";
    static private final String DEFAULT_SOURCE_FILE = DEFAULT_DATA_DIR + "/CtakesSources.txt";
 
@@ -61,7 +62,10 @@ final public class CreatorProperties {
       System.out.println(
             "If an Input Tui List Path is not specified then the cTakes Snomed Tuis are used:
" + DEFAULT_TUI_FILE );
       System.out.println(
-            "If an Input Drug Tui List Path is not specified then the cTakes Medication Tuis
are used: "
+            "If an Input Anatomical Site Tui List Path is not specified then the defaults
are used: "
+            + DEFAULT_ANAT_TUI_FILE );
+      System.out.println(
+            "If an Input Drug Tui List Path is not specified then the defaults are used:
"
                   + DEFAULT_MED_TUI_FILE );
       System.out.println( "If a Source Type List Path is not specified then Snomed is used:
" + DEFAULT_SOURCE_FILE );
    }
@@ -86,6 +90,9 @@ final public class CreatorProperties {
       if ( !Option.TUI_LIST.hasValue() ) {
          Option.TUI_LIST.parseValue( Option.TUI_LIST.__key, DEFAULT_TUI_FILE );
       }
+      if ( !Option.ANAT_TUI_LIST.hasValue() ) {
+         Option.ANAT_TUI_LIST.parseValue( Option.ANAT_TUI_LIST.__key, DEFAULT_ANAT_TUI_FILE
);
+      }
       if ( !Option.MED_TUI_LIST.hasValue() ) {
          Option.MED_TUI_LIST.parseValue( Option.MED_TUI_LIST.__key, DEFAULT_MED_TUI_FILE
);
       }
@@ -106,6 +113,7 @@ final public class CreatorProperties {
       ORANGE_BOOK( "Orangebook Path", "-ob" ),
       FORMAT_DATA( "Format Data Directory", "-fd" ),
       TUI_LIST( "Input Tui List Path", "-tui" ),
+      ANAT_TUI_LIST( "Anatomical Site Tui List Path", "-atui" ),
       MED_TUI_LIST( "Medication Tui List Path", "-mtui" ),
       //      SEM_LIST( "Input Semantic Group List Path", "-sem" ),
       SOURCE( "Source Type List Path", "-src" ),

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java?rev=1636633&r1=1636632&r2=1636633&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
Tue Nov  4 16:19:20 2014
@@ -18,6 +18,7 @@ final public class UmlsTermUtil {
    static private enum DATA_FILE {
       REMOVAL_PREFIX_TRIGGERS( "RemovalPrefixTriggers.txt" ),
       REMOVAL_SUFFIX_TRIGGERS( "RemovalSuffixTriggers.txt" ),
+      REMOVAL_FUNCTION_TRIGGERS( "RemovalFunctionTriggers.txt" ),
       REMOVAL_COLON_TRIGGERS( "RemovalColonTriggers.txt" ),
       UNWANTED_PREFIXES( "UnwantedPrefixes.txt" ),
       UNWANTED_SUFFIXES( "UnwantedSuffixes.txt" ),
@@ -37,6 +38,7 @@ final public class UmlsTermUtil {
    final private Collection<String> _removalPrefixTriggers;
    final private Collection<String> _removalSuffixTriggers;
    final private Collection<String> _removalColonTriggers;
+   final private Collection<String> _removalFunctionTriggers;
    final private Collection<String> _unwantedPrefixes;
    final private Collection<String> _unwantedSuffixes;
    final private Collection<String> _modifierSuffixes;
@@ -46,6 +48,7 @@ final public class UmlsTermUtil {
       this( getDataPath( dataDir, DATA_FILE.REMOVAL_PREFIX_TRIGGERS ),
             getDataPath( dataDir, DATA_FILE.REMOVAL_SUFFIX_TRIGGERS ),
             getDataPath( dataDir, DATA_FILE.REMOVAL_COLON_TRIGGERS ),
+            getDataPath( dataDir, DATA_FILE.REMOVAL_FUNCTION_TRIGGERS ),
             getDataPath( dataDir, DATA_FILE.UNWANTED_PREFIXES ),
             getDataPath( dataDir, DATA_FILE.UNWANTED_SUFFIXES ),
             getDataPath( dataDir, DATA_FILE.MODIFIER_SUFFIXES ),
@@ -53,12 +56,13 @@ final public class UmlsTermUtil {
    }
 
    public UmlsTermUtil( final String removalPrefixTriggersPath, final String removalSuffixTriggersPath,
-                        final String removalColonTriggersPath,
+                        final String removalColonTriggersPath, final String removalFunctionTriggersPath,
                         final String unwantedPrefixesPath, final String unwantedSuffixesPath,
                         final String modifierSuffixesPath, final String abbreviationsPath
) {
       _removalPrefixTriggers = FileUtil.readOneColumn( removalPrefixTriggersPath, "term removal
Prefix Triggers" );
       _removalSuffixTriggers = FileUtil.readOneColumn( removalSuffixTriggersPath, "term removal
Suffix Triggers" );
       _removalColonTriggers = FileUtil.readOneColumn( removalColonTriggersPath, "term removal
Colon Triggers" );
+      _removalFunctionTriggers = FileUtil.readOneColumn( removalFunctionTriggersPath, "term
removal Function Triggers" );
       _unwantedPrefixes = FileUtil.readOneColumn( unwantedPrefixesPath, "unwanted Prefixes"
);
       _unwantedSuffixes = FileUtil.readOneColumn( unwantedSuffixesPath, "unwanted Suffixes"
);
       _modifierSuffixes = FileUtil.readOneColumn( modifierSuffixesPath, "modifier Suffixes"
);
@@ -112,9 +116,9 @@ final public class UmlsTermUtil {
       if ( extractAbbreviations ) {
          // add embedded abbreviations
          extractedTerms = extractAbbreviations( validText );
-         if ( extractedTerms.isEmpty() ) {
-            extractedTerms = autoExtractAcronyms( validText );
-         }
+//         if ( extractedTerms.isEmpty() ) {
+//            extractedTerms = autoExtractAcronyms( validText );
+//         }
       }
       if ( extractedTerms.isEmpty() ) {
          extractedTerms = extractModifiers( validText );
@@ -123,36 +127,36 @@ final public class UmlsTermUtil {
          extractedTerms.add( validText );
          return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength,
maxWordCount );
       }
-      // Check for embedded and / or terms
-      if ( extractedTerms.isEmpty() ) {
-         extractedTerms = autoExtractColonParaTerms( validText );
-      }
-      if ( extractedTerms.isEmpty() ) {
-         extractedTerms = autoExtractOrParaTerms( validText );
-      }
-      if ( extractedTerms.isEmpty() ) {
-         extractedTerms = autoExtractColonBracketTerms( validText );
-      }
-      //      if ( extractedTerms.isEmpty() ) {
-      //         extractedTerms = autoExtractAndBracketTerms( validText );
-      //      }
-      if ( extractedTerms.isEmpty() ) {
-         extractedTerms = autoExtractOrBracketTerms( validText );
-      }
-      if ( extractedTerms.isEmpty() ) {
-         extractedTerms = autoExtractAndOrOtherTerms( validText );
-      }
-      if ( !extractedTerms.isEmpty() ) {
-         //         System.out.println( validText );
-         //         for ( String et : extractedTerms ) {
-         //            System.out.println("  " + et);
-         //         }
-         return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength,
maxWordCount );
-      } else {
+//      // Check for embedded and / or terms
+//      if ( extractedTerms.isEmpty() ) {
+//         extractedTerms = autoExtractColonParaTerms( validText );
+//      }
+//      if ( extractedTerms.isEmpty() ) {
+//         extractedTerms = autoExtractOrParaTerms( validText );
+//      }
+//      if ( extractedTerms.isEmpty() ) {
+//         extractedTerms = autoExtractColonBracketTerms( validText );
+//      }
+//      //      if ( extractedTerms.isEmpty() ) {
+//      //         extractedTerms = autoExtractAndBracketTerms( validText );
+//      //      }
+//      if ( extractedTerms.isEmpty() ) {
+//         extractedTerms = autoExtractOrBracketTerms( validText );
+//      }
+//      if ( extractedTerms.isEmpty() ) {
+//         extractedTerms = autoExtractAndOrOtherTerms( validText );
+//      }
+//      if ( !extractedTerms.isEmpty() ) {
+//         //         System.out.println( validText );
+//         //         for ( String et : extractedTerms ) {
+//         //            System.out.println("  " + et);
+//         //         }
+//         return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength,
maxWordCount );
+//      } else {
          Collection<String> texts = new HashSet<>( 1 );
          texts.add( validText );
          return getFormattedTexts( getPluralTerms( getValidTexts( texts ) ), minWordLength,
maxWordCount );
-      }
+//      }
    }
 
    static private Collection<String> getPluralTerms( final Collection<String>
texts ) {
@@ -197,6 +201,11 @@ final public class UmlsTermUtil {
             return false;
          }
       }
+      for ( String removalFunction : _removalFunctionTriggers ) {
+         if ( text.contains( removalFunction ) ) {
+            return false;
+         }
+      }
       return true;
    }
 
@@ -210,14 +219,14 @@ final public class UmlsTermUtil {
 
    private String getValidText( final String text ) {
       // remove form underlines
-      if ( text.contains( "_ _ _" ) ) {
-         final int lastParen = text.lastIndexOf( '(' );
-         final int lastDash = text.indexOf( "_ _ _" );
-         final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
-         if ( deleteIndex > 0 ) {
-            return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
-         }
-      }
+//      if ( text.contains( "_ _ _" ) ) {
+//         final int lastParen = text.lastIndexOf( '(' );
+//         final int lastDash = text.indexOf( "_ _ _" );
+//         final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
+//         if ( deleteIndex > 0 ) {
+//            return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
+//         }
+//      }
       // remove unmatched parentheses, brackets, etc.
       //      if ( text.startsWith( "(" ) && !text.contains( ")" ) ) {
       //         return getValidText( text.substring( 1 ).trim() );
@@ -264,10 +273,16 @@ final public class UmlsTermUtil {
                strippedText = strippedText.substring( 0, strippedText.length() - suffix.length()
).trim();
             }
          }
+         if ( !isTextValid( strippedText ) ) {
+            return "";
+         }
       }
       if ( strippedText.contains( "(" ) && strippedText.contains( "[" ) ) {
          return "";
       }
+//      if ( strippedText.length() != text.trim().length() ) {
+//         System.out.println( text.trim() + " > " + strippedText );
+//      }
       return strippedText;
    }
 



Mime
View raw message