ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1624062 [3/4] - in /ctakes/sandbox/dictionarytool: data/default/ src/org/apache/ctakes/dictionarytool/ src/org/apache/ctakes/dictionarytool/reader/ src/org/apache/ctakes/dictionarytool/util/ src/org/apache/ctakes/dictionarytool/util/collec...
Date Wed, 10 Sep 2014 17:30:43 GMT
Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java Wed Sep 10 17:30:42 2014
@@ -1,5 +1,7 @@
 package org.apache.ctakes.dictionarytool.util;
 
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+
 import javax.swing.filechooser.FileSystemView;
 import java.io.BufferedReader;
 import java.io.BufferedWriter;
@@ -12,6 +14,7 @@ import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.logging.Logger;
 
 /**
@@ -154,7 +157,7 @@ final public class FileUtil {
 
    static public Collection<String> readOneColumn( final String listFilePath, final String description ) {
       System.out.println( "Reading " + description + " from " + listFilePath );
-      final Collection<String> listItems = new HashSet<String>();
+      final Collection<String> listItems = new HashSet<>();
       long lineCount = 0;
       try {
          final BufferedReader reader = createReader( listFilePath );
@@ -175,7 +178,31 @@ final public class FileUtil {
       return listItems;
    }
 
+   static public void writeNamedSets( final String filePath, final String description,
+                                      final HashSetMap<String, String> namedSets ) {
+      System.out.println( "Writing " + description + " to " + filePath );
+      long lineCount = 0;
+      try {
+         final BufferedWriter writer = createWriter( filePath );
+         for ( Map.Entry<String, Set<String>> namedSet : namedSets.entrySet() ) {
+            lineCount++;
+            writer.write( TokenUtil.createBsvLine( namedSet.getKey(),
+                                                   TokenUtil.createCsvLine( namedSet.getValue() ) ) );
+            writer.newLine();
+            if ( lineCount % 100000 == 0 ) {
+               System.out.println( "File Line " + lineCount );
+            }
+         }
+         writer.close();
+      } catch ( IOException ioE ) {
+         System.err.println( "Error writing " + description + " on line " + lineCount + " in file " + filePath );
+      }
+      System.out.println( "Wrote " + lineCount + " " + description + " to " + filePath );
+   }
 
+   /**
+    * @deprecated
+    */
    static public void writeNamedSets( final String filePath, final String description,
                                       final Map<String, Collection<String>> namedSets ) {
       System.out.println( "Writing " + description + " to " + filePath );
@@ -198,9 +225,12 @@ final public class FileUtil {
       System.out.println( "Wrote " + lineCount + " " + description + " to " + filePath );
    }
 
-   static public Map<String, Collection<String>> readNamedSets( final String filePath, final String description ) {
+   /**
+    * @deprecated
+    */
+   static public Map<String, Collection<String>> readNamedSetsOld( final String filePath, final String description ) {
       final Collection<String> lines = readOneColumn( filePath, description );
-      final Map<String, Collection<String>> namedSets = new HashMap<String, Collection<String>>( lines.size() );
+      final Map<String, Collection<String>> namedSets = new HashMap<>( lines.size() );
       for ( String line : lines ) {
          final List<String> nameAndList = TokenUtil.getBsvItems( line );
          if ( nameAndList == null || nameAndList.size() != 2 ) {
@@ -212,5 +242,18 @@ final public class FileUtil {
       return namedSets;
    }
 
+   static public HashSetMap<String, String> readNamedSets( final String filePath, final String description ) {
+      final Collection<String> lines = readOneColumn( filePath, description );
+      final HashSetMap<String, String> namedSets = new HashSetMap<>( lines.size() );
+      for ( String line : lines ) {
+         final List<String> nameAndList = TokenUtil.getBsvItems( line );
+         if ( nameAndList == null || nameAndList.size() != 2 ) {
+            System.err.println( "Bad line " + line );
+            continue;
+         }
+         namedSets.addAll( nameAndList.get( 0 ), TokenUtil.getCsvItems( nameAndList.get( 1 ) ) );
+      }
+      return namedSets;
+   }
 
 }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java Wed Sep 10 17:30:42 2014
@@ -47,23 +47,31 @@ final public class JdbcUtil {
 
    //   static public String createRowInsertSql( final String tableName, final int valueCount ) {
    static public String createRowInsertSql( final String tableName, final Enum... fields ) {
+      final String[] fieldNames = new String[fields.length];
+      int i = 0;
+      for ( Enum field : fields ) {
+         fieldNames[i] = field.name();
+         i++;
+      }
+      return createRowInsertSql( tableName, fieldNames );
+   }
 
+   static public String createRowInsertSql( final String tableName, final String... fieldNames ) {
       final StringBuilder sb = new StringBuilder( "insert into" );
       sb.append( " " ).append( tableName );
       sb.append( " (" );
-      for ( Enum field : fields ) {
-         sb.append( field.name() ).append( ',' );
+      for ( String fieldName : fieldNames ) {
+         sb.append( fieldName ).append( ',' );
       }
       // remove last comma
       sb.setLength( sb.length() - 1 );
       sb.append( ") " );
       sb.append( " values (" );
-      for ( int i = 0; i < fields.length - 1; i++ ) {
+      for ( int i = 0; i < fieldNames.length - 1; i++ ) {
          sb.append( "?," );
       }
       sb.append( "?)" );
       return sb.toString();
    }
 
-
 }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java Wed Sep 10 17:30:42 2014
@@ -1,5 +1,7 @@
 package org.apache.ctakes.dictionarytool.util;
 
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.HashMap;
@@ -63,9 +65,13 @@ final public class RareWordUtil {
             // WRB
             "how", "where", "when", "however", "wherever", "whenever",
             // Mine ...
-            "no"
+            "no",
+            // additional numbers
+            "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen", "seventeen", "eighteen", "nineteen",
+            "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
+            "hundred", "thousand", "million", "billion", "trillion",
       };
-      BAD_POS_TERM_SET = new HashSet<String>( Arrays.asList( BAD_POS_TERMS ) );
+      BAD_POS_TERM_SET = new HashSet<>( Arrays.asList( BAD_POS_TERMS ) );
    }
 
    static public boolean isRarableToken( final String token ) {
@@ -82,9 +88,33 @@ final public class RareWordUtil {
       return hasLetter && !BAD_POS_TERM_SET.contains( token );
    }
 
-
+   /**
+    * @param cuiTexts -
+    * @return -
+    * @deprecated
+    */
    static public Map<String, Integer> getTokenCounts( final Map<String, Collection<String>> cuiTexts ) {
-      final Map<String, Integer> tokenCounts = new HashMap<String, Integer>();
+      final Map<String, Integer> tokenCounts = new HashMap<>();
+      for ( Collection<String> texts : cuiTexts.values() ) {
+         for ( String text : texts ) {
+            final String[] tokens = text.split( "\\s+" );
+            for ( String token : tokens ) {
+               if ( RareWordUtil.isRarableToken( token ) ) {
+                  Integer count = tokenCounts.get( token );
+                  if ( count == null ) {
+                     count = 0;
+                  }
+                  tokenCounts.put( token, (count + 1) );
+               }
+            }
+
+         }
+      }
+      return tokenCounts;
+   }
+
+   static public Map<String, Integer> getTokenCounts( final HashSetMap<Long, String> cuiTexts ) {
+      final Map<String, Integer> tokenCounts = new HashMap<>();
       for ( Collection<String> texts : cuiTexts.values() ) {
          for ( String text : texts ) {
             final String[] tokens = text.split( "\\s+" );
@@ -132,4 +162,31 @@ final public class RareWordUtil {
    //      return bestIndex;
    //   }
 
+
+   static public final class IndexedRareWord {
+      final public String __word;
+      final public int __index;
+      final public int __tokenCount;
+
+      private IndexedRareWord( final String word, final int index, final int tokenCount ) {
+         __word = word;
+         __index = index;
+         __tokenCount = tokenCount;
+      }
+   }
+
+   static public IndexedRareWord getIndexedRareWord( final String text,
+                                                     final Map<String, Integer> tokenCounts ) {
+      final String[] tokens = text.split( "\\s+" );
+      int bestIndex = 0;
+      int bestCount = Integer.MAX_VALUE;
+      for ( int i = 0; i < tokens.length; i++ ) {
+         Integer count = tokenCounts.get( tokens[i] );
+         if ( count != null && count < bestCount ) {
+            bestIndex = i;
+            bestCount = count;
+         }
+      }
+      return new IndexedRareWord( tokens[bestIndex], bestIndex, tokens.length );
+   }
 }

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RxNormTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RxNormTermUtil.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RxNormTermUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RxNormTermUtil.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,356 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 7/15/14
+ */
+final public class RxNormTermUtil {
+
+   private RxNormTermUtil() {
+   }
+
+   public enum RxNameType {
+      //      FullTradeName, TrimTradeName,  --> turned out to be mostly company brand names, not drug brand names
+      //      FullFormula, TrimFormula,      --> not easily detectable without bracketed tradename
+      ExtractFormula, ExtractName, ExtractSource,
+      FullEntry, TrimEntry;
+   }
+
+   static public final class RxNormCui {
+      final private Long __cuiCode;
+      final private HashSetMap<RxNameType, String> __rxNames = new HashSetMap<>();
+
+      private RxNormCui( final Long cuiCode ) {
+         __cuiCode = cuiCode;
+      }
+
+      private boolean hasType( final RxNameType rxNameType ) {
+         return __rxNames.get( rxNameType ) != null && !__rxNames.get( rxNameType ).isEmpty();
+      }
+
+      private void place( final RxNameType rxNameType, final String text ) {
+         if ( text.length() > 2 ) {
+            if ( rxNameType == RxNameType.TrimEntry ) {
+               extractFormulas( text );
+            }
+            __rxNames.place( rxNameType, text );
+         }
+      }
+
+      private Collection<String> obtain( final RxNameType rxNameType ) {
+         return __rxNames.obtain( rxNameType );
+      }
+
+      private Collection<String> obtainAll() {
+         final Collection<String> all = new HashSet<>();
+         for ( RxNameType rxNameType : RxNameType.values() ) {
+            all.addAll( __rxNames.obtain( rxNameType ) );
+         }
+         return all;
+      }
+
+      private void placeAll( final RxNameType rxNameType, final Collection<String> texts ) {
+         for ( String text : texts ) {
+            place( rxNameType, text );
+         }
+      }
+
+      private void extractFormulas( final String text ) {
+         if ( !text.endsWith( ")" ) ) {
+            return;
+         }
+         final int paraIndex = text.lastIndexOf( '(' );
+         if ( paraIndex < 1 ) {
+            return;
+         }
+         final String tradeName = text.substring( 0, paraIndex ).trim();
+         final String formula = text.substring( paraIndex + 1, text.length() - 1 ).trim();
+         if ( formula.split( "\\/" ).length > 2 ) {
+            place( RxNameType.ExtractName, tradeName );
+            place( RxNameType.ExtractFormula, formula );
+            place( RxNameType.ExtractSource, text );
+         }
+      }
+
+      private void clearFormulas() {
+         __rxNames.clear( RxNameType.ExtractName );
+         __rxNames.clear( RxNameType.ExtractFormula );
+         __rxNames.clear( RxNameType.ExtractSource );
+      }
+      //      private void cleanFormulas() {
+      //         final Collection<String> allTerms = obtainAll();
+      //         if ( allTerms.size() <= 2 ) {
+      //            return;
+      //         }
+      //         final Collection<String> removals = new HashSet<>();
+      //         final Collection<String> additions = new HashSet<>();
+      //         for ( String text : allTerms ) {
+      //            if ( text.endsWith( ")" ) ) {
+      //               final int paraIndex = text.lastIndexOf( '(' );
+      //               if ( paraIndex > 0 ) {
+      //                  final String tradeName = text.substring( 0, paraIndex ).trim();
+      //                  if ( allTerms.contains( tradeName ) ) {
+      //                     final String formula = text.substring( paraIndex+1, text.length()-1 ).trim();
+      //                     if ( formula.split( "\\/" ).length > 2 ) {
+      //                        removals.add( text );
+      //                        if ( !allTerms.contains( formula ) ) {
+      //                           additions.add( formula );
+      //                        }
+      //                     }
+      //                  }
+      //               }
+      //            }
+      //         }
+      //         for ( RxNameType rxNameType : RxNameType.values() ) {
+      //            obtain( rxNameType ).removeAll( removals );
+      //         }
+      //         placeAll( RxNameType.TrimEntry, additions );
+      //      }
+
+      private void copy( final RxNormCui rxNormCui ) {
+         for ( RxNameType rxNameType : RxNameType.values() ) {
+            placeAll( rxNameType, rxNormCui.obtain( rxNameType ) );
+         }
+      }
+   }
+
+
+   static private void fillDeliveryFreeTerm( final RxNameType rxNameType1,
+                                             final RxNameType rxNameType2,
+                                             final List<RxNormCui> rxNormCuis,
+                                             final int primaryTermIndex,
+                                             final Collection<Integer> removalIndices ) {
+      final RxNormCui primaryTerm = rxNormCuis.get( primaryTermIndex );
+      final Collection<String> primaryTexts = primaryTerm.obtain( rxNameType1 );
+      final int size = rxNormCuis.size();
+      RxNormCui rxNormCui = null;
+      for ( int i = primaryTermIndex + 1; i < size; i++ ) {
+         if ( removalIndices.contains( i ) ) {
+            continue;
+         }
+         rxNormCui = rxNormCuis.get( i );
+         final Collection<String> texts = rxNormCui.obtain( rxNameType2 );
+         if ( texts.isEmpty() ) {
+            continue;
+         }
+         boolean synonym = false;
+         for ( String primary : primaryTexts ) {
+            for ( String text : texts ) {
+               if ( primary.equals( text ) ) {
+                  System.out.println( primaryTerm.__cuiCode + " | " + primary
+                                            + " ==== " + rxNormCui.__cuiCode + " | " + text );
+                  synonym = true;
+                  break;
+               }
+            }
+         }
+         if ( synonym ) {
+            primaryTerm.copy( rxNormCui );
+            removalIndices.add( i );
+         }
+      }
+   }
+
+   static public void fillDeliveryFreeTerms( final List<RxNormCui> rxNormCuis,
+                                             final Collection<Integer> removalIndices,
+                                             final RxNameType rxNameType1,
+                                             final RxNameType rxNameType2 ) {
+      final int size = rxNormCuis.size();
+      RxNormCui rxNormCui = null;
+      for ( int i = 0; i < size; i++ ) {
+         if ( removalIndices.contains( i ) ) {
+            continue;
+         }
+         rxNormCui = rxNormCuis.get( i );
+         final Collection<String> nameTypes = rxNormCui.obtain( rxNameType1 );
+         if ( nameTypes.isEmpty() ) {
+            continue;
+         }
+         fillDeliveryFreeTerm( rxNameType1, rxNameType2, rxNormCuis, i, removalIndices );
+         // Because terms can have "out-of-order" matching synonyms: A1 : C1; B1 : C1 but B was already passed by
+         fillDeliveryFreeTerm( rxNameType1, rxNameType2, rxNormCuis, i, removalIndices );
+      }
+   }
+
+   static public void fillDeliveryFreeTerms( final List<RxNormCui> rxNormCuis1,
+                                             final List<RxNormCui> rxNormCuis2,
+                                             final Collection<Integer> removalIndices2,
+                                             final RxNameType rxNameType1,
+                                             final RxNameType rxNameType2 ) {
+      final int size = rxNormCuis1.size();
+      RxNormCui rxNormCui = null;
+      for ( int i = 0; i < size; i++ ) {
+         rxNormCui = rxNormCuis1.get( i );
+         final Collection<String> nameTypes = rxNormCui.obtain( rxNameType1 );
+         if ( nameTypes.isEmpty() ) {
+            continue;
+         }
+         fillDeliveryFreeTerm( rxNameType1, rxNameType2, rxNormCuis1, rxNormCuis2, i, removalIndices2 );
+         // Because terms can have "out-of-order" matching synonyms: A1 : C1; B1 : C1 but B was already passed by
+         fillDeliveryFreeTerm( rxNameType1, rxNameType2, rxNormCuis1, rxNormCuis2, i, removalIndices2 );
+      }
+   }
+
+   static private void fillDeliveryFreeTerm( final RxNameType rxNameType1,
+                                             final RxNameType rxNameType2,
+                                             final List<RxNormCui> rxNormCuis1,
+                                             final List<RxNormCui> rxNormCuis2,
+                                             final int primaryTermIndex,
+                                             final Collection<Integer> removalIndices2 ) {
+      final RxNormCui primaryTerm = rxNormCuis1.get( primaryTermIndex );
+      final Collection<String> primaryTexts = primaryTerm.obtain( rxNameType1 );
+      final int size = rxNormCuis2.size();
+      RxNormCui rxNormCui = null;
+      for ( int i = 0; i < size; i++ ) {
+         if ( removalIndices2.contains( i ) ) {
+            continue;
+         }
+         rxNormCui = rxNormCuis2.get( i );
+         final Collection<String> texts = rxNormCui.obtain( rxNameType2 );
+         if ( texts.isEmpty() ) {
+            continue;
+         }
+         boolean synonym = false;
+         for ( String primary : primaryTexts ) {
+            for ( String text : texts ) {
+               if ( primary.equals( text ) ) {
+                  System.out.println( primaryTerm.__cuiCode + " | " + primary
+                                            + " ====== " + rxNormCui.__cuiCode + " | " + text );
+                  synonym = true;
+                  break;
+               }
+            }
+         }
+         if ( synonym ) {
+            primaryTerm.copy( rxNormCui );
+            removalIndices2.add( i );
+         }
+      }
+   }
+
+
+   /**
+    * @param cuiTexts Map of Cuis to Sets of Texts
+    * @return Map of Texts to lists of Cuis
+    */
+   static public HashSetMap<Long, String> getDeliveryFreeTerms( final HashSetMap<Long, String> cuiTexts ) {
+      final Map<Long, RxNormCui> rxNormCuiMap = new HashMap<>( cuiTexts.size() );
+      for ( Map.Entry<Long, Set<String>> entry : cuiTexts.entrySet() ) {
+         fillRxNormCui( rxNormCuiMap, entry );
+      }
+      final List<RxNormCui> fullRxNormCuis = new ArrayList<>();
+      final List<RxNormCui> trimRxNormCuis = new ArrayList<>();
+      for ( RxNormCui rxNormCui : rxNormCuiMap.values() ) {
+         if ( rxNormCui.hasType( RxNameType.FullEntry ) ) {
+            fullRxNormCuis.add( rxNormCui );
+         } else {
+            trimRxNormCuis.add( rxNormCui );
+         }
+      }
+      rxNormCuiMap.clear();
+
+      final Collection<Integer> removalIndices = new HashSet<>();
+      System.out.println( "Start rxNormCuis: " + fullRxNormCuis.size() + " , " + trimRxNormCuis.size() );
+      // If a full entry name matches another full entry name, merge them
+      fillDeliveryFreeTerms( fullRxNormCuis, removalIndices, RxNameType.FullEntry, RxNameType.FullEntry );
+      removeRxNormCuis( fullRxNormCuis, removalIndices );
+      System.out.println( "After Full Entry & Entry rxNormCuis: " + fullRxNormCuis.size() );
+
+      // If a trimmed entry name matches a trimmed entry name, merge them
+      fillDeliveryFreeTerms( trimRxNormCuis, removalIndices, RxNameType.TrimEntry, RxNameType.TrimEntry );
+      removeRxNormCuis( trimRxNormCuis, removalIndices );
+      System.out.println( "After Trim Entry & Trim Entry rxNormCuis: " + trimRxNormCuis.size() );
+
+      // If a full entry name matches a trimmed entry name, merge them
+      fillDeliveryFreeTerms( fullRxNormCuis, trimRxNormCuis, removalIndices,
+                             RxNameType.FullEntry, RxNameType.TrimEntry );
+      removeRxNormCuis( trimRxNormCuis, removalIndices );
+      System.out.println( "After Full Entry & Trim Entry rxNormCuis: "
+                                + fullRxNormCuis.size() + " " + trimRxNormCuis.size() );
+
+      // If a full entry name matches a trimmed entry extracted name, merge them
+      fillDeliveryFreeTerms( fullRxNormCuis, trimRxNormCuis, removalIndices,
+                             RxNameType.FullEntry, RxNameType.ExtractName );
+      removeRxNormCuis( trimRxNormCuis, removalIndices );
+      System.out.println( "After Full Entry & Extract Name rxNormCuis: "
+                                + fullRxNormCuis.size() + " " + trimRxNormCuis.size() );
+
+      // If a full entry name matches a trimmed entry extracted name, merge them
+      fillDeliveryFreeTerms( fullRxNormCuis, trimRxNormCuis, removalIndices,
+                             RxNameType.FullEntry, RxNameType.ExtractFormula );
+      removeRxNormCuis( trimRxNormCuis, removalIndices );
+      System.out.println( "After Full Entry & Extract Formula rxNormCuis: "
+                                + fullRxNormCuis.size() + " " + trimRxNormCuis.size() );
+
+      // Cui to list of text per cui
+      final HashSetMap<Long, String> cuiMap = new HashSetMap<>();
+      for ( RxNormCui rxNormCui : fullRxNormCuis ) {
+         rxNormCui.clearFormulas();
+         cuiMap.addAll( rxNormCui.__cuiCode, rxNormCui.obtainAll() );
+      }
+      fullRxNormCuis.clear();
+      for ( RxNormCui rxNormCui : trimRxNormCuis ) {
+         rxNormCui.clearFormulas();
+         cuiMap.addAll( rxNormCui.__cuiCode, rxNormCui.obtainAll() );
+      }
+      trimRxNormCuis.clear();
+      return cuiMap;
+   }
+
+   static private void removeRxNormCuis( final List<RxNormCui> rxNormCuis,
+                                         final Collection<Integer> removalIndices ) {
+      final Collection<RxNormCui> removalCuis = new HashSet<>();
+      for ( Integer i : removalIndices ) {
+         removalCuis.add( rxNormCuis.get( i ) );
+      }
+      rxNormCuis.removeAll( removalCuis );
+      removalIndices.clear();
+   }
+
+   static private void fillRxNormCui( final Map<Long, RxNormCui> rxNormCuiMap,
+                                      final Map.Entry<Long, Set<String>> cuiTexts ) {
+      RxNormCui rxNormCui = rxNormCuiMap.get( cuiTexts.getKey() );
+      if ( rxNormCui == null ) {
+         rxNormCui = new RxNormCui( cuiTexts.getKey() );
+         rxNormCuiMap.put( cuiTexts.getKey(), rxNormCui );
+      }
+
+      for ( String text : cuiTexts.getValue() ) {
+         if ( text.length() > 7 && text.indexOf( " _ # " ) == text.length() - 6 ) {
+            // hack for multi-entry rxnorm repeats
+            text = text.substring( 0, text.length() - 6 ).trim();
+         }
+         if ( text.startsWith( "{" ) || text.endsWith( "]" ) ) {
+            // start curly brackets are repeats
+            // end brackets more often than not denote brand name only, not brand name drug name
+            // all drugs with brand name entries should have non-brand name entries as well
+            // end brackets also have simple modifiers like [coated], which are elsewhere
+            continue;
+         }
+         // deal with full text
+         populateRxNormCui( rxNormCui, text );
+      }
+   }
+
+   static private void populateRxNormCui( final RxNormCui rxNormCui, final String text ) {
+      final String deliveryFree = DeliveryUtil.getDeliveryFreeText( text );
+      if ( deliveryFree.length() < 4 || deliveryFree.equals( text ) ) {
+         rxNormCui.place( RxNameType.FullEntry, text );
+      } else {
+         rxNormCui.place( RxNameType.TrimEntry, deliveryFree );
+      }
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RxNormTermUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TermPhonemator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TermPhonemator.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TermPhonemator.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TermPhonemator.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,163 @@
+package org.apache.ctakes.dictionarytool.util;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 6/4/14
+ */
+final public class TermPhonemator {
+
+   private TermPhonemator() {
+   }
+
+
+   static private String phenominateTerm( final String text ) {
+      //      final char[] textChars = text.toCharArray();
+      final String[] splits = text.toLowerCase().split( "\\s+" );
+      if ( splits.length == 0 ) {
+         return "";
+      }
+      final StringBuilder sb = new StringBuilder();
+      for ( String split : splits ) {
+         sb.append( phenominateWord( split ) );
+         sb.append( ' ' );
+      }
+      sb.setLength( sb.length() - 1 );
+      return sb.toString();
+   }
+
+   static private final char NULL_PHONE = '\\';
+
+   static private String phenominateWord( final String text ) {
+      final char[] chars = text.toCharArray();
+      final StringBuilder sb = new StringBuilder();
+      int start = 0;
+      char phoneme = getSpecialStart( chars );
+      if ( phoneme != NULL_PHONE ) {
+         sb.append( phoneme );
+         start = 2;
+      } else if ( isVowel( chars[0] ) ) {
+         // Always use first vowel as-is
+         sb.append( chars[0] );
+         start = 1;
+         // kludge to prevent double vowel starts
+         phoneme = 'a';
+      }
+      int length = text.length();
+      char endPhone = getSpecialEnd( chars );
+      if ( endPhone != NULL_PHONE ) {
+         length--;
+      }
+      char lastPhoneme = phoneme;
+      for ( int i = start; i < length; i++ ) {
+         if ( isVowel( chars[i] ) ) {
+            phoneme = 'a';
+         } else {
+            // set up default phoneme as normal character
+            phoneme = chars[i];
+            // check for special characters
+            switch ( chars[i] ) {
+               case 'c':
+                  if ( isNext( chars, i + 1, 'h' ) ) {
+                     phoneme = 'x';
+                     i++;
+                  } else if ( isNext( chars, i + 1, 'i' ) && isNext( chars, i + 2, 'a' ) ) {
+                     phoneme = 'x';
+                  } else if ( isNext( chars, i + 1, 'i', 'e', 'y' ) ) {
+                     phoneme = 's';
+                  } else {
+                     phoneme = 'k';
+                  }
+                  break;
+               case 'd':
+                  if ( isNext( chars, i + 1, 'g' ) && isNext( chars, i + 2, 'e', 'y', 'i' ) ) {
+                     phoneme = 'j';
+                     i++;
+                  } else {
+                     phoneme = 't';
+                  }
+                  break;
+               case 'g':
+                  if ( isNext( chars, i + 1, 'g' ) ) {
+                     phoneme = 'k';
+                     i++;
+                  } else if ( isNext( chars, i + 1, 'i', 'e', 'y' ) ) {
+                     phoneme = 'j';
+                  } else if ( isNext( chars, i + 1, 'h' ) && !isNext( chars, i + 2, 'a', 'e', 'i', 'o', 'u', 'y' ) ) {
+                     // drop g
+                     continue;
+                  } else {
+                     phoneme = 'k';
+                  }
+                  break;
+               case 'h':
+                  if ( lastPhoneme == 'a' && !isNext( chars, i + 1, 'a', 'e', 'i', 'o', 'u', 'y' ) ) {
+                     // drop h
+                     continue;
+                  }
+                  break;
+               case 'p':
+                  if ( isNext( chars, i + 1, 'h' ) ) {
+                     phoneme = 'f';
+                     i++;
+                  }
+                  break;
+               // TODO http://en.wikipedia.org/wiki/Metaphone
+
+            }
+         }
+         if ( phoneme != lastPhoneme ) {
+            sb.append( phoneme );
+            lastPhoneme = phoneme;
+         }
+      }
+
+
+      return sb.toString();
+   }
+
+   static private boolean isNext( final char[] chars, final int index, final char... checks ) {
+      if ( index >= chars.length ) {
+         return false;
+      }
+      for ( char c : checks ) {
+         if ( chars[index] == c ) {
+            return true;
+         }
+      }
+      return false;
+   }
+
+
+   static private boolean isVowel( final char c ) {
+      return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' || c == 'y';
+   }
+
+   static private char getSpecialStart( final char[] chars ) {
+      if ( chars[1] == 'n' && (chars[0] == 'g' || chars[0] == 'k' || chars[0] == 'p') ) {
+         return 'n';
+      }
+      if ( chars[0] == 'w' && chars[1] == 'r' ) {
+         return 'r';
+      }
+      if ( chars[0] == 'p' && chars[1] == 's' ) {
+         return 's';
+      }
+      if ( chars[0] == 'x' ) {
+         return 's';
+      }
+      return NULL_PHONE;
+   }
+
+   static private char getSpecialEnd( final char[] chars ) {
+      final int length = chars.length;
+      if ( chars[length - 1] == 'b' && chars[length - 2] == 'm' ) {
+         return 'm';
+      }
+      //      if ( chars[ length-1 ] == 'h' && chars[length-2] == 'g' ) {
+      //         return 'h';
+      //      }
+      return NULL_PHONE;
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TermPhonemator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java Wed Sep 10 17:30:42 2014
@@ -116,16 +116,28 @@ final public class TextTokenizer {
 
 
    static public List<String> getTokens( final String word ) {
-      final List<String> tokens = new ArrayList<String>();
+      return getTokens( word, false );
+   }
+
+   static public List<String> getTokens( final String word, final boolean separateDigits ) {
+      final List<String> tokens = new ArrayList<>();
       final StringBuilder sb = new StringBuilder();
       final int count = word.length();
+      boolean wasDigit = false;
       for ( int i = 0; i < count; i++ ) {
          final char c = word.charAt( i );
          if ( Character.isLetterOrDigit( c ) ) {
+            if ( sb.length() != 0 && separateDigits && (wasDigit && !Character.isDigit( c )) ) {
+               // separating characters from digits, add the current word
+               tokens.add( sb.toString() );
+               sb.setLength( 0 );
+            }
+            wasDigit = Character.isDigit( c );
             // Appending character to current word
             sb.append( c );
             continue;
          }
+         wasDigit = false;
          if ( c != '-' ) {
             // have a symbol other than dash
             if ( sb.length() != 0 ) {
@@ -163,6 +175,10 @@ final public class TextTokenizer {
    }
 
    static public String getTokenizedText( final String text ) {
+      return getTokenizedText( text, false );
+   }
+
+   static public String getTokenizedText( final String text, final boolean separateDigits ) {
       if ( text.isEmpty() ) {
          return text;
       }
@@ -177,7 +193,7 @@ final public class TextTokenizer {
       }
       final StringBuilder sb = new StringBuilder();
       for ( String split : splits ) {
-         final List<String> tokens = getTokens( split );
+         final List<String> tokens = getTokens( split, separateDigits );
          for ( String token : tokens ) {
             sb.append( token ).append( " " );
          }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java Wed Sep 10 17:30:42 2014
@@ -31,7 +31,7 @@ final public class TokenUtil {
       if ( line == null || line.trim().isEmpty() ) {
          return Collections.emptyList();
       }
-      final List<String> tokens = new ArrayList<String>();
+      final List<String> tokens = new ArrayList<>();
       int startIndex = 0;
       int stopIndex = line.indexOf( separator );
       while ( stopIndex > 0 && stopIndex < line.length() ) {
@@ -49,10 +49,16 @@ final public class TokenUtil {
 
 
    static public String createBsvLine( final Collection<String> values ) {
+      if ( values == null ) {
+         return "";
+      }
       return createBsvLine( values.toArray( new String[values.size()] ) );
    }
 
    static public String createBsvLine( final String... values ) {
+      if ( values.length == 0 ) {
+         return "";
+      }
       final StringBuilder sb = new StringBuilder();
       for ( String value : values ) {
          sb.append( value ).append( "|" );
@@ -69,6 +75,9 @@ final public class TokenUtil {
    }
 
    static public String createCsvLine( final String... values ) {
+      if ( values.length == 0 ) {
+         return "";
+      }
       final StringBuilder sb = new StringBuilder();
       for ( String value : values ) {
          sb.append( value ).append( "," );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsFileName.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsFileName.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsFileName.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsFileName.java Wed Sep 10 17:30:42 2014
@@ -8,6 +8,8 @@ package org.apache.ctakes.dictionarytool
 public enum UmlsFileName {
    CUI_TUI_MAP( "MRSTY.RRF" ),
    CUI_TERM_MAP( "MRCONSO.RRF" ),
+   // TODO DEBUG
+   //   CUI_TERM_MAP( "MRCONSO_SNO_RX.RRF" ),
    RELATION_LIST( "MRREL.RRF" );
    final public String _filename;
 

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java Wed Sep 10 17:30:42 2014
@@ -1,5 +1,7 @@
 package org.apache.ctakes.dictionarytool.util;
 
+import org.apache.ctakes.dictionarytool.util.index.MrconsoIndex;
+
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.util.Collection;
@@ -8,6 +10,7 @@ import java.util.List;
 
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.CUI;
 import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.SOURCE;
+import static org.apache.ctakes.dictionarytool.util.index.MrconsoIndex.TERM_TYPE;
 
 /**
  * Author: SPF
@@ -25,23 +28,67 @@ final public class UmlsSourceTypeCuiVali
     *
     * @param rrfPath     path to the UMLS_ROOT Meta/MRCONSO.RRF file
     * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
-    * @param cuis        current list of cuis
+    * @param cuiCodes    current list of cui codes
     * @return Subset of cuis that exist in in the given sources
     */
-   static public Collection<String> getSourceTypeValidCuis( final String rrfPath,
-                                                            final Collection<String> sourceTypes,
-                                                            final Collection<String> cuis ) {
-      final Collection<String> validCuis = new HashSet<String>( cuis.size() );
+   static public Collection<Long> getSourceTypeValidCuis( final String rrfPath,
+                                                          final Collection<String> sourceTypes,
+                                                          final Collection<Long> cuiCodes ) {
+      final Collection<Long> validCuis = new HashSet<>( cuiCodes.size() );
       long lineCount = 0;
       try {
          final BufferedReader reader = FileUtil.createReader( rrfPath );
          List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
          while ( tokens != null ) {
             lineCount++;
-            if ( tokens.size() > SOURCE._index && sourceTypes.contains( tokens.get( SOURCE._index ) ) ) {
-               final String cui = CuiTuiUtil.getAsCui( tokens.get( CUI._index ) );
-               if ( cuis.contains( cui ) ) {
-                  validCuis.add( cui );
+            if ( tokens.size() > SOURCE._index && sourceTypes.contains( getToken( tokens, SOURCE ) ) ) {
+               final Long cuiCode = CuiTuiUtil.getCuiCode( getToken( tokens, CUI ) );
+               if ( cuiCodes.contains( cuiCode ) ) {
+                  validCuis.add( cuiCode );
+               }
+            }
+            if ( lineCount % 2000 == 0 ) {
+               System.out.print( "." );
+               if ( lineCount % 100000 == 0 ) {
+                  System.out.println( "File Line " + lineCount + "\t Valid Cuis " + validCuis.size() );
+               }
+            }
+            tokens = FileUtil.readBsvTokens( reader, rrfPath );
+         }
+         reader.close();
+      } catch ( IOException ioE ) {
+         System.err.println( ioE.getMessage() );
+      }
+      System.out.println( "File Lines " + lineCount + "\t Valid Cuis " + validCuis.size() );
+      return validCuis;
+   }
+
+   /**
+    * Can cull the given collection of cuis
+    *
+    * @param rrfPath     path to the UMLS_ROOT Meta/MRCONSO.RRF file
+    * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
+    * @param cuiCodes    current list of cui codes
+    * @param termTypes   desired term type names as appear in rrf: IN, PIN, MIN, BN, etc.
+    * @return Subset of cuis that exist in in the given sources
+    */
+   static public Collection<Long> getSourceTypeValidCuis( final String rrfPath,
+                                                          final Collection<String> sourceTypes,
+                                                          final Collection<Long> cuiCodes,
+                                                          final Collection<String> termTypes ) {
+      final Collection<Long> validCuis = new HashSet<>( cuiCodes.size() );
+      long lineCount = 0;
+      try {
+         final BufferedReader reader = FileUtil.createReader( rrfPath );
+         List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
+         while ( tokens != null ) {
+            lineCount++;
+            if ( tokens.size() > SOURCE._index
+                  && sourceTypes.contains( getToken( tokens, SOURCE ) )
+                  && termTypes.contains( getToken( tokens, TERM_TYPE ) ) ) {
+               final Long cuiCode = CuiTuiUtil.getCuiCode( getToken( tokens, CUI ) );
+               if ( cuiCodes.contains( cuiCode ) ) {
+                  validCuis.add( cuiCode );
                }
             }
             if ( lineCount % 2000 == 0 ) {
@@ -65,21 +112,24 @@ final public class UmlsSourceTypeCuiVali
     *
     * @param rrfPath     path to the UMLS_ROOT Meta/MRCONSO.RRF file
     * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
-    * @param cuis        current list of cuis
+    * @param cuiCodes    current list of cui codes
     * @return Subset of cuis that don't exist in in the given sources
     */
-   static public Collection<String> getSourceTypeInvalidCuis( final String rrfPath,
-                                                              final Collection<String> sourceTypes,
-                                                              final Collection<String> cuis ) {
-      final Collection<String> validCuis = getSourceTypeValidCuis( rrfPath, sourceTypes, cuis );
-      final Collection<String> invalidCuis = new HashSet<String>( cuis.size() - validCuis.size() );
-      for ( String cui : cuis ) {
-         if ( !validCuis.contains( cui ) ) {
-            invalidCuis.add( cui );
+   static public Collection<Long> getSourceTypeInvalidCuis( final String rrfPath,
+                                                            final Collection<String> sourceTypes,
+                                                            final Collection<Long> cuiCodes ) {
+      final Collection<Long> validCuis = getSourceTypeValidCuis( rrfPath, sourceTypes, cuiCodes );
+      final Collection<Long> invalidCuis = new HashSet<>( cuiCodes.size() - validCuis.size() );
+      for ( Long cuiCode : cuiCodes ) {
+         if ( !validCuis.contains( cuiCode ) ) {
+            invalidCuis.add( cuiCode );
          }
       }
       return invalidCuis;
    }
 
+   static private String getToken( final List<String> tokens, final MrconsoIndex mrconsoIndex ) {
+      return tokens.get( mrconsoIndex._index );
+   }
 
 }

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java Wed Sep 10 17:30:42 2014
@@ -66,10 +66,37 @@ final public class UmlsTermUtil {
    }
 
    public Collection<String> getFormattedTexts( final String text ) {
-      return getFormattedTexts( text, true );
+      return getFormattedTexts( text, true, 1, Integer.MAX_VALUE );
    }
 
-   public Collection<String> getFormattedTexts( final String text, final boolean extractAbbreviations ) {
+   public Collection<String> getFormattedTexts( final Collection<String> extractedTerms,
+                                                final int minWordLength, final int maxWordCount ) {
+      final Collection<String> removalTexts = new HashSet<>();
+      for ( String term : extractedTerms ) {
+         if ( term.length() < minWordLength ) {
+            removalTexts.add( term );
+            continue;
+         }
+         final String[] splits = term.split( "\\s+" );
+         if ( splits.length > maxWordCount ) {
+            int count = 0;
+            for ( String split : splits ) {
+               if ( split.length() > 2 ) {
+                  count++;
+                  if ( count > maxWordCount ) {
+                     removalTexts.add( term );
+                     break;
+                  }
+               }
+            }
+         }
+      }
+      extractedTerms.removeAll( removalTexts );
+      return extractedTerms;
+   }
+
+   public Collection<String> getFormattedTexts( final String text, final boolean extractAbbreviations,
+                                                final int minWordLength, final int maxWordCount ) {
       final String tokenizedText = TextTokenizer.getTokenizedText( text );
       if ( tokenizedText == null || tokenizedText.isEmpty() ) {
          return Collections.emptyList();
@@ -94,7 +121,7 @@ final public class UmlsTermUtil {
       }
       if ( !extractedTerms.isEmpty() ) {
          extractedTerms.add( validText );
-         return getPluralTerms( getValidTexts( extractedTerms ) );
+         return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
       }
       // Check for embedded and / or terms
       if ( extractedTerms.isEmpty() ) {
@@ -120,16 +147,16 @@ final public class UmlsTermUtil {
          //         for ( String et : extractedTerms ) {
          //            System.out.println("  " + et);
          //         }
-         return getPluralTerms( getValidTexts( extractedTerms ) );
+         return getFormattedTexts( getPluralTerms( getValidTexts( extractedTerms ) ), minWordLength, maxWordCount );
       } else {
-         Collection<String> texts = new HashSet<String>( 1 );
+         Collection<String> texts = new HashSet<>( 1 );
          texts.add( validText );
-         return getPluralTerms( getValidTexts( texts ) );
+         return getFormattedTexts( getPluralTerms( getValidTexts( texts ) ), minWordLength, maxWordCount );
       }
    }
 
    static private Collection<String> getPluralTerms( final Collection<String> texts ) {
-      final Collection<String> plurals = new HashSet<String>();
+      final Collection<String> plurals = new HashSet<>();
       for ( String text : texts ) {
          if ( text.endsWith( "( s )" ) ) {
             final String singular = text.substring( 0, text.length() - 5 ).trim();
@@ -174,7 +201,7 @@ final public class UmlsTermUtil {
    }
 
    private Collection<String> getValidTexts( final Collection<String> texts ) {
-      final Collection<String> validTexts = new HashSet<String>( texts.size() );
+      final Collection<String> validTexts = new HashSet<>( texts.size() );
       for ( String text : texts ) {
          validTexts.add( getValidText( text ) );
       }
@@ -255,7 +282,7 @@ final public class UmlsTermUtil {
             final String abbrTerm
                   = abbreviation.replace( ":", "" ).replace( "(", "" ).replace( ")", "" ).replace( "-", "" )
                   .replace( "[", "" ).replace( "]", "" ).replace( "&", "" ).trim();
-            final Collection<String> extractedAbbreviations = new HashSet<String>( 2 );
+            final Collection<String> extractedAbbreviations = new HashSet<>( 2 );
             extractedAbbreviations.add( noAbbrTerm );
             extractedAbbreviations.add( abbrTerm );
             return extractedAbbreviations;
@@ -270,7 +297,7 @@ final public class UmlsTermUtil {
             final String mainText = tokenizedText.substring( 0, tokenizedText.length() - modifier.length() ).trim();
             final String modifierText
                   = modifier.replace( "(", "" ).replace( ")", "" ).replace( "-", "" ).replace( ",", "" ).trim();
-            final Collection<String> modifiedTexts = new HashSet<String>( 2 );
+            final Collection<String> modifiedTexts = new HashSet<>( 2 );
             modifiedTexts.add( tokenizedText );
             modifiedTexts.add( modifierText + " " + mainText );
             return modifiedTexts;
@@ -304,7 +331,7 @@ final public class UmlsTermUtil {
                acronym.length() - 1 ) ) {
             return Collections.emptyList();
          }
-         final Collection<String> extractedAbbreviations = new HashSet<String>( 2 );
+         final Collection<String> extractedAbbreviations = new HashSet<>( 2 );
          extractedAbbreviations.add( acronym );
          extractedAbbreviations.add( definition );
          return extractedAbbreviations;
@@ -326,7 +353,7 @@ final public class UmlsTermUtil {
       if ( andOrIndex > 0 ) {
          splitter = "\\] & / or \\[";
       }
-      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      final Collection<String> extractedTerms = new HashSet<>( 2 );
       final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
       final String types = tokenizedText.substring( colonIndex + 1 ).trim();
       final String[] splits = types.split( splitter );
@@ -348,7 +375,7 @@ final public class UmlsTermUtil {
       if ( andIndex < 0 || tokenizedText.indexOf( "] or [" ) < andIndex ) {
          return Collections.emptyList();
       }
-      final Collection<String> extractedTerms = new HashSet<String>( 3 );
+      final Collection<String> extractedTerms = new HashSet<>( 3 );
       final String thing = tokenizedText.substring( 0, andIndex - 1 ).trim();
       extractedTerms.add( thing );
       final String types = tokenizedText.substring( andIndex + 3 ).trim();
@@ -370,13 +397,13 @@ final public class UmlsTermUtil {
          final String ofTerm = tokenizedText.substring( lastOf ).trim();
          final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0,
                                                                                                       lastOf ).trim() );
-         final Collection<String> ofTexts = new HashSet<String>( ofExtractions.size() );
+         final Collection<String> ofTexts = new HashSet<>( ofExtractions.size() );
          for ( String ofText : ofExtractions ) {
             ofTexts.add( ofText + " " + ofTerm );
          }
          return ofTexts;
       }
-      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      final Collection<String> extractedTerms = new HashSet<>( 2 );
       String splitter = "\\] or \\[";
       if ( tokenizedText.contains( "] & / or [" ) ) {
          splitter = "\\] & / or \\[";
@@ -400,13 +427,13 @@ final public class UmlsTermUtil {
          final String ofTerm = tokenizedText.substring( lastOf ).trim();
          final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0,
                                                                                                       lastOf ).trim() );
-         final Collection<String> ofTexts = new HashSet<String>( ofExtractions.size() );
+         final Collection<String> ofTexts = new HashSet<>( ofExtractions.size() );
          for ( String ofText : ofExtractions ) {
             ofTexts.add( ofText + " " + ofTerm );
          }
          return ofTexts;
       }
-      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      final Collection<String> extractedTerms = new HashSet<>( 2 );
       String splitter = "\\) or \\(";
       if ( tokenizedText.contains( ") & / or (" ) ) {
          splitter = "\\) & / or \\(";
@@ -435,7 +462,7 @@ final public class UmlsTermUtil {
       if ( andOrIndex > 0 ) {
          splitter = "\\) & / or \\(";
       }
-      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      final Collection<String> extractedTerms = new HashSet<>( 2 );
       final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
       final String types = tokenizedText.substring( colonIndex + 1 ).trim();
       final String[] splits = types.split( splitter );
@@ -457,7 +484,7 @@ final public class UmlsTermUtil {
       if ( otherIndex < 0 ) {
          return Collections.emptyList();
       }
-      final Collection<String> otherTexts = new HashSet<String>( 2 );
+      final Collection<String> otherTexts = new HashSet<>( 2 );
       otherTexts.add( tokenizedText.substring( 0, otherIndex ).trim() );
       otherTexts.add( tokenizedText.substring( otherIndex + 14 ).trim() );
       return otherTexts;

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/ArrayListMap.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/ArrayListMap.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/ArrayListMap.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/ArrayListMap.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,112 @@
+package org.apache.ctakes.dictionarytool.util.collection;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 7/23/14
+ */
+final public class ArrayListMap<K, V> extends HashMap<K, List<V>> implements CollectionMap<K, V> {
+
+   public ArrayListMap() {
+      super();
+   }
+
+   /**
+    * @param size initial size of the ArrayListMap
+    */
+   public ArrayListMap( final int size ) {
+      super( size );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean containsValue( final K key, final V value ) {
+      final Collection<V> values = get( key );
+      return values != null && values.contains( value );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean place( final K key, final V value ) {
+      List<V> list = get( key );
+      if ( list == null ) {
+         list = new ArrayList<>();
+         put( key, list );
+      }
+      return list.add( value );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<V> obtain( final K key ) {
+      List<V> list = get( key );
+      if ( list == null ) {
+         list = new ArrayList<>();
+         put( key, list );
+      }
+      return list;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void remove( final K key, final V value ) {
+      final List<V> list = get( key );
+      if ( list == null ) {
+         return;
+      }
+      list.remove( value );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int addAll( final K key, final Collection<V> collection ) {
+      List<V> list = get( key );
+      if ( list == null ) {
+         list = new ArrayList<>();
+         put( key, list );
+      }
+      final int oldSize = list.size();
+      list.addAll( collection );
+      return list.size() - oldSize;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void clear( final K key ) {
+      List<V> list = get( key );
+      if ( list != null ) {
+         list.clear();
+      }
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Map<K, Collection<V>> toSimpleMap() {
+      final Map<K, Collection<V>> simpleMap = new HashMap<>( size() );
+      for ( K key : keySet() ) {
+         simpleMap.put( key, obtain( key ) );
+      }
+      return simpleMap;
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/ArrayListMap.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/CollectionMap.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/CollectionMap.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/CollectionMap.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/CollectionMap.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,80 @@
+package org.apache.ctakes.dictionarytool.util.collection;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 6/24/14
+ */
+public interface CollectionMap<K, V> {
+
+   /**
+    * check the collection map for a key
+    *
+    * @param key key for internal collection
+    * @return <tt>true</tt> if this CollectionMap contain the key
+    */
+   public boolean containsKey( K key );
+
+   /**
+    * check the collection map for a key and value combination
+    *
+    * @param key   key for internal collection
+    * @param value value to check in internal collection
+    * @return <tt>true</tt> if this CollectionMap contain the value for the given key
+    */
+   public boolean containsValue( K key, V value );
+
+   // TODO rename place > forcePut , obtain > forceGet ?
+
+   /**
+    * places value into a collection mapped with key
+    *
+    * @param key   key for internal collection
+    * @param value value to place in internal collection
+    * @return <tt>true</tt> if this set did not already contain the value
+    */
+   public boolean place( K key, V value );
+
+   /**
+    * obtains a collection mapped with key
+    *
+    * @param key key for internal collection
+    * @return collection mapped with key
+    */
+   public Collection<V> obtain( K key );
+
+   /**
+    * removes value from a collection mapped with key
+    *
+    * @param key   key for internal collection
+    * @param value value to remove from internal collection
+    */
+   public void remove( K key, V value );
+
+   /**
+    * adds everything from the given collection to the internal collection mapped with key
+    *
+    * @param key        key for internal collection
+    * @param collection collection of values to place in internal collection
+    * @return the number of new items added
+    */
+   public int addAll( K key, Collection<V> collection );
+
+   /**
+    * clear the collection mapped with key
+    *
+    * @param key key for internal collection
+    */
+   public void clear( K key );
+
+   /**
+    * Copy of this object as a simple (java.util.Collection) map of Collection
+    *
+    * @return map of java.util.Collection
+    */
+   public Map<K, Collection<V>> toSimpleMap();
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/CollectionMap.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/HashSetMap.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/HashSetMap.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/HashSetMap.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/HashSetMap.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,113 @@
+package org.apache.ctakes.dictionarytool.util.collection;
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 6/24/14
+ */
+final public class HashSetMap<K, V> extends HashMap<K, Set<V>> implements CollectionMap<K, V> {
+
+   public HashSetMap() {
+      super();
+   }
+
+   /**
+    * @param size initial size of the HashSetMap
+    */
+   public HashSetMap( final int size ) {
+      super( size );
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean containsValue( final K key, final V value ) {
+      final Collection<V> values = get( key );
+      return values != null && values.contains( value );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean place( final K key, final V value ) {
+      Set<V> set = get( key );
+      if ( set == null ) {
+         set = new HashSet<>();
+         put( key, set );
+      }
+      return set.add( value );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<V> obtain( final K key ) {
+      Set<V> set = get( key );
+      if ( set == null ) {
+         set = new HashSet<>();
+         put( key, set );
+      }
+      return set;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void remove( final K key, final V value ) {
+      final Set<V> set = get( key );
+      if ( set == null ) {
+         return;
+      }
+      set.remove( value );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int addAll( final K key, final Collection<V> collection ) {
+      Set<V> set = get( key );
+      if ( set == null ) {
+         set = new HashSet<>();
+         put( key, set );
+      }
+      final int oldSize = set.size();
+      set.addAll( collection );
+      return set.size() - oldSize;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void clear( final K key ) {
+      final Set<V> set = get( key );
+      if ( set != null ) {
+         set.clear();
+      }
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Map<K, Collection<V>> toSimpleMap() {
+      final Map<K, Collection<V>> simpleMap = new HashMap<>( size() );
+      for ( K key : keySet() ) {
+         simpleMap.put( key, obtain( key ) );
+      }
+      return simpleMap;
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/collection/HashSetMap.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/index/MrconsoIndex.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/index/MrconsoIndex.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/index/MrconsoIndex.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/index/MrconsoIndex.java Wed Sep 10 17:30:42 2014
@@ -6,7 +6,7 @@ package org.apache.ctakes.dictionarytool
  * Date: 1/23/14
  */
 public enum MrconsoIndex {
-   CUI( 0 ), LANGUAGE( 1 ), STATUS( 2 ), SOURCE( 11 ), SOURCE_CODE( 13 ), TEXT( 14 );
+   CUI( 0 ), LANGUAGE( 1 ), STATUS( 2 ), FORM( 4 ), SOURCE( 11 ), TERM_TYPE( 12 ), SOURCE_CODE( 13 ), TEXT( 14 );
    final public int _index;
 
    private MrconsoIndex( final int index ) {

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesDbWriter.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesDbWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesDbWriter.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,138 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.JdbcUtil;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Collection;
+
+import static org.apache.ctakes.dictionarytool.reader.UmlsCodesForCuisReader.CuiCodeInfo;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 9/2/14
+ */
+final public class CuiCodesDbWriter {
+
+   private CuiCodesDbWriter() {
+   }
+
+
+   static public void writeCuiCodeInfo( final Collection<CuiCodeInfo> cuiCodeInfos,
+                                        final String url, final String user, final String pass ) {
+      final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass );
+      writeCuiIntTable( connection, "TUI", cuiCodeInfos );
+      writeCuiLongTable( connection, "SNOMEDCT", cuiCodeInfos );
+      writeCuiLongTable( connection, "RXNORM", cuiCodeInfos );
+      writeCuiStringTable( connection, "ICD9CM", cuiCodeInfos );
+      writeCuiStringTable( connection, "ICD10PCS", cuiCodeInfos );
+      writeCuiStringTable( connection, "PREFTERM", cuiCodeInfos );
+   }
+
+   static private void writeCuiIntTable( final Connection connection, final String name,
+                                         final Collection<CuiCodeInfo> cuiCodeInfos ) {
+      final String sql = JdbcUtil.createRowInsertSql( name, "CUI", name );
+      try {
+         final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+         long lineCount = 0;
+         for ( CuiCodeInfo cuiCodeInfo : cuiCodeInfos ) {
+            final Collection<String> codes = cuiCodeInfo.obtain( name );
+            if ( codes == null || codes.isEmpty() ) {
+               continue;
+            }
+            for ( String code : codes ) {
+               try {
+                  final int codeNum = Integer.parseInt( code );
+                  rowInsertSql.setLong( 1, cuiCodeInfo.getCuiCode() );
+                  rowInsertSql.setInt( 2, codeNum );
+                  rowInsertSql.executeUpdate();
+                  lineCount++;
+                  if ( lineCount % 100000 == 0 ) {
+                     System.out.println( "DB Row " + lineCount );
+                  }
+               } catch ( NumberFormatException nfE ) {
+                  System.err.println( "Could not create Code for " + code );
+               }
+            }
+         }
+         System.out.println( "DB Rows " + lineCount );
+         final Statement statement = connection.createStatement();
+         statement.execute( "commit" );
+         rowInsertSql.close();
+      } catch ( SQLException sqlE ) {
+         System.err.println( sqlE.getMessage() );
+      }
+   }
+
+
+   static private void writeCuiLongTable( final Connection connection, final String name,
+                                          final Collection<CuiCodeInfo> cuiCodeInfos ) {
+      final String sql = JdbcUtil.createRowInsertSql( name, "CUI", name );
+      try {
+         final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+         long lineCount = 0;
+         for ( CuiCodeInfo cuiCodeInfo : cuiCodeInfos ) {
+            final Collection<String> codes = cuiCodeInfo.obtain( name );
+            if ( codes == null || codes.isEmpty() ) {
+               continue;
+            }
+            for ( String code : codes ) {
+               try {
+                  final long codeNum = Long.parseLong( code );
+                  rowInsertSql.setLong( 1, cuiCodeInfo.getCuiCode() );
+                  rowInsertSql.setLong( 2, codeNum );
+                  rowInsertSql.executeUpdate();
+                  lineCount++;
+                  if ( lineCount % 100000 == 0 ) {
+                     System.out.println( "DB Row " + lineCount );
+                  }
+               } catch ( NumberFormatException nfE ) {
+                  System.err.println( "Could not create Code for " + code );
+               }
+            }
+         }
+         System.out.println( "DB Rows " + lineCount );
+         final Statement statement = connection.createStatement();
+         statement.execute( "commit" );
+         rowInsertSql.close();
+      } catch ( SQLException sqlE ) {
+         System.err.println( sqlE.getMessage() );
+      }
+   }
+
+
+   static private void writeCuiStringTable( final Connection connection, final String name,
+                                            final Collection<CuiCodeInfo> cuiCodeInfos ) {
+      final String sql = JdbcUtil.createRowInsertSql( name, "CUI", name );
+      try {
+         final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+         long lineCount = 0;
+         for ( CuiCodeInfo cuiCodeInfo : cuiCodeInfos ) {
+            final Collection<String> codes = cuiCodeInfo.obtain( name );
+            if ( codes == null || codes.isEmpty() ) {
+               continue;
+            }
+            for ( String code : codes ) {
+               rowInsertSql.setLong( 1, cuiCodeInfo.getCuiCode() );
+               rowInsertSql.setString( 2, code );
+               rowInsertSql.executeUpdate();
+               lineCount++;
+               if ( lineCount % 100000 == 0 ) {
+                  System.out.println( "DB Row " + lineCount );
+               }
+            }
+         }
+         System.out.println( "DB Rows " + lineCount );
+         final Statement statement = connection.createStatement();
+         statement.execute( "commit" );
+         rowInsertSql.close();
+      } catch ( SQLException sqlE ) {
+         System.err.println( sqlE.getMessage() );
+      }
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesDbWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesWriter.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesWriter.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,54 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.TokenUtil;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.util.Collection;
+
+import static org.apache.ctakes.dictionarytool.reader.UmlsCodesForCuisReader.CuiCodeInfo;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 3/28/14
+ */
+final public class CuiCodesWriter {
+
+   private CuiCodesWriter() {
+   }
+
+   static public void writeCuiCodeInfo( final String termFilePath,
+                                        final Collection<CuiCodeInfo> cuiCodeInfos ) {
+      System.out.println( "Writing map of Cuis and Codes to " + termFilePath );
+      long lineCount = 0;
+      try ( BufferedWriter writer = FileUtil.createWriter( termFilePath ) ) {
+         for ( CuiCodeInfo cuiCodeInfo : cuiCodeInfos ) {
+            lineCount++;
+            writer.write( TokenUtil.createBsvLine( cuiCodeInfo.getCui(),
+                                                   TokenUtil.createCsvLine(
+                                                         CuiTuiUtil.getStringAsTuis( cuiCodeInfo.obtain( "TUI" ) ) ),
+                                                   createField( cuiCodeInfo, "SNOMEDCT" ),
+                                                   createField( cuiCodeInfo, "RXNORM" ),
+                                                   createField( cuiCodeInfo, "ICD9CM" ),
+                                                   createField( cuiCodeInfo, "ICD10PCS" ),
+                                                   createField( cuiCodeInfo, "PREFTERM" ) ) );
+            writer.newLine();
+            if ( lineCount % 100000 == 0 ) {
+               System.out.println( "File Line " + lineCount );
+            }
+         }
+         writer.close();
+      } catch ( IOException ioE ) {
+         System.err.println( "Error writing Term on line " + lineCount + " in file " + termFilePath );
+      }
+      System.out.println( "Wrote " + lineCount + " terms to " + termFilePath );
+   }
+
+   static private String createField( final CuiCodeInfo cuiCodeInfo, final String codeName ) {
+      return TokenUtil.createCsvLine( cuiCodeInfo.obtain( codeName ) );
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiCodesWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiMapWriter.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiMapWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiMapWriter.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,15 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 9/2/14
+ */
+public interface CuiMapWriter {
+
+   public void writeCuiMap( final HashSetMap<Integer, String> cuiTuis,
+                            final HashSetMap<Integer, String> cuiTexts );
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiMapWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTermWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTermWriter.java?rev=1624062&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTermWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTermWriter.java Wed Sep 10 17:30:42 2014
@@ -0,0 +1,15 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 9/2/14
+ */
+public interface CuiTermWriter {
+
+   public void writeCuiTerms( final HashSetMap<Integer, String> cuiTuis,
+                              final HashSetMap<Integer, String> cuiTexts );
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTermWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java Wed Sep 10 17:30:42 2014
@@ -1,12 +1,14 @@
 package org.apache.ctakes.dictionarytool.writer;
 
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.FileUtil;
 import org.apache.ctakes.dictionarytool.util.TokenUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.io.BufferedWriter;
 import java.io.IOException;
-import java.util.Collection;
 import java.util.Map;
+import java.util.Set;
 
 
 /**
@@ -19,13 +21,14 @@ final public class CuiTextsMapWriter {
    private CuiTextsMapWriter() {
    }
 
-   static public void writeCuiTexts( final String termFilePath, final Map<String, Collection<String>> cuiTexts ) {
+   static public void writeCuiTexts( final String termFilePath, final HashSetMap<Long, String> cuiTexts ) {
       System.out.println( "Writing map of Cuis and Texts to " + termFilePath );
       long lineCount = 0;
       try {
          final BufferedWriter writer = FileUtil.createWriter( termFilePath );
-         for ( Map.Entry<String, Collection<String>> cuiTextsEntry : cuiTexts.entrySet() ) {
-            final String cui = cuiTextsEntry.getKey();
+         for ( Map.Entry<Long, Set<String>> cuiTextsEntry : cuiTexts.entrySet() ) {
+            final Long code = cuiTextsEntry.getKey();
+            final String cui = CuiTuiUtil.getAsCui( code );
             for ( String text : cuiTextsEntry.getValue() ) {
                lineCount++;
                writer.write( TokenUtil.createBsvLine( cui, text ) );

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java Wed Sep 10 17:30:42 2014
@@ -1,9 +1,7 @@
 package org.apache.ctakes.dictionarytool.writer;
 
 import org.apache.ctakes.dictionarytool.util.FileUtil;
-
-import java.util.Collection;
-import java.util.Map;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 /**
  * Author: SPF
@@ -17,7 +15,7 @@ final public class CuiTuiMapWriter {
 
 
    static private void writeCuiTuiMap( final String cuiTuiFilePath,
-                                       final Map<String, Collection<String>> cuisAndTuis ) {
+                                       final HashSetMap<String, String> cuisAndTuis ) {
       FileUtil.writeNamedSets( cuiTuiFilePath, "map of Cuis and Tuis", cuisAndTuis );
    }
 

Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java (original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java Wed Sep 10 17:30:42 2014
@@ -1,6 +1,8 @@
 package org.apache.ctakes.dictionarytool.writer;
 
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
 import org.apache.ctakes.dictionarytool.util.JdbcUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.sql.Connection;
 import java.sql.PreparedStatement;
@@ -8,6 +10,7 @@ import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.Collection;
 import java.util.Map;
+import java.util.Set;
 
 /**
  * <p>
@@ -42,38 +45,40 @@ final public class FirstWordDbWriter {
    }
 
 
-   static public void writeTermsToDb( final Map<String, Collection<String>> cuiTuis,
-                                      final Map<String, Collection<String>> cuiTexts,
+   static public void writeTermsToDb( final HashSetMap<Long, Integer> cuiTuis,
+                                      final HashSetMap<Long, String> cuiTexts,
                                       final String url, final String user, final String pass, final String tableName ) {
       final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass );
       final String sql = JdbcUtil.createRowInsertSql( tableName, FIELD.values() );
       System.out.println( "Writing to " + tableName );
       try {
-
          final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
          long lineCount = 0;
-         for ( Map.Entry<String, Collection<String>> cuiTextEntry : cuiTexts.entrySet() ) {
-            final Collection<String> tuis = cuiTuis.get( cuiTextEntry.getKey() );
-            if ( tuis == null ) {
+         for ( Map.Entry<Long, Set<String>> cuiTextEntry : cuiTexts.entrySet() ) {
+            final Collection<Integer> tuiCodes = cuiTuis.get( cuiTextEntry.getKey() );
+            if ( tuiCodes == null ) {
                continue;
             }
+            final String cui = CuiTuiUtil.getAsCui( cuiTextEntry.getKey() );
             for ( String text : cuiTextEntry.getValue() ) {
                final String[] tokens = text.split( "\\s+" );
-               rowInsertSql.setString( FIELD.CUI.__index, cuiTextEntry.getKey() );
-               rowInsertSql.setString( FIELD.FWORD.__index, tokens[0] );
-               rowInsertSql.setString( FIELD.TEXT.__index, text );
-               rowInsertSql.setString( FIELD.CODE.__index, cuiTextEntry.getKey() );
-               rowInsertSql.setString( FIELD.SOURCETYPE.__index, "UMLS_ROOT" );
-               rowInsertSql.setString( FIELD.TUI.__index, getSingleTui( tuis ) );
-               rowInsertSql.executeUpdate();
-               lineCount++;
-               if ( lineCount % 100000 == 0 ) {
-                  System.out.println( "DB Row " + lineCount );
+               for ( Integer tuiCode : tuiCodes ) {
+                  final String tui = CuiTuiUtil.getAsTui( tuiCode );
+                  rowInsertSql.setString( FIELD.CUI.__index, cui );
+                  rowInsertSql.setString( FIELD.FWORD.__index, tokens[0] );
+                  rowInsertSql.setString( FIELD.TEXT.__index, text );
+                  rowInsertSql.setString( FIELD.CODE.__index, cui );
+                  rowInsertSql.setString( FIELD.SOURCETYPE.__index, "UMLS_ROOT" );
+                  rowInsertSql.setString( FIELD.TUI.__index, tui );
+                  rowInsertSql.executeUpdate();
+                  lineCount++;
+                  if ( lineCount % 100000 == 0 ) {
+                     System.out.println( "DB Row " + lineCount );
+                  }
                }
             }
          }
          System.out.println( "DB Rows " + lineCount );
-
          final Statement statement = connection.createStatement();
          statement.execute( "commit" );
          rowInsertSql.close();
@@ -82,13 +87,12 @@ final public class FirstWordDbWriter {
       }
    }
 
-
-   static private String getSingleTui( final Collection<String> tuis ) {
-      for ( String tui : tuis ) {
-         return tui;
-      }
-      return "T000";
-   }
-
+   //   static private Integer getSingleTuiCode( final Collection<Integer> tuis ) {
+   //      for ( Integer tuiCode : tuis ) {
+   //         return tuiCode;
+   //      }
+   //      return 0;
+   //   }
+   //
 
 }



Mime
View raw message