ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1572710 [2/2] - in /ctakes/sandbox/dictionarytool: ./ data/ data/default/ lib/ src/ src/META-INF/ src/org/ src/org/apache/ src/org/apache/ctakes/ src/org/apache/ctakes/dictionarytool/ src/org/apache/ctakes/dictionarytool/reader/ src/org/ap...
Date Thu, 27 Feb 2014 19:19:55 GMT
Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,76 @@
+package org.apache.ctakes.dictionarytool.reader;
+
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.RRF_INDEX;
+import org.apache.ctakes.dictionarytool.util.UmlsTermUtil;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class UmlsTextsForCuisReader {
+
+//   static private final String RRF_PATH = "C:/Spiffy/App/umls/2013AA/2013AA/META/MRCONSO.RRF";
+
+
+   private UmlsTextsForCuisReader() {
+   }
+
+   static public Map<String, Collection<String>> readTextsForCuis( final String rrfPath,
+                                                                    final Collection<String> wantedCuis,
+                                                                    final UmlsTermUtil umlsTermUtil ) {
+      System.out.println( "Compiling map of Umls Cuis and Texts" );
+      long lineCount = 0;
+      long textCount = 0;
+      final Map<String, Collection<String>> cuisAndText = new HashMap<String, Collection<String>>( wantedCuis.size() );
+      try {
+         final BufferedReader reader = FileUtil.createReader( rrfPath );
+         List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
+         while ( tokens != null ) {
+            lineCount++;
+            if ( tokens.size() > RRF_INDEX.TEXT._index && tokens.get( RRF_INDEX.LANGUAGE._index ).equals( "ENG" ) ) {
+               final String cui = CuiTuiUtil.getAsCui( tokens.get( RRF_INDEX.CUI._index ) );
+               if ( wantedCuis.contains( cui ) ) {
+                  String text = tokens.get( RRF_INDEX.TEXT._index );
+                  Collection<String> formattedTexts = umlsTermUtil.getFormattedTexts( text );
+                  if ( formattedTexts == null || formattedTexts.isEmpty() ) {
+                     tokens = FileUtil.readBsvTokens( reader, rrfPath );
+                     continue;
+                  }
+                  Collection<String> textsForCui = cuisAndText.get( cui );
+                  if ( textsForCui == null ) {
+                     cuisAndText.put( cui, formattedTexts );
+                     textCount += formattedTexts.size();
+                  } else {
+                     final int oldSize = textsForCui.size();
+                     textsForCui.addAll( formattedTexts );
+                     textCount += textsForCui.size() - oldSize;
+                  }
+               }
+            }
+            if ( lineCount % 2000 == 0 ) {
+               System.out.print( "." );
+               if ( lineCount % 100000 == 0 ) {
+                  System.out.println( "File Line " + lineCount + "\t Terms " + textCount );
+               }
+            }
+            tokens = FileUtil.readBsvTokens( reader, rrfPath );
+         }
+         reader.close();
+      } catch ( IOException ioE ) {
+         System.err.println( ioE.getMessage() );
+      }
+      System.out.println( "File Line " + lineCount + "\t Terms " + textCount );
+      return cuisAndText;
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTextsForCuisReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,75 @@
+package org.apache.ctakes.dictionarytool.reader;
+
+import org.apache.ctakes.dictionarytool.util.CuiTuiUtil;
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class UmlsTuisForCuisReader {
+
+   private UmlsTuisForCuisReader() {
+   }
+
+//   static private final String CUI_TUI_PATH = "C:/Spiffy/App/umls/2013AA/2013AA/META/MRSTY.RRF";
+
+   static private final int CUI_INDEX = 0;
+   static private final int TUI_INDEX = 1;
+
+   static public Map<String, Collection<String>> readUmlsTuisForCuis( final String cuiTuiMapPath,
+                                                                      final Collection<String> cuis ) {
+      System.out.println( "Compiling list of Tuis for wanted Cuis using " + cuiTuiMapPath );
+      long lineCount = 0;
+      final Map<String, Collection<String>> cuisAndTuis = new HashMap<String, Collection<String>>( cuis.size() );
+      final Collection<String> usedCuis = new HashSet<String>( cuis.size() );
+      try {
+         final BufferedReader reader = FileUtil.createReader( cuiTuiMapPath );
+         List<String> tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
+         while ( tokens != null ) {
+            lineCount++;
+            if ( tokens.size() > TUI_INDEX ) {
+               final String cui = CuiTuiUtil.getAsCui( tokens.get( CUI_INDEX ) );
+               if ( !cuis.contains( cui ) ) {
+                  tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
+                  continue;
+               }
+               final String tui = CuiTuiUtil.getAsTui( tokens.get( TUI_INDEX ) );
+               Collection<String> tuis = cuisAndTuis.get( cui );
+               if ( tuis == null ) {
+                  tuis = new HashSet<String>( 1 );
+                  cuisAndTuis.put( cui, tuis );
+               }
+               tuis.add( tui );
+               usedCuis.add( cui );
+            }
+            if ( lineCount % 100000 == 0 ) {
+               System.out.println( "File Line " + lineCount + "\t Cuis " + cuisAndTuis.size() );
+            }
+            tokens = FileUtil.readBsvTokens( reader, cuiTuiMapPath );
+         }
+         reader.close();
+      } catch ( IOException ioE ) {
+         System.err.println( ioE.getMessage() );
+      }
+      System.out.println( "File Lines " + lineCount + "\t Cuis " + cuisAndTuis.size() );
+      if ( usedCuis.size() != cuis.size() ) {
+         cuis.removeAll( usedCuis );
+         for ( String missingCui : cuis ) {
+            System.out.println( "Could not find Tuis for Cui " + missingCui );
+         }
+      }
+      return cuisAndTuis;
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/reader/UmlsTuisForCuisReader.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,152 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import net.jcip.annotations.NotThreadSafe;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 2/26/14
+ */
+@NotThreadSafe
+final public class CreatorProperties {
+
+   static private final String DEFAULT_DATA_DIR = "./data/default";
+   static private final String DEFAULT_TUI_FILE = DEFAULT_DATA_DIR + "/CtakesAllTuis.txt";
+   static private final String DEFAULT_SOURCE_FILE = DEFAULT_DATA_DIR + "/CtakesSources.txt";
+
+
+   private boolean _rareWordIndex = true;
+
+   public CreatorProperties( final String ... args ) {
+      if ( args.length == 0 ) {
+         printHelp();
+         System.exit( 0 );
+      }
+      for ( String arg : args ) {
+         if ( arg.equalsIgnoreCase( "-fw" ) ) {
+            _rareWordIndex = false;
+         } else if ( arg.equals( "-?" ) || arg.equalsIgnoreCase( "-h" ) ) {
+            printHelp();
+            System.exit( 0 );
+         }
+      }
+      for ( Option option : Option.values() ) {
+         option.parseValue( args );
+      }
+      if ( !ensurePropertiesOk() ) {
+         printHelp();
+         System.exit( 1 );
+      }
+      assignCtakesDefaults();
+   }
+
+   static private void printHelp() {
+      System.out.println( "Dictionary Creator: Creates a flat file Cui|Text or Database Dictionary from UMLS and Orangebook" );
+      System.out.println( "Database Dictionary can be indexed by each Text's First Word or Rarest Word (for the dictionary)" );
+      System.out.println( "Minimal Usage: DictionaryCreator -umls pathToUmlsRoot -ol pathToFlatFileOutput" );
+      System.out.println( "" );
+      System.out.println( "-fw \t\tCreate First Word Index" );
+//      System.out.println( "-ct \t\tUse cTakes default setup (default)" );
+      for ( Option option : Option.values() ) {
+         System.out.println( option.getHelp() );
+      }
+      System.out.println( "The UMLS Root Directory must be specified" );
+      System.out.println( "One form of output must be specified using either -ol or -db and -tbl" );
+      System.out.println( "The default index type for databases is Rare Word Index" );
+      System.out.println( "If an Orangebook Path is not specified then (orangebook) medication terms are not written" );
+      System.out.println( "If a Format Data Directory is not specified then the default is used: " + DEFAULT_DATA_DIR );
+      System.out.println( "If an Input Tui List Path is not specified then the cTakes Tuis are used: " + DEFAULT_TUI_FILE );
+      System.out.println( "If a Source Type List Path is not specified then Snomed is used: " + DEFAULT_SOURCE_FILE );
+   }
+
+   private boolean ensurePropertiesOk() {
+      boolean ok = true;
+      if ( !Option.TERM_LIST.hasValue()
+            && (!Option.DATA_BASE.hasValue() || !Option.DATA_TABLE.hasValue()) ) {
+         System.err.println( "Need an output location" );
+         ok = false;
+      }
+      if ( !Option.UMLS_ROOT.hasValue() ) {
+         System.err.println( "Need an UMLS_ROOT root directory" );
+      }
+      return ok;
+   }
+
+   private void assignCtakesDefaults() {
+      if ( !Option.FORMAT_DATA.hasValue() ) {
+         Option.FORMAT_DATA.parseValue( Option.FORMAT_DATA.__key, DEFAULT_DATA_DIR );
+      }
+      if ( !Option.TUI_LIST.hasValue() ) {
+         Option.TUI_LIST.parseValue( Option.TUI_LIST.__key, DEFAULT_TUI_FILE );
+      }
+      if ( !Option.SOURCE.hasValue() ) {
+         Option.SOURCE.parseValue( Option.SOURCE.__key, DEFAULT_SOURCE_FILE );
+      }
+   }
+
+   /**
+    * @return true if a rare word indexed dictionary should be created
+    */
+   public boolean isRareWordIndex() {
+      return _rareWordIndex;
+   }
+
+   static public enum Option {
+      UMLS_ROOT( "Umls Root Directory", "-umls" ),
+      ORANGE_BOOK( "Orangebook Path", "-ob" ),
+      FORMAT_DATA( "Format Data Directory", "-fd" ),
+      TUI_LIST( "Input Tui List Path", "-tui" ),
+//      SEM_LIST( "Input Semantic Group List Path", "-sem" ),
+      SOURCE( "Source Type List Path", "-src" ),
+      TERM_LIST( "Output Cui and Term List Path", "-ol" ),
+      DATA_BASE( "Output Database Url", "-db" ),
+      DATA_TABLE( "Output Database Table", "-tbl" );
+      final private String __name;
+      final private String __key;
+      private String __value;
+
+      private Option( final String name, final String key ) {
+         __name = name;
+         __key = key;
+      }
+
+      public String getName() {
+         return __name;
+      }
+
+      public String getKey() {
+         return __key;
+      }
+
+      public String getValue() {
+         return __value;
+      }
+
+      public boolean hasValue() {
+         return __value != null && !__value.isEmpty();
+      }
+
+      private void parseValue( final String ... args ) {
+         if ( args[args.length-1].equalsIgnoreCase( __key ) ) {
+            System.err.println( "An argument is needed for " + __name + " (" + __key + ")" );
+            return;
+         }
+         for ( int i=0; i<args.length-1; i++ ) {
+            if ( args[i].equalsIgnoreCase( __key ) ) {
+               if ( hasValue() ) {
+                  System.err.println( __name + " (" + __key + ") has been set more than once" );
+               }
+               __value = args[i+1];
+               // don't break yet, check for repeat setting
+            }
+         }
+      }
+
+      public String getHelp() {
+         return getKey() + " \t\t" + getName();
+      }
+   }
+
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CreatorProperties.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,29 @@
+package org.apache.ctakes.dictionarytool.util;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class CuiTuiUtil {
+
+   private CuiTuiUtil() {
+   }
+
+   static public String getAsCui( final String code ) {
+      final String cui = code.trim().toUpperCase();
+      if ( cui.startsWith( "C" ) ) {
+         return cui;
+      }
+      return "C" + cui;
+   }
+
+   static public String getAsTui( final String code ) {
+      final String tui = code.trim().toUpperCase();
+      if ( tui.startsWith( "T" ) ) {
+         return tui;
+      }
+      return "T" + tui;
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/CuiTuiUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,218 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import javax.swing.filechooser.FileSystemView;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileReader;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class FileUtil {
+
+   private FileUtil() {
+   }
+
+   static private final Logger LOGGER = Logger.getLogger( "FileUtil" );
+
+   static public String parseDirText( final String dirPath ) {
+      if ( dirPath == null || dirPath.isEmpty() ) {
+         return parseDirText( "." );
+      } else if ( dirPath.startsWith( "~" ) ) {
+         return parseDirText( dirPath.replaceAll( "~", System.getProperty( "user.home" ) ) );
+      } else if ( dirPath.equals( "." ) ) {
+         final String userDir = System.getProperty("user.dir");
+         if ( userDir == null || userDir.isEmpty() ) {
+            return FileSystemView.getFileSystemView().getDefaultDirectory().getPath();
+         }
+         return userDir;
+      } else if ( dirPath.startsWith( ".." ) ) {
+         final String userDirPath = parseDirText( "." );
+         File cwd = new File( userDirPath );
+         String cwdPath = dirPath;
+         while ( cwdPath.startsWith( ".." ) ) {
+            if ( !cwd.isDirectory() ) {
+               LOGGER.severe( "Invalid directory " + dirPath );
+               System.exit( 1 );
+            }
+            cwd = cwd.getParentFile();
+            if ( cwdPath.equals( ".." ) ) {
+               return cwd.getPath();
+            }
+            cwdPath = cwdPath.substring( 3 );
+         }
+         return cwd.getPath();
+      }
+      return dirPath;
+   }
+
+
+
+   static public BufferedReader createReader( final String filePath ) {
+      final String formattedPath = parseDirText( filePath );
+      final File file = new File( formattedPath );
+      if ( !file.canRead() ) {
+         System.err.println( "Cannot read file " + filePath );
+         System.exit( 1 );
+      }
+      try {
+         return new BufferedReader( new FileReader( file ) );
+      } catch ( IOException ioE ) {
+         System.err.println( "Cannot create Reader for " + filePath );
+         System.err.println( ioE.getMessage() );
+         System.exit( 1 );
+      }
+      return null;
+   }
+
+   static public BufferedWriter createWriter( final String filePath ) {
+      final String formattedPath = parseDirText( filePath );
+      final File file = new File( formattedPath );
+      if ( file.getParentFile() != null && !file.getParentFile().isDirectory() ) {
+         file.getParentFile().mkdirs();
+      }
+      try {
+         return new BufferedWriter( new FileWriter( file, true ) );
+      } catch ( IOException ioE ) {
+         System.err.println( "Cannot create Writer for " + filePath );
+         System.err.println( ioE.getMessage() );
+         System.exit( 1 );
+      }
+      return null;
+   }
+
+   static public String readLine( final BufferedReader reader, final String filePath ) {
+      try {
+         String line = reader.readLine();
+         while ( line != null ) {
+            line = line.trim();
+            if ( !line.isEmpty() && !line.startsWith( "//" ) ) {
+               return line;
+            }
+            line = reader.readLine();
+         }
+      } catch ( IOException ioE ) {
+         System.err.println( "Error reading from file " + filePath );
+      }
+      return null;
+   }
+
+   static public List<String> readBsvTokens( final BufferedReader reader, final String filePath ) {
+      final String line = readLine( reader, filePath );
+      if ( line == null ) {
+         return null;
+      }
+      return TokenUtil.getBsvItems( line );
+   }
+
+   static public List<String> readCsvTokens( final BufferedReader reader, final String filePath ) {
+      final String line = readLine( reader, filePath );
+      if ( line == null ) {
+         return null;
+      }
+      return TokenUtil.getCsvItems( line );
+   }
+
+   static public List<String> readTildeTokens( final BufferedReader reader, final String filePath ) {
+      final String line = readLine( reader, filePath );
+      if ( line == null ) {
+         return null;
+      }
+      return TokenUtil.getTildeItems( line );
+   }
+
+   static public void writeOneColumn( final String filePath, final String description,
+                                      final Collection<String> list ) {
+      System.out.println( "Writing " + description + " to " + filePath );
+      long lineCount = 0;
+      try {
+         final BufferedWriter writer = createWriter( filePath );
+         for ( String item : list ) {
+            lineCount++;
+            writer.write( item );
+            writer.newLine();
+            if ( lineCount % 100000 == 0 ) {
+               System.out.println( "File Line " + lineCount );
+            }
+         }
+         writer.close();
+      } catch ( IOException ioE ) {
+         System.err.println( "Error writing " + description + " on line " + lineCount + " in file " + filePath );
+      }
+      System.out.println( "Wrote " + lineCount + " " + description + " to " + filePath );
+   }
+
+
+   static public Collection<String> readOneColumn( final String listFilePath, final String description ) {
+      System.out.println( "Reading " + description + " from " + listFilePath );
+      final Collection<String> listItems = new HashSet<String>();
+      long lineCount = 0;
+      try {
+         final BufferedReader reader = createReader( listFilePath );
+         String line = readLine( reader, listFilePath );
+         while ( line != null ) {
+            lineCount++;
+            listItems.add( line );
+            if ( lineCount % 100000 == 0 ) {
+               System.out.println( "File Line " + lineCount );
+            }
+            line = readLine( reader, listFilePath );
+         }
+         reader.close();
+      } catch ( IOException ioE ) {
+         System.err.println( ioE.getMessage() );
+      }
+      System.out.println( "File Lines " + lineCount + "\t " + description + " " + listItems.size() );
+      return listItems;
+   }
+
+
+   static public void writeNamedSets( final String filePath, final String description,
+                                      final Map<String, Collection<String>> namedSets ) {
+      System.out.println( "Writing " + description + " to " + filePath );
+      long lineCount = 0;
+      try {
+         final BufferedWriter writer = createWriter( filePath );
+         for ( Map.Entry<String, Collection<String>> namedSet : namedSets.entrySet() ) {
+            lineCount++;
+            writer.write( TokenUtil.createBsvLine( namedSet.getKey(),
+                                                   TokenUtil.createCsvLine( namedSet.getValue() ) ) );
+            writer.newLine();
+            if ( lineCount % 100000 == 0 ) {
+               System.out.println( "File Line " + lineCount );
+            }
+         }
+         writer.close();
+      } catch ( IOException ioE ) {
+         System.err.println( "Error writing " + description + " on line " + lineCount + " in file " + filePath );
+      }
+      System.out.println( "Wrote " + lineCount + " " + description + " to " + filePath );
+   }
+
+   static public Map<String, Collection<String>> readNamedSets( final String filePath, final String description ) {
+      final Collection<String> lines = readOneColumn( filePath, description );
+      final Map<String, Collection<String>> namedSets = new HashMap<String, Collection<String>>( lines.size() );
+      for ( String line : lines ) {
+         final List<String> nameAndList = TokenUtil.getBsvItems( line );
+         if ( nameAndList == null || nameAndList.size() != 2 ) {
+            System.err.println( "Bad line " + line );
+            continue;
+         }
+         namedSets.put( nameAndList.get( 0 ), TokenUtil.getCsvItems( nameAndList.get( 1 ) ) );
+      }
+      return namedSets;
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/FileUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,69 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.sql.Connection;
+import java.sql.Driver;
+import java.sql.DriverManager;
+import java.sql.SQLException;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/21/14
+ */
+final public class JdbcUtil {
+
+   private JdbcUtil() {
+   }
+
+   static private final String JDBC_DRIVER = "org.hsqldb.jdbcDriver";
+
+
+   static public void registerDriver() {
+      try {
+         Driver driver = (Driver) Class.forName( JDBC_DRIVER ).newInstance();
+         DriverManager.registerDriver( driver );
+      } catch ( Exception e ) {
+         // TODO At least four different exceptions are thrown here, and should be caught and handled individually
+         System.err.println( "Could not register Driver " + JDBC_DRIVER );
+         System.err.println( e.getMessage() );
+         System.exit( 1 );
+      }
+   }
+
+   static public Connection createDatabaseConnection( final String url, final String user, final String pass ) {
+      registerDriver();
+      System.out.println( "Connecting to " + url + " as " + user );
+      Connection connection = null;
+      try {
+         connection = DriverManager.getConnection( url, user, pass );
+      } catch ( SQLException sqlE ) {
+         // thrown by Connection.prepareStatement(..) and getTotalRowCount(..)
+         System.err.println( "Could not establish connection to " + url + " as " + user );
+         System.err.println( sqlE.getMessage() );
+         System.exit( 1 );
+      }
+      return connection;
+   }
+
+//   static public String createRowInsertSql( final String tableName, final int valueCount ) {
+   static public String createRowInsertSql( final String tableName, final Enum ... fields ) {
+
+   final StringBuilder sb = new StringBuilder( "insert into" );
+      sb.append( " " ).append( tableName );
+      sb.append( " (" );
+      for ( Enum field : fields ) {
+         sb.append( field.name() ).append( ',' );
+      }
+      // remove last comma
+      sb.setLength( sb.length() - 1 );
+      sb.append( ") " );
+      sb.append( " values (" );
+      for ( int i = 0; i < fields.length - 1; i++ ) {
+         sb.append( "?," );
+      }
+      sb.append( "?)" );
+      return sb.toString();
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/JdbcUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,15 @@
+package org.apache.ctakes.dictionarytool.util;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/23/14
+ */
+public enum RRF_INDEX {
+   CUI(0), LANGUAGE(1), SOURCE(11), TEXT(14);
+   final public int _index;
+   RRF_INDEX( final int index ) {
+      _index = index;
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RRF_INDEX.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,171 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class RareWordUtil {
+
+   private RareWordUtil() {}
+
+   // LookupDesc for the standard excluded pos tags are
+   //   VB,VBD,VBG,VBN,VBP,VBZ,CC,CD,DT,EX,LS,MD,PDT,POS,PP,PP$,PRP,PRP$,RP,TO,WDT,WP,WPS,WRB
+   // Listing every verb in the language seems a pain, but listing the others is possible.
+   // Verbs should be rare in the dictionaries, excepting perhaps the activity and concept dictionaries
+   // CD, CC, DT, EX, MD, PDT, PP, PP$, PRP, PRP$, RP, TO, WDT, WP, WPS, WRB
+   // why not WP$ (possessive wh- pronoun "whose")
+   // PP$ is a Brown POS tag, not Penn Treebank (as are the rest)
+//   static private final String[] BAD_POS_TERMS = {
+//         // CD  cardinal number
+//         "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
+//         // CC  coordinating conjunction
+//         "and", "or", "but", "for", "nor", "so", "yet",
+//         // DT  determiner
+//         "this", "that", "these", "those", "the",
+//         // EX  existential there
+//         "there",
+//         // MD  modal
+//         "can", "should", "will", "may", "might", "must", "could", "would",
+//         // PDT  predeterminer
+//         "some", "any", "all", "both", "half", "none", "twice",
+//         // PP  prepositional phrase (preposition)
+//         "at", "before", "after", "behind", "beneath", "beside", "between", "into", "through", "across", "of",
+//         "concerning", "like", "except", "with", "without", "toward", "to", "past", "against", "during", "until",
+//         "throughout", "below", "besides", "beyond", "from", "inside", "near", "outside", "since", "upon",
+//         // PP$  possessive personal pronoun - Brown POS tag, not Penn TreeBank
+//         "my", "our",
+//         // PRP  personal pronoun
+//         "i", "you", "he", "she", "it",
+//         // PRP$  possesive pronoun
+//         "mine", "yours", "his", "hers", "its", "ours", "theirs",
+//         // RP  particle  - this contains some prepositions
+//         "about", "off", "up", "along", "away", "back", "by", "down", "forward", "in", "on", "out",
+//         "over", "around", "under",
+//         // TO  to  - also a preposition
+//         "to",
+//         // WDT  wh- determiner
+//         "what", "whatever", "which", "whichever",
+//         // WP, WPS  wh- pronoun, nominative wh- pronoun
+//         "who", "whom", "which", "that", "whoever", "whomever",
+//         // WRB
+//         "how", "where", "when", "however", "wherever", "whenever",
+//         // Mine ...
+//         "no"
+//   };
+
+   static private Set<String> BAD_POS_TERM_SET;
+
+   static {
+      final String[] BAD_POS_TERMS = {
+            // CD  cardinal number
+            "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
+            // CC  coordinating conjunction
+            "and", "or", "but", "for", "nor", "so", "yet",
+            // DT  determiner
+            "this", "that", "these", "those", "the",
+            // EX  existential there
+            "there",
+            // MD  modal
+            "can", "should", "will", "may", "might", "must", "could", "would",
+            // PDT  predeterminer
+            "some", "any", "all", "both", "half", "none", "twice",
+            // PP  prepositional phrase (preposition)
+            "at", "before", "after", "behind", "beneath", "beside", "between", "into", "through", "across", "of",
+            "concerning", "like", "except", "with", "without", "toward", "to", "past", "against", "during", "until",
+            "throughout", "below", "besides", "beyond", "from", "inside", "near", "outside", "since", "upon",
+            // PP$  possessive personal pronoun - Brown POS tag, not Penn TreeBank
+            "my", "our",
+            // PRP  personal pronoun
+            "i", "you", "he", "she", "it",
+            // PRP$  possesive pronoun
+            "mine", "yours", "his", "hers", "its", "ours", "theirs",
+            // RP  particle  - this contains some prepositions
+            "about", "off", "up", "along", "away", "back", "by", "down", "forward", "in", "on", "out",
+            "over", "around", "under",
+            // TO  to  - also a preposition
+            "to",
+            // WDT  wh- determiner
+            "what", "whatever", "which", "whichever",
+            // WP, WPS  wh- pronoun, nominative wh- pronoun
+            "who", "whom", "which", "that", "whoever", "whomever",
+            // WRB
+            "how", "where", "when", "however", "wherever", "whenever",
+            // Mine ...
+            "no"
+      };
+      BAD_POS_TERM_SET = new HashSet<String>( Arrays.asList( BAD_POS_TERMS ) );
+   }
+
+   static public boolean isRarableToken( final String token ) {
+      if ( token.length() <= 1 ) {
+         return false;
+      }
+      boolean hasLetter = false;
+      for ( int i = 0; i < token.length(); i++ ) {
+         if ( Character.isLetter( token.charAt( i ) ) ) {
+            hasLetter = true;
+            break;
+         }
+      }
+      return hasLetter && !BAD_POS_TERM_SET.contains( token );
+   }
+
+
+   static public Map<String, Integer> getTokenCounts( final Map<String, Collection<String>> cuiTexts ) {
+      final Map<String, Integer> tokenCounts = new HashMap<String, Integer>();
+      for ( Collection<String> texts : cuiTexts.values() ) {
+         for ( String text : texts ) {
+            final String[] tokens = text.split( "\\s+" );
+            for ( String token : tokens ) {
+               if ( RareWordUtil.isRarableToken( token ) ) {
+                  Integer count = tokenCounts.get( token );
+                  if ( count == null ) {
+                     count = 0;
+                  }
+                  tokenCounts.put( token, (count + 1) );
+               }
+            }
+
+         }
+      }
+      return tokenCounts;
+   }
+
+
+   //   static public String getRareToken( final Map<String,Integer> tokenCounts, final String text ) {
+   //      final String[] tokens = text.split( "\\s+" );
+   //      int bestIndex = 0;
+   //      int bestCount = Integer.MAX_VALUE;
+   //      for ( int i = 0; i < tokens.length; i++ ) {
+   //         Integer count = tokenCounts.get( tokens[i] );
+   //         if ( count != null && count < bestCount ) {
+   //            bestIndex = i;
+   //            bestCount = count;
+   //         }
+   //      }
+   //      return tokens[bestIndex];
+   //   }
+   //
+   //   static public int getRareTokenIndex( final Map<String,Integer> tokenCounts, final String text ) {
+   //      final String[] tokens = text.split( "\\s+" );
+   //      int bestIndex = 0;
+   //      int bestCount = Integer.MAX_VALUE;
+   //      for ( int i = 0; i < tokens.length; i++ ) {
+   //         Integer count = tokenCounts.get( tokens[i] );
+   //         if ( count != null && count < bestCount ) {
+   //            bestIndex = i;
+   //            bestCount = count;
+   //         }
+   //      }
+   //      return bestIndex;
+   //   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/RareWordUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,191 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/16/14
+ */
+final public class TextTokenizer {
+
+   private TextTokenizer() {
+   }
+
+   static private final String[] PREFIXES = {
+         "e-",
+         "a-",
+         "u-",
+         "x-",
+         "agro-",
+         "ante-",
+         "anti-",
+         "arch-",
+         "be-",
+         "bi-",
+         "bio-",
+         "co-",
+         "counter-",
+         "cross-",
+         "cyber-",
+         "de-",
+         "eco-",
+         "ex-",
+         "extra-",
+         "inter-",
+         "intra-",
+         "macro-",
+         "mega-",
+         "micro-",
+         "mid-",
+         "mini-",
+         "multi-",
+         "neo-",
+         "non-",
+         "over-",
+         "pan-",
+         "para-",
+         "peri-",
+         "post-",
+         "pre-",
+         "pro-",
+         "pseudo-",
+         "quasi-",
+         "re-",
+         "semi-",
+         "sub-",
+         "super-",
+         "tri-",
+         "ultra-",
+         "un-",
+         "uni-",
+         "vice-",
+         // From email from Colin Warner <colinw@ldc.upenn.edu> on 7/25/2010
+         "electro-",
+         "gasto-",
+         "homo-",
+         "hetero-",
+         "ortho-",
+         "phospho-",
+   };
+
+   static private final String[] SUFFIXES = {"-esque", "-ette", "-fest", "-fold", "-gate", "-itis", "-less", "-most",
+                                             "-o-torium", "-rama", "-wise"};
+
+
+   static private String getNextCharTerm( final String word ) {
+      final StringBuilder sb = new StringBuilder();
+      final int count = word.length();
+      for ( int i = 0; i < count; i++ ) {
+         final char c = word.charAt( i );
+         if ( !Character.isLetterOrDigit( c ) ) {
+            return sb.toString();
+         }
+         sb.append( c );
+      }
+      return sb.toString();
+   }
+
+   static private boolean isPrefix( final String word ) {
+      final String prefixQ = word + "-";
+      for ( String prefix : PREFIXES ) {
+         if ( prefix.equals( prefixQ ) ) {
+            return true;
+         }
+      }
+      return false;
+   }
+
+   static private boolean isSuffix( final String word, final int startIndex ) {
+      if ( word.length() >= startIndex ) {
+         return false;
+      }
+      final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
+      if ( nextCharTerm.isEmpty() ) {
+         return false;
+      }
+      final String suffixQ = "-" + nextCharTerm;
+      for ( String suffix : SUFFIXES ) {
+         if ( suffix.equals( suffixQ ) ) {
+            return true;
+         }
+      }
+      return false;
+   }
+
+
+   static public List<String> getTokens( final String word ) {
+      final List<String> tokens = new ArrayList<String>();
+      final StringBuilder sb = new StringBuilder();
+      final int count = word.length();
+      for ( int i = 0; i < count; i++ ) {
+         final char c = word.charAt( i );
+         if ( Character.isLetterOrDigit( c ) ) {
+            // Appending character to current word
+            sb.append( c );
+            continue;
+         }
+         if ( c != '-' ) {
+            // have a symbol other than dash
+            if ( sb.length() != 0 ) {
+               // add the current word
+               tokens.add( sb.toString() );
+               sb.setLength( 0 );
+            }
+            // add the symbol
+            tokens.add( "" + c );
+            continue;
+         }
+         final boolean isPrefix = isPrefix( sb.toString() );
+         if ( isPrefix ) {
+            // what precedes is a prefix, so append the dash to the current word and move on
+            sb.append( '-' );
+            continue;
+         }
+         final boolean isSuffix = isSuffix( word, i + 1 );
+         if ( !isSuffix ) {
+            // what follows is not a suffix, so add the current word, add the dash, and move on
+            if ( sb.length() != 0 ) {
+               tokens.add( sb.toString() );
+               sb.setLength( 0 );
+            }
+            tokens.add( "" + c );
+            continue;
+         }
+         tokens.add( "" + c );
+      }
+      if ( sb.length() != 0 ) {
+         // add the final word
+         tokens.add( sb.toString() );
+      }
+      return tokens;
+   }
+
+   static public String getTokenizedText( final String text ) {
+      if ( text.isEmpty() ) {
+         return text;
+      }
+      final String[] splits = text.toLowerCase().split( "\\s+" );
+      if ( splits.length == 0 ) {
+         return "";
+      }
+      final String lastSplit = splits[splits.length - 1];
+      if ( lastSplit.endsWith( "," ) || lastSplit.endsWith( ";" ) || lastSplit.endsWith( "." ) ) {
+         // get rid of last comma or semicolon or period
+         splits[splits.length - 1] = lastSplit.substring( 0, lastSplit.length() - 1 );
+      }
+      final StringBuilder sb = new StringBuilder();
+      for ( String split : splits ) {
+         final List<String> tokens = getTokens( split );
+         for ( String token : tokens ) {
+            sb.append( token ).append( " " );
+         }
+      }
+      // trim whitespace
+      sb.setLength( Math.max( 0, sb.length() - 1 ) );
+      return sb.toString();
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TextTokenizer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,77 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class TokenUtil {
+
+   private TokenUtil() {
+   }
+
+   static public List<String> getBsvItems( final String line ) {
+      return getSeparatedValueItems( line, '|' );
+   }
+
+   static public List<String> getTildeItems( final String line ) {
+      return getSeparatedValueItems( line, '~' );
+   }
+
+   static public List<String> getCsvItems( final String line ) {
+      return getSeparatedValueItems( line, ',' );
+   }
+
+   static private List<String> getSeparatedValueItems( final String line, final char separator ) {
+      if ( line == null || line.trim().isEmpty() ) {
+         return Collections.emptyList();
+      }
+      final List<String> tokens = new ArrayList<String>();
+      int startIndex = 0;
+      int stopIndex = line.indexOf( separator );
+      while ( stopIndex > 0 && stopIndex < line.length() ) {
+         tokens.add( line.substring( startIndex, stopIndex ) );
+         startIndex = stopIndex + 1;
+         stopIndex = line.indexOf( separator, startIndex );
+      }
+      if ( startIndex < line.length() - 1 ) {
+         tokens.add( line.substring( startIndex ) );
+      }
+      return tokens;
+   }
+
+
+
+
+   static public String createBsvLine( final Collection<String> values ) {
+      return createBsvLine( values.toArray( new String[values.size()] ) );
+   }
+
+   static public String createBsvLine( final String... values ) {
+      final StringBuilder sb = new StringBuilder();
+      for ( String value : values ) {
+         sb.append( value ).append( "|" );
+      }
+      sb.setLength( sb.length() - 1 );
+      return sb.toString();
+   }
+
+   static public String createCsvLine( final Collection<String> values ) {
+      return createCsvLine( values.toArray( new String[values.size()] ) );
+   }
+
+   static public String createCsvLine( final String... values ) {
+      final StringBuilder sb = new StringBuilder();
+      for ( String value : values ) {
+         sb.append( value ).append( "," );
+      }
+      sb.setLength( sb.length() - 1 );
+      return sb.toString();
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/TokenUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,79 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/17/14
+ */
+final public class UmlsSourceTypeCuiValidator {
+
+   private UmlsSourceTypeCuiValidator() {}
+
+
+   /**
+    * Can cull the given collection of cuis
+    * @param rrfPath path to the UMLS_ROOT Meta/MRCONSO.RRF file
+    * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
+    * @param cuis current list of cuis
+    * @return Subset of cuis that exist in in the given sources
+    */
+   static public Collection<String> getSourceTypeValidCuis( final String rrfPath,
+                                                            final Collection<String> sourceTypes,
+                                                            final Collection<String> cuis ) {
+      final Collection<String> validCuis = new HashSet<String>( cuis.size() );
+      long lineCount = 0;
+      try {
+         final BufferedReader reader = FileUtil.createReader( rrfPath );
+         List<String> tokens = FileUtil.readBsvTokens( reader, rrfPath );
+         while ( tokens != null ) {
+            lineCount++;
+            if ( tokens.size() > RRF_INDEX.SOURCE._index && sourceTypes.contains( tokens.get( RRF_INDEX.SOURCE._index ) ) ) {
+               final String cui = CuiTuiUtil.getAsCui( tokens.get( RRF_INDEX.CUI._index ) );
+               if ( cuis.contains( cui ) ) {
+                  validCuis.add( cui );
+               }
+            }
+            if ( lineCount % 2000 == 0 ) {
+               System.out.print( "." );
+               if ( lineCount % 100000 == 0 ) {
+                  System.out.println( "File Line " + lineCount + "\t Valid Cuis " + validCuis.size() );
+               }
+            }
+            tokens = FileUtil.readBsvTokens( reader, rrfPath );
+         }
+         reader.close();
+      } catch ( IOException ioE ) {
+         System.err.println( ioE.getMessage() );
+      }
+      System.out.println( "File Lines " + lineCount + "\t Valid Cuis " + validCuis.size() );
+      return validCuis;
+   }
+
+   /**
+    * Given a collection of cuis, returns all of the cuis that don't exist for the given source types
+    * @param rrfPath path to the UMLS_ROOT Meta/MRCONSO.RRF file
+    * @param sourceTypes desired source type names as appear in rrf: RXNORM, SNOMEDCT, MSH, etc.
+    * @param cuis current list of cuis
+    * @return Subset of cuis that don't exist in in the given sources
+    */
+   static public Collection<String> getSourceTypeInvalidCuis( final String rrfPath,
+                                                              final Collection<String> sourceTypes,
+                                                              final Collection<String> cuis ) {
+      final Collection<String> validCuis = getSourceTypeValidCuis( rrfPath, sourceTypes, cuis );
+      final Collection<String> invalidCuis = new HashSet<String>( cuis.size() - validCuis.size() );
+      for ( String cui : cuis ) {
+         if ( !validCuis.contains( cui ) ) {
+            invalidCuis.add( cui );
+         }
+      }
+      return invalidCuis;
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsSourceTypeCuiValidator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,469 @@
+package org.apache.ctakes.dictionarytool.util;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+
+
+/**
+ * Contains all the methods used to parse individual text definitions of umls terms
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/16/14
+ */
+final public class UmlsTermUtil {
+
+
+   static private enum DATA_FILE {
+      REMOVAL_PREFIX_TRIGGERS( "RemovalPrefixTriggers.txt" ),
+      REMOVAL_SUFFIX_TRIGGERS( "RemovalSuffixTriggers.txt" ),
+      REMOVAL_COLON_TRIGGERS( "RemovalColonTriggers.txt" ),
+      UNWANTED_PREFIXES( "UnwantedPrefixes.txt" ),
+      UNWANTED_SUFFIXES( "UnwantedSuffixes.txt" ),
+      MODIFIER_SUFFIXES( "ModifierSuffixes.txt" ),
+      RIGHT_ABBREVIATIONS( "RightAbbreviations.txt");
+      final private String __name;
+      private DATA_FILE( final String name ) {
+         __name = name;
+      }
+   }
+
+   static private String getDataPath( final String dataDir, final DATA_FILE dataFile ) {
+      return dataDir + '/' + dataFile.__name;
+   }
+
+   final private Collection<String> _removalPrefixTriggers;
+   final private Collection<String> _removalSuffixTriggers;
+   final private Collection<String> _removalColonTriggers;
+   final private Collection<String> _unwantedPrefixes;
+   final private Collection<String> _unwantedSuffixes;
+   final private Collection<String> _modifierSuffixes;
+   final private Collection<String> _abbreviations;
+
+   public UmlsTermUtil( final String dataDir ) {
+      this( getDataPath( dataDir, DATA_FILE.REMOVAL_PREFIX_TRIGGERS ),
+           getDataPath( dataDir, DATA_FILE.REMOVAL_SUFFIX_TRIGGERS ),
+           getDataPath( dataDir, DATA_FILE.REMOVAL_COLON_TRIGGERS ),
+           getDataPath( dataDir, DATA_FILE.UNWANTED_PREFIXES ),
+           getDataPath( dataDir, DATA_FILE.UNWANTED_SUFFIXES ),
+           getDataPath( dataDir, DATA_FILE.MODIFIER_SUFFIXES ),
+           getDataPath( dataDir, DATA_FILE.RIGHT_ABBREVIATIONS ) );
+   }
+
+   public UmlsTermUtil( final String removalPrefixTriggersPath, final String removalSuffixTriggersPath,
+                        final String removalColonTriggersPath,
+                        final String unwantedPrefixesPath, final String unwantedSuffixesPath,
+                        final String modifierSuffixesPath, final String abbreviationsPath ) {
+      _removalPrefixTriggers = FileUtil.readOneColumn( removalPrefixTriggersPath, "term removal Prefix Triggers" );
+      _removalSuffixTriggers = FileUtil.readOneColumn( removalSuffixTriggersPath, "term removal Suffix Triggers" );
+      _removalColonTriggers = FileUtil.readOneColumn( removalColonTriggersPath, "term removal Colon Triggers" );
+      _unwantedPrefixes = FileUtil.readOneColumn( unwantedPrefixesPath, "unwanted Prefixes" );
+      _unwantedSuffixes = FileUtil.readOneColumn( unwantedSuffixesPath, "unwanted Suffixes" );
+      _modifierSuffixes = FileUtil.readOneColumn( modifierSuffixesPath, "modifier Suffixes" );
+      _abbreviations = FileUtil.readOneColumn( abbreviationsPath, "Abbreviations to expand" );
+   }
+
+   public Collection<String> getFormattedTexts( final String text ) {
+      final String tokenizedText = TextTokenizer.getTokenizedText( text );
+      if ( tokenizedText == null || tokenizedText.isEmpty() ) {
+         return Collections.emptyList();
+      }
+      if ( !isTextValid( tokenizedText ) ) {
+         return Collections.emptyList();
+      }
+      final String validText = getValidText( tokenizedText );
+      if ( validText == null || validText.isEmpty() ) {
+         return Collections.emptyList();
+      }
+      // add embedded abbreviations
+      Collection<String> extractedTerms = extractAbbreviations( validText );
+      if ( extractedTerms.isEmpty() ) {
+         extractedTerms = autoExtractAcronyms( validText );
+      }
+      if ( extractedTerms.isEmpty() ) {
+         extractedTerms = extractModifiers( validText );
+      }
+      if ( !extractedTerms.isEmpty() ) {
+         extractedTerms.add( validText );
+         return getPluralTerms( extractedTerms );
+      }
+      // Check for embedded and / or terms
+      if ( extractedTerms.isEmpty() ) {
+         extractedTerms = autoExtractColonParaTerms( validText );
+      }
+      if ( extractedTerms.isEmpty() ) {
+         extractedTerms = autoExtractOrParaTerms( validText );
+      }
+      if ( extractedTerms.isEmpty() ) {
+         extractedTerms = autoExtractColonBracketTerms( validText );
+      }
+//      if ( extractedTerms.isEmpty() ) {
+//         extractedTerms = autoExtractAndBracketTerms( validText );
+//      }
+      if ( extractedTerms.isEmpty() ) {
+         extractedTerms = autoExtractOrBracketTerms( validText );
+      }
+      if ( !extractedTerms.isEmpty() ) {
+//         System.out.println( validText );
+//         for ( String et : extractedTerms ) {
+//            System.out.println("  " + et);
+//         }
+         return getPluralTerms( extractedTerms );
+      } else {
+         Collection<String> texts = new HashSet<String>( 1 );
+         texts.add( validText );
+         return getPluralTerms( texts );
+      }
+   }
+
+   static private Collection<String> getPluralTerms( final Collection<String> texts ) {
+      final Collection<String> plurals = new HashSet<String>();
+      for ( String text : texts ) {
+         if ( text.endsWith( "( s )" ) ) {
+            final String singular = text.substring( 0, text.length() - 5 ).trim();
+            plurals.add( singular );
+            plurals.add( singular + "s" );
+         }
+      }
+      texts.addAll( plurals );
+      return texts;
+   }
+
+   private boolean isTextValid( final String text ) {
+      // Check for illegal characters
+      for ( int i = 0; i < text.length(); i++ ) {
+         if ( text.charAt( i ) < ' ' || text.charAt( i ) > '~'  ) {
+            return false;
+         }
+      }
+      // Check for auto-created note form
+      if ( text.split( "@" ).length > 2 ) {
+         return false;
+      }
+      if ( text.length() == 3 && text.charAt( 0 ) == '(' ) {
+         return false;
+      }
+      for ( String removalPrefix : _removalPrefixTriggers ) {
+         if ( text.startsWith( removalPrefix ) ) {
+            return false;
+         }
+      }
+      for ( String removalSuffix : _removalSuffixTriggers ) {
+         if ( text.endsWith( removalSuffix ) ) {
+            return false;
+         }
+      }
+      for ( String removalColon : _removalColonTriggers ) {
+         if ( text.contains( removalColon ) ) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   private String getValidText( final String text ) {
+      // remove form underlines
+      if ( text.contains( "_ _ _" ) ) {
+         final int lastParen = text.lastIndexOf( '(' );
+         final int lastDash = text.indexOf( "_ _ _" );
+         final int deleteIndex = Math.max( 0, Math.min( lastParen, lastDash ) );
+         if ( deleteIndex > 0 ) {
+            return getValidText( text.substring( 0, deleteIndex - 1 ).trim() );
+         }
+      }
+      // remove unmatched parentheses, brackets, etc.
+//      if ( text.startsWith( "(" ) && !text.contains( ")" ) ) {
+//         return getValidText( text.substring( 1 ).trim() );
+//      }
+//      if ( text.startsWith( "[" ) && !text.contains( "]" ) ) {
+//         return getValidText( text.substring( 1 ).trim() );
+//      }
+//      if ( text.startsWith( "(" ) && text.endsWith( ") or" ) ) {
+//         return getValidText( text.substring( 1, text.length() - 4 ).trim() );
+//      }
+//      if ( text.startsWith( "or (" ) ) {
+//         return getValidText( text.substring( 2 ).trim() );
+//      }
+//      if ( text.startsWith( "\"" ) && text.endsWith( "\"" ) ) {
+//         return getValidText( text.substring( 1 ).trim() );
+//      }
+//      if ( text.startsWith( "(" ) && text.endsWith( ")" ) ) {
+//         return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+//      }
+//      if ( text.startsWith( "[" ) && text.endsWith( "]" ) ) {
+//         return getValidText( text.substring( 1, text.length() - 2 ).trim() );
+//      }
+//      if ( text.startsWith( "&" ) ) {
+//         return getValidText( text.substring( 1 ).trim() );
+//      }
+//      if ( text.endsWith( "]" ) && !text.contains( "[" ) ) {
+//         return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+//      }
+//      if ( text.endsWith( ")" ) && !text.contains( "(" ) ) {
+//         return getValidText( text.substring( 0, text.length() - 2 ).trim() );
+//      }
+      String strippedText = text.trim();
+      // Text in umls can have multiple suffixes and/or prefixes.  Stripping just once doesn't do the trick
+      int lastLength = Integer.MAX_VALUE;
+      while ( lastLength != strippedText.length() ) {
+         lastLength = strippedText.length();
+         for ( String prefix : _unwantedPrefixes ) {
+            if ( strippedText.startsWith( prefix ) ) {
+               strippedText = strippedText.substring( prefix.length() ).trim();
+            }
+         }
+         for ( String suffix : _unwantedSuffixes ) {
+            if ( strippedText.endsWith( suffix ) ) {
+               strippedText = strippedText.substring( 0, strippedText.length() - suffix.length() ).trim();
+            }
+         }
+      }
+      if ( strippedText.contains( "(" ) && strippedText.contains( "[" ) ) {
+         return "";
+      }
+      return strippedText;
+   }
+
+
+   private Collection<String> extractAbbreviations( final String tokenizedText ) {
+      for ( String abbreviation : _abbreviations ) {
+         if ( tokenizedText.endsWith( abbreviation )
+               && !tokenizedText.contains( ":" ) && !tokenizedText.contains( " of " )
+               && !tokenizedText.contains( " for " ) ) {
+            final String noAbbrTerm
+                  = tokenizedText.substring( 0, tokenizedText.length() - abbreviation.length() ).trim();
+            final String abbrTerm
+                  = abbreviation.replace( ":", "" ).replace( "(", "" ).replace( ")", "" ).replace( "-", "" )
+                  .replace( "[", "" ).replace( "]", "" ).replace( "&", "" ).trim();
+            final Collection<String> extractedAbbreviations = new HashSet<String>( 2 );
+            extractedAbbreviations.add( noAbbrTerm );
+            extractedAbbreviations.add( abbrTerm );
+            return extractedAbbreviations;
+         }
+      }
+      return Collections.emptyList();
+   }
+
+   private Collection<String> extractModifiers( final String tokenizedText ) {
+      for ( String modifier : _modifierSuffixes ) {
+         if ( tokenizedText.endsWith( modifier ) ) {
+            final String mainText = tokenizedText.substring( 0, tokenizedText.length() - modifier.length() ).trim();
+            final String modifierText = modifier.replace( "(", "" ).replace( ")", "" ).trim();
+            final Collection<String> modifiedTexts = new HashSet<String>( 2 );
+            modifiedTexts.add( modifierText + " " + mainText );
+            return modifiedTexts;
+         }
+      }
+      return Collections.emptyList();
+   }
+
+   private Collection<String> autoExtractAcronyms( final String tokenizedText ) {
+      final int dashIndex = tokenizedText.indexOf( '-' );
+      if ( dashIndex > 1 ) {
+         // have text ABC - DEF, check for acronym
+         final String acronym = tokenizedText.substring( 0, dashIndex - 1 ).trim();
+         if ( acronym.isEmpty() || acronym.length() > 8 || acronym.equals( "dose" ) ) {
+            return Collections.emptyList();
+         }
+         final String[] splits = acronym.split( "\\s+" );
+         if ( (splits.length == 1 && acronym.length() > 6) || splits.length > 2 ) {
+            return Collections.emptyList();
+         }
+         final String definition = tokenizedText.substring( dashIndex + 1 ).trim();
+         if ( definition.isEmpty() ) {
+            return Collections.emptyList();
+         }
+         if ( (acronym.charAt( 0 ) != definition.charAt( 0 ) && !definition.contains( "' s" )) ) {
+            return Collections.emptyList();
+         }
+         final String[] definitionSplits = definition.split( "\\s+" );
+         if ( acronym.length() != definitionSplits.length
+               || definitionSplits[definitionSplits.length - 1].charAt( 0 ) != acronym.charAt(
+               acronym.length() - 1 ) ) {
+            return Collections.emptyList();
+         }
+         final Collection<String> extractedAbbreviations = new HashSet<String>( 2 );
+         extractedAbbreviations.add( acronym );
+         extractedAbbreviations.add( definition );
+         return extractedAbbreviations;
+      }
+      return Collections.emptyList();
+   }
+
+   private Collection<String> autoExtractColonBracketTerms( final String tokenizedText ) {
+      final int colonIndex = tokenizedText.indexOf( ':' );
+      if ( colonIndex < 0 ) {
+         return Collections.emptyList();
+      }
+      final int orIndex = tokenizedText.indexOf( "] or [" );
+      final int andOrIndex = tokenizedText.indexOf( "] & / or [" );
+      if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
+         return Collections.emptyList();
+      }
+      String splitter = "\\] or \\[";
+      if ( andOrIndex > 0 ) {
+         splitter = "\\] & / or \\[";
+      }
+      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
+      final String types = tokenizedText.substring( colonIndex + 1 ).trim();
+      final String[] splits = types.split( splitter );
+      for ( String split : splits ) {
+         split = trimBracketText( split );
+         if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
+               || split.equals( "other" ) || split.isEmpty() ) {
+            extractedTerms.addAll( getFormattedTexts( thing ) );
+         } else {
+            extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
+            extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
+         }
+      }
+      return extractedTerms;
+   }
+
+   private Collection<String> autoExtractAndBracketTerms( final String tokenizedText ) {
+      final int andIndex = tokenizedText.indexOf( "( &" );
+      if ( andIndex < 0 || tokenizedText.indexOf( "] or [" ) < andIndex ) {
+         return Collections.emptyList();
+      }
+      final Collection<String> extractedTerms = new HashSet<String>( 3 );
+      final String thing = tokenizedText.substring( 0, andIndex - 1 ).trim();
+      extractedTerms.add( thing );
+      final String types = tokenizedText.substring( andIndex + 3 ).trim();
+      final String[] splits = types.split( "\\] or \\[" );
+      for ( String split : splits ) {
+         split = trimBracketText( split );
+         extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
+         extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
+      }
+      return extractedTerms;
+   }
+
+   private Collection<String> autoExtractOrBracketTerms( final String tokenizedText ) {
+      if ( !tokenizedText.contains( "] or [" ) && !tokenizedText.contains( "] & / or [" ) ) {
+         return Collections.emptyList();
+      }
+      final int lastOf = tokenizedText.lastIndexOf( " of " );
+      if ( lastOf > tokenizedText.lastIndexOf( ']' ) ) {
+         final String ofTerm = tokenizedText.substring( lastOf ).trim();
+         final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0, lastOf ).trim() );
+         final Collection<String> ofTexts = new HashSet<String>( ofExtractions.size() );
+         for ( String ofText : ofExtractions ) {
+            ofTexts.add( ofText + " " + ofTerm );
+         }
+         return ofTexts;
+      }
+      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      String splitter = "\\] or \\[";
+      if ( tokenizedText.contains( "] & / or [" ) ) {
+         splitter = "\\] & / or \\[";
+      }
+      final String[] splits = tokenizedText.split( splitter );
+      for ( String split : splits ) {
+         split = trimBracketText( split );
+         if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
+            extractedTerms.addAll( getFormattedTexts( split ) );
+         }
+      }
+      return extractedTerms;
+   }
+
+   private Collection<String> autoExtractOrParaTerms( final String tokenizedText ) {
+      if ( !tokenizedText.contains( ") or (" ) && !tokenizedText.contains( ") & / or (" ) ) {
+         return Collections.emptyList();
+      }
+      final int lastOf = tokenizedText.lastIndexOf( " of " );
+      if ( lastOf > tokenizedText.lastIndexOf( ')' ) ) {
+         final String ofTerm = tokenizedText.substring( lastOf ).trim();
+         final Collection<String> ofExtractions = autoExtractOrBracketTerms( tokenizedText.substring( 0, lastOf ).trim() );
+         final Collection<String> ofTexts = new HashSet<String>( ofExtractions.size() );
+         for ( String ofText : ofExtractions ) {
+            ofTexts.add( ofText + " " + ofTerm );
+         }
+         return ofTexts;
+      }
+      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      String splitter = "\\) or \\(";
+      if ( tokenizedText.contains( ") & / or (" ) ) {
+         splitter = "\\) & / or \\(";
+      }
+      final String[] splits = tokenizedText.split( splitter );
+      for ( String split : splits ) {
+         split = trimParaText( split );
+         if ( !split.equals( "operation" ) && !split.equals( "therapy" ) && !split.equals( "provision of" ) ) {
+            extractedTerms.addAll( getFormattedTexts( split ) );
+         }
+      }
+      return extractedTerms;
+   }
+
+   private Collection<String> autoExtractColonParaTerms( final String tokenizedText ) {
+      final int colonIndex = tokenizedText.indexOf( ':' );
+      if ( colonIndex < 0 || colonIndex > tokenizedText.indexOf( '(' ) ) {
+         return Collections.emptyList();
+      }
+      final int orIndex = tokenizedText.indexOf( ") or (" );
+      final int andOrIndex = tokenizedText.indexOf( ") & / or (" );
+      if ( Math.max( orIndex, andOrIndex ) < colonIndex ) {
+         return Collections.emptyList();
+      }
+      String splitter = "\\) or \\(";
+      if ( andOrIndex > 0 ) {
+         splitter = "\\) & / or \\(";
+      }
+      final Collection<String> extractedTerms = new HashSet<String>( 2 );
+      final String thing = tokenizedText.substring( 0, colonIndex - 1 ).trim();
+      final String types = tokenizedText.substring( colonIndex + 1 ).trim();
+      final String[] splits = types.split( splitter );
+      for ( String split : splits ) {
+         split = trimParaText( split );
+         if ( split.equals( "nos" ) || split.equals( "nec" ) || split.equals( "unspecified" )
+               || split.equals( "other" ) || split.isEmpty() ) {
+            extractedTerms.addAll( getFormattedTexts( thing ) );
+         } else {
+            extractedTerms.addAll( getFormattedTexts( split + " " + thing ) );
+            extractedTerms.addAll( getFormattedTexts( thing + " " + split ) );
+         }
+      }
+      return extractedTerms;
+   }
+
+   static private String trimParaText( String paraText ) {
+      if ( paraText.startsWith( "(" ) ) {
+         paraText = paraText.substring( 1 );
+      }
+      if ( paraText.endsWith( " nos " ) || paraText.endsWith( " nec " ) ) {
+         return paraText.substring( 0, paraText.length()-4 ).trim();
+      } else  if ( paraText.endsWith( ", unspecified " ) ) {
+         return paraText.substring( 0, paraText.length() - 14 ).trim();
+      } else if ( paraText.endsWith( " nos )" ) || paraText.endsWith( " nec )" ) ) {
+         return paraText.substring( 0, paraText.length() - 5 ).trim();
+      } else  if ( paraText.endsWith( ", unspecified )" ) ) {
+         return paraText.substring( 0, paraText.length() - 15 ).trim();
+      } else if ( paraText.endsWith( ")" ) ) {
+         return paraText.substring( 0, paraText.length()-1 ).trim();
+      }
+      return paraText.trim();
+   }
+
+   static private String trimBracketText( String bracketText ) {
+      if ( bracketText.startsWith( "[" ) ) {
+         bracketText = bracketText.substring( 1 );
+      }
+      if ( bracketText.endsWith( " nos " ) || bracketText.endsWith( " nec " ) ) {
+         return bracketText.substring( 0, bracketText.length()-4 ).trim();
+      } else  if ( bracketText.endsWith( ", unspecified " ) ) {
+         return bracketText.substring( 0, bracketText.length() - 14 ).trim();
+      } else if ( bracketText.endsWith( " nos ]" ) || bracketText.endsWith( " nec ]" ) ) {
+         return bracketText.substring( 0, bracketText.length() - 5 ).trim();
+      } else  if ( bracketText.endsWith( ", unspecified ]" ) ) {
+         return bracketText.substring( 0, bracketText.length() - 15 ).trim();
+      } else if ( bracketText.endsWith( "]" ) ) {
+         return bracketText.substring( 0, bracketText.length()-1 ).trim();
+      }
+      return bracketText.trim();
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/util/UmlsTermUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,46 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+import org.apache.ctakes.dictionarytool.util.TokenUtil;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.util.Collection;
+import java.util.Map;
+
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class CuiTextsMapWriter {
+
+   private CuiTextsMapWriter() {
+   }
+
+   static public void writeCuiTexts( final String termFilePath, final Map<String, Collection<String>> cuiTexts ) {
+      System.out.println( "Writing map of Cuis and Texts to " + termFilePath );
+      long lineCount = 0;
+      try {
+         final BufferedWriter writer = FileUtil.createWriter( termFilePath );
+         for (  Map.Entry<String,Collection<String>> cuiTextsEntry : cuiTexts.entrySet() ) {
+            final String cui = cuiTextsEntry.getKey();
+            for ( String text : cuiTextsEntry.getValue() ) {
+               lineCount++;
+               writer.write( TokenUtil.createBsvLine( cui, text ) );
+               writer.newLine();
+               if ( lineCount % 100000 == 0 ) {
+                  System.out.println( "File Line " + lineCount );
+               }
+            }
+         }
+         writer.close();
+      } catch ( IOException ioE ) {
+         System.err.println( "Error writing Term on line " + lineCount + " in file " + termFilePath );
+      }
+      System.out.println( "Wrote " + lineCount + " terms to " + termFilePath );
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTextsMapWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,25 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class CuiTuiMapWriter {
+
+   private CuiTuiMapWriter() {
+   }
+
+
+   static private void writeCuiTuiMap( final String cuiTuiFilePath,
+                                       final Map<String, Collection<String>> cuisAndTuis ) {
+      FileUtil.writeNamedSets( cuiTuiFilePath, "map of Cuis and Tuis", cuisAndTuis );
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/CuiTuiMapWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,92 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.JdbcUtil;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * <p>
+ * CREATE CACHED TABLE UMLS_MS_2011AB (
+ *    CUI VARCHAR_IGNORECASE(8) NOT NULL,
+ *    FWORD VARCHAR_IGNORECASE(80) NOT NULL,
+ *    TEXT VARCHAR_IGNORECASE(2048) NOT NULL,
+ *    CODE VARCHAR_IGNORECASE(45) NOT NULL,
+ *    SOURCETYPE VARCHAR_IGNORECASE(45) NOT NULL,
+ *    TUI VARCHAR_IGNORECASE(4) NOT NULL
+ * );
+ * CREATE INDEX IDX_UMLS_MS_2011AB ON UMLS_MS_2011AB( FWORD );
+ * COMMIT;
+ * </p>
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class FirstWordDbWriter {
+
+   private FirstWordDbWriter() {}
+
+
+   static private enum FIELD {
+      CUI(1),FWORD(2),TEXT(3),CODE(4),SOURCETYPE(5),TUI(6);
+      final private int __index;
+      FIELD( final int index ) {
+         __index = index;
+      }
+   }
+
+
+   static public void writeTermsToDb( final Map<String, Collection<String>> cuiTuis,
+                             final Map<String, Collection<String>> cuiTexts,
+                             final String url, final String user, final String pass, final String tableName ) {
+      final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass );
+      final String sql = JdbcUtil.createRowInsertSql( tableName, FIELD.values() );
+      System.out.println( "Writing to " + tableName );
+      try {
+
+         final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+         long lineCount = 0;
+         for ( Map.Entry<String, Collection<String>> cuiTextEntry : cuiTexts.entrySet() ) {
+            final Collection<String> tuis = cuiTuis.get( cuiTextEntry.getKey() );
+            if ( tuis == null ) {
+               continue;
+            }
+            for ( String text : cuiTextEntry.getValue() ) {
+               final String[] tokens = text.split( "\\s+" );
+               rowInsertSql.setString( FIELD.CUI.__index, cuiTextEntry.getKey() );
+               rowInsertSql.setString( FIELD.FWORD.__index, tokens[0] );
+               rowInsertSql.setString( FIELD.TEXT.__index, text );
+               rowInsertSql.setString( FIELD.CODE.__index, cuiTextEntry.getKey() );
+               rowInsertSql.setString( FIELD.SOURCETYPE.__index, "UMLS_ROOT" );
+               rowInsertSql.setString( FIELD.TUI.__index, getSingleTui( tuis ) );
+               rowInsertSql.executeUpdate();
+               lineCount++;
+               if ( lineCount % 100000 == 0 ) {
+                  System.out.println( "DB Row " + lineCount );
+               }
+            }
+         }
+         System.out.println( "DB Rows " + lineCount );
+
+         final Statement statement = connection.createStatement();
+         statement.execute( "commit" );
+         rowInsertSql.close();
+      } catch ( SQLException sqlE ) {
+         System.err.println( sqlE.getMessage() );
+      }
+   }
+
+
+   static private String getSingleTui( final Collection<String> tuis ) {
+      for ( String tui : tuis ) {
+         return tui;
+      }
+      return "T000";
+   }
+
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/FirstWordDbWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,93 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.JdbcUtil;
+import org.apache.ctakes.dictionarytool.util.RareWordUtil;
+import org.apache.ctakes.dictionarytool.util.TokenUtil;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.SQLException;
+import java.sql.Statement;
+import java.util.Collection;
+import java.util.Map;
+
+
+/**
+ * <p>
+ * CREATE CACHED TABLE CTAKES_UMLS (
+ *    CUI VARCHAR_IGNORECASE(12),
+ *    TUI VARCHAR_IGNORECASE(48),
+ *    RINDEX INTEGER,
+ *    TCOUNT INTEGER,
+ *    TEXT VARCHAR_IGNORECASE(255),
+ *    RWORD VARCHAR_IGNORECASE(48)
+ * );
+ * CREATE INDEX IDX_CTAKES_UMLS ON CTAKES_UMLS( RWORD );
+ * COMMIT;
+ * </p>
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/15/14
+ */
+final public class RareWordDbWriter {
+
+   private RareWordDbWriter() {}
+
+   static private enum FIELD {
+      CUI(1),TUI(2),RINDEX(3),TCOUNT(4),TEXT(5),RWORD(6);
+      final private int __index;
+      FIELD( final int index ) {
+         __index = index;
+      }
+   }
+
+
+   static public void writeTermsToDb( final Map<String, Collection<String>> cuiTuis,
+                            final Map<String, Collection<String>> cuiTexts,
+                            final String url, final String user, final String pass, final String tableName ) {
+      final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass );
+      final String sql = JdbcUtil.createRowInsertSql( tableName, FIELD.values() );
+      try {
+         final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+         final Map<String, Integer> tokenCounts = RareWordUtil.getTokenCounts( cuiTexts );
+         long lineCount = 0;
+         for ( Map.Entry<String, Collection<String>> cuiTextEntry : cuiTexts.entrySet() ) {
+            final Collection<String> tuis = cuiTuis.get( cuiTextEntry.getKey() );
+            if ( tuis == null ) {
+               continue;
+            }
+            for ( String text : cuiTextEntry.getValue() ) {
+               final String[] tokens = text.split( "\\s+" );
+               int bestIndex = 0;
+               int bestCount = Integer.MAX_VALUE;
+               for ( int i = 0; i < tokens.length; i++ ) {
+                  Integer count = tokenCounts.get( tokens[i] );
+                  if ( count != null && count < bestCount ) {
+                     bestIndex = i;
+                     bestCount = count;
+                  }
+               }
+               rowInsertSql.setString( FIELD.CUI.__index, cuiTextEntry.getKey() );
+               rowInsertSql.setString( FIELD.TUI.__index, TokenUtil.createCsvLine( tuis ) );
+               rowInsertSql.setInt( FIELD.RINDEX.__index, bestIndex );
+               rowInsertSql.setInt( FIELD.TCOUNT.__index, tokens.length );
+               rowInsertSql.setString( FIELD.TEXT.__index, text );
+               rowInsertSql.setString( FIELD.RWORD.__index, tokens[bestIndex] );
+               rowInsertSql.executeUpdate();
+               lineCount++;
+               if ( lineCount % 100000 == 0 ) {
+                  System.out.println( "DB Row " + lineCount );
+               }
+            }
+         }
+         System.out.println( "DB Rows " + lineCount );
+
+         final Statement statement = connection.createStatement();
+         statement.execute( "commit" );
+         rowInsertSql.close();
+      } catch ( SQLException sqlE ) {
+         System.err.println( sqlE.getMessage() );
+      }
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java?rev=1572710&view=auto
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java (added)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java Thu Feb 27 19:19:54 2014
@@ -0,0 +1,20 @@
+package org.apache.ctakes.dictionarytool.writer;
+
+import org.apache.ctakes.dictionarytool.util.FileUtil;
+
+import java.util.Collection;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/16/14
+ */
+final public class TuiListWriter {
+
+   private TuiListWriter() {}
+
+   static public void writeTuiList( final String tuiFilePath, final Collection<String> typeTuis ) {
+      FileUtil.writeOneColumn( tuiFilePath, "list of Tuis", typeTuis );
+   }
+
+}

Propchange: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/TuiListWriter.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message