ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1624062 [4/4] - in /ctakes/sandbox/dictionarytool: data/default/ src/org/apache/ctakes/dictionarytool/ src/org/apache/ctakes/dictionarytool/reader/ src/org/apache/ctakes/dictionarytool/util/ src/org/apache/ctakes/dictionarytool/util/collec...
Date Wed, 10 Sep 2014 17:30:43 GMT
Modified: ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java?rev=1624062&r1=1624061&r2=1624062&view=diff
==============================================================================
--- ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java
(original)
+++ ctakes/sandbox/dictionarytool/src/org/apache/ctakes/dictionarytool/writer/RareWordDbWriter.java
Wed Sep 10 17:30:42 2014
@@ -2,7 +2,7 @@ package org.apache.ctakes.dictionarytool
 
 import org.apache.ctakes.dictionarytool.util.JdbcUtil;
 import org.apache.ctakes.dictionarytool.util.RareWordUtil;
-import org.apache.ctakes.dictionarytool.util.TokenUtil;
+import org.apache.ctakes.dictionarytool.util.collection.HashSetMap;
 
 import java.sql.Connection;
 import java.sql.PreparedStatement;
@@ -10,71 +10,106 @@ import java.sql.SQLException;
 import java.sql.Statement;
 import java.util.Collection;
 import java.util.Map;
+import java.util.Set;
 
 
 /**
  * <p>
- * CREATE CACHED TABLE CTAKES_UMLS (
- * CUI VARCHAR_IGNORECASE(12),
- * TUI VARCHAR_IGNORECASE(48),
+ * CREATE CACHED TABLE CUI_TERMS (
+ * CUI BIGINT,
  * RINDEX INTEGER,
  * TCOUNT INTEGER,
- * TEXT VARCHAR_IGNORECASE(255),
- * RWORD VARCHAR_IGNORECASE(48)
+ * TEXT VARCHAR(255),
+ * RWORD VARCHAR(48)
  * );
- * CREATE INDEX IDX_CTAKES_UMLS ON CTAKES_UMLS( RWORD );
+ * CREATE INDEX IDX_CUI_TERMS ON CUI_TERMS( RWORD );
+ * COMMIT;
+ * <p/>
+ * CREATE CACHED TABLE TUI ( CUI BIGINT, TUI INTEGER );
+ * CREATE INDEX IDX_TUI ON TUI( CUI );
+ * COMMIT;
+ * <p/>
+ * CREATE CACHED TABLE SNOMEDCT ( CUI BIGINT, SNOMEDCT BIGINT );
+ * CREATE INDEX IDX_SNOMEDCT ON SNOMEDCT( CUI );
+ * COMMIT;
+ * <p/>
+ * CREATE CACHED TABLE RXNORM ( CUI BIGINT, RXNORM BIGINT );
+ * CREATE INDEX IDX_RXNORM ON RXNORM( CUI );
+ * COMMIT;
+ * <p/>
+ * CREATE CACHED TABLE ICD9CD ( CUI BIGINT, ICD9CD VARCHAR( 48 ) );
+ * CREATE INDEX IDX_ICD9CD ON ICD9CD( CUI );
+ * COMMIT;
+ * <p/>
+ * CREATE CACHED TABLE ICD10PCS ( CUI BIGINT, ICD10PCS VARCHAR( 48 ) );
+ * CREATE INDEX IDX_ICD10PCS ON ICD10PCS( CUI );
+ * COMMIT;
+ * <p/>
+ * CREATE CACHED TABLE PREFTERM ( CUI BIGINT, PREFTERM VARCHAR( 255 ) );
+ * CREATE INDEX IDX_PREFTERM ON PREFTERM( CUI );
  * COMMIT;
  * </p>
+ * <p/>
  * Author: SPF
  * Affiliation: CHIP-NLP
  * Date: 1/15/14
  */
 final public class RareWordDbWriter {
-
    private RareWordDbWriter() {
    }
 
-   static private enum FIELD {
-      CUI( 1 ), TUI( 2 ), RINDEX( 3 ), TCOUNT( 4 ), TEXT( 5 ), RWORD( 6 );
+   static private enum CuiTermsField {
+      CUI( "CUI", 1, Long.class ), RINDEX( "RINDEX", 2, Integer.class ), TCOUNT( "TCOUNT",
3, Integer.class ),
+      TEXT( "TEXT", 4, String.class ), RWORD( "RWORD", 5, String.class );
+      final private String __name;
+      final private int __index;
+      final private Class __classType;
+
+      CuiTermsField( final String name, final int index, final Class classType ) {
+         __name = name;
+         __index = index;
+         __classType = classType;
+      }
+   }
+
+   static private enum CuiMapField {
+      CUI( "CUI", 1, Long.class ), TUI( "TUI", 2, Integer.class ),
+      ICD9( "ICD9", 3, String.class ), ICD10( "ICD10", 4, String.class ),
+      SNOMED( "SNOMED", 5, Long.class ), RXNORM( "RXNORM", 6, Long.class ), PREFTERM( "PREFTERM",
7, String.class );
+      final private String __name;
       final private int __index;
+      final private Class __classType;
 
-      FIELD( final int index ) {
+      CuiMapField( final String name, final int index, final Class classType ) {
+         __name = name;
          __index = index;
+         __classType = classType;
       }
    }
 
 
-   static public void writeTermsToDb( final Map<String, Collection<String>> cuiTuis,
-                                      final Map<String, Collection<String>> cuiTexts,
-                                      final String url, final String user, final String pass,
final String tableName ) {
+   static public void writeCuiTerms( final HashSetMap<Long, Integer> cuiTuis,
+                                     final HashSetMap<Long, String> cuiTexts,
+                                     final String url, final String user, final String pass,
final String tableName ) {
       final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass );
-      final String sql = JdbcUtil.createRowInsertSql( tableName, FIELD.values() );
+      final String sql = JdbcUtil.createRowInsertSql( tableName, CuiTermsField.values() );
       try {
          final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
          final Map<String, Integer> tokenCounts = RareWordUtil.getTokenCounts( cuiTexts
);
          long lineCount = 0;
-         for ( Map.Entry<String, Collection<String>> cuiTextEntry : cuiTexts.entrySet()
) {
-            final Collection<String> tuis = cuiTuis.get( cuiTextEntry.getKey() );
+         for ( Map.Entry<Long, Set<String>> cuiTextEntry : cuiTexts.entrySet()
) {
+            final Collection<Integer> tuis = cuiTuis.get( cuiTextEntry.getKey() );
             if ( tuis == null ) {
                continue;
             }
             for ( String text : cuiTextEntry.getValue() ) {
-               final String[] tokens = text.split( "\\s+" );
-               int bestIndex = 0;
-               int bestCount = Integer.MAX_VALUE;
-               for ( int i = 0; i < tokens.length; i++ ) {
-                  Integer count = tokenCounts.get( tokens[i] );
-                  if ( count != null && count < bestCount ) {
-                     bestIndex = i;
-                     bestCount = count;
-                  }
-               }
-               rowInsertSql.setString( FIELD.CUI.__index, cuiTextEntry.getKey() );
-               rowInsertSql.setString( FIELD.TUI.__index, TokenUtil.createCsvLine( tuis )
);
-               rowInsertSql.setInt( FIELD.RINDEX.__index, bestIndex );
-               rowInsertSql.setInt( FIELD.TCOUNT.__index, tokens.length );
-               rowInsertSql.setString( FIELD.TEXT.__index, text );
-               rowInsertSql.setString( FIELD.RWORD.__index, tokens[bestIndex] );
+               final RareWordUtil.IndexedRareWord indexedRareWord = RareWordUtil.getIndexedRareWord(
text,
+                                                                                        
            tokenCounts );
+               rowInsertSql.setLong( CuiTermsField.CUI.__index, cuiTextEntry.getKey() );
+               rowInsertSql.setInt( CuiTermsField.RINDEX.__index, indexedRareWord.__index
);
+               rowInsertSql.setInt( CuiTermsField.TCOUNT.__index, indexedRareWord.__tokenCount
);
+               rowInsertSql.setString( CuiTermsField.TEXT.__index, text );
+               rowInsertSql.setString( CuiTermsField.RWORD.__index, indexedRareWord.__word
);
                rowInsertSql.executeUpdate();
                lineCount++;
                if ( lineCount % 100000 == 0 ) {
@@ -92,4 +127,63 @@ final public class RareWordDbWriter {
       }
    }
 
+   // TODO switch to cui map
+   //   static public void writeCuiMap( final HashSetMap<Long, String> cuiTuis,
+   //                                      final ArrayListMap<Long, String> cuiTexts,
+   //                                      final String url, final String user, final String
pass, final String tableName ) {
+   //      final Connection connection = JdbcUtil.createDatabaseConnection( url, user, pass
);
+   //      final String sql = JdbcUtil.createRowInsertSql( tableName, CuiTermsField.values()
);
+   //      try {
+   //         final PreparedStatement rowInsertSql = connection.prepareStatement( sql );
+   //         final Map<String, Integer> tokenCounts = RareWordUtil.getTokenCounts(
cuiTexts );
+   //         long lineCount = 0;
+   //         for ( Map.Entry<Long, List<String>> cuiTextEntry : cuiTexts.entrySet()
) {
+   //            final Collection<String> tuis = cuiTuis.get( cuiTextEntry.getKey()
);
+   //            if ( tuis == null ) {
+   //               continue;
+   //            }
+   //            for ( String text : cuiTextEntry.getValue() ) {
+   //               final String[] tokens = text.split( "\\s+" );
+   //               int bestIndex = 0;
+   //               int bestCount = Integer.MAX_VALUE;
+   //               for ( int i = 0; i < tokens.length; i++ ) {
+   //                  Integer count = tokenCounts.get( tokens[i] );
+   //                  if ( count != null && count < bestCount ) {
+   //                     bestIndex = i;
+   //                     bestCount = count;
+   //                  }
+   //               }
+   //               //               rowInsertSql.setString( FIELD.CUI.__index, cuiTextEntry.getKey()
);
+   //               // TODO refactor all code to use long for Cuis and int for Tuis.
+   //               // TODO reference and store all cuis as long (sql bigint)
+   //               // TODO put Tuis in secondary table with alternate codes (sno: rx:, etc.)
and preferred term
+   //               // TODO On lookup, grab appropriate cuis and text from main table
+   //               // TODO After lookup, upon store of terms, grab tuis, alternate codes,
and preferred terms from table 2
+   //               // TODO overall storage requirements -should- be smaller
+   //               // TODO Plus, alternate codes and preferred terms may be tied to Tuis
...
+   //
+   //               rowInsertSql.setLong( CuiTermsField.CUI.__index, cuiTextEntry.getKey()
);
+   //               //               rowInsertSql.setString( FIELD.TUI.__index, TokenUtil.createCsvLine(
tuis ) );
+   //               rowInsertSql.setInt( CuiTermsField.RINDEX.__index, bestIndex );
+   //               rowInsertSql.setInt( CuiTermsField.TCOUNT.__index, tokens.length );
+   //               rowInsertSql.setString( CuiTermsField.TEXT.__index, text );
+   //               rowInsertSql.setString( CuiTermsField.RWORD.__index, tokens[bestIndex]
);
+   //               rowInsertSql.executeUpdate();
+   //               lineCount++;
+   //               if ( lineCount % 100000 == 0 ) {
+   //                  System.out.println( "DB Row " + lineCount );
+   //               }
+   //            }
+   //         }
+   //         System.out.println( "DB Rows " + lineCount );
+   //
+   //         final Statement statement = connection.createStatement();
+   //         statement.execute( "commit" );
+   //         rowInsertSql.close();
+   //      } catch ( SQLException sqlE ) {
+   //         System.err.println( sqlE.getMessage() );
+   //      }
+   //   }
+
+
 }



Mime
View raw message