ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1571820 [3/4] - in /ctakes/sandbox/ctakes-dictionary-lookup2: ./ desc/ desc/analysis_engine/ doc/ example/ example/desc/ example/desc/analysis_engine/ example/desc/analysis_engine/ctakes-dictionary-lookup2/ src/ src/main/ src/main/java/ sr...
Date Tue, 25 Feb 2014 20:54:26 GMT
Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryFactory.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryFactory.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryFactory.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryFactory.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,211 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+import org.apache.ctakes.core.resource.FileResource;
+import org.apache.ctakes.core.resource.JdbcConnectionResource;
+import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
+import org.jdom.Element;
+
+import java.io.File;
+import java.sql.Connection;
+
+/**
+ * TODO
+ * This factory can create a RareWordDictionary by wrapping the older Jdbc, Lucene, StringTable (CSV) descriptors.
+ * However, to prevent the dependency upon the current Dictionary-Lookup module and its "Dictionary" interface,
+ * all methods have been commented out.  Uncommenting, linking, and rebuilding is possible if use of an older dictionary
+ * resource is required.
+ * TODO
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 2/20/14
+ */
+final public class DictionaryFactory {
+
+   private DictionaryFactory() {}
+
+
+   /**
+    * A JDBC accessible RareWordDictionary is the preferred fast lookup dictionary
+    *
+    * @param implementationElement contains properties for the implementation
+    * @param externalResource      jdbc accessible db
+    * @param entityTypeId          type of entity that the dictionary contains, specified by {@link this.TYPE_ID}
+    * @return a dictionary that uses the specified db for rare word lookup
+    * @throws org.apache.uima.analysis_engine.annotator.AnnotatorContextException if the {@code externalResource} is not a db
+    */
+   static public RareWordDictionary createRareWordJdbc( final Element implementationElement,
+                                                        final Object externalResource, final String entityTypeId )
+         throws AnnotatorContextException {
+      checkResourceType( JdbcConnectionResource.class, externalResource );
+      final String tableName = implementationElement.getAttributeValue( "tableName" );
+      final Connection connection = ((JdbcConnectionResource) externalResource).getConnection();
+      return new JdbcRareWordDictionary( entityTypeId, connection, tableName );
+   }
+
+   /**
+    * A RareWordDictionary for a simple user created bar-separated value (bsv) file
+    *
+    * @param externalResourceKey contains the name of the implementation
+    * @param externalResource    bar-separated value (bsv) file
+    * @param entityTypeId        type of entity that the dictionary contains, specified by {@link this.TYPE_ID}
+    * @return a dictionary that uses the specified file for rare word lookup
+    * @throws AnnotatorContextException if the {@code externalResource} is not a file
+    */
+   static public RareWordDictionary createRareWordBsv( final String externalResourceKey, final Object externalResource,
+                                                       final String entityTypeId )
+         throws AnnotatorContextException {
+      checkResourceType( FileResource.class, externalResource );
+      final File bsvFile = ((FileResource) externalResource).getFile();
+      return new BsvRareWordDictionary( externalResourceKey, entityTypeId, bsvFile );
+   }
+
+//   /**
+//    * A RareWordDictionary for an older "first word lookup" lucene table/index.
+//    * The old uber-configurable dictionary paradigm should be abandoned in favor of something stricter yet simpler
+//    *
+//    * @param rootElement         contains information about fields in the lucene table
+//    * @param externalResourceKey contains the name of the implementation
+//    * @param externalResource    lucene table
+//    * @param entityTypeId        type of entity that the dictionary contains, specified by {@link this.TYPE_ID}
+//    * @return a dictionary that uses the specified lucene table for rare word lookup
+//    * @throws AnnotatorContextException if the {@code externalResource} is not lucene
+//    * @deprecated Fixed index/naming schemes in the data are so much easier than flex in the data and fixed in the desc
+//    */
+//   @Deprecated
+//   static public RareWordDictionary createWrappedLucene( final Element rootElement, final String externalResourceKey,
+//                                                         final Object externalResource, final String entityTypeId )
+//         throws AnnotatorContextException {
+//      checkResourceType( LuceneIndexReaderResource.class, externalResource );
+//      final IndexReader indexReader = ((LuceneIndexReaderResource) externalResource).getIndexReader();
+//      final IndexSearcher indexSearcher = new IndexSearcher( indexReader );
+//      // Added 'MaxListSize' ohnlp-Bugs-3296301
+//      final Element lookupFieldElement = rootElement.getChild( "lookupField" );
+//      final String lookupFieldName = lookupFieldElement.getAttributeValue( "fieldName" );
+//      return createWrappedDictionary( externalResourceKey, entityTypeId,
+//                                      new LuceneDictionaryImpl( indexSearcher, lookupFieldName, MAX_LIST_SIZE ) );
+//   }
+//
+//   /**
+//    * A RareWordDictionary for an older "first word lookup" jdbc accessible db.
+//    * The old uber-configurable dictionary paradigm should be abandoned in favor of something stricter yet simpler
+//    *
+//    * @param rootElement           contains information about fields in the jdbc accessible db
+//    * @param implementationElement contains properties for the implementation
+//    * @param externalResourceKey   contains the name of the implementation
+//    * @param externalResource      jdbc accessible db
+//    * @param entityTypeId          type of entity that the dictionary contains, specified by {@link this.TYPE_ID}
+//    * @return a dictionary that uses the specified db for rare word lookup
+//    * @throws AnnotatorContextException if the {@code externalResource} is not a db
+//    * @deprecated Fixed index/naming schemes in the data are so much easier than flex in the data and fixed in the desc
+//    */
+//   @Deprecated
+//   static public RareWordDictionary createWrappedJdbc( final Element rootElement, final Element implementationElement,
+//                                                       final String externalResourceKey,
+//                                                       final Object externalResource, final String entityTypeId )
+//         throws AnnotatorContextException {
+//      checkResourceType( JdbcConnectionResource.class, externalResource );
+//      final String tableName = implementationElement.getAttributeValue( "tableName" );
+//      final Element lookupFieldElement = rootElement.getChild( "lookupField" );
+//      final String lookupFieldName = lookupFieldElement.getAttributeValue( "fieldName" );
+//      final Connection connection = ((JdbcConnectionResource) externalResource).getConnection();
+//      return createWrappedDictionary( externalResourceKey, entityTypeId,
+//                                      new MemReleaseJdbcDictionaryImpl( connection, tableName, lookupFieldName ) );
+//   }
+//
+//   /**
+//    * A RareWordDictionary for a simple user created comma-separated value (csv) file.
+//    * The old uber-configurable dictionary paradigm should be abandoned in favor of something stricter yet simpler
+//    *
+//    * @param rootElement           contains information about fields in the csv file
+//    * @param implementationElement contains properties for the implementation
+//    * @param externalResourceKey   contains the name of the implementation
+//    * @param externalResource      comma-separated value (csv) file
+//    * @param entityTypeId          type of entity that the dictionary contains, specified by {@link this.TYPE_ID}
+//    * @return a dictionary that uses the specified file for rare word lookup
+//    * @throws AnnotatorContextException if the {@code externalResource} is not a file
+//    * @deprecated Fixed index/naming schemes in the data are so much easier than flex in the data and fixed in the desc
+//    */
+//   @Deprecated
+//   static public RareWordDictionary createWrappedCsv( final Element rootElement, final Element implementationElement,
+//                                                      final String externalResourceKey,
+//                                                      final Object externalResource, final String entityTypeId )
+//         throws AnnotatorContextException {
+//      checkResourceType( FileResource.class, externalResource );
+//      final String fieldDelimiter = implementationElement.getAttributeValue( "delimiter" );
+//      final String indexFieldNames = implementationElement.getAttributeValue( "indexedFieldNames" );
+//      final String[] fieldNames = indexFieldNames.split( "," );
+//      for ( int i = 0; i < fieldNames.length; i++ ) {
+//         fieldNames[i] = fieldNames[i].trim();
+//      }
+//      final File csvFile = ((FileResource) externalResource).getFile();
+//      try {
+//         final StringTable stringTable = StringTableFactory.build( new FileReader( csvFile ),
+//                                                                   fieldDelimiter, fieldNames, true );
+//         final Element lookupFieldElement = rootElement.getChild( "lookupField" );
+//         final String lookupFieldName = lookupFieldElement.getAttributeValue( "fieldName" );
+//         return createWrappedDictionary( externalResourceKey, entityTypeId,
+//                                         new StringTableDictionaryImpl( stringTable, lookupFieldName ) );
+//      } catch ( IOException ioE ) {
+//         throw new AnnotatorContextException( "Could not build StringTable from " + csvFile.getPath(),
+//                                              new Object[0], ioE );
+//      }
+//   }
+//
+//   /**
+//    * Wraps an {@link Dictionary} using {@link org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionaryWrapper}
+//    *
+//    * @param externalResourceKey contains the name of the implementation
+//    * @param entityTypeId        type of entity that the dictionary contains, specified by {@link this.TYPE_ID}
+//    * @param dictionary          implementation of older Dictionary interface
+//    * @return a class that implements {@link RareWordDictionary} but <i>does not</i> necessarily
+//    *         perform lookup by rare word
+//    * @throws AnnotatorContextException if {@code dictionary} is null
+//    */
+//   static public RareWordDictionary createWrappedDictionary( final String externalResourceKey,
+//                                                             final String entityTypeId, final Dictionary dictionary )
+//         throws AnnotatorContextException {
+//      if ( dictionary != null ) {
+//         return new RareWordDictionaryWrapper( externalResourceKey, entityTypeId, dictionary );
+//      }
+//      throw new AnnotatorContextException( "Could not wrap a null Dictionary for " + externalResourceKey,
+//                                           new Object[0] );
+//   }
+
+
+   /**
+    * Convenience method that throws an {@link AnnotatorContextException} when an external resource is
+    * not of the correct type
+    *
+    * @param expectedClassType expected resource class
+    * @param typeValue         Object that should be an implementation of the {@code expectedClassType}
+    * @throws AnnotatorContextException if {@code typeValue} is an incorrect class type
+    */
+   static private void checkResourceType( final Class expectedClassType, final Object typeValue )
+         throws AnnotatorContextException {
+      if ( expectedClassType.isInstance( typeValue ) ) {
+         return;
+      }
+      throw new AnnotatorContextException( "Expected external resource to be " + expectedClassType.getName()
+                                                 + " not " + typeValue.getClass().getName(), new Object[0] );
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryFactory.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/JdbcRareWordDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/JdbcRareWordDictionary.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/JdbcRareWordDictionary.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/JdbcRareWordDictionary.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.log4j.Logger;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Preferred dictionary to use for large collections of terms.
+ * Column indices within the database are constant and not configurable: CUI TUI RINDEX TCOUNT TEXT RWORD
+ * If a configurable implementation is desired then create an extension.
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 3/26/13
+ */
+final public class JdbcRareWordDictionary extends AbstractRareWordDictionary {
+
+   /**
+    * Column (field) indices in the database.  Notice that these are constant and not configurable.
+    * If a configurable implementation is desired then create an extension.
+    */
+   static private enum FIELD_INDEX {
+      CUI( 1 ), TUI( 2 ), RINDEX( 3 ), TCOUNT( 4 ), TEXT( 5 ), RWORD( 6 );
+      final private int __index;
+      private FIELD_INDEX( final int index ) {
+         __index = index;
+      }
+   }
+
+   // LOG4J logger based on class name
+   final private Logger _logger = Logger.getLogger( getClass().getName() );
+
+   final private Connection _connection;
+   final private String _tableName;
+   private PreparedStatement _metadataStatement;
+
+   /**
+    *
+    * @param semanticGroup the type of term that exists in the dictionary: Anatomical Site, Disease/Disorder, Drug, etc.
+    * @param connection database connection
+    * @param tableName name of the database table to use for lookup.  Used as the simple name for the dictionary
+    */
+   public JdbcRareWordDictionary( final String semanticGroup,
+                                  final Connection connection,
+                                  final String tableName ) {
+      super( tableName, semanticGroup );
+      _connection = connection;
+      _tableName = tableName;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<RareWordTerm> getRareWordHits( final String rareWordText ) {
+      final List<RareWordTerm> rareWordTerms = new ArrayList<RareWordTerm>();
+      try {
+         initMetaDataStatement( rareWordText );
+         final ResultSet resultSet = _metadataStatement.executeQuery();
+         while ( resultSet.next() ) {
+            final RareWordTerm rareWordTerm = new RareWordTerm( resultSet.getString( FIELD_INDEX.TEXT.__index),
+                                                                resultSet.getString( FIELD_INDEX.CUI.__index ),
+                                                                resultSet.getString( FIELD_INDEX.TUI.__index ),
+                                                                resultSet.getString( FIELD_INDEX.RWORD.__index ),
+                                                                resultSet.getInt( FIELD_INDEX.RINDEX.__index ),
+                                                                resultSet.getInt( FIELD_INDEX.TCOUNT.__index ) );
+            rareWordTerms.add( rareWordTerm );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         _logger.error( e.getMessage() );
+      }
+      return rareWordTerms;
+   }
+
+   /**
+    *
+    * @param rareWordText text of the rare word to use for term lookup
+    * @return an sql call to use for term lookup
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   private PreparedStatement initMetaDataStatement( final String rareWordText ) throws SQLException {
+      if ( _metadataStatement == null ) {
+         final String lookupSql = "SELECT * FROM " + _tableName + " WHERE RWORD = ?";
+         _metadataStatement = _connection.prepareStatement( lookupSql );
+      }
+      _metadataStatement.clearParameters();
+      _metadataStatement.setString( 1, rareWordText );
+      return _metadataStatement;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/JdbcRareWordDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/MemRareWordDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/MemRareWordDictionary.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/MemRareWordDictionary.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/MemRareWordDictionary.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Map;
+
+/**
+ * A RareWordDictionary that uses a HashMap of Rare Words and Terms for lookup
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/9/14
+ */
+final public class MemRareWordDictionary extends AbstractRareWordDictionary {
+
+   // Map of rare tokens to terms that contain those tokens.  Used like "First Word Token Lookup" but faster
+   final private Map<String,Collection<RareWordTerm>> _rareWordTermMap;
+
+   /**
+    * {@inheritDoc}
+    * @param rareWordTermMap Map with a Rare Word (tokens) as key, and RareWordTerm Collection as value
+    */
+   public MemRareWordDictionary( final String name, final String semanticGroup,
+                                 final Map<String, Collection<RareWordTerm>> rareWordTermMap ) {
+      super( name, semanticGroup );
+      _rareWordTermMap = rareWordTermMap;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<RareWordTerm> getRareWordHits( final String rareWordText ) {
+      final Collection<RareWordTerm> hits = _rareWordTermMap.get( rareWordText );
+      if ( hits == null ) {
+         return Collections.emptyList();
+      }
+      return hits;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/MemRareWordDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionary.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionary.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionary.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,62 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+
+import java.util.Collection;
+
+/**
+ * Dictionary used to lookup terms by the most rare word within them
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/20/13
+ */
+public interface RareWordDictionary {
+
+   /**
+    * The Type identifier and Name are used to maintain a collection of dictionaries,
+    * so the combination of Type and Name should be unique for each dictionary if possible.
+    * @return simple name for the dictionary
+    */
+   public String getName();
+
+   /**
+    * @return the type of term that exists in the dictionary: Anatomical Site, Disease/Disorder, Drug, etc.
+    */
+   public String getSemanticGroup();
+
+   /**
+    * Any single token can exist in zero or more terms in the dictionary.  It may exist as its -own- form or as an
+    * alternate canonical variant.  This method will check the dictionary for both
+    * @param fastLookupToken a single-word token
+    * @return zero or more terms that contain the lookup token
+    */
+   public Collection<RareWordTerm> getRareWordHits( final FastLookupToken fastLookupToken );
+
+   /**
+    * Get all terms within the dictionary that contain a given rare word
+    *
+    * @param rareWordText text of the rare word
+    * @return all terms within the dictionary that contain {@code rareWordText}
+    */
+   public Collection<RareWordTerm> getRareWordHits( final String rareWordText );
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionaryWrapper.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionaryWrapper.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionaryWrapper.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionaryWrapper.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+//package org.apache.ctakes.dictionary.lookup2.bsv;
+//
+//import org.apache.ctakes.dictionary.lookup.Dictionary;
+//import org.apache.ctakes.dictionary.lookup.DictionaryException;
+//import org.apache.ctakes.dictionary.lookup.MetaDataHit;
+//import org.apache.log4j.Logger;
+//
+//import java.util.Collection;
+//import java.util.Collections;
+//import java.util.HashSet;
+//
+///**
+// * Author: SPF
+// * Affiliation: CHIP-NLP
+// * Date: 11/20/13
+// */
+//public class RareWordDictionaryWrapper extends AbstractRareWordDictionary implements Dictionary {
+//
+//   // LOG4J logger based on class name
+//   final private Logger _logger = Logger.getLogger( getClass().getName() );
+//
+//   final private Dictionary _metaDataHitDictionary;
+//
+//   /**
+//    *
+//    * @param name name of the database table to use for lookup.  Used as the simple name for the dictionary
+//    * @param entityTypeId the type of term that exists in the dictionary: Anatomical Site, Disease/Disorder, Drug, etc.
+//    * @param metaDataHitDictionary older dictionary to wrap for lookup
+//    */
+//   public RareWordDictionaryWrapper( final String name, final String entityTypeId, final Dictionary metaDataHitDictionary ) {
+//      super( name, entityTypeId );
+//      _metaDataHitDictionary = metaDataHitDictionary;
+//   }
+//
+//   /**
+//    * {@inheritDoc}
+//    */
+//   @Override
+//   public void retainMetaData( final String metaFieldName )  {
+//      _metaDataHitDictionary.retainMetaData( metaFieldName );
+//   }
+//
+//   /**
+//    * {@inheritDoc}
+//    */
+//   @Override
+//   public boolean contains( String text ) throws DictionaryException {
+//      return _metaDataHitDictionary.contains( text );
+//   }
+//
+//   /**
+//    * {@inheritDoc}
+//    */
+//   @Override
+//   public Collection<MetaDataHit> getEntries( String text ) throws DictionaryException {
+//      return _metaDataHitDictionary.getEntries( text );
+//   }
+//
+//
+//
+//   /**
+//    * Uses metadatahit metafieldvalues of cui tui wordindex tokenlength text rareword
+//    *
+//    * {@inheritDoc}
+//    */
+//   @Override
+//   public Collection<RareWordTerm> getRareWordHits( final String rareWordText ) throws DictionaryException {
+//      final Collection<MetaDataHit> metaDataHits = getEntries( rareWordText );
+//      if ( metaDataHits == null || metaDataHits.isEmpty() ) {
+//         return Collections.emptySet();
+//      }
+//      final Collection<RareWordTerm> rareWordTerms = new HashSet<RareWordTerm>( metaDataHits.size() );
+//      for ( MetaDataHit metaDataHit : metaDataHits ) {
+//         final String text = metaDataHit.getMetaFieldValue( "text" );
+//         final String rareWord = metaDataHit.getMetaFieldValue( "rareword" );
+//         final String cui = metaDataHit.getMetaFieldValue( "cui" );
+//         final String tui = metaDataHit.getMetaFieldValue( "tui" );
+//         int index = -1;
+//         int length = -1;
+//         try {
+//            index = Integer.parseInt( metaDataHit.getMetaFieldValue( "wordindex" ) );
+//            length = Integer.parseInt( metaDataHit.getMetaFieldValue( "tokenlength" ) );
+//         } catch ( NumberFormatException nfE ) {
+//            _logger.warn( "No wordindex or tokenlength in metaDataHit " + metaDataHit );
+//            index = 0;
+//            length = text.split( "\\s+" ).length;
+//         }
+//         if ( index >=0 && length >0 ) {
+//            rareWordTerms.add( new RareWordTerm( text, cui, tui, rareWord, index, length ) );
+//         }
+//      }
+//      return rareWordTerms;
+//   }
+//
+//
+//}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionaryWrapper.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,391 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.util.LookupUtil;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.logging.Logger;
+
+/**
+ * Given a collection of {@link CuiTuiTerm} Objects,
+ * this factory can create a Map of {@link org.apache.ctakes.dictionary.lookup2.term.RareWordTerm} collections
+ * indexed by rare word.
+ * This map can be used to create a {@link MemRareWordDictionary}
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/9/14
+ */
+final public class RareWordTermMapCreator {
+
+   static private final Logger LOGGER = Logger.getLogger( "RareWordTermMapCreator" );
+
+   private RareWordTermMapCreator() {}
+
+   static private final String [] PREFIXES = {
+         "e-",
+         "a-",
+         "u-",
+         "x-",
+         "agro-",
+         "ante-",
+         "anti-",
+         "arch-",
+         "be-",
+         "bi-",
+         "bio-",
+         "co-",
+         "counter-",
+         "cross-",
+         "cyber-",
+         "de-",
+         "eco-",
+         "ex-",
+         "extra-",
+         "inter-",
+         "intra-",
+         "macro-",
+         "mega-",
+         "micro-",
+         "mid-",
+         "mini-",
+         "multi-",
+         "neo-",
+         "non-",
+         "over-",
+         "pan-",
+         "para-",
+         "peri-",
+         "post-",
+         "pre-",
+         "pro-",
+         "pseudo-",
+         "quasi-",
+         "re-",
+         "semi-",
+         "sub-",
+         "super-",
+         "tri-",
+         "ultra-",
+         "un-",
+         "uni-",
+         "vice-",
+         // From email from Colin Warner <colinw@ldc.upenn.edu> on 7/25/2010
+         "electro-",
+         "gasto-",
+         "homo-",
+         "hetero-",
+         "ortho-",
+         "phospho-",
+   };
+   static private final String [] SUFFIXES = {"-esque", "-ette", "-fest", "-fold", "-gate", "-itis", "-less", "-most",
+                                              "-o-torium", "-rama", "-wise"};
+
+   // LookupDesc for the standard excluded pos tags are
+   //   VB,VBD,VBG,VBN,VBP,VBZ,CC,CD,DT,EX,LS,MD,PDT,POS,PP,PP$,PRP,PRP$,RP,TO,WDT,WP,WPS,WRB
+   // Listing every verb in the language seems a pain, but listing the others is possible.
+   // Verbs should be rare in the dictionaries, excepting perhaps the activity and concept dictionaries
+   // CD, CC, DT, EX, MD, PDT, PP, PP$, PRP, PRP$, RP, TO, WDT, WP, WPS, WRB
+   // why not WP$ (possessive wh- pronoun "whose")
+   // PP$ is a Brown POS tag, not Penn Treebank (as are the rest)
+   static private final String[] BAD_POS_TERMS = {
+         // CD  cardinal number
+         "zero", "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten",
+         // CC  coordinating conjunction
+         "and", "or", "but", "for", "nor", "so", "yet",
+         // DT  determiner
+         "this", "that", "these", "those", "the",
+         // EX  existential there
+         "there",
+         // MD  modal
+         "can", "should", "will", "may", "might", "must", "could", "would",
+         // PDT  predeterminer
+         "some", "any", "all", "both", "half", "none", "twice",
+         // PP  prepositional phrase (preposition)
+         "at", "before", "after", "behind", "beneath", "beside", "between", "into", "through", "across", "of",
+         "concerning", "like", "except", "with", "without", "toward", "to", "past", "against", "during", "until",
+         "throughout", "below", "besides", "beyond", "from", "inside", "near", "outside", "since", "upon",
+         // PP$  possessive personal pronoun - Brown POS tag, not Penn TreeBank
+         "my", "our",
+         // PRP  personal pronoun
+         "i", "you", "he", "she", "it",
+         // PRP$  possesive pronoun
+         "mine", "yours", "his", "hers", "its", "ours", "theirs",
+         // RP  particle  - this contains some prepositions
+         "about", "off", "up", "along", "away", "back", "by", "down", "forward", "in", "on", "out",
+         "over", "around", "under",
+         // TO  to  - also a preposition
+         "to",
+         // WDT  wh- determiner
+         "what", "whatever", "which", "whichever",
+         // WP, WPS  wh- pronoun, nominative wh- pronoun
+         "who", "whom", "which", "that", "whoever", "whomever",
+         // WRB
+         "how", "where", "when", "however", "wherever", "whenever",
+   };
+
+   static public Map<String,Collection<RareWordTerm>> createRareWordTermMap( final Collection<CuiTuiTerm> cuiTuiTerms ) {
+      final Map<String,Collection<RareWordTerm>> rareWordTermMap = new HashMap<String,Collection<RareWordTerm>>();
+      final Map<String,Integer> tokenCountMap = createTokenCountMap( cuiTuiTerms );
+      for ( CuiTuiTerm cuiTuiTerm : cuiTuiTerms ) {
+         final String rareWord = getRareWord( cuiTuiTerm.getTerm(), tokenCountMap );
+         final int wordIndex = getWordIndex( cuiTuiTerm.getTerm(), rareWord );
+         final int tokenCount = getTokenCount( cuiTuiTerm.getTerm() );
+         if ( wordIndex < 0 ) {
+            LOGGER.warning( "Bad Rare Word Index for " + rareWord + " in " + cuiTuiTerm.getTerm() );
+            continue;
+         }
+         Collection<RareWordTerm> rareWordTerms = rareWordTermMap.get( rareWord );
+         if ( rareWordTerms == null ) {
+            rareWordTerms = new ArrayList<RareWordTerm>();
+            rareWordTermMap.put( rareWord, rareWordTerms );
+         }
+         rareWordTerms.add( new RareWordTerm( cuiTuiTerm.getTerm(), cuiTuiTerm.__cui, cuiTuiTerm.__tui,
+                                              rareWord, wordIndex, tokenCount ) );
+      }
+      return rareWordTermMap;
+   }
+
+   static private Map<String,Integer> createTokenCountMap( final Collection<CuiTuiTerm> cuiTuiTerms ) {
+      final Map<String,Integer> tokenCountMap = new HashMap<String, Integer>();
+      for ( CuiTuiTerm cuiTuiTerm : cuiTuiTerms ) {
+         final String[] tokens = LookupUtil.fastSplit( cuiTuiTerm.getTerm(), ' ' );
+         for ( String token : tokens ) {
+            if ( isRarableToken( token ) ) {
+               // Don't bother to store counts for single-character tokens
+               Integer count = tokenCountMap.get( token );
+               if ( count == null ) {
+                  count = 0;
+               }
+               tokenCountMap.put( token, (count+1) );
+            }
+         }
+      }
+      return tokenCountMap;
+   }
+
+   static private String getRareWord( final String tokenizedTerm, final Map<String,Integer> tokenCountMap ) {
+      final String[] tokens = LookupUtil.fastSplit( tokenizedTerm, ' ' );
+      if ( tokens.length == 1 ) {
+         return tokens[0];
+      }
+      String bestWord = tokens[0];
+      int bestCount = Integer.MAX_VALUE;
+      for ( String token : tokens ) {
+         if ( isRarableToken( token ) ) {
+            Integer count = tokenCountMap.get( token );
+            if ( count != null && count < bestCount ) {
+               bestWord = token;
+               bestCount = count;
+            }
+         }
+      }
+      return bestWord;
+   }
+
+   static private boolean isRarableToken( final String token ) {
+      if ( token.length() <= 1 ) {
+         return false;
+      }
+      boolean hasLetter = false;
+      for ( int i=0; i<token.length(); i++ ) {
+         if ( Character.isLetter( token.charAt( i ) ) ) {
+            hasLetter = true;
+            break;
+         }
+      }
+      if ( !hasLetter ) {
+         return false;
+      }
+      for ( String badPosTerm : BAD_POS_TERMS ) {
+         if ( token.equals( badPosTerm ) ) {
+            return false;
+         }
+      }
+      return true;
+   }
+
+   static private int getWordIndex( final String tokenizedTerm, final String word ) {
+      int index = 0;
+      final String[] tokens = LookupUtil.fastSplit( tokenizedTerm, ' ' );
+      for ( String token : tokens ) {
+         if ( token.equals( word ) ) {
+            return index;
+         }
+         index++;
+      }
+      return -1;
+   }
+
+   static private int getTokenCount( final String tokenizedTerm ) {
+      return LookupUtil.fastSplit( tokenizedTerm, ' ' ).length;
+   }
+
+
+
+   // Can also use:
+   // tokenizer = new TokenizerPTB();  List<Token> tokenList = tokenizer.tokenize( term );
+   // for( token ) {
+   //   startIndex = token.getStartOffset();
+   //   endIndex = token.getEndOffset();
+   //   tokenText = term.substring( startIndex, endIndex+1 );
+   //   sb.append( tokenText ).append( " " );
+   // }
+   // but what a roundabout!
+   static private String getTokenizedTerm( final String term ) {
+      if ( term.isEmpty() ) {
+         return term;
+      }
+      final String[] splits = term.split( "\\s+" );
+      if ( splits.length == 0 ) {
+         return "";
+      }
+      final StringBuilder sb = new StringBuilder();
+      for ( String split : splits ) {
+         final List<String> tokens = getTokens( split );
+         for ( String token : tokens ) {
+            sb.append( token ).append( " " );
+         }
+      }
+      // trim whitespace
+      sb.setLength( Math.max( 0, sb.length()-1 ) );
+      return sb.toString();
+   }
+
+   static private List<String> getTokens( final String word ) {
+      final List<String> tokens = new ArrayList<String>();
+      final StringBuilder sb = new StringBuilder();
+      final int count = word.length();
+      for ( int i=0; i<count; i++ ) {
+         final char c = word.charAt( i );
+         if ( Character.isLetterOrDigit( c ) ) {
+            sb.append( c );
+            continue;
+         }
+         if ( c != '-' ) {
+            if ( sb.length() != 0 ) {
+               tokens.add( sb.toString() );
+               sb.setLength( 0 );
+            }
+            tokens.add( ""+c );
+            continue;
+         }
+         final boolean isPrefix = isPrefix( sb.toString() );
+         if ( isPrefix ) {
+            // what precedes is a prefix, so append the dash and move on
+            sb.append( '-' );
+            continue;
+         }
+         final boolean isSuffix = isSuffix( word, i+1 );
+         if ( isSuffix ) {
+            // what follows is a suffix, so append the dash and move on
+            sb.append( '-' );
+            continue;
+         }
+         if ( sb.length() != 0 ) {
+            tokens.add( sb.toString() );
+            sb.setLength( 0 );
+         }
+         tokens.add( ""+c );
+      }
+      if ( sb.length() != 0 ) {
+         tokens.add( sb.toString() );
+      }
+      return tokens;
+   }
+
+   static private boolean isPrefix( final String word ) {
+      final String prefixQ = word + "-";
+      for ( String prefix : PREFIXES ) {
+         if ( prefix.equals( prefixQ ) ) {
+            return true;
+         }
+      }
+      return false;
+   }
+
+   static private boolean isSuffix( final String word, final int startIndex ) {
+      if ( word.length() >= startIndex ) {
+         return false;
+      }
+      final String nextCharTerm = getNextCharTerm( word.substring( startIndex ) );
+      if ( nextCharTerm.isEmpty() ) {
+         return false;
+      }
+      final String suffixQ = "-" + nextCharTerm;
+      for ( String suffix : SUFFIXES ) {
+         if ( suffix.equals( suffixQ ) ) {
+            return true;
+         }
+      }
+      return false;
+   }
+
+   static private String getNextCharTerm( final String word ) {
+      final StringBuilder sb = new StringBuilder();
+      final int count = word.length();
+      for ( int i=0; i<count; i++ ) {
+         final char c = word.charAt( i );
+         if ( !Character.isLetterOrDigit( c ) ) {
+            return sb.toString();
+         }
+         sb.append( c );
+      }
+      return sb.toString();
+   }
+
+
+   static public class CuiTuiTerm {
+      final private String __term;
+      final private String __cui;
+      final private String __tui;
+      final private int __hashcode;
+      public CuiTuiTerm( final String cui, final String tui, final String term ) {
+         __term = getTokenizedTerm( term );
+         __cui = cui.startsWith( "C" ) ? cui : "C"+cui;
+         __tui = tui.startsWith( "T" ) ? tui : "T"+tui;
+         __hashcode = (__cui+"_"+__tui+"_"+__term).hashCode();
+      }
+      public String getCui() {
+         return __cui;
+      }
+      public String getTui() {
+         return __tui;
+      }
+      public String getTerm() {
+         return __term;
+      }
+      public boolean equals( final Object value ) {
+         return value instanceof CuiTuiTerm
+               && __term.equals( ((CuiTuiTerm)value).__term )
+               && __cui.equals( ((CuiTuiTerm)value).__cui )
+               && __tui.equals( ((CuiTuiTerm)value).__tui );
+      }
+      public int hashCode() {
+         return __hashcode;
+      }
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/relation/CuiRelationsJdbc.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/relation/CuiRelationsJdbc.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/relation/CuiRelationsJdbc.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/relation/CuiRelationsJdbc.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,110 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.relation;
+
+import org.apache.log4j.Logger;
+
+import java.sql.Connection;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import static org.apache.ctakes.dictionary.lookup2.consumer.WsdTermConsumer.RelatedCui;
+
+/**
+ * TODO  -- work in progress for use in WSD
+ *
+ *
+ *
+* Author: SPF
+* Affiliation: CHIP-NLP
+* Date: 12/16/13
+*/
+public class CuiRelationsJdbc {
+
+   /**
+    * Column (field) indices in the database.  Notice that these are constant and not configurable.
+    * If a configurable implementation is desired then create an extension.
+    */
+   static private enum FIELD_INDEX {
+      CUI( 1 ), RELATION_TYPE( 2 );
+      final private int __index;
+      private FIELD_INDEX( final int index ) {
+         __index = index;
+      }
+   }
+
+   // LOG4J logger based on class name
+   final private Logger _logger = Logger.getLogger( getClass().getName() );
+
+   final private Connection _connection;
+   final private String _tableName;
+   private PreparedStatement _metadataStatement;
+
+   /**
+    *
+    */
+   public CuiRelationsJdbc( final Connection connection, final String tableName ) {
+      _connection = connection;
+      _tableName = tableName;
+   }
+
+   /**
+    * @param cui cui to check for relations
+    * @return all relations (cui and relation)
+    */
+   public Collection<RelatedCui> getCuiRelations( final String cui ) {
+      final List<RelatedCui> relatedCuis = new ArrayList<RelatedCui>();
+      try {
+         initMetaDataStatement( cui );
+         final ResultSet resultSet = _metadataStatement.executeQuery();
+         while ( resultSet.next() ) {
+            final RelatedCui relatedCui = new RelatedCui( resultSet.getString( FIELD_INDEX.CUI.__index),
+                                                          resultSet.getString( FIELD_INDEX.RELATION_TYPE.__index ) );
+            relatedCuis.add( relatedCui );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         _logger.error( e.getMessage() );
+      }
+      return relatedCuis;
+   }
+
+   /**
+    *
+    * @param cui text of the rare word to use for term lookup
+    * @return an sql call to use for term lookup
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   private PreparedStatement initMetaDataStatement( final String cui ) throws SQLException {
+      if ( _metadataStatement == null ) {
+         final String lookupSql = "SELECT * FROM " + _tableName + " WHERE RWORD = ?";
+         _metadataStatement = _connection.prepareStatement( lookupSql );
+      }
+      _metadataStatement.clearParameters();
+      _metadataStatement.setString( 1, cui );
+      return _metadataStatement;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/relation/CuiRelationsJdbc.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,129 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.term;
+
+import javax.annotation.concurrent.Immutable;
+
+/**
+ * Container class for terms in a {@link org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary}
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/18/13
+ */
+@Immutable
+final public class RareWordTerm {
+
+   final private String _text;
+   final private String _cui;
+   final private String _tui;
+   final private String _rareWord;
+   final private int _rareWordIndex;
+   final private int _tokenCount;
+   final private int _hashCode;
+
+   /**
+    *
+    * @param text full text of term
+    * @param cui  umls cui for the term
+    * @param tui  semantic type tui for the term
+    * @param rareWord rare word in the term that is used for lookup
+    * @param rareWordIndex index of the rare word within the term
+    * @param tokenCount number of tokens within the term
+    */
+   public RareWordTerm( final String text, final String cui, final String tui,
+                        final String rareWord, final int rareWordIndex,
+                        final int tokenCount ) {
+      _text = text;
+      _cui = cui;
+      _tui = tui;
+      _rareWord = rareWord;
+      _rareWordIndex = rareWordIndex;
+      _tokenCount = tokenCount;
+      _hashCode = (_cui+_tui+ _text).hashCode();
+   }
+
+   /**
+    *
+    * @return full text of term
+    */
+   public String getText() {
+      return _text;
+   }
+
+   /**
+    *
+    * @return umls cui for the term
+    */
+   public String getCui() {
+      return _cui;
+   }
+
+   /**
+    *
+    * @return semantic type tui for the term
+    */
+   public String getTui() {
+      return _tui;
+   }
+
+   /**
+    *
+    * @return rare word in the term that is used for lookup
+    */
+   public String getRareWord() {
+      return _rareWord;
+   }
+
+   /**
+    *
+    * @return index of the rare word within the term
+    */
+   public int getRareWordIndex() {
+      return _rareWordIndex;
+   }
+
+   /**
+    *
+    * @return number of tokens within the term
+    */
+   public int getTokenCount() {
+      return _tokenCount;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean equals( final Object value ) {
+      if ( !( value instanceof RareWordTerm) ) {
+         return false;
+      }
+      final RareWordTerm other = (RareWordTerm)value;
+      return other.getCui().equals( _cui ) && other.getText().equals( _text ) && other.getTui().equals( _tui );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int hashCode() {
+      return _hashCode;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/SpannedRareWordTerm.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/SpannedRareWordTerm.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/SpannedRareWordTerm.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/SpannedRareWordTerm.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.term;
+
+import org.apache.ctakes.dictionary.lookup2.textspan.DefaultTextSpan;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+
+import javax.annotation.concurrent.Immutable;
+
+/**
+ * A {@link RareWordTerm} tied to a text span
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/18/13
+ */
+@Immutable
+final public class SpannedRareWordTerm {
+
+   final private TextSpan _textSpan;
+   final private RareWordTerm _rareWordTerm;
+   final private int _hashCode;
+
+   /**
+    *
+    * @param rareWordTerm contains a term from a {@link org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary}
+    * @param startOffset the start index of the term
+    * @param endOffset the end index of the term
+    */
+   public SpannedRareWordTerm( final RareWordTerm rareWordTerm, final int startOffset, final int endOffset ) {
+      this( rareWordTerm, new DefaultTextSpan( startOffset, endOffset ) );
+   }
+
+   /**
+    *
+    * @param rareWordTerm contains a term from a {@link org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary}
+    * @param spanKey the span of the term
+    */
+   public SpannedRareWordTerm( final RareWordTerm rareWordTerm, final TextSpan spanKey ) {
+      _rareWordTerm = rareWordTerm;
+      _textSpan = spanKey;
+      _hashCode = _rareWordTerm.hashCode() + _textSpan.hashCode();
+   }
+
+   /**
+    * @return a span with the start and end indices used for this lookup token
+    */
+   public TextSpan getTextSpan() {
+      return _textSpan;
+   }
+
+   /**
+    *
+    * @return the term that was discovered in this span
+    */
+   public RareWordTerm getRareWordTerm() {
+      return _rareWordTerm;
+   }
+
+   /**
+    * Override default equals method. Two SpannedRareWordTerm objects are equal if their
+    * offsets match and their RareWordTerm objects are equal.
+    * {@inheritDoc}
+    */
+   @Override
+   public boolean equals( final Object value ) {
+      if ( value instanceof SpannedRareWordTerm ) {
+         final SpannedRareWordTerm other = (SpannedRareWordTerm) value;
+         return _textSpan.equals( other._textSpan ) && _rareWordTerm.equals( other.getRareWordTerm() );
+      }
+      return false;
+   }
+
+   /**
+    * Override default equals method. Two SpannedRareWordTerm objects are equal if their
+    * offsets match and their RareWordTerm objects are equal.
+    * {@inheritDoc}
+    */
+   @Override
+   public int hashCode() {
+      return _hashCode;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/term/SpannedRareWordTerm.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/DefaultTextSpan.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/DefaultTextSpan.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/DefaultTextSpan.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/DefaultTextSpan.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.textspan;
+
+
+import javax.annotation.concurrent.Immutable;
+
+/**
+ * A useful key for hash collections based upon start and end indices.
+ * This is faster than using String as {@link String#hashCode()}
+ * iterates over the internal character array of a new string (new(..), .substring(..), .lowercase(..), ...).
+ */
+@Immutable
+final public class DefaultTextSpan implements TextSpan {
+   final private int _start;
+   final private int _end;
+   final private int _hashCode;
+
+   /**
+    * Given span indices should be ordered start < end, but it is not an absolute requirement.
+    * @param start start index of a span, be it of a string or other
+    * @param end end index of a span,  be it of a  string or other
+    */
+   public DefaultTextSpan( final int start, final int end ) {
+      _start = start;
+      _end = end;
+      _hashCode = 1000 * _end + _start;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getStart() {
+      return _start;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getEnd() {
+      return _end;
+   }
+
+   /**
+    * {@inheritDoc}
+    * @return a hashcode based upon the start and end indices of this span key
+    */
+   @Override
+   public int hashCode() {
+      return _hashCode;
+   }
+
+   /**
+    * {@inheritDoc}
+    * @return true iff the start keys are equal and the end keys are equal
+    */
+   @Override
+   public boolean equals( final Object object ) {
+      return object instanceof DefaultTextSpan
+            && _start == ((DefaultTextSpan)object)._start
+            && _end == ((DefaultTextSpan)object)._end;
+   }
+
+   /**
+    * {@inheritDoc}
+    * @return "TextSpan for span [start index] to [end index]"
+    */
+   @Override
+   public String toString() {
+      return "TextSpan for span " + _start + " to " + _end;
+   }
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/DefaultTextSpan.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/MultiTextSpan.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/MultiTextSpan.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/MultiTextSpan.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/MultiTextSpan.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,109 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.textspan;
+
+
+import javax.annotation.concurrent.Immutable;
+import java.util.Collection;
+
+/**
+ * A useful key for hash collections based upon start and end indices and missing internal spans.
+ * This is faster than using String as {@link String#hashCode()}
+ * iterates over the internal character array of a new string (new(..), .substring(..), .lowercase(..), ...).
+ *
+ * There is a much better version of this in org.chboston.chip.nlp.annotation but this will do for now.
+ */
+@Immutable
+final public class MultiTextSpan implements TextSpan {
+
+   final private int _start;
+   final private int _end;
+   final private Collection<TextSpan> _missingSpans;
+   final private int _hashCode;
+
+   /**
+    * Given span indices should be ordered start < end, but it is not an absolute requirement.
+    * @param start start index of a span, be it of a string or other
+    * @param end end index of a span,  be it of a  string or other
+    */
+   public MultiTextSpan( final int start, final int end, final Collection<TextSpan> missingSpans ) {
+      _start = start;
+      _end = end;
+      _missingSpans = missingSpans;
+      _hashCode = 1000 * _end + _start + missingSpans.hashCode();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getStart() {
+      return _start;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public int getEnd() {
+      return _end;
+   }
+
+
+   public Collection<TextSpan> getMissingSpans() {
+      return _missingSpans;
+   }
+
+   /**
+    * {@inheritDoc}
+    * @return a hashcode based upon the start and end indices of this span key
+    */
+   @Override
+   public int hashCode() {
+      return _hashCode;
+   }
+
+   /**
+    * {@inheritDoc}
+    * @return true iff the start keys are equal and the end keys are equal
+    */
+   @Override
+   public boolean equals( final Object object ) {
+      return object instanceof MultiTextSpan
+            && _start == ((MultiTextSpan)object)._start
+            && _end == ((MultiTextSpan)object)._end
+            && _missingSpans.equals( ((MultiTextSpan) object)._missingSpans );
+   }
+
+   /**
+    * {@inheritDoc}
+    * @return "Discontiguous TextSpan for span [start index] to [end index] but missing:\n[missing spans]"
+    */
+   @Override
+   public String toString() {
+      final StringBuilder stringBuilder = new StringBuilder();
+      stringBuilder.append( "Discontiguous TextSpan for span " ).append( _start ).append( " to " ).append( _end );
+      stringBuilder.append( " but missing:\n" );
+      for ( TextSpan textSpan : _missingSpans ) {
+         stringBuilder.append( "   " ).append( textSpan.toString() ).append( '\n' );
+      }
+      return stringBuilder.toString();
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/MultiTextSpan.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/TextSpan.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/TextSpan.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/TextSpan.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/TextSpan.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.textspan;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/29/14
+ */
+public interface TextSpan {
+   /**
+    * @return the start index used for this text span
+    */
+   int getStart();
+
+   /**
+    * @return the end index used for this text span
+    */
+   int getEnd();
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/TextSpan.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DictionarySpec.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DictionarySpec.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DictionarySpec.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DictionarySpec.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.util;
+
+import org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer;
+import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+
+import javax.annotation.concurrent.Immutable;
+import java.util.Collection;
+
+/**
+ * Simple Container class that holds a {@link org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary}
+ * collection and a {@link org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer}
+ */
+@Immutable
+final public class DictionarySpec {
+   final private Collection<RareWordDictionary> _dictionaries;
+   final private TermConsumer _termConsumer;
+   public DictionarySpec( final Collection<RareWordDictionary> dictionaries,
+                          final TermConsumer termConsumer ) {
+      _dictionaries = dictionaries;
+      _termConsumer = termConsumer;
+   }
+
+   /**
+    * @return all dictionaries to use for term lookup
+    */
+   public Collection<RareWordDictionary> getDictionaries() {
+      return _dictionaries;
+   }
+
+   /**
+    * @return the consumer to add terms to the Cas
+    */
+   public TermConsumer getConsumer() {
+      return _termConsumer;
+   }
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DictionarySpec.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/FastLookupToken.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/FastLookupToken.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/FastLookupToken.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/FastLookupToken.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.util;
+
+
+import org.apache.ctakes.dictionary.lookup2.textspan.DefaultTextSpan;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.uima.jcas.tcas.Annotation;
+
+import javax.annotation.concurrent.Immutable;
+
+/**
+ * Container class that holds a text span, actual text, and possible variant text for a lookup token.
+ * This class maintains (forces) lowercase text for lookup
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/29/13
+ */
+@Immutable
+final public class FastLookupToken {
+
+   final private TextSpan _textSpan;
+   final private String _text;
+   private String _variant;
+
+   public FastLookupToken( final Annotation jcasAnnotation ) {
+      _textSpan = new DefaultTextSpan( jcasAnnotation.getBegin(), jcasAnnotation.getEnd() );
+      _text = jcasAnnotation.getCoveredText().toLowerCase();
+      if ( jcasAnnotation instanceof WordToken ) {
+         final String canonicalForm = ((WordToken)jcasAnnotation).getCanonicalForm();
+         // If canonical is not null AND not the same as the plain text then it is a valid variant for lookup
+         if ( canonicalForm != null && !canonicalForm.equals( _text ) ) {
+            _variant = canonicalForm;
+         }
+      }
+   }
+
+   /**
+    * @return a span with the start and end indices used for this lookup token
+    */
+   public TextSpan getTextSpan() {
+      return _textSpan;
+   }
+
+   /**
+    * @return the start index used for this lookup token
+    */
+   public int getStart() {
+      return _textSpan.getStart();
+   }
+
+   /**
+    * @return the end index used for this lookup token
+    */
+   public int getEnd() {
+      return _textSpan.getEnd();
+   }
+
+   /**
+    * @return the actual text in the document for the lookup token, in lowercase
+    */
+   public String getText() {
+      return _text;
+   }
+
+   /**
+    * @return possible canonical variant text for the lookup token, in lowercase, or null if none
+    */
+   public String getVariant() {
+      return _variant;
+   }
+
+   /**
+    * Two lookup tokens are equal iff the spans are equal.
+    * @param value -
+    * @return true if {@code value} is a {@code FastLookupToken} and has a span equal to this token's span
+    */
+   public boolean equals( final Object value ) {
+      return value != null && value instanceof FastLookupToken
+            && _textSpan.equals( ((FastLookupToken)value).getTextSpan() );
+   }
+
+   /**
+    * @return hashCode created from the Span
+    */
+   public int hashCode() {
+      return _textSpan.hashCode();
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/FastLookupToken.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/LookupUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/LookupUtil.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/LookupUtil.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/LookupUtil.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,58 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.util;
+
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 12/16/13
+ */
+final public class LookupUtil {
+
+   private LookupUtil() {}
+
+
+   /**
+    * Splits a string using a character.  Faster than String.split( regex )
+    * @param line full text to split
+    * @param c character at which to split
+    * @return array of substrings or the original line if there are no characters c
+    */
+   static public String[] fastSplit( final String line, final char c ) {
+      int nextSplit = line.indexOf( c );
+      if ( nextSplit < 0 ) {
+         return new String[]{line};
+      }
+      final List<String> splits = new ArrayList<String>();
+      int lastSplit = -1;
+      while ( nextSplit > 0 ) {
+         splits.add( line.substring( lastSplit+1, nextSplit ) );
+         lastSplit = nextSplit;
+         nextSplit = line.indexOf( c, lastSplit+1 );
+      }
+      if ( lastSplit+1 < line.length() ) {
+         splits.add( line.substring( lastSplit+1 ) );
+      }
+      return splits.toArray( new String[ splits.size() ] );
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/LookupUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/SemanticUtil.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/SemanticUtil.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/SemanticUtil.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/SemanticUtil.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,125 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.util;
+
+import org.apache.ctakes.typesystem.type.constants.CONST;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+
+/**
+ * Utility class to aid in the handling of semantic groups, semantic types, and tuis.
+ * Used most by the term consumers.
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 2/25/14
+ */
+final public class SemanticUtil {
+
+   private SemanticUtil() {}
+
+   // cTakes types
+   static private final String[] DRUG = { "T109", "T110", "T114", "T115", "T116", "T118", "T119",
+                                          "T121", "T122", "T123", "T124", "T125", "T126", "T127",
+                                          "T129", "T130", "T131", "T195", "T196", "T197", "T200", "T203" };
+   static private final String[] DISO = { "T019", "T020", "T037", "T047", "T048", "T049", "T050", "T190", "T191" };
+   static private final String[] FIND = { "T033", "T034", "T040", "T041", "T042", "T043", "T044", "T045", "T046",
+                                          "T056", "T057", "T184" };
+   static private final String[] PROC = { "T059", "T060", "T061" };
+   static private final String[] ANAT = { "T021","T022", "T023", "T024", "T025", "T026", "T029", "T030" };
+
+   // non-cTakes types
+   // cTakes ID 7.  What is Clinical Attribute?  Just the single [standard] type?
+   //   static private final String[] CLNQ = { "T201" };
+   // cTakes ID 8
+   //   static private final String[] DEVI = { "T203", "T074", "T075" };
+   // cTakes ID 9.  What is LAB?  T034 is cTakes FIND and [standard] PHEN (test result), T059 is cTakes and [standard] PROC
+   //   static private final String[] LABQ = { "T034", "T059" };
+   // cTakes ID 10
+   //   static private final String[] PHEN = { "T034", "T038", "T068", "T069", "T067", "T070" };
+
+
+   static private final Collection<String> ANAT_TUIS = new HashSet<String>( Arrays.asList( ANAT ) );
+   static private final Collection<String> DISO_TUIS = new HashSet<String>( Arrays.asList( DISO ) );
+   static private final Collection<String> FIND_TUIS = new HashSet<String>( Arrays.asList( FIND ) );
+   static private final Collection<String> PROC_TUIS = new HashSet<String>( Arrays.asList( PROC ) );
+   static private final Collection<String> DRUG_TUIS = new HashSet<String>( Arrays.asList( DRUG ) );
+
+
+   static public final String UNKNOWN_SEMANTIC_GROUP = "UNKNOWN_SEMANTIC_GROUP";
+   static public final String UNKNOWN_SEMANTIC_ZERO = "0";
+
+
+   /**
+    * cTakes IdentifiedAnnotation only accepts an integer as a typeId, which historically map to cTakes semantic groups
+    * @param entityType the text name of the semantic group or type
+    * @return the integer value of the entity type or {@code CONST.NE_TYPE_ID_UNKNOWN} if none or improperly formed
+    */
+   static public int getSemanticGroupId( final String entityType ) {
+      if ( entityType == null || entityType.isEmpty() ) {
+         return CONST.NE_TYPE_ID_UNKNOWN;
+      }
+      if ( entityType.equalsIgnoreCase( "DRUG" ) ) {
+         return CONST.NE_TYPE_ID_DRUG;
+      } else if ( entityType.equalsIgnoreCase( "DISO" ) ) {
+         return CONST.NE_TYPE_ID_DISORDER;
+      } else if ( entityType.equalsIgnoreCase( "FIND" ) ) {
+         return CONST.NE_TYPE_ID_FINDING;
+      } else if ( entityType.equalsIgnoreCase( "PROC" ) ) {
+         return CONST.NE_TYPE_ID_PROCEDURE;
+      } else if ( entityType.equalsIgnoreCase( "ANAT" ) ) {
+         return CONST.NE_TYPE_ID_ANATOMICAL_SITE;
+      }
+      try {
+         return Integer.parseInt( entityType );
+      } catch ( NumberFormatException nfe ) {
+         return CONST.NE_TYPE_ID_UNKNOWN;
+      }
+   }
+
+   /**
+    * Sometimes a
+    * @param tuis a comma-delimited collection of tuis that apply to some annotation
+    * @return all cTakes groups for the given tuis
+    */
+   static public Collection<Integer> getSemanticGroupIdFromTui( final String tuis ) {
+      final Collection<Integer> typeIds = new HashSet<Integer>( 1 );
+      final String[] splits = LookupUtil.fastSplit( tuis, ',' );
+      for ( String tui : splits ) {
+         if ( ANAT_TUIS.contains( tui ) ) {
+            typeIds.add( CONST.NE_TYPE_ID_ANATOMICAL_SITE );
+         } else if ( DISO_TUIS.contains( tui ) ) {
+            typeIds.add( CONST.NE_TYPE_ID_DISORDER );
+         } else if ( FIND_TUIS.contains( tui ) ) {
+            typeIds.add( CONST.NE_TYPE_ID_FINDING );
+         } else if ( PROC_TUIS.contains( tui ) ) {
+            typeIds.add( CONST.NE_TYPE_ID_PROCEDURE );
+         } else if ( DRUG_TUIS.contains( tui ) ) {
+            typeIds.add( CONST.NE_TYPE_ID_DRUG );
+         } else {
+            typeIds.add( CONST.NE_TYPE_ID_UNKNOWN );
+         }
+      }
+      return typeIds;
+   }
+
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/util/SemanticUtil.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message