ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1624032 [1/3] - in /ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2: ae/ concept/ consumer/ dictionary/ relation/ term/ textspan/ util/ util/collection/
Date Wed, 10 Sep 2014 15:27:25 GMT
Author: seanfinan
Date: Wed Sep 10 15:27:24 2014
New Revision: 1624032

URL: http://svn.apache.org/r1624032
Log:
**** What follows have to do with some additions to dictionary lookup capabilities.
**** features were requested by a third party, but the advantages are good enough to share
**** since this module is still -not used- I feel ok making so many changes.
Updates to TermAnnotator classes to use primitive codes instead of String, decreasing mem req (and ++hash speed)
Added Concept, which offloads tui requirements, adds lookup possibilities for preferred term and additional codes
Consumers updated to use primitives and store concepts in cas as Type System UmlsConcept
Dictionaries updated to use primitives
Terms updated to use primitives and offload tui to Concepts 
.getLength() added to TextSpan
Added CollectionMap, slightly different from what is in google.collection 
Added CodeUtils to switch between primitive and string representation of cui, tui and mind missing c,t prefix
DictionarySpec updated to contain ConceptFactory in addition to Dictionary
**** JavaDocs still need to be updated ****

Added:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/AbstractConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/Concept.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptCode.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/CuiCodeUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/TuiCodeUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/collection/
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/collection/ArrayListMap.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/collection/CollectionMap.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/collection/HashSetMap.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/collection/ImmutableCollectionMap.java
Modified:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/AbstractJCasTermAnnotator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/PrecisionTermConsumer.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/TermConsumer.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/WsdTermConsumer.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/AbstractRareWordDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/BsvRareWordDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryDescriptorParser.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryFactory.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/JdbcRareWordDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/MemRareWordDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordDictionary.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/RareWordTermMapCreator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/relation/CuiRelationsJdbc.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/term/SpannedRareWordTerm.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/DefaultTextSpan.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/MultiTextSpan.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/textspan/TextSpan.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/DictionarySpec.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/FastLookupToken.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/LookupUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/SemanticUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/TokenMatchUtil.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/UmlsUserApprover.java

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/AbstractJCasTermAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/AbstractJCasTermAnnotator.java?rev=1624032&r1=1624031&r2=1624032&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/AbstractJCasTermAnnotator.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/AbstractJCasTermAnnotator.java Wed Sep 10 15:27:24 2014
@@ -21,16 +21,16 @@ package org.apache.ctakes.dictionary.loo
 import org.apache.ctakes.core.fsm.token.NumberToken;
 import org.apache.ctakes.core.resource.FileResource;
 import org.apache.ctakes.core.util.JCasUtil;
-import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.concept.Concept;
+import org.apache.ctakes.dictionary.lookup2.concept.ConceptFactory;
 import org.apache.ctakes.dictionary.lookup2.dictionary.DictionaryDescriptorParser;
-import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
 import org.apache.ctakes.dictionary.lookup2.util.DictionarySpec;
 import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
-import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
-import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
-import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
+import org.apache.ctakes.dictionary.lookup2.util.collection.HashSetMap;
+import org.apache.ctakes.typesystem.type.syntax.*;
 import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
@@ -44,20 +44,12 @@ import org.apache.uima.resource.Resource
 import org.apache.uima.resource.ResourceInitializationException;
 
 import java.io.File;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
+import java.util.*;
 
 /**
  * Performs the basic initialization with uima context, including the parse of the dictionary specifications file.
  * Has a
- *
+ * <p/>
  * Author: SPF
  * Affiliation: CHIP-NLP
  * Date: 12/6/13
@@ -68,11 +60,17 @@ abstract public class AbstractJCasTermAn
    // LOG4J logger based on interface name
    final private Logger _logger = Logger.getLogger( "AbstractJCasTermAnnotator" );
 
-   /** specifies the type of window to use for lookup */
+   /**
+    * specifies the type of window to use for lookup
+    */
    static private final String WINDOW_ANNOT_PRP_KEY = "windowAnnotations";
-   /** optional part of speech tags for tokens that should not be used for lookup */
+   /**
+    * optional part of speech tags for tokens that should not be used for lookup
+    */
    static private final String EXC_TAGS_PRP_KEY = "exclusionTags";
-   /** optional minimum span for tokens that should not be used for lookup */
+   /**
+    * optional minimum span for tokens that should not be used for lookup
+    */
    static private final String MIN_SPAN_PRP_KEY = "minimumSpan";
 
    private DictionarySpec _dictionarySpec;
@@ -80,9 +78,9 @@ abstract public class AbstractJCasTermAn
    // type of lookup window to use, typically "LookupWindowAnnotation" or "Sentence"
    private int _lookupWindowType;
    // set of exclusion POS tags (lower cased), may be null
-   private final Set<String> _exclusionPartsOfSpeech = new HashSet<String>();
+   private final Set<String> _exclusionPartsOfSpeech = new HashSet<>();
    // minimum span required to use token for lookup
-   private int _minimumLookupSpan = 3;
+   protected int _minimumLookupSpan = 3;
 
    /**
     * {@inheritDoc}
@@ -102,7 +100,7 @@ abstract public class AbstractJCasTermAn
             for ( String tag : tagArr ) {
                _exclusionPartsOfSpeech.add( tag.toUpperCase() );
             }
-            final List<String> posList = new ArrayList<String>( _exclusionPartsOfSpeech );
+            final List<String> posList = new ArrayList<>( _exclusionPartsOfSpeech );
             Collections.sort( posList );
             final StringBuilder sb = new StringBuilder();
             for ( String pos : posList ) {
@@ -117,33 +115,37 @@ abstract public class AbstractJCasTermAn
             _minimumLookupSpan = parseInt( minimumSpan, MIN_SPAN_PRP_KEY, _minimumLookupSpan );
          }
          _logger.info( "Using minimum lookup token span: " + _minimumLookupSpan );
-         final FileResource fileResource = (FileResource) uimaContext.getResourceObject( DICTIONARY_DESCRIPTOR_KEY );
+         final FileResource fileResource = (FileResource)uimaContext.getResourceObject( DICTIONARY_DESCRIPTOR_KEY );
          final File descriptorFile = fileResource.getFile();
          _dictionarySpec = DictionaryDescriptorParser.parseDescriptor( descriptorFile, uimaContext );
-      } catch ( ResourceAccessException raE ) {
-         throw new ResourceInitializationException( raE );
-      } catch ( AnnotatorContextException acE ) {
-         throw new ResourceInitializationException( acE );
+      } catch ( ResourceAccessException | AnnotatorContextException multE ) {
+         throw new ResourceInitializationException( multE );
       }
    }
 
+
+
    /**
     * {@inheritDoc}
     */
    @Override
    public void process( final JCas jcas ) throws AnalysisEngineProcessException {
-      _logger.info( "Starting processing" );
+      _logger.debug( "Starting processing" );
       final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
       final AnnotationIndex annotationIndex = indexes.getAnnotationIndex( _lookupWindowType );
       if ( annotationIndex == null ) {  // I don't trust AnnotationIndex.size(), so don't check
          return;
       }
-      final Map<RareWordDictionary,Collection<SpannedRareWordTerm>> dictionaryTermsMap
-            = new HashMap<RareWordDictionary, Collection<SpannedRareWordTerm>>();
+      final Map<RareWordDictionary, CollectionMap<TextSpan, Long>> dictionaryTermsMap
+            = new HashMap<>( getDictionaries().size() );
+      for ( RareWordDictionary dictionary : getDictionaries() ) {
+         final CollectionMap<TextSpan, Long> textSpanCuis = new HashSetMap<>();
+         dictionaryTermsMap.put( dictionary, textSpanCuis );
+      }
       final Iterator windowIterator = annotationIndex.iterator();
       try {
          while ( windowIterator.hasNext() ) {
-            final Annotation window = (Annotation) windowIterator.next();
+            final Annotation window = (Annotation)windowIterator.next();
             if ( isWindowOk( window ) ) {
                processWindow( jcas, window, dictionaryTermsMap );
             }
@@ -153,12 +155,26 @@ abstract public class AbstractJCasTermAn
          _logger.warn( iobE.getMessage() );
       }
       // Let the consumer handle uniqueness and ordering - some may not care
-      for ( Map.Entry<RareWordDictionary, Collection<SpannedRareWordTerm>> entry : dictionaryTermsMap.entrySet() ) {
-         _dictionarySpec.getConsumer().consumeHits( jcas, entry.getKey(), entry.getValue() );
+      for ( Map.Entry<RareWordDictionary, CollectionMap<TextSpan, Long>> dictionaryCuis : dictionaryTermsMap.entrySet() ) {
+         final RareWordDictionary dictionary = dictionaryCuis.getKey();
+         final CollectionMap<TextSpan, Long> textSpanCuis = dictionaryCuis.getValue();
+         final Collection<Long> allDictionaryCuis = new HashSet<>();
+         for ( Collection<Long> cuiCodes : textSpanCuis.getAllCollections() ) {
+            allDictionaryCuis.addAll( cuiCodes );
+         }
+         final Collection<ConceptFactory> conceptFactories
+               = _dictionarySpec.getPairedConceptFactories( dictionary.getName() );
+         final CollectionMap<Long, Concept> allConceptsMap = new HashSetMap<>();
+         for ( ConceptFactory conceptFactory : conceptFactories ) {
+            final Map<Long, Concept> conceptMap = conceptFactory.createConcepts( allDictionaryCuis );
+            allConceptsMap.placeMap( conceptMap );
+         }
+         _dictionarySpec.getConsumer().consumeHits( jcas, dictionary, textSpanCuis, allConceptsMap );
       }
-      _logger.info( "Finished processing" );
+      _logger.debug( "Finished processing" );
    }
 
+
    /**
     * {@inheritDoc}
     */
@@ -175,79 +191,78 @@ abstract public class AbstractJCasTermAn
    public boolean isWindowOk( final Annotation window ) {
       final String coveredText = window.getCoveredText();
       return !coveredText.equals( "section id" )
-            && !coveredText.startsWith( "[start section id" )
-            && !coveredText.startsWith( "[end section id" );
+             && !coveredText.startsWith( "[start section id" )
+             && !coveredText.startsWith( "[end section id" );
    }
 
+
    /**
     * {@inheritDoc}
     */
    @Override
    public void processWindow( final JCas jcas, final Annotation window,
-                              final Map<RareWordDictionary, Collection<SpannedRareWordTerm>> dictionaryTermsMap ) {
-      final List<FastLookupToken> allTokens = new ArrayList<FastLookupToken>();
-      final List<Integer> lookupTokenIndices = new ArrayList<Integer>();
+                              final Map<RareWordDictionary, CollectionMap<TextSpan, Long>> dictionaryTerms ) {
+      final List<FastLookupToken> allTokens = new ArrayList<>();
+      final List<Integer> lookupTokenIndices = new ArrayList<>();
       getAnnotationsInWindow( jcas, window, allTokens, lookupTokenIndices );
-      findTerms( getDictionaries(), allTokens, lookupTokenIndices, dictionaryTermsMap );
+      findTerms( getDictionaries(), allTokens, lookupTokenIndices, dictionaryTerms );
    }
 
    /**
     * Given a set of dictionaries, tokens, and lookup token indices, populate a terms map with discovered terms
-    * @param dictionaries -
-    * @param allTokens    -
+    *
+    * @param dictionaries       -
+    * @param allTokens          -
     * @param lookupTokenIndices -
     * @param dictionaryTermsMap -
     */
    private void findTerms( final Collection<RareWordDictionary> dictionaries,
                            final List<FastLookupToken> allTokens, final List<Integer> lookupTokenIndices,
-                           final Map<RareWordDictionary, Collection<SpannedRareWordTerm>> dictionaryTermsMap ) {
-      Collection<SpannedRareWordTerm> termsFromDictionary;
+                           final Map<RareWordDictionary, CollectionMap<TextSpan, Long>> dictionaryTermsMap ) {
       for ( RareWordDictionary dictionary : dictionaries ) {
-         termsFromDictionary = dictionaryTermsMap.get( dictionary );
-         if ( termsFromDictionary == null ) {
-            termsFromDictionary = new ArrayList<SpannedRareWordTerm>();
-            dictionaryTermsMap.put( dictionary, termsFromDictionary );
-         }
+         CollectionMap<TextSpan, Long> termsFromDictionary = dictionaryTermsMap.get( dictionary );
          findTerms( dictionary, allTokens, lookupTokenIndices, termsFromDictionary );
       }
    }
 
    /**
     * Given a dictionary, tokens, and lookup token indices, populate a terms collection with discovered terms
-    * @param dictionary -
-    * @param allTokens  -
+    *
+    * @param dictionary          -
+    * @param allTokens           -
     * @param lookupTokenIndices  -
     * @param termsFromDictionary -
     */
    abstract void findTerms( RareWordDictionary dictionary,
-                            List<FastLookupToken> allTokens, List<Integer> lookupTokenIndices,
-                            Collection<SpannedRareWordTerm> termsFromDictionary );
+                            List<FastLookupToken> allTokens,
+                            List<Integer> lookupTokenIndices,
+                            CollectionMap<TextSpan, Long> termsFromDictionary );
 
 
    /**
     * For the given lookup window fills two collections with 1) All tokens in the window,
     * and 2) indexes of tokens in the window to be used for lookup
-    * @param jcas -
-    * @param window annotation lookup window
-    * @param allTokens filled with all tokens, including punctuation, etc.
+    *
+    * @param jcas               -
+    * @param window             annotation lookup window
+    * @param allTokens          filled with all tokens, including punctuation, etc.
     * @param lookupTokenIndices filled with indices of tokens to use for lookup
     */
    protected void getAnnotationsInWindow( final JCas jcas, final Annotation window,
-                                        final List<FastLookupToken> allTokens,
-                                        final List<Integer> lookupTokenIndices ) {
+                                          final List<FastLookupToken> allTokens,
+                                          final List<Integer> lookupTokenIndices ) {
       final List<BaseToken> allBaseTokens = org.uimafit.util.JCasUtil.selectCovered( jcas, BaseToken.class, window );
       for ( BaseToken baseToken : allBaseTokens ) {
          if ( baseToken instanceof NewlineToken ) {
             continue;
          }
          final boolean isNonLookup = baseToken instanceof PunctuationToken
-               || baseToken instanceof NumberToken
-               || baseToken instanceof ContractionToken
-               || baseToken instanceof SymbolToken;
+                                     || baseToken instanceof NumberToken
+                                     || baseToken instanceof ContractionToken
+                                     || baseToken instanceof SymbolToken;
          // We are only interested in tokens that are -words-
-         // getEnd() and getBegin() are both inclusive, so end - begin is actually text.length()-1
-         if ( !isNonLookup && baseToken.getEnd() - baseToken.getBegin() + 1 >= _minimumLookupSpan ) {
-           // POS exclusion logic for first word lookup
+         if ( !isNonLookup ) {
+            // POS exclusion logic for first word lookup
             final String partOfSpeech = baseToken.getPartOfSpeech();
             if ( partOfSpeech == null || !_exclusionPartsOfSpeech.contains( partOfSpeech ) ) {
                lookupTokenIndices.add( allTokens.size() );
@@ -259,7 +274,6 @@ abstract public class AbstractJCasTermAn
    }
 
 
-
    protected int parseInt( final Object value, final String name, final int defaultValue ) {
       if ( value instanceof Integer ) {
          return (Integer)value;

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java?rev=1624032&r1=1624031&r2=1624032&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java Wed Sep 10 15:27:24 2014
@@ -20,9 +20,11 @@ package org.apache.ctakes.dictionary.loo
 
 import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
 import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
-import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.textspan.DefaultTextSpan;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
 import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
 import org.apache.ctakes.dictionary.lookup2.util.TokenMatchUtil;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
 
 import java.util.Collection;
 import java.util.List;
@@ -40,8 +42,9 @@ final public class DefaultJCasTermAnnota
     */
    @Override
    public void findTerms( final RareWordDictionary dictionary,
-                           final List<FastLookupToken> allTokens, final List<Integer> lookupTokenIndices,
-                           final Collection<SpannedRareWordTerm> termsFromDictionary ) {
+                          final List<FastLookupToken> allTokens,
+                          final List<Integer> lookupTokenIndices,
+                          final CollectionMap<TextSpan, Long> termsFromDictionary ) {
       Collection<RareWordTerm> rareWordHits;
       for ( Integer lookupTokenIndex : lookupTokenIndices ) {
          final FastLookupToken lookupToken = allTokens.get( lookupTokenIndex );
@@ -50,9 +53,12 @@ final public class DefaultJCasTermAnnota
             continue;
          }
          for ( RareWordTerm rareWordHit : rareWordHits ) {
+            if ( lookupToken.getLength() < _minimumLookupSpan ) {
+               continue;
+            }
             if ( rareWordHit.getTokenCount() == 1 ) {
                // Single word term, add and move on
-               termsFromDictionary.add( new SpannedRareWordTerm( rareWordHit, lookupToken.getTextSpan() ) );
+               termsFromDictionary.placeValue( lookupToken.getTextSpan(), rareWordHit.getCuiCode() );
                continue;
             }
             final int termStartIndex = lookupTokenIndex - rareWordHit.getRareWordIndex();
@@ -64,7 +70,7 @@ final public class DefaultJCasTermAnnota
             if ( TokenMatchUtil.isTermMatch( rareWordHit, allTokens, termStartIndex, termEndIndex ) ) {
                final int spanStart = allTokens.get( termStartIndex ).getStart();
                final int spanEnd = allTokens.get( termEndIndex ).getEnd();
-               termsFromDictionary.add( new SpannedRareWordTerm( rareWordHit, spanStart, spanEnd ) );
+               termsFromDictionary.placeValue( new DefaultTextSpan( spanStart, spanEnd ), rareWordHit.getCuiCode() );
             }
          }
       }

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java?rev=1624032&r1=1624031&r2=1624032&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java Wed Sep 10 15:27:24 2014
@@ -19,11 +19,12 @@
 package org.apache.ctakes.dictionary.lookup2.ae;
 
 import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.textspan.DefaultTextSpan;
 import org.apache.ctakes.dictionary.lookup2.textspan.MultiTextSpan;
 import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
-import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
-import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
 import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
 import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
 import org.apache.uima.resource.ResourceInitializationException;
@@ -47,9 +48,13 @@ final public class OverlapJCasTermAnnota
    private int _consecutiveSkipMax = 2;
    private int _totalSkipMax = 4;
 
-   /** specifies the number of consecutive non-comma tokens that can be skipped */
+   /**
+    * specifies the number of consecutive non-comma tokens that can be skipped
+    */
    static private final String CONS_SKIP_PRP_KEY = "consecutiveSkips";
-   /** specifies the number of total tokens that can be skipped */
+   /**
+    * specifies the number of total tokens that can be skipped
+    */
    static private final String TOTAL_SKIP_PRP_KEY = "totalTokenSkips";
 
 
@@ -72,13 +77,15 @@ final public class OverlapJCasTermAnnota
       _logger.info( "Maximum tokens that can be skipped: " + _totalSkipMax );
    }
 
+
    /**
     * {@inheritDoc}
     */
    @Override
    public void findTerms( final RareWordDictionary dictionary,
-                          final List<FastLookupToken> allTokens, final List<Integer> lookupTokenIndices,
-                          final Collection<SpannedRareWordTerm> termsFromDictionary ) {
+                          final List<FastLookupToken> allTokens,
+                          final List<Integer> lookupTokenIndices,
+                          final CollectionMap<TextSpan, Long> termsFromDictionary ) {
       Collection<RareWordTerm> rareWordHits;
       for ( Integer lookupTokenIndex : lookupTokenIndices ) {
          final FastLookupToken lookupToken = allTokens.get( lookupTokenIndex );
@@ -87,9 +94,12 @@ final public class OverlapJCasTermAnnota
             continue;
          }
          for ( RareWordTerm rareWordHit : rareWordHits ) {
+            if ( lookupToken.getLength() < _minimumLookupSpan ) {
+               continue;
+            }
             if ( rareWordHit.getTokenCount() == 1 ) {
                // Single word term, add and move on
-               termsFromDictionary.add( new SpannedRareWordTerm( rareWordHit, lookupToken.getTextSpan() ) );
+               termsFromDictionary.placeValue( lookupToken.getTextSpan(), rareWordHit.getCuiCode() );
                continue;
             }
             final int termStartIndex = lookupTokenIndex - rareWordHit.getRareWordIndex();
@@ -97,37 +107,39 @@ final public class OverlapJCasTermAnnota
                // term will extend beyond window
                continue;
             }
-            final SpannedRareWordTerm overlapTerm = getOverlapTerm( allTokens, lookupTokenIndex, rareWordHit,
-                                                                    _consecutiveSkipMax, _totalSkipMax );
-            if ( overlapTerm != null ) {
-               termsFromDictionary.add( overlapTerm );
+            final TextSpan overlapSpan = getOverlapTerm( allTokens, lookupTokenIndex, rareWordHit,
+                  _consecutiveSkipMax, _totalSkipMax );
+            if ( overlapSpan != null ) {
+               termsFromDictionary.placeValue( overlapSpan, rareWordHit.getCuiCode() );
             }
          }
       }
    }
 
+
    /**
     * Check to see if a given term overlaps a set of tokens
-    * @param allTokens all tokens in a window
+    *
+    * @param allTokens        all tokens in a window
     * @param lookupTokenIndex index of rare word in the window of all tokens
-    * @param rareWordHit some possible term
+    * @param rareWordHit      some possible term
     * @return a spanned term that is in the window in some overlapping manner, or null
     */
-   static private SpannedRareWordTerm getOverlapTerm( final List<FastLookupToken> allTokens, final int lookupTokenIndex,
-                                                      final RareWordTerm rareWordHit,
-                                                      final int consecutiveSkipMax, final int totalSkipMax ) {
+   static private TextSpan getOverlapTerm( final List<FastLookupToken> allTokens, final int lookupTokenIndex,
+                                           final RareWordTerm rareWordHit,
+                                           final int consecutiveSkipMax, final int totalSkipMax ) {
       final String[] rareWordTokens = fastSplit( rareWordHit.getText(), rareWordHit.getTokenCount() );
-      final List<TextSpan> missingSpanKeys = new ArrayList<TextSpan>();
+      final List<TextSpan> missingSpanKeys = new ArrayList<>();
       int consecutiveSkips = 0;
       int totalSkips = 0;
       int firstWordIndex = -1;
       if ( rareWordHit.getRareWordIndex() == 0 ) {
          firstWordIndex = lookupTokenIndex;
       } else {
-         int nextRareWordIndex = rareWordHit.getRareWordIndex()-1;
-         for ( int allTokensIndex=lookupTokenIndex-1; allTokensIndex>=0; allTokensIndex-- ) {
+         int nextRareWordIndex = rareWordHit.getRareWordIndex() - 1;
+         for ( int allTokensIndex = lookupTokenIndex - 1; allTokensIndex >= 0; allTokensIndex-- ) {
             if ( rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getText() )
-                  || rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getVariant() ) ) {
+                 || rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getVariant() ) ) {
                nextRareWordIndex--;
                if ( nextRareWordIndex < 0 ) {
                   firstWordIndex = allTokensIndex;
@@ -149,19 +161,19 @@ final public class OverlapJCasTermAnnota
                break;
             }
          }
-         if  ( firstWordIndex == -1 ) {
+         if ( firstWordIndex == -1 ) {
             return null;
          }
       }
       int lastWordIndex = -1;
-      if ( rareWordHit.getRareWordIndex() == rareWordHit.getTokenCount()-1 ) {
+      if ( rareWordHit.getRareWordIndex() == rareWordHit.getTokenCount() - 1 ) {
          lastWordIndex = lookupTokenIndex;
       } else {
          consecutiveSkips = 0;
-         int nextRareWordIndex = rareWordHit.getRareWordIndex()+1;
-         for ( int allTokensIndex=lookupTokenIndex+1; allTokensIndex<allTokens.size(); allTokensIndex++ ) {
+         int nextRareWordIndex = rareWordHit.getRareWordIndex() + 1;
+         for ( int allTokensIndex = lookupTokenIndex + 1; allTokensIndex < allTokens.size(); allTokensIndex++ ) {
             if ( rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getText() )
-                  || rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getVariant() ) ) {
+                 || rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getVariant() ) ) {
                nextRareWordIndex++;
                if ( nextRareWordIndex >= rareWordHit.getTokenCount() ) {
                   lastWordIndex = allTokensIndex;
@@ -185,31 +197,27 @@ final public class OverlapJCasTermAnnota
          }
       }
       if ( missingSpanKeys.isEmpty() ) {
-         return new SpannedRareWordTerm( rareWordHit,
-                                         allTokens.get( firstWordIndex ).getStart(),
-                                         allTokens.get( lastWordIndex ).getEnd() );
-      }
-      final TextSpan discontiguousSpanKey = new MultiTextSpan( allTokens.get( firstWordIndex ).getStart(),
-                                                                     allTokens.get( lastWordIndex ).getEnd(),
-                                                                     missingSpanKeys );
-      return new SpannedRareWordTerm( rareWordHit, discontiguousSpanKey );
+         return new DefaultTextSpan( allTokens.get( firstWordIndex ).getStart(),
+               allTokens.get( lastWordIndex ).getEnd() );
+      }
+      return new MultiTextSpan( allTokens.get( firstWordIndex ).getStart(),
+            allTokens.get( lastWordIndex ).getEnd(), missingSpanKeys );
    }
 
 
-
    static private String[] fastSplit( final String line, final int tokenCount ) {
       final String[] tokens = new String[tokenCount];
       int tokenIndex = 0;
       int previousSpaceIndex = -1;
       int spaceIndex = line.indexOf( ' ' );
       while ( spaceIndex > 0 && tokenIndex < tokenCount ) {
-         tokens[tokenIndex] = line.substring( previousSpaceIndex+1, spaceIndex );
+         tokens[tokenIndex] = line.substring( previousSpaceIndex + 1, spaceIndex );
          tokenIndex++;
          previousSpaceIndex = spaceIndex;
-         spaceIndex = line.indexOf( ' ', previousSpaceIndex+1 );
+         spaceIndex = line.indexOf( ' ', previousSpaceIndex + 1 );
       }
-      if ( previousSpaceIndex+1 < line.length() ) {
-         tokens[tokenCount-1] = line.substring( previousSpaceIndex+1 );
+      if ( previousSpaceIndex + 1 < line.length() ) {
+         tokens[tokenCount - 1] = line.substring( previousSpaceIndex + 1 );
       }
       return tokens;
    }

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java?rev=1624032&r1=1624031&r2=1624032&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java Wed Sep 10 15:27:24 2014
@@ -19,16 +19,16 @@
 package org.apache.ctakes.dictionary.lookup2.ae;
 
 import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
-import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.tcas.Annotation;
 
-import java.util.Collection;
 import java.util.Map;
 
 /**
  * Processes an Annotation window in the cas, adding discovered terms to a map.
- *
+ * <p/>
  * Author: SPF
  * Affiliation: CHIP-NLP
  * Date: 12/5/13
@@ -37,18 +37,23 @@ public interface WindowProcessor {
 
    /**
     * Some windows should be skipped entirely, such as "[section *]"
+    *
     * @param window annotation in which to search for terms
     * @return true if window should be processed, false if it should not
     */
    boolean isWindowOk( Annotation window );
 
+
    /**
     * Processes a window of annotations for dictionary terms
-    * @param jcas -
-    * @param window annotation in which to search for terms
-    * @param dictionaryTermsMap map of entity types and terms for those types in the window
+    *
+    * @param jcas              -
+    * @param window            annotation in which to search for terms
+    * @param dictionaryTerms map of entity types and terms for those types in the window
     */
-   void processWindow( JCas jcas, Annotation window,
-                       Map<RareWordDictionary, Collection<SpannedRareWordTerm>> dictionaryTermsMap );
+   void processWindow( JCas jcas,
+                       Annotation window,
+                       Map<RareWordDictionary, CollectionMap<TextSpan, Long>> dictionaryTerms );
+
 
 }

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/AbstractConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/AbstractConceptFactory.java?rev=1624032&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/AbstractConceptFactory.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/AbstractConceptFactory.java Wed Sep 10 15:27:24 2014
@@ -0,0 +1,45 @@
+package org.apache.ctakes.dictionary.lookup2.concept;
+
+
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 9/4/2014
+ */
+abstract public class AbstractConceptFactory implements ConceptFactory {
+
+   final private String _name;
+
+   public AbstractConceptFactory( final String name ) {
+      _name = name;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _name;
+   }
+
+   /**
+    * Only creates non-empty concepts; Cuis for which additional info does not exist don't create concepts
+    * {@inheritDoc}
+    */
+   @Override
+   public Map<Long, Concept> createConcepts( final Collection<Long> cuiCodes ) {
+      final Map<Long, Concept> conceptMap = new HashMap<>( cuiCodes.size() );
+      for ( Long cuiCode : cuiCodes ) {
+         final Concept concept = createConcept( cuiCode );
+         if ( concept != null && !concept.isEmpty() ) {
+            conceptMap.put( cuiCode, concept );
+         }
+      }
+      return conceptMap;
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java?rev=1624032&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/BsvConceptFactory.java Wed Sep 10 15:27:24 2014
@@ -0,0 +1,191 @@
+package org.apache.ctakes.dictionary.lookup2.concept;
+
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+import org.apache.ctakes.dictionary.lookup2.util.LookupUtil;
+import org.apache.ctakes.dictionary.lookup2.util.TuiCodeUtil;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
+import org.apache.ctakes.dictionary.lookup2.util.collection.HashSetMap;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.*;
+
+import static org.apache.ctakes.dictionary.lookup2.concept.ConceptCode.TUI;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 9/9/2014
+ */
+final public class BsvConceptFactory implements ConceptFactory {
+
+   static private final Logger LOGGER = Logger.getLogger( "BsvConceptFactory" );
+
+   static private final String BSV_FILE_PATH = "bsvPath";
+
+
+   final private ConceptFactory _delegateFactory;
+
+
+   public BsvConceptFactory( final String name, final UimaContext uimaContext, final Properties properties ) {
+      this( name, properties.getProperty( BSV_FILE_PATH ) );
+   }
+
+   public BsvConceptFactory( final String name,  final String bsvFilePath ) {
+      this( name, new File( bsvFilePath ) );
+   }
+
+   public BsvConceptFactory( final String name, final File bsvFile ) {
+      final Collection<CuiTuiTerm> cuiTuiTerms = parseBsvFile( bsvFile );
+      final Map<Long,Concept> conceptMap = new HashMap<>( cuiTuiTerms.size() );
+      for ( CuiTuiTerm cuiTuiTerm : cuiTuiTerms ) {
+         final CollectionMap<ConceptCode,String> codes = new HashSetMap<>( 1 );
+         codes.placeValue( ConceptCode.TUI, TuiCodeUtil.getAsTui( cuiTuiTerm.getTui() ) );
+         conceptMap.put( CuiCodeUtil.getCuiCode( cuiTuiTerm.getCui() ),
+               new Concept( cuiTuiTerm.getCui(), cuiTuiTerm.getPrefTerm(), codes ) );
+      }
+      _delegateFactory = new MemConceptFactory( name, conceptMap );
+   }
+
+
+   /**
+    * The Type identifier and Name are used to maintain a collection of dictionaries,
+    * so the combination of Type and Name should be unique for each dictionary if possible.
+    *
+    * @return simple name for the dictionary
+    */
+   public String getName() {
+      return _delegateFactory.getName();
+   }
+
+   /**
+    * @param cuiCode concept unique identifier
+    * @return the information about the concept that exists in the repository.
+    */
+   public Concept createConcept( final Long cuiCode )  {
+      return _delegateFactory.createConcept( cuiCode );
+   }
+
+   /**
+    * @param cuiCodes concept unique identifiers
+    * @return the information about the concepts that exist in the repository.
+    */
+   public Map<Long, Concept> createConcepts( final Collection<Long> cuiCodes ) {
+      return _delegateFactory.createConcepts( cuiCodes );
+   }
+
+
+   /**
+    * Create a collection of {@link org.apache.ctakes.dictionary.lookup2.dictionary.RareWordTermMapCreator.CuiTerm} Objects
+    * by parsing a bsv file.  The file can be in one of two columnar formats:
+    * <p>
+    * CUI|Tui
+    * </p>
+    * or
+    * <p>
+    * CUI|TUI|Text
+    * </p>
+    * or
+    * <p>
+    * CUI|TUI|Text|PreferredTerm
+    * </p>
+    * If the TUI column is omitted then the entityId for the dictionary is used as the TUI
+    *
+    * @param bsvFile file containing term rows and bsv columns
+    * @return collection of all valid terms read from the bsv file
+    */
+   static private Collection<CuiTuiTerm> parseBsvFile( final File bsvFile ) {
+      final Collection<CuiTuiTerm> cuiTuiTerms = new ArrayList<>();
+      try {
+         final BufferedReader reader = new BufferedReader( new FileReader( bsvFile ) );
+         String line = reader.readLine();
+         while ( line != null ) {
+            if ( line.startsWith( "//" ) || line.startsWith( "#" ) ) {
+               continue;
+            }
+            final String[] columns = LookupUtil.fastSplit( line, '|' );
+            final CuiTuiTerm cuiTuiTerm = createCuiTuiTerm( columns );
+            if ( cuiTuiTerm != null ) {
+               // Add to the dictionary
+               cuiTuiTerms.add( cuiTuiTerm );
+            } else {
+               LOGGER.warn( "Bad BSV line " + line + " in " + bsvFile.getPath() );
+            }
+            line = reader.readLine();
+         }
+         reader.close();
+      } catch ( IOException ioE ) {
+         LOGGER.error( ioE.getMessage() );
+      }
+      return cuiTuiTerms;
+   }
+
+   /**
+    * @param columns two or three columns representing CUI,Text or CUI,TUI,Text respectively
+    * @return a term created from the columns or null if the columns are malformed
+    */
+   static private CuiTuiTerm createCuiTuiTerm( final String[] columns ) {
+      if ( columns.length < 2 ) {
+         return null;
+      }
+      final int cuiIndex = 0;
+      final int tuiIndex = 1;
+      int termIndex = -1;
+      if ( columns.length >= 4 ) {
+         // third column is text, fourth column is preferred term text
+         termIndex = 3;
+      }
+      if ( columns[cuiIndex].trim().isEmpty() || columns[tuiIndex].trim().isEmpty() ) {
+         return null;
+      }
+      final String cui = columns[cuiIndex];
+      // default for an empty tui column is tui 0 = unknown
+      final String tui = (columns[tuiIndex].trim().isEmpty()) ? "T000" : columns[tuiIndex].trim();
+      final String term = (termIndex < 0 || columns[termIndex].trim().isEmpty()) ? "" : columns[termIndex].trim();
+      return new CuiTuiTerm( cui, tui, term );
+   }
+
+   static public class CuiTuiTerm {
+
+      final private String __cui;
+      final private String __tui;
+      final private String __prefTerm;
+      final private int __hashcode;
+
+      public CuiTuiTerm( final String cui, final String tui, final String preferredTerm ) {
+         __cui = cui;
+         __tui = tui;
+         __prefTerm = preferredTerm;
+         __hashcode = (__cui + "_" + __tui + "_" + __prefTerm).hashCode();
+      }
+
+      public String getCui() {
+         return __cui;
+      }
+
+      public String getTui() {
+         return __tui;
+      }
+
+      public String getPrefTerm() {
+         return __prefTerm;
+      }
+
+      public boolean equals( final Object value ) {
+         return value instanceof CuiTuiTerm
+                && __prefTerm.equals( ((CuiTuiTerm)value).__prefTerm )
+                && __tui.equals( ((CuiTuiTerm)value).__tui )
+                && __cui.equals( ((CuiTuiTerm)value).__cui );
+      }
+
+      public int hashCode() {
+         return __hashcode;
+      }
+   }
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/Concept.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/Concept.java?rev=1624032&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/Concept.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/Concept.java Wed Sep 10 15:27:24 2014
@@ -0,0 +1,78 @@
+package org.apache.ctakes.dictionary.lookup2.concept;
+
+import org.apache.ctakes.dictionary.lookup2.util.SemanticUtil;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
+import org.apache.ctakes.dictionary.lookup2.util.collection.HashSetMap;
+import org.apache.ctakes.dictionary.lookup2.util.collection.ImmutableCollectionMap;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+
+import javax.annotation.concurrent.Immutable;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/20/13
+ */
+@Immutable
+final public class Concept {
+
+   static public final String PREFERRED_TERM_UNKNOWN = "Unknown Preferred Term";
+
+   final private String _cui;
+   final private String _preferredText;
+   final private CollectionMap<ConceptCode, String> _codes;
+   final private Collection<Integer> _ctakesSemantics;
+
+   public Concept( final String cui ) {
+      this( cui, "" );
+   }
+
+   public Concept( final String cui, final String preferredText ) {
+      this( cui, preferredText, new HashSetMap<ConceptCode, String>( 0 ) );
+   }
+
+   public Concept( final String cui, final String preferredText, final CollectionMap<ConceptCode, String> codes ) {
+      _cui = cui;
+      _preferredText = preferredText;
+      _codes = new ImmutableCollectionMap<>( codes );
+      final Collection<Integer> ctakesSemantics = new HashSet<>();
+      for ( String tui : getCodes( ConceptCode.TUI ) ) {
+         // Attempt to obtain one or more valid type ids from the tuis of the term
+         ctakesSemantics.add( SemanticUtil.getTuiSemanticGroupId( tui ) );
+      }
+      if ( ctakesSemantics.isEmpty() ) {
+         ctakesSemantics.add( CONST.NE_TYPE_ID_UNKNOWN );
+      }
+      _ctakesSemantics = Collections.unmodifiableCollection( ctakesSemantics );
+   }
+
+   public String getCui() {
+      return _cui;
+   }
+
+   public String getPreferredText() {
+      if ( _preferredText != null ) {
+         return _preferredText;
+      }
+      return PREFERRED_TERM_UNKNOWN;
+   }
+
+   public Collection<String> getCodes( final ConceptCode codeType ) {
+      return _codes.getCollection( codeType );
+   }
+
+   /**
+    * @return the type of term that exists in the dictionary: Anatomical Site, Disease/Disorder, Drug, etc.
+    */
+   public Collection<Integer> getCtakesSemantics() {
+      return _ctakesSemantics;
+   }
+
+   public boolean isEmpty() {
+      return (_preferredText == null || _preferredText.isEmpty()) && _codes.isEmpty();
+   }
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptCode.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptCode.java?rev=1624032&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptCode.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptCode.java Wed Sep 10 15:27:24 2014
@@ -0,0 +1,12 @@
+package org.apache.ctakes.dictionary.lookup2.concept;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/20/13
+ */
+public enum ConceptCode {
+
+   TUI, SNOMEDCT, RXNORM, ICD9CM, ICD10PCS
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptFactory.java?rev=1624032&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptFactory.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/ConceptFactory.java Wed Sep 10 15:27:24 2014
@@ -0,0 +1,35 @@
+package org.apache.ctakes.dictionary.lookup2.concept;
+
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * Term Attribute Repository used to lookup term attributes by the cui of the term
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/20/13
+ */
+public interface ConceptFactory {
+
+   /**
+    * The Type identifier and Name are used to maintain a collection of dictionaries,
+    * so the combination of Type and Name should be unique for each dictionary if possible.
+    *
+    * @return simple name for the dictionary
+    */
+   public String getName();
+
+   /**
+    * @param cuiCode concept unique identifier
+    * @return the information about the concept that exists in the repository.
+    */
+   public Concept createConcept( final Long cuiCode );
+
+   /**
+    * @param cuiCodes concept unique identifiers
+    * @return the information about the concepts that exist in the repository.
+    */
+   public Map<Long, Concept> createConcepts( final Collection<Long> cuiCodes );
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java?rev=1624032&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/JdbcConceptFactory.java Wed Sep 10 15:27:24 2014
@@ -0,0 +1,220 @@
+package org.apache.ctakes.dictionary.lookup2.concept;
+
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+import org.apache.ctakes.dictionary.lookup2.util.TuiCodeUtil;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
+import org.apache.ctakes.dictionary.lookup2.util.collection.HashSetMap;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+
+import java.sql.*;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Properties;
+
+import static org.apache.ctakes.dictionary.lookup2.concept.ConceptCode.*;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/20/13
+ */
+public class JdbcConceptFactory extends AbstractConceptFactory {
+
+
+   // LOG4J logger based on class name
+   final private Logger _logger = Logger.getLogger( getClass().getName() );
+
+
+   // TODO move to Constants class
+   static private final String JDBC_DRIVER = "jdbcDriver";
+   static private final String JDBC_URL = "jdbcUrl";
+   static private final String JDBC_USER = "jdbcUser";
+   static private final String JDBC_PASS = "jdbcPass";
+   static private final String TUI_TABLE = "tuiTable";
+   static private final String PREF_TERM_TABLE = "prefTermTable";
+   static private final String SNOMED_TABLE = "snomedTable";
+   static private final String RXNORM_TABLE = "rxnormTable";
+   static private final String ICD9_TABLE = "icd9Table";
+   static private final String ICD10_TABLE = "icd10Table";
+
+
+   final private Connection _connection;
+   private PreparedStatement _selectTuiCall;
+   private PreparedStatement _selectPrefTermCall;
+   private PreparedStatement _selectSnomedCall;
+   private PreparedStatement _selectRxNormCall;
+   private PreparedStatement _selectIcd9Call;
+   private PreparedStatement _selectIcd10Call;
+
+
+   public JdbcConceptFactory( final String name, final UimaContext uimaContext, final Properties properties )
+         throws ClassNotFoundException, InstantiationException, IllegalAccessException {
+      this( name,
+            properties.getProperty( JDBC_DRIVER ), properties.getProperty( JDBC_URL ),
+            properties.getProperty( JDBC_USER ), properties.getProperty( JDBC_PASS ),
+            properties.getProperty( TUI_TABLE ), properties.getProperty( PREF_TERM_TABLE ),
+            properties.getProperty( SNOMED_TABLE ), properties.getProperty( RXNORM_TABLE ),
+            properties.getProperty( ICD9_TABLE ), properties.getProperty( ICD10_TABLE ) );
+   }
+
+   public JdbcConceptFactory( final String name,
+                              final String jdbcDriver, final String jdbcUrl,
+                              final String jdbcUser, final String jdbcPass,
+                              final String tuiName, final String prefTermName,
+                              final String snomedName, final String rxnormName,
+                              final String icd9Name, final String icd10Name )
+         throws ClassNotFoundException, InstantiationException, IllegalAccessException {
+      super( name );
+      try {
+         final Driver driver = (Driver)Class.forName( jdbcDriver ).newInstance();
+         DriverManager.registerDriver( driver );
+      } catch ( SQLException sqlE ) {
+         _logger.error( "Could not register Driver " + jdbcDriver, sqlE );
+         throw new InstantiationException( "Could not register Driver " + jdbcDriver );
+      } catch ( ClassNotFoundException | InstantiationException | IllegalAccessException multE ) {
+         _logger.error( "Could not create Driver " + jdbcDriver, multE );
+         throw multE;
+      }
+      Connection connection = null;
+      try {
+         connection = DriverManager.getConnection( jdbcUrl, jdbcUser, jdbcPass );
+      } catch ( SQLException sqlE ) {
+         _logger.error( "Could not create Connection with " + jdbcUrl + " as " + jdbcUser, sqlE );
+         throw new InstantiationException( "Could not create Connection with " + jdbcUrl + " as " + jdbcUser );
+      }
+      _connection = connection;
+      try {
+         _selectTuiCall = createSelectCall( tuiName );
+         _selectPrefTermCall = createSelectCall( prefTermName );
+         _selectSnomedCall = createSelectCall( snomedName );
+         _selectRxNormCall = createSelectCall( rxnormName );
+         _selectIcd9Call = createSelectCall( icd9Name );
+         _selectIcd10Call = createSelectCall( icd10Name );
+      } catch ( SQLException sqlE ) {
+         _logger.error( "Could not create Concept Data Selection Call", sqlE );
+      }
+   }
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Concept createConcept( final Long cuiCode ) {
+      final String prefTerm = (_selectPrefTermCall == null) ? null : getPreferredTerm( cuiCode );
+      final CollectionMap<ConceptCode, String> codes = new HashSetMap<>( 5 );
+      if ( _selectTuiCall != null ) {
+         codes.addAllValues( TUI, getTuis( cuiCode ) );
+      }
+      if ( _selectSnomedCall != null ) {
+         codes.addAllValues( SNOMEDCT, getLongCodes( _selectSnomedCall, cuiCode ) );
+      }
+      if ( _selectRxNormCall != null ) {
+         codes.addAllValues( RXNORM, getLongCodes( _selectRxNormCall, cuiCode ) );
+      }
+      if ( _selectIcd9Call != null ) {
+         codes.addAllValues( ICD9CM, getStringCodes( _selectIcd9Call, cuiCode ) );
+      }
+      if ( _selectIcd10Call != null ) {
+         codes.addAllValues( ICD10PCS, getStringCodes( _selectIcd10Call, cuiCode ) );
+      }
+      return new Concept( CuiCodeUtil.getAsCui( cuiCode ), prefTerm, codes );
+   }
+
+
+   private Collection<String> getTuis( final Long cuiCode ) {
+      final Collection<String> tuis = new HashSet<>();
+      try {
+         fillSelectCall( _selectTuiCall, cuiCode );
+         final ResultSet resultSet = _selectTuiCall.executeQuery();
+         while ( resultSet.next() ) {
+            tuis.add( TuiCodeUtil.getAsTui( resultSet.getInt( 2 ) ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         _logger.error( e.getMessage() );
+      }
+      return tuis;
+   }
+
+   private String getPreferredTerm( final Long cuiCode ) {
+      String preferredName = "";
+      try {
+         fillSelectCall( _selectPrefTermCall, cuiCode );
+         final ResultSet resultSet = _selectPrefTermCall.executeQuery();
+         if ( resultSet.next() ) {
+            preferredName = resultSet.getString( 2 );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         _logger.error( e.getMessage() );
+      }
+      return preferredName;
+   }
+
+
+   private Collection<String> getLongCodes( PreparedStatement selectCall, final Long cuiCode ) {
+      final Collection<String> codes = new HashSet<>();
+      try {
+         fillSelectCall( selectCall, cuiCode );
+         final ResultSet resultSet = selectCall.executeQuery();
+         while ( resultSet.next() ) {
+            codes.add( Long.toString( resultSet.getLong( 2 ) ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         _logger.error( e.getMessage() );
+      }
+      return codes;
+   }
+
+
+   private Collection<String> getStringCodes( PreparedStatement selectCall, final Long cuiCode ) {
+      final Collection<String> codes = new HashSet<>();
+      try {
+         fillSelectCall( selectCall, cuiCode );
+         final ResultSet resultSet = selectCall.executeQuery();
+         while ( resultSet.next() ) {
+            codes.add( resultSet.getString( 2 ) );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+      } catch ( SQLException e ) {
+         _logger.error( e.getMessage() );
+      }
+      return codes;
+   }
+
+   /**
+    * @param tableName -
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   private PreparedStatement createSelectCall( final String tableName ) throws SQLException {
+      if ( tableName == null || tableName.isEmpty() || tableName.equalsIgnoreCase( "null" ) ) {
+         return null;
+      }
+      final String lookupSql = "SELECT * FROM " + tableName + " WHERE CUI = ?";
+      return _connection.prepareStatement( lookupSql );
+   }
+
+
+   /**
+    * @param cuiCode -
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   private void fillSelectCall( final PreparedStatement selectCall, final Long cuiCode ) throws SQLException {
+      selectCall.clearParameters();
+      selectCall.setLong( 1, cuiCode );
+   }
+
+
+
+}

Added: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java?rev=1624032&view=auto
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java (added)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/concept/MemConceptFactory.java Wed Sep 10 15:27:24 2014
@@ -0,0 +1,36 @@
+package org.apache.ctakes.dictionary.lookup2.concept;
+
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+
+import java.util.Map;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 9/9/2014
+ */
+final public class MemConceptFactory extends AbstractConceptFactory {
+
+   // Map of rare tokens to terms that contain those tokens.  Used like "First Word Token Lookup" but faster
+   final private Map<Long, Concept> _conceptMap;
+
+
+   public MemConceptFactory( final String name, final Map<Long, Concept> conceptMap ) {
+      super( name );
+      _conceptMap = conceptMap;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Concept createConcept( final Long cuiCode ) {
+      Concept concept = _conceptMap.get( cuiCode );
+      if ( concept == null ) {
+         concept = new Concept( CuiCodeUtil.getAsCui( cuiCode ) );
+         _conceptMap.put( cuiCode, concept );
+      }
+      return concept;
+   }
+
+}

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java?rev=1624032&r1=1624031&r2=1624032&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java Wed Sep 10 15:27:24 2014
@@ -18,18 +18,16 @@
  */
 package org.apache.ctakes.dictionary.lookup2.consumer;
 
+import org.apache.ctakes.dictionary.lookup2.concept.Concept;
 import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
 import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
-import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
-import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
-import org.apache.ctakes.dictionary.lookup2.util.SemanticUtil;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
+import org.apache.ctakes.dictionary.lookup2.util.collection.HashSetMap;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
 
-import java.util.ArrayList;
 import java.util.Collection;
-import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Map;
 import java.util.Properties;
@@ -49,91 +47,71 @@ abstract public class AbstractTermConsum
       _codingScheme = properties.getProperty( CODING_SCHEME_PRP_KEY );
    }
 
+
+
    /**
-    *
-    * @param jcas -
-    * @param codingScheme -
-    * @param typeId  cTakes IdentifiedAnnotation only accepts an integer as a typeId
-    * @param lookupHitMap map of spans to terms for those spans
+    * @param jcas           -
+    * @param codingScheme   -
+    * @param cTakesSemantic cTakes IdentifiedAnnotation only accepts an integer as a typeId
+    * @param textSpanCuis  map of spans to terms for those spans
+    * @param cuiConcepts     -
     * @throws org.apache.uima.analysis_engine.AnalysisEngineProcessException
     */
-   abstract protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int typeId,
-                                              final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap )
+   abstract protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int cTakesSemantic,
+                                              final CollectionMap<TextSpan, Long> textSpanCuis,
+                                              final CollectionMap<Long, Concept> cuiConcepts )
          throws AnalysisEngineProcessException;
 
-
    /**
     * {@inheritDoc}
     */
    @Override
-   public void consumeHits( final JCas jcas, final RareWordDictionary dictionary,
-                            final Collection<SpannedRareWordTerm> dictionaryTerms )
+   public void consumeHits( final JCas jcas,
+                            final RareWordDictionary dictionary,
+                            final CollectionMap<TextSpan, Long> textSpanCuis,
+                            final CollectionMap<Long, Concept> cuiConcepts )
          throws AnalysisEngineProcessException {
       final String codingScheme = getCodingScheme();
-      final String entityType = dictionary.getSemanticGroup();
-      if ( entityType.equals( SemanticUtil.UNKNOWN_SEMANTIC_GROUP )
-            || entityType.equals( SemanticUtil.UNKNOWN_SEMANTIC_ZERO ) ) {
-         // The dictionary may have more than one type, create a map of types to terms and use them all
-         final Map<Integer,Collection<SpannedRareWordTerm>> typeIdLookupHitMap
-               = createTypeIdLookupHitMap( dictionaryTerms );
-         for ( Map.Entry<Integer,Collection<SpannedRareWordTerm>> typeIdLookupHits : typeIdLookupHitMap.entrySet() ) {
-            final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap = createLookupHitMap( typeIdLookupHits.getValue() );
-            consumeTypeIdHits( jcas, codingScheme, typeIdLookupHits.getKey(), lookupHitMap );
+      final Collection<Integer> usedcTakesSemantics = getUsedcTakesSemantics( cuiConcepts );
+      // The dictionary may have more than one type, create a map of types to terms and use them all
+      for ( Integer cTakesSemantic : usedcTakesSemantics ) {
+         final CollectionMap<TextSpan, Long> semanticCuis = new HashSetMap<>();
+         for ( Map.Entry<TextSpan, Collection<Long>> spanCuis : textSpanCuis ) {
+            for ( Long cuiCode : spanCuis.getValue() ) {
+               final Collection<Concept> concepts = cuiConcepts.getCollection( cuiCode );
+               if ( hascTakesSemantic( cTakesSemantic, concepts ) ) {
+                  semanticCuis.placeValue( spanCuis.getKey(), cuiCode );
+               }
+            }
          }
-         return;
+         consumeTypeIdHits( jcas, codingScheme, cTakesSemantic, semanticCuis, cuiConcepts );
       }
-      // The dictionary has one type, consume all using that type id
-      final int typeId = SemanticUtil.getSemanticGroupId( entityType );
-      final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap = createLookupHitMap( dictionaryTerms );
-      consumeTypeIdHits( jcas, codingScheme, typeId, lookupHitMap );
    }
 
-
    protected String getCodingScheme() {
       return _codingScheme;
    }
 
-   /**
-    *
-    * @param spannedRareWordTerms discovered terms
-    * @return Map of terms for each span
-    */
-   static protected Map<TextSpan, Collection<RareWordTerm>> createLookupHitMap(
-         final Collection<SpannedRareWordTerm> spannedRareWordTerms ) {
-      final Map<TextSpan,Collection<RareWordTerm>> lookupHitMap = new HashMap<TextSpan, Collection<RareWordTerm>>();
-      for ( SpannedRareWordTerm spannedRareWordTerm : spannedRareWordTerms ) {
-         Collection<RareWordTerm> rareWordTerms = lookupHitMap.get( spannedRareWordTerm.getTextSpan() );
-         if ( rareWordTerms == null ) {
-            rareWordTerms = new HashSet<RareWordTerm>();
-            lookupHitMap.put( spannedRareWordTerm.getTextSpan(), rareWordTerms );
+
+
+   static protected Collection<Integer> getUsedcTakesSemantics( final CollectionMap<Long, Concept> cuiConcepts ) {
+      final Collection<Integer> usedSemanticTypes = new HashSet<>();
+      for ( Collection<Concept> concepts : cuiConcepts.getAllCollections() ) {
+         for ( Concept concept : concepts ) {
+            usedSemanticTypes.addAll( concept.getCtakesSemantics() );
          }
-         rareWordTerms.add( spannedRareWordTerm.getRareWordTerm() );
       }
-      return lookupHitMap;
+      return usedSemanticTypes;
    }
 
-   /**
-    *
-    * @param spannedRareWordTerms discovered terms
-    * @return Map of type Ids and the discovered terms for each
-    */
-   static protected Map<Integer,Collection<SpannedRareWordTerm>> createTypeIdLookupHitMap(
-         final Collection<SpannedRareWordTerm> spannedRareWordTerms ) {
-      final Map<Integer,Collection<SpannedRareWordTerm>> typeIdLookupHitMap
-            = new HashMap<Integer, Collection<SpannedRareWordTerm>>( 6 );
-      for ( SpannedRareWordTerm spannedTerm : spannedRareWordTerms ) {
-         // Attempt to obtain one or more valid type ids from the tuis of the term
-         final Collection<Integer> typeIds = SemanticUtil.getSemanticGroupIdFromTui( spannedTerm.getRareWordTerm().getTui() );
-         for ( Integer typeId : typeIds ) {
-            Collection<SpannedRareWordTerm> typeIdHits = typeIdLookupHitMap.get( typeId );
-            if ( typeIdHits == null ) {
-               typeIdHits = new ArrayList<SpannedRareWordTerm>();
-               typeIdLookupHitMap.put( typeId, typeIdHits );
-            }
-            typeIdHits.add( spannedTerm );
+   static private boolean hascTakesSemantic( final Integer cTakesSemantic,
+                                             final Collection<Concept> concepts  ) {
+      for ( Concept concept : concepts ) {
+         if ( concept.getCtakesSemantics().contains( cTakesSemantic ) ) {
+            return true;
          }
       }
-      return typeIdLookupHitMap;
+      return false;
    }
 
 

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java?rev=1624032&r1=1624031&r2=1624032&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java (original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java Wed Sep 10 15:27:24 2014
@@ -18,29 +18,31 @@
  */
 package org.apache.ctakes.dictionary.lookup2.consumer;
 
+import org.apache.ctakes.dictionary.lookup2.concept.Concept;
+import org.apache.ctakes.dictionary.lookup2.concept.ConceptCode;
 import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
-import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.util.CuiCodeUtil;
+import org.apache.ctakes.dictionary.lookup2.util.SemanticUtil;
+import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
 import org.apache.ctakes.typesystem.type.constants.CONST;
 import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
-import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
-import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
-import org.apache.ctakes.typesystem.type.textsem.EntityMention;
-import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
-import org.apache.ctakes.typesystem.type.textsem.LabMention;
-import org.apache.ctakes.typesystem.type.textsem.MedicationMention;
-import org.apache.ctakes.typesystem.type.textsem.ProcedureMention;
-import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.ctakes.typesystem.type.textsem.*;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
 
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
+import java.util.*;
+
+import static org.apache.ctakes.typesystem.type.constants.CONST.NE_TYPE_ID_DRUG;
+import static org.apache.ctakes.typesystem.type.constants.CONST.NE_TYPE_ID_ANATOMICAL_SITE;
+import static org.apache.ctakes.typesystem.type.constants.CONST.NE_TYPE_ID_DISORDER;
+import static org.apache.ctakes.typesystem.type.constants.CONST.NE_TYPE_ID_FINDING;
+import static org.apache.ctakes.typesystem.type.constants.CONST.NE_TYPE_ID_LAB;
+import static org.apache.ctakes.typesystem.type.constants.CONST.NE_TYPE_ID_PROCEDURE;
+
+
+
 
 /**
  * Author: SPF
@@ -54,67 +56,35 @@ final public class DefaultTermConsumer e
       super( uimaContext, properties );
    }
 
+
    /**
-    *
-    * @param jcas -
-    * @param codingScheme -
-    * @param typeId  cTakes IdentifiedAnnotation only accepts an integer as a typeId
-    * @param lookupHitMap map of spans to terms for those spans
+    * @param jcas           -
+    * @param codingScheme   -
+    * @param cTakesSemantic cTakes IdentifiedAnnotation only accepts an integer as a cTakesSemantic
     * @throws org.apache.uima.analysis_engine.AnalysisEngineProcessException
     */
-   protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int typeId,
-                                     final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap )
+   protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int cTakesSemantic,
+                                     final CollectionMap<TextSpan, Long> textSpanCuis,
+                                     final CollectionMap<Long, Concept> cuiConcepts )
          throws AnalysisEngineProcessException {
-      // Set of Cuis to avoid duplicates at this offset
-      final Set<String> cuiSet = new HashSet<String>();
       // Collection of UmlsConcept objects
-      final Collection<UmlsConcept> conceptList = new ArrayList<UmlsConcept>();
+      final Collection<UmlsConcept> umlsConceptList = new ArrayList<>();
       try {
-         for ( Map.Entry<TextSpan, Collection<RareWordTerm>> entry : lookupHitMap.entrySet() ) {
-            cuiSet.clear();
-            conceptList.clear();
-            for ( RareWordTerm lookupHit : entry.getValue() ) {
-               final String cui = lookupHit.getCui() ;
-               if ( cuiSet.add( cui ) ) {
-                  final UmlsConcept concept = new UmlsConcept( jcas );
-                  concept.setCodingScheme( codingScheme );
-                  concept.setCui( cui );
-                  concept.setTui( lookupHit.getTui() );
-                  conceptList.add( concept );
-               }
+         for ( Map.Entry<TextSpan, Collection<Long>> spanCuis : textSpanCuis ) {
+            umlsConceptList.clear();
+            for ( Long cuiCode : spanCuis.getValue() ) {
+               umlsConceptList.addAll( createUmlsConcepts( jcas, codingScheme, cTakesSemantic, cuiCode, cuiConcepts ) );
             }
-            // Skip updating CAS if all Concepts for this type were filtered out for this span.
-            if ( conceptList.isEmpty() ) {
-               continue;
-            }
-            // code is only valid if the covered text is also present in the filter
-            final int neBegin = entry.getKey().getStart();
-            final int neEnd = entry.getKey().getEnd();
-            final FSArray conceptArr = new FSArray( jcas, conceptList.size() );
+            final FSArray conceptArr = new FSArray( jcas, umlsConceptList.size() );
             int arrIdx = 0;
-            for ( UmlsConcept umlsConcept : conceptList ) {
+            for ( UmlsConcept umlsConcept : umlsConceptList ) {
                conceptArr.set( arrIdx, umlsConcept );
                arrIdx++;
             }
-            IdentifiedAnnotation annotation;
-            if ( typeId == CONST.NE_TYPE_ID_DRUG ) {
-               annotation = new MedicationMention( jcas );
-            } else if ( typeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
-               annotation = new AnatomicalSiteMention( jcas );
-            } else if ( typeId == CONST.NE_TYPE_ID_DISORDER ) {
-               annotation = new DiseaseDisorderMention( jcas );
-            } else if ( typeId == CONST.NE_TYPE_ID_FINDING ) {
-               annotation = new SignSymptomMention( jcas );
-            } else if ( typeId == CONST.NE_TYPE_ID_LAB ) {
-               annotation = new LabMention( jcas );
-            } else if ( typeId == CONST.NE_TYPE_ID_PROCEDURE ) {
-               annotation = new ProcedureMention( jcas );
-            } else {
-               annotation = new EntityMention( jcas );
-            }
-            annotation.setTypeID( typeId );
-            annotation.setBegin( neBegin );
-            annotation.setEnd( neEnd );
+            final IdentifiedAnnotation annotation = createSemanticAnnotation( jcas, cTakesSemantic );
+            annotation.setTypeID( cTakesSemantic );
+            annotation.setBegin( spanCuis.getKey().getStart() );
+            annotation.setEnd( spanCuis.getKey().getEnd() );
             annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
             annotation.setOntologyConceptArr( conceptArr );
             annotation.addToIndexes();
@@ -126,5 +96,83 @@ final public class DefaultTermConsumer e
       }
    }
 
+   static private IdentifiedAnnotation createSemanticAnnotation( final JCas jcas, final int cTakesSemantic ) {
+      switch( cTakesSemantic ) {
+         case NE_TYPE_ID_DRUG: {
+            return new MedicationMention( jcas );
+         }
+         case NE_TYPE_ID_ANATOMICAL_SITE: {
+            return new AnatomicalSiteMention( jcas );
+         }
+         case NE_TYPE_ID_DISORDER: {
+            return new DiseaseDisorderMention( jcas );
+         }
+         case NE_TYPE_ID_FINDING: {
+            return new SignSymptomMention( jcas );
+         }
+         case NE_TYPE_ID_LAB: {
+            return new LabMention( jcas );
+         }
+         case NE_TYPE_ID_PROCEDURE: {
+            return new ProcedureMention( jcas );
+         }
+      }
+      return new EntityMention( jcas );
+   }
+
+
+   static private Collection<UmlsConcept> createUmlsConcepts( final JCas jcas,
+                                                              final String codingScheme,
+                                                              final int cTakesSemantic,
+                                                              final Long cui,
+                                                              final CollectionMap<Long, Concept> conceptMap ) {
+      final Collection<Concept> concepts = conceptMap.getCollection( cui );
+      if ( concepts == null || concepts.isEmpty() ) {
+         return Arrays.asList( createUmlsConcept( jcas, codingScheme, cui, null, null, null ) );
+      }
+      final Collection<UmlsConcept> umlsConcepts = new HashSet<>();
+      for ( Concept concept : concepts ) {
+         final String preferredText = concept.getPreferredText();
+         // The cTakes Type System for UmlsConcepts is inadequate.
+         // A single Cui can have multiple Tuis, Snomed and RxNorm and icd codes.
+         // Adding -disconnected- Ontology concepts is not correct, as an ontology concept such as snomed
+         // is actually connected to a cui, semantic type, preferredTerm - and cannot be stored alone just for a span
+         final Collection<String> tuis = concept.getCodes( ConceptCode.TUI );
+         if ( !tuis.isEmpty() ) {
+            for ( String tui : tuis ) {
+               // the concept could have tuis outside this cTakes semantic group
+               if ( SemanticUtil.getTuiSemanticGroupId( tui ) == cTakesSemantic ) {
+                  umlsConcepts.add( createUmlsConcept( jcas, codingScheme, cui, tui, preferredText, null ) );
+               }
+            }
+         } else {
+            umlsConcepts.add( createUmlsConcept( jcas, codingScheme, cui, null, preferredText, null ) );
+         }
+      }
+      return umlsConcepts;
+   }
+
+
+   // The cTakes Type System UmlsConcepts is inadequate.
+   // A single Cui can have multiple Tuis, Snomed and RxNorm and icd codes.
+   // Propagating a handful of UmlsConcepts for a single Cui with multiple Snomeds creates bloat
+   static private UmlsConcept createUmlsConcept( final JCas jcas, final String codingScheme,
+                                                 final Long cuiCode, final String tui,
+                                                 final String preferredText, final String code ) {
+      final UmlsConcept umlsConcept = new UmlsConcept( jcas );
+      umlsConcept.setCodingScheme( codingScheme );
+      umlsConcept.setCui( CuiCodeUtil.getAsCui( cuiCode ) );
+      if ( tui != null ) {
+         umlsConcept.setTui( tui );
+      }
+      if ( preferredText != null && !preferredText.isEmpty() ) {
+         umlsConcept.setPreferredText( preferredText );
+      }
+      if ( code != null ) {
+         umlsConcept.setCode( code );
+      }
+      return umlsConcept;
+   }
+
 
 }



Mime
View raw message