ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1571820 [2/4] - in /ctakes/sandbox/ctakes-dictionary-lookup2: ./ desc/ desc/analysis_engine/ doc/ example/ example/desc/ example/desc/analysis_engine/ example/desc/analysis_engine/ctakes-dictionary-lookup2/ src/ src/main/ src/main/java/ sr...
Date Tue, 25 Feb 2014 20:54:26 GMT
Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,217 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.ae;
+
+import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.textspan.MultiTextSpan;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.resource.ResourceInitializationException;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+/**
+ * Checks for terms that overlap a window.  All tokens of the term must exist in the window in order,
+ * but not necessarily contiguously
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 12/6/13
+ */
+final public class OverlapJCasTermAnnotator extends AbstractJCasTermAnnotator {
+
+   // LOG4J logger based on interface name
+   final private Logger _logger = Logger.getLogger( "OverlapJCasTermAnnotator" );
+
+   private int _consecutiveSkipMax = 2;
+   private int _totalSkipMax = 4;
+
+   /** specifies the number of consecutive non-comma tokens that can be skipped */
+   static private final String CONS_SKIP_PRP_KEY = "consecutiveSkips";
+   /** specifies the number of total tokens that can be skipped */
+   static private final String TOTAL_SKIP_PRP_KEY = "totalTokenSkips";
+
+
+   /**
+    * Set the number of consecutive and total tokens that can be skipped (optional).  Defaults are 2 and 4.
+    * {@inheritDoc}
+    */
+   @Override
+   public void initialize( final UimaContext uimaContext ) throws ResourceInitializationException {
+      super.initialize( uimaContext );
+      final String consecutiveSkipText = (String)uimaContext.getConfigParameterValue( CONS_SKIP_PRP_KEY );
+      if ( consecutiveSkipText != null ) {
+         _consecutiveSkipMax = parseInt( consecutiveSkipText, CONS_SKIP_PRP_KEY, _consecutiveSkipMax );
+      }
+      final String totalSkipText = (String)uimaContext.getConfigParameterValue( TOTAL_SKIP_PRP_KEY );
+      if ( totalSkipText != null ) {
+         _totalSkipMax = parseInt( totalSkipText, TOTAL_SKIP_PRP_KEY, _consecutiveSkipMax );
+      }
+      _logger.info( "Maximum consecutive tokens that can be skipped: " + _consecutiveSkipMax );
+      _logger.info( "Maximum tokens that can be skipped: " + _totalSkipMax );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void findTerms( final RareWordDictionary dictionary,
+                          final List<FastLookupToken> allTokens, final List<Integer> lookupTokenIndices,
+                          final Collection<SpannedRareWordTerm> termsFromDictionary ) {
+      Collection<RareWordTerm> rareWordHits;
+      for ( Integer lookupTokenIndex : lookupTokenIndices ) {
+         final FastLookupToken lookupToken = allTokens.get( lookupTokenIndex );
+         rareWordHits = dictionary.getRareWordHits( lookupToken );
+         if ( rareWordHits == null || rareWordHits.isEmpty() ) {
+            continue;
+         }
+         for ( RareWordTerm rareWordHit : rareWordHits ) {
+            if ( rareWordHit.getTokenCount() == 1 ) {
+               // Single word term, add and move on
+               termsFromDictionary.add( new SpannedRareWordTerm( rareWordHit, lookupToken.getTextSpan() ) );
+               continue;
+            }
+            final int termStartIndex = lookupTokenIndex - rareWordHit.getRareWordIndex();
+            if ( termStartIndex < 0 || termStartIndex + rareWordHit.getTokenCount() > allTokens.size() ) {
+               // term will extend beyond window
+               continue;
+            }
+            final SpannedRareWordTerm overlapTerm = getOverlapTerm( allTokens, lookupTokenIndex, rareWordHit,
+                                                                    _consecutiveSkipMax, _totalSkipMax );
+            if ( overlapTerm != null ) {
+               termsFromDictionary.add( overlapTerm );
+            }
+         }
+      }
+   }
+
+   /**
+    * Check to see if a given term overlaps a set of tokens
+    * @param allTokens all tokens in a window
+    * @param lookupTokenIndex index of rare word in the window of all tokens
+    * @param rareWordHit some possible term
+    * @return a spanned term that is in the window in some overlapping manner, or null
+    */
+   static private SpannedRareWordTerm getOverlapTerm( final List<FastLookupToken> allTokens, final int lookupTokenIndex,
+                                                      final RareWordTerm rareWordHit,
+                                                      final int consecutiveSkipMax, final int totalSkipMax ) {
+      final String[] rareWordTokens = fastSplit( rareWordHit.getText(), rareWordHit.getTokenCount() );
+      final List<TextSpan> missingSpanKeys = new ArrayList<TextSpan>();
+      int consecutiveSkips = 0;
+      int totalSkips = 0;
+      int firstWordIndex = -1;
+      if ( rareWordHit.getRareWordIndex() == 0 ) {
+         firstWordIndex = lookupTokenIndex;
+      } else {
+         int nextRareWordIndex = rareWordHit.getRareWordIndex()-1;
+         for ( int allTokensIndex=lookupTokenIndex-1; allTokensIndex>=0; allTokensIndex-- ) {
+            if ( rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getText() )
+                  || rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getVariant() ) ) {
+               nextRareWordIndex--;
+               if ( nextRareWordIndex < 0 ) {
+                  firstWordIndex = allTokensIndex;
+                  break;
+               }
+               consecutiveSkips = 0;
+               continue;
+            }
+            missingSpanKeys.add( allTokens.get( allTokensIndex ).getTextSpan() );
+            if ( !allTokens.get( allTokensIndex ).getText().equals( "," ) ) {
+               // things like "blood, urine, sputum cultures" should pick up "blood culture" and "urine culture"
+               consecutiveSkips++;
+               if ( consecutiveSkips > consecutiveSkipMax ) {
+                  break;
+               }
+            }
+            totalSkips++;
+            if ( totalSkips > totalSkipMax ) {
+               break;
+            }
+         }
+         if  ( firstWordIndex == -1 ) {
+            return null;
+         }
+      }
+      int lastWordIndex = -1;
+      if ( rareWordHit.getRareWordIndex() == rareWordHit.getTokenCount()-1 ) {
+         lastWordIndex = lookupTokenIndex;
+      } else {
+         consecutiveSkips = 0;
+         int nextRareWordIndex = rareWordHit.getRareWordIndex()+1;
+         for ( int allTokensIndex=lookupTokenIndex+1; allTokensIndex<allTokens.size(); allTokensIndex++ ) {
+            if ( rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getText() )
+                  || rareWordTokens[nextRareWordIndex].equals( allTokens.get( allTokensIndex ).getVariant() ) ) {
+               nextRareWordIndex++;
+               if ( nextRareWordIndex >= rareWordHit.getTokenCount() ) {
+                  lastWordIndex = allTokensIndex;
+                  break;
+               }
+               consecutiveSkips = 0;
+               continue;
+            }
+            missingSpanKeys.add( allTokens.get( allTokensIndex ).getTextSpan() );
+            consecutiveSkips++;
+            if ( consecutiveSkips > consecutiveSkipMax ) {
+               break;
+            }
+            totalSkips++;
+            if ( totalSkips > totalSkipMax ) {
+               break;
+            }
+         }
+         if ( lastWordIndex == -1 ) {
+            return null;
+         }
+      }
+      if ( missingSpanKeys.isEmpty() ) {
+         return new SpannedRareWordTerm( rareWordHit,
+                                         allTokens.get( firstWordIndex ).getStart(),
+                                         allTokens.get( lastWordIndex ).getEnd() );
+      }
+      final TextSpan discontiguousSpanKey = new MultiTextSpan( allTokens.get( firstWordIndex ).getStart(),
+                                                                     allTokens.get( lastWordIndex ).getEnd(),
+                                                                     missingSpanKeys );
+      return new SpannedRareWordTerm( rareWordHit, discontiguousSpanKey );
+   }
+
+
+
+   static private String[] fastSplit( final String line, final int tokenCount ) {
+      final String[] tokens = new String[tokenCount];
+      int tokenIndex = 0;
+      int previousSpaceIndex = -1;
+      int spaceIndex = line.indexOf( ' ' );
+      while ( spaceIndex > 0 && tokenIndex < tokenCount ) {
+         tokens[tokenIndex] = line.substring( previousSpaceIndex+1, spaceIndex );
+         tokenIndex++;
+         previousSpaceIndex = spaceIndex;
+         spaceIndex = line.indexOf( ' ', previousSpaceIndex+1 );
+      }
+      if ( previousSpaceIndex+1 < line.length() ) {
+         tokens[tokenCount-1] = line.substring( previousSpaceIndex+1 );
+      }
+      return tokens;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.ae;
+
+import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.tcas.Annotation;
+
+import java.util.Collection;
+import java.util.Map;
+
+/**
+ * Processes an Annotation window in the cas, adding discovered terms to a map.
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 12/5/13
+ */
+public interface WindowProcessor {
+
+   /**
+    * Some windows should be skipped entirely, such as "[section *]"
+    * @param window annotation in which to search for terms
+    * @return true if window should be processed, false if it should not
+    */
+   boolean isWindowOk( Annotation window );
+
+   /**
+    * Processes a window of annotations for dictionary terms
+    * @param jcas -
+    * @param window annotation in which to search for terms
+    * @param dictionaryTermsMap map of entity types and terms for those types in the window
+    */
+   void processWindow( JCas jcas, Annotation window,
+                       Map<RareWordDictionary, Collection<SpannedRareWordTerm>> dictionaryTermsMap );
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/WindowProcessor.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.consumer;
+
+import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.util.SemanticUtil;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 2/5/14
+ */
+abstract public class AbstractTermConsumer implements TermConsumer {
+
+   static private final String CODING_SCHEME_PRP_KEY = "codingScheme";
+
+   final private String _codingScheme;
+
+   public AbstractTermConsumer( final UimaContext uimaContext, final Properties properties ) {
+      _codingScheme = properties.getProperty( CODING_SCHEME_PRP_KEY );
+   }
+
+   /**
+    *
+    * @param jcas -
+    * @param codingScheme -
+    * @param typeId  cTakes IdentifiedAnnotation only accepts an integer as a typeId
+    * @param lookupHitMap map of spans to terms for those spans
+    * @throws org.apache.uima.analysis_engine.AnalysisEngineProcessException
+    */
+   abstract protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int typeId,
+                                              final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap )
+         throws AnalysisEngineProcessException;
+
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void consumeHits( final JCas jcas, final RareWordDictionary dictionary,
+                            final Collection<SpannedRareWordTerm> dictionaryTerms )
+         throws AnalysisEngineProcessException {
+      final String codingScheme = getCodingScheme();
+      final String entityType = dictionary.getSemanticGroup();
+      if ( entityType.equals( SemanticUtil.UNKNOWN_SEMANTIC_GROUP )
+            || entityType.equals( SemanticUtil.UNKNOWN_SEMANTIC_ZERO ) ) {
+         // The dictionary may have more than one type, create a map of types to terms and use them all
+         final Map<Integer,Collection<SpannedRareWordTerm>> typeIdLookupHitMap
+               = createTypeIdLookupHitMap( dictionaryTerms );
+         for ( Map.Entry<Integer,Collection<SpannedRareWordTerm>> typeIdLookupHits : typeIdLookupHitMap.entrySet() ) {
+            final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap = createLookupHitMap( typeIdLookupHits.getValue() );
+            consumeTypeIdHits( jcas, codingScheme, typeIdLookupHits.getKey(), lookupHitMap );
+         }
+         return;
+      }
+      // The dictionary has one type, consume all using that type id
+      final int typeId = SemanticUtil.getSemanticGroupId( entityType );
+      final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap = createLookupHitMap( dictionaryTerms );
+      consumeTypeIdHits( jcas, codingScheme, typeId, lookupHitMap );
+   }
+
+
+   protected String getCodingScheme() {
+      return _codingScheme;
+   }
+
+   /**
+    *
+    * @param spannedRareWordTerms discovered terms
+    * @return Map of terms for each span
+    */
+   static protected Map<TextSpan, Collection<RareWordTerm>> createLookupHitMap(
+         final Collection<SpannedRareWordTerm> spannedRareWordTerms ) {
+      final Map<TextSpan,Collection<RareWordTerm>> lookupHitMap = new HashMap<TextSpan, Collection<RareWordTerm>>();
+      for ( SpannedRareWordTerm spannedRareWordTerm : spannedRareWordTerms ) {
+         Collection<RareWordTerm> rareWordTerms = lookupHitMap.get( spannedRareWordTerm.getTextSpan() );
+         if ( rareWordTerms == null ) {
+            rareWordTerms = new HashSet<RareWordTerm>();
+            lookupHitMap.put( spannedRareWordTerm.getTextSpan(), rareWordTerms );
+         }
+         rareWordTerms.add( spannedRareWordTerm.getRareWordTerm() );
+      }
+      return lookupHitMap;
+   }
+
+   /**
+    *
+    * @param spannedRareWordTerms discovered terms
+    * @return Map of type Ids and the discovered terms for each
+    */
+   static protected Map<Integer,Collection<SpannedRareWordTerm>> createTypeIdLookupHitMap(
+         final Collection<SpannedRareWordTerm> spannedRareWordTerms ) {
+      final Map<Integer,Collection<SpannedRareWordTerm>> typeIdLookupHitMap
+            = new HashMap<Integer, Collection<SpannedRareWordTerm>>( 6 );
+      for ( SpannedRareWordTerm spannedTerm : spannedRareWordTerms ) {
+         // Attempt to obtain one or more valid type ids from the tuis of the term
+         final Collection<Integer> typeIds = SemanticUtil.getSemanticGroupIdFromTui( spannedTerm.getRareWordTerm().getTui() );
+         for ( Integer typeId : typeIds ) {
+            Collection<SpannedRareWordTerm> typeIdHits = typeIdLookupHitMap.get( typeId );
+            if ( typeIdHits == null ) {
+               typeIdHits = new ArrayList<SpannedRareWordTerm>();
+               typeIdLookupHitMap.put( typeId, typeIdHits );
+            }
+            typeIdHits.add( spannedTerm );
+         }
+      }
+      return typeIdLookupHitMap;
+   }
+
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/AbstractTermConsumer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.consumer;
+
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.AnatomicalSiteMention;
+import org.apache.ctakes.typesystem.type.textsem.DiseaseDisorderMention;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.ctakes.typesystem.type.textsem.LabMention;
+import org.apache.ctakes.typesystem.type.textsem.MedicationMention;
+import org.apache.ctakes.typesystem.type.textsem.ProcedureMention;
+import org.apache.ctakes.typesystem.type.textsem.SignSymptomMention;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/9/14
+ */
+final public class DefaultTermConsumer extends AbstractTermConsumer {
+
+
+   public DefaultTermConsumer( final UimaContext uimaContext, final Properties properties ) {
+      super( uimaContext, properties );
+   }
+
+   /**
+    *
+    * @param jcas -
+    * @param codingScheme -
+    * @param typeId  cTakes IdentifiedAnnotation only accepts an integer as a typeId
+    * @param lookupHitMap map of spans to terms for those spans
+    * @throws org.apache.uima.analysis_engine.AnalysisEngineProcessException
+    */
+   protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int typeId,
+                                     final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap )
+         throws AnalysisEngineProcessException {
+      // Set of Cuis to avoid duplicates at this offset
+      final Set<String> cuiSet = new HashSet<String>();
+      // Collection of UmlsConcept objects
+      final Collection<UmlsConcept> conceptList = new ArrayList<UmlsConcept>();
+      try {
+         for ( Map.Entry<TextSpan, Collection<RareWordTerm>> entry : lookupHitMap.entrySet() ) {
+            cuiSet.clear();
+            conceptList.clear();
+            for ( RareWordTerm lookupHit : entry.getValue() ) {
+               final String cui = lookupHit.getCui() ;
+               if ( cuiSet.add( cui ) ) {
+                  final UmlsConcept concept = new UmlsConcept( jcas );
+                  concept.setCodingScheme( codingScheme );
+                  concept.setCui( cui );
+                  concept.setTui( lookupHit.getTui() );
+                  conceptList.add( concept );
+               }
+            }
+            // Skip updating CAS if all Concepts for this type were filtered out for this span.
+            if ( conceptList.isEmpty() ) {
+               continue;
+            }
+            // code is only valid if the covered text is also present in the filter
+            final int neBegin = entry.getKey().getStart();
+            final int neEnd = entry.getKey().getEnd();
+            final FSArray conceptArr = new FSArray( jcas, conceptList.size() );
+            int arrIdx = 0;
+            for ( UmlsConcept umlsConcept : conceptList ) {
+               conceptArr.set( arrIdx, umlsConcept );
+               arrIdx++;
+            }
+            IdentifiedAnnotation annotation;
+            if ( typeId == CONST.NE_TYPE_ID_DRUG ) {
+               annotation = new MedicationMention( jcas );
+            } else if ( typeId == CONST.NE_TYPE_ID_ANATOMICAL_SITE ) {
+               annotation = new AnatomicalSiteMention( jcas );
+            } else if ( typeId == CONST.NE_TYPE_ID_DISORDER ) {
+               annotation = new DiseaseDisorderMention( jcas );
+            } else if ( typeId == CONST.NE_TYPE_ID_FINDING ) {
+               annotation = new SignSymptomMention( jcas );
+            } else if ( typeId == CONST.NE_TYPE_ID_LAB ) {
+               annotation = new LabMention( jcas );
+            } else if ( typeId == CONST.NE_TYPE_ID_PROCEDURE ) {
+               annotation = new ProcedureMention( jcas );
+            } else {
+               annotation = new EntityMention( jcas );
+            }
+            annotation.setTypeID( typeId );
+            annotation.setBegin( neBegin );
+            annotation.setEnd( neEnd );
+            annotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
+            annotation.setOntologyConceptArr( conceptArr );
+            annotation.addToIndexes();
+         }
+      } catch ( Exception e ) {
+         // TODO Poor form - refactor
+         // What is really thrown?  The jcas "throwFeatMissing" is not a great help
+         throw new AnalysisEngineProcessException( e );
+      }
+   }
+
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/DefaultTermConsumer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/PrecisionTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/PrecisionTermConsumer.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/PrecisionTermConsumer.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/PrecisionTermConsumer.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,134 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.consumer;
+
+import org.apache.ctakes.dictionary.lookup2.textspan.MultiTextSpan;
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+
+/**
+ * Refine a collection of dictionary terms to only contain the most specific variations:
+ * "colon cancer" instead of "cancer", performed by span inclusion / complete containment, not overlap
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/9/14
+ */
+final public class PrecisionTermConsumer extends AbstractTermConsumer {
+
+   private final AbstractTermConsumer _idHitConsumer;
+
+   public PrecisionTermConsumer( final UimaContext uimaContext, final Properties properties ) {
+      super( uimaContext, properties );
+      _idHitConsumer = new DefaultTermConsumer( uimaContext, properties );
+   }
+
+   /**
+    * Only uses the largest spans for the type
+    * {@inheritDoc}
+    */
+   @Override
+   protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int typeId,
+                                     final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap )
+         throws AnalysisEngineProcessException {
+      final Map<TextSpan, Collection<RareWordTerm>> preciseHitMap = createPreciseHitMap( lookupHitMap );
+      _idHitConsumer.consumeTypeIdHits( jcas, codingScheme, typeId, preciseHitMap );
+   }
+
+   /**
+    * Refine a collection of dictionary terms to only contain the most specific variations:
+    * "colon cancer" instead of "cancer", performed by span inclusion /complete containment, not overlap
+    * @param lookupHitMap terms in the dictionary
+    * @return terms with the longest spans
+    */
+   static private Map<TextSpan, Collection<RareWordTerm>> createPreciseHitMap(
+         final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap ) {
+      final Collection<TextSpan> discardSpans = new HashSet<TextSpan>();
+      final List<TextSpan> textSpans = new ArrayList<TextSpan>( lookupHitMap.keySet() );
+      final int count = textSpans.size();
+      for ( int i=0; i<count; i++ ) {
+         final TextSpan spanKeyI = textSpans.get( i );
+         for ( int j=i+1; j<count; j++ ) {
+            final TextSpan spanKeyJ = textSpans.get( j );
+            if ( (spanKeyJ.getStart() <= spanKeyI.getStart() && spanKeyJ.getEnd() > spanKeyI.getEnd())
+                  || (spanKeyJ.getStart() < spanKeyI.getStart() && spanKeyJ.getEnd() >= spanKeyI.getEnd()) ) {
+               // J contains I, discard less precise concepts for span I and move on to next span I
+               if ( spanKeyJ instanceof MultiTextSpan ) {
+                  boolean spanIok = false;
+                  for ( TextSpan missingSpanKey : ((MultiTextSpan)spanKeyJ).getMissingSpans() ) {
+                     if ( (missingSpanKey.getStart() >= spanKeyI.getStart() && missingSpanKey.getStart() < spanKeyI.getEnd())
+                           || (missingSpanKey.getEnd() > spanKeyI.getStart() && missingSpanKey.getEnd() <= spanKeyI.getEnd()) ) {
+                        // I overlaps a missing span, so it is actually ok
+                        spanIok = true;
+                        break;
+                     }
+                  }
+                  if ( !spanIok ) {
+                     discardSpans.add( spanKeyI );
+                     break;
+                  }
+               } else {
+                  discardSpans.add( spanKeyI );
+                  break;
+               }
+            }
+            if ( ( (spanKeyI.getStart() <= spanKeyJ.getStart() && spanKeyI.getEnd() > spanKeyJ.getEnd() )
+                  || (spanKeyI.getStart() < spanKeyJ.getStart() && spanKeyI.getEnd() >= spanKeyJ.getEnd()) ) ) {
+               // I contains J, discard less precise concepts for span J and move on to next span J
+               if ( spanKeyI instanceof MultiTextSpan ) {
+                  boolean spanJok = false;
+                  for ( TextSpan missingSpanKey : ((MultiTextSpan)spanKeyI).getMissingSpans() ) {
+                     if ( (missingSpanKey.getStart() >= spanKeyJ.getStart() && missingSpanKey.getStart() < spanKeyJ.getEnd())
+                           || (missingSpanKey.getEnd() > spanKeyJ.getStart() && missingSpanKey.getEnd() <= spanKeyJ.getEnd()) ) {
+                        // J overlaps a missing span, so it is actually ok
+                        spanJok = true;
+                        break;
+                     }
+                  }
+                  if ( !spanJok ) {
+                     discardSpans.add( spanKeyJ );
+                  }
+               } else {
+                  discardSpans.add( spanKeyJ );
+               }
+            }
+         }
+      }
+      final Map<TextSpan, Collection<RareWordTerm>> preciseHitMap
+            = new HashMap<TextSpan, Collection<RareWordTerm>>( lookupHitMap.size() - discardSpans.size() );
+      for ( Map.Entry<TextSpan,Collection<RareWordTerm>> entry : lookupHitMap.entrySet() ) {
+         if ( !discardSpans.contains( entry.getKey() ) ) {
+            preciseHitMap.put( entry.getKey(), entry.getValue() );
+         }
+      }
+      return preciseHitMap;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/PrecisionTermConsumer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/TermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/TermConsumer.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/TermConsumer.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/TermConsumer.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,45 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.consumer;
+
+import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+
+import java.util.Collection;
+
+/**
+ * Stores terms in the cas
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 12/5/13
+ */
+public interface TermConsumer {
+
+   /**
+    *
+    * @param jcas -
+    * @param dictionary the dictionary: Anatomical Site, Disease/Disorder, Drug, combination, etc.
+    * @param dictionaryTerms collection of discovered terms
+    * @throws AnalysisEngineProcessException
+    */
+   void consumeHits( JCas jcas, RareWordDictionary dictionary, Collection<SpannedRareWordTerm> dictionaryTerms )
+         throws AnalysisEngineProcessException;
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/TermConsumer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/WsdTermConsumer.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/WsdTermConsumer.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/WsdTermConsumer.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/WsdTermConsumer.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,283 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.consumer;
+
+import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
+import org.apache.ctakes.dictionary.lookup2.dictionary.RareWordDictionary;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.term.SpannedRareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.util.SemanticUtil;
+import org.apache.ctakes.typesystem.type.constants.CONST;
+import org.apache.ctakes.typesystem.type.refsem.UmlsConcept;
+import org.apache.ctakes.typesystem.type.textsem.EntityMention;
+import org.apache.ctakes.typesystem.type.textsem.IdentifiedAnnotation;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.jcas.cas.FSArray;
+
+import java.sql.Connection;
+import java.sql.Driver;
+import java.sql.DriverManager;
+import java.sql.PreparedStatement;
+import java.sql.ResultSet;
+import java.sql.SQLException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+
+/**
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 12/16/13
+ */
+public class WsdTermConsumer extends AbstractTermConsumer {
+
+   static private final String JDBC_DRIVER = "org.hsqldb.jdbcDriver";
+   static private final String DB_URL = "jdbc:hsqldb:res:resources/org/apache/ctakes/dictionary/lookup/cuiRelations/cuiRelations";
+   static private final String DB_USER = "sa";
+   static private final String DB_PASS = "";
+   static private final String DB_TABLE = "cuiRelations";
+   final private Connection _connection;
+   private PreparedStatement _metadataStatement;
+
+   public WsdTermConsumer( final UimaContext uimaContext, final Properties properties ) {
+      super( uimaContext, properties );
+      _connection = createDatabaseConnection();
+   }
+
+   protected void consumeTypeIdHits( final JCas jcas, final String codingScheme, final int typeId,
+                                              final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap )
+         throws AnalysisEngineProcessException {
+      // Do nothing
+   }
+
+   static private void registerDriver() {
+      try {
+         Driver driver = (Driver)Class.forName( JDBC_DRIVER ).newInstance();
+         DriverManager.registerDriver( driver );
+      } catch ( Exception e ) {
+         // TODO At least four different exceptions are thrown here, and should be caught and handled individually
+         System.err.println( "Could not register Driver " + JDBC_DRIVER );
+         System.err.println( e.getMessage() );
+         System.exit( 1 );
+      }
+   }
+
+   static public Connection createDatabaseConnection() {
+      registerDriver();
+      Connection connection = null;
+      try {
+         connection = DriverManager.getConnection( DB_URL, DB_USER, DB_PASS );
+      } catch ( SQLException sqlE ) {
+         // thrown by Connection.prepareStatement(..) and getTotalRowCount(..)
+         System.err.println( "Could not establish connection to " + DB_URL + " as " + DB_USER );
+         System.err.println( sqlE.getMessage() );
+         System.exit( 1 );
+      }
+      return connection;
+   }
+
+   /**
+    *
+    * @param cui text of the rare word to use for term lookup
+    * @return an sql call to use for term lookup
+    * @throws SQLException if the {@code PreparedStatement} could not be created or changed
+    */
+   private PreparedStatement initMetaDataStatement( final String cui ) throws SQLException {
+      if ( _metadataStatement == null ) {
+         final String lookupSql = "SELECT * FROM " + DB_TABLE + " WHERE CUI = ?";
+         _metadataStatement = _connection.prepareStatement( lookupSql );
+      }
+      _metadataStatement.clearParameters();
+      _metadataStatement.setString( 1, cui );
+      return _metadataStatement;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public void consumeHits( final JCas jcas, final RareWordDictionary dictionary,
+                            final Collection<SpannedRareWordTerm> dictionaryTerms )
+         throws AnalysisEngineProcessException {
+      final String codingScheme = getCodingScheme();
+      final String entityType = dictionary.getSemanticGroup();
+      // cTakes IdentifiedAnnotation only accepts an integer as a typeId.
+      final int typeId = SemanticUtil.getSemanticGroupId( entityType );
+      // iterate over the LookupHit objects
+      final Map<TextSpan, Collection<RareWordTerm>> lookupHitMap = createLookupHitMap( dictionaryTerms );
+      // Set of Cuis to avoid duplicates at this offset
+      final Set<String> cuiSet = new HashSet<String>();
+      // Collection of UmlsConcept objects
+      final Collection<UmlsConcept> conceptList = new ArrayList<UmlsConcept>();
+      try {
+         for ( Map.Entry<TextSpan, Collection<RareWordTerm>> entry : lookupHitMap.entrySet() ) {
+            cuiSet.clear();
+            conceptList.clear();
+            final Collection<RareWordTerm> bestTerms = getBestRareWordTerms( entry.getValue(), dictionaryTerms );
+            for ( RareWordTerm lookupHit : bestTerms ) {
+               final String cui = lookupHit.getCui() ;
+               //String text = lh.getDictMetaDataHit().getMetaFieldValue("text");
+               if ( cuiSet.add( cui ) ) {
+                  final UmlsConcept concept = new UmlsConcept( jcas );
+                  concept.setCodingScheme( codingScheme );
+                  concept.setCui( cui );
+                  concept.setTui( lookupHit.getTui() );
+                  conceptList.add( concept );
+               }
+            }
+            // Skip updating CAS if all Concepts for this type were filtered out for this span.
+            if ( conceptList.isEmpty() ) {
+               continue;
+            }
+            // code is only valid if the covered text is also present in the filter
+            final int neBegin = entry.getKey().getStart();
+            final int neEnd = entry.getKey().getEnd();
+            final FSArray conceptArr = new FSArray( jcas, conceptList.size() );
+            int arrIdx = 0;
+            for ( UmlsConcept umlsConcept : conceptList ) {
+               conceptArr.set( arrIdx, umlsConcept );
+               arrIdx++;
+            }
+            final IdentifiedAnnotation identifiedAnnotation = new EntityMention( jcas );
+            identifiedAnnotation.setTypeID( typeId );
+            identifiedAnnotation.setBegin( neBegin );
+            identifiedAnnotation.setEnd( neEnd );
+            identifiedAnnotation.setDiscoveryTechnique( CONST.NE_DISCOVERY_TECH_DICT_LOOKUP );
+            identifiedAnnotation.setOntologyConceptArr( conceptArr );
+            //            identifiedAnnotation.setConfidence( 0.1 );
+            identifiedAnnotation.addToIndexes();
+         }
+      } catch ( Exception e ) {
+         // TODO Poor form - refactor
+         throw new AnalysisEngineProcessException( e );
+      }
+   }
+
+
+   private Collection<RareWordTerm> getBestRareWordTerms( final Collection<RareWordTerm> spanTerms,
+                                                            final Collection<SpannedRareWordTerm> dictionaryTerms ) {
+      if ( spanTerms.size() <= 1 ) {
+         return spanTerms;
+      }
+      final Map<RareWordTerm, Integer> termValidityMap = new HashMap<RareWordTerm, Integer>( spanTerms.size() );
+      int highestValidity = 0;
+      for ( RareWordTerm term : spanTerms ) {
+         final int validity = getValidityByRelation( term, dictionaryTerms );
+         highestValidity = Math.max( highestValidity, validity );
+         termValidityMap.put( term, validity );
+      }
+      // Anything that is a synonym or above should be valid, or highest validity
+      highestValidity = Math.min( highestValidity, RelationType.SY.__relationStrength );
+      final Collection<RareWordTerm> bestTerms = new ArrayList<RareWordTerm>();
+      for ( Map.Entry<RareWordTerm,Integer> entry : termValidityMap.entrySet() ) {
+         if ( entry.getValue() == highestValidity ) {
+            bestTerms.add( entry.getKey() );
+         }
+      }
+      return bestTerms;
+   }
+
+
+   private int getValidityByRelation( final RareWordTerm term,
+                                             final Collection<SpannedRareWordTerm> dictionaryTerms ) {
+      final Collection<RelatedCui> relatedCuis = getRelatedCuis( term.getCui() );
+      int validity = 0;
+      for ( RelatedCui relatedCui : relatedCuis ) {
+         if ( haveCui( relatedCui.__cui, dictionaryTerms ) ) {
+            validity += relatedCui.__relationType.__relationStrength;
+         }
+      }
+      return validity;
+   }
+
+   private Collection<RelatedCui> getRelatedCuis( final String cui ) {
+      final List<RelatedCui> relatedCuis = new ArrayList<RelatedCui>();
+      try {
+         initMetaDataStatement( cui );
+         final ResultSet resultSet = _metadataStatement.executeQuery();
+         while ( resultSet.next() ) {
+            final RelatedCui relatedCui = new RelatedCui( resultSet.getString( FIELD_INDEX.CUI.__index),
+                                                          resultSet.getString( FIELD_INDEX.RELATION_TYPE.__index ) );
+            relatedCuis.add( relatedCui );
+         }
+         // Though the ResultSet interface documentation states that there are automatic closures,
+         // it is up to the driver to implement this behavior ...  historically some drivers have not done so
+         resultSet.close();
+         return relatedCuis;
+      } catch ( SQLException e ) {
+//         throw new DictionaryException( e );
+      }
+      return relatedCuis;
+   }
+
+   static private boolean haveCui( final String cui, final Collection<SpannedRareWordTerm> dictionaryTerms ) {
+      for ( SpannedRareWordTerm term : dictionaryTerms ) {
+         if ( term.getRareWordTerm().getCui().equals( cui ) ) {
+            return true;
+         }
+      }
+      return false;
+   }
+
+   static public enum RelationType {
+      // RL/SY : Synonym; SIB : Sibling; PAR : Parent; CHD : Child; RN,RB,RO : Narrow, Broad, Other; XR : No Relation
+      RL(9), SY(9), SIB(7), PAR(7), CHD(7), RN(8), RB(8), RO(5), XR(-5), UNKNOWN(0);
+      private final int __relationStrength;
+      private RelationType( final int relationStrength ) {
+         __relationStrength = relationStrength;
+      }
+      static private RelationType getRelationType( final String relationName ) {
+         for ( RelationType type : RelationType.values() ) {
+            if ( relationName.equals( type.name() ) ) {
+               return type;
+            }
+         }
+         return UNKNOWN;
+      }
+   }
+
+   static public class RelatedCui {
+      final private String __cui;
+      final private RelationType __relationType;
+      public RelatedCui( final String cui, final String relationName ) {
+         __cui = cui;
+         __relationType = RelationType.getRelationType( relationName );
+      }
+   }
+
+   /**
+    * Column (field) indices in the database.  Notice that these are constant and not configurable.
+    * If a configurable implementation is desired then create an extension.
+    */
+   static private enum FIELD_INDEX {
+      CUI( 1 ), RELATION_TYPE( 2 );
+      final private int __index;
+      private FIELD_INDEX( final int index ) {
+         __index = index;
+      }
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/consumer/WsdTermConsumer.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/AbstractRareWordDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/AbstractRareWordDictionary.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/AbstractRareWordDictionary.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/AbstractRareWordDictionary.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,77 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+
+import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+
+import java.util.Collection;
+
+/**
+ *
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/25/13
+ */
+abstract public class AbstractRareWordDictionary implements RareWordDictionary {
+
+   final private String _name;
+   final private String _semanticGroup;
+
+   /**
+    *
+    * @param name simple name for the dictionary
+    * @param semanticGroup the type of term that exists in the dictionary: Anatomical Site, Disease/Disorder, Drug, etc.
+    */
+   public AbstractRareWordDictionary( final String name, final String semanticGroup ) {
+      _name = name;
+      _semanticGroup = semanticGroup;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _name;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getSemanticGroup() {
+      return _semanticGroup;
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<RareWordTerm> getRareWordHits( final FastLookupToken fastLookupToken ) {
+      if ( fastLookupToken.getVariant() == null ) {
+         return getRareWordHits( fastLookupToken.getText() );
+      }
+      final Collection<RareWordTerm> terms = getRareWordHits( fastLookupToken.getText() );
+      terms.addAll( getRareWordHits( fastLookupToken.getVariant() ) );
+      return terms;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/AbstractRareWordDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/BsvRareWordDictionary.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/BsvRareWordDictionary.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/BsvRareWordDictionary.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/BsvRareWordDictionary.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,161 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
+import org.apache.ctakes.dictionary.lookup2.term.RareWordTerm;
+import org.apache.ctakes.dictionary.lookup2.util.LookupUtil;
+import org.apache.log4j.Logger;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Map;
+
+import static org.apache.ctakes.dictionary.lookup2.dictionary.RareWordTermMapCreator.CuiTuiTerm;
+
+/**
+ * A RareWordDictionary created from a bar-separated value (BSV) file.  The file can have 2 or 3 columns,
+ * in the format CUI|TEXT or CUI|TUI|TEXT.  The text will be tokenized and rare word indexing done automatically for
+ * internal storage and retrieval.  If TUI is not supplied then CUI duplicates as TUI.
+ * This dictionary is really just a wrapper of a {@link MemRareWordDictionary} with a file reader.
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 1/9/14
+ */
+final public class BsvRareWordDictionary implements RareWordDictionary {
+
+   static private final Logger LOGGER = Logger.getLogger( "BsvRareWordDictionary" );
+
+   private RareWordDictionary _delegateDictionary;
+
+   public BsvRareWordDictionary( final String name, final String entityId, final String bsvFilePath ) {
+      this( name, entityId, new File( bsvFilePath ) );
+   }
+
+   public BsvRareWordDictionary( final String name, final String entityId, final File bsvFile ) {
+      final Collection<CuiTuiTerm> cuiTuiTerms = parseBsvFile( bsvFile, entityId );
+      final Map<String,Collection<RareWordTerm>> rareWordTermMap
+            = RareWordTermMapCreator.createRareWordTermMap( cuiTuiTerms );
+      _delegateDictionary = new MemRareWordDictionary( name, entityId, rareWordTermMap );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getName() {
+      return _delegateDictionary.getName();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public String getSemanticGroup() {
+      return _delegateDictionary.getSemanticGroup();
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<RareWordTerm> getRareWordHits( final FastLookupToken fastLookupToken ) {
+      return _delegateDictionary.getRareWordHits( fastLookupToken );
+   }
+
+   /**
+    * {@inheritDoc}
+    */
+   @Override
+   public Collection<RareWordTerm> getRareWordHits( final String rareWordText ) {
+      return _delegateDictionary.getRareWordHits( rareWordText );
+   }
+
+
+   /**
+    * Create a collection of {@link RareWordTermMapCreator.CuiTuiTerm} Objects
+    * by parsing a bsv file.  The file can be in one of two columnar formats:
+    * <p>
+    *    CUI|Text
+    * </p>
+    * or
+    * <p>
+    *    CUI|TUI|Text
+    * </p>
+    * If the TUI column is omitted then the entityId for the dictionary is used as the TUI
+    * @param bsvFile file containing term rows and bsv columns
+    * @param entityId the entity id for the dictionary
+    * @return collection of all valid terms read from the bsv file
+    */
+   static private Collection<CuiTuiTerm> parseBsvFile( final File bsvFile, final String entityId ) {
+      final Collection<CuiTuiTerm> cuiTuiTerms = new ArrayList<CuiTuiTerm>();
+      try {
+         final BufferedReader reader = new BufferedReader( new FileReader( bsvFile ) );
+         String line = reader.readLine();
+         while ( line != null ) {
+            final String[] columns = LookupUtil.fastSplit( line, '|' );
+            final CuiTuiTerm cuiTuiTerm = createCuiTuiTerm( columns, entityId );
+            if ( cuiTuiTerm != null ) {
+               // Add to the dictionary
+               cuiTuiTerms.add( cuiTuiTerm );
+            } else {
+               LOGGER.warn( "Bad BSV line " + line + " in " + bsvFile.getPath() );
+            }
+            line = reader.readLine();
+         }
+         reader.close();
+      } catch ( FileNotFoundException fnfE ) {
+         LOGGER.error( fnfE.getMessage() );
+      } catch ( IOException ioE ) {
+         LOGGER.error( ioE.getMessage() );
+      }
+      return cuiTuiTerms;
+   }
+
+   /**
+    * @param columns two or three columns representing CUI,Text or CUI,TUI,Text respectively
+    * @param entityId the entity id for the dictionary, used as the Term TUI should one not be specified
+    * @return a term created from the columns or null if the columns are malformed
+    */
+   static private CuiTuiTerm createCuiTuiTerm( final String[] columns, final String entityId ) {
+      if ( columns.length != 2 && columns.length != 3 ) {
+         return null;
+      }
+      final int cuiIndex = 0;
+      int tuiIndex = -1;
+      int termIndex = 1;
+      if ( columns.length == 3 ) {
+         tuiIndex = 1;
+         termIndex = 2;
+      }
+      if ( columns[ cuiIndex ].trim().isEmpty() || columns[ termIndex ].trim().isEmpty() ) {
+         return null;
+      }
+      final String cui = columns[ cuiIndex ].trim();
+      final String tui = (tuiIndex < 0 || columns[tuiIndex].trim().isEmpty()) ? entityId : columns[ tuiIndex ].trim();
+      final String term = columns[ termIndex ].trim().toLowerCase();
+      return new CuiTuiTerm( cui, tui, term );
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/BsvRareWordDictionary.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryDescriptorParser.java
URL: http://svn.apache.org/viewvc/ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryDescriptorParser.java?rev=1571820&view=auto
==============================================================================
--- ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryDescriptorParser.java (added)
+++ ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryDescriptorParser.java Tue Feb 25 20:54:25 2014
@@ -0,0 +1,342 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.ctakes.dictionary.lookup2.dictionary;
+
+import org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer;
+import org.apache.ctakes.dictionary.lookup2.util.DictionarySpec;
+import org.apache.ctakes.dictionary.lookup2.util.UmlsUserApprover;
+import org.apache.log4j.Logger;
+import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
+import org.apache.uima.resource.ResourceAccessException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.jdom.Document;
+import org.jdom.Element;
+import org.jdom.JDOMException;
+import org.jdom.input.SAXBuilder;
+
+import java.io.File;
+import java.io.IOException;
+import java.lang.reflect.Constructor;
+import java.lang.reflect.InvocationTargetException;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Properties;
+
+/**
+ * Parses the XML descriptor indicated by the {@code externalResource} for {@code RareWordTermsDescriptorFile}
+ * in the XML descriptor for the Rare Word Term Lookup Annotator
+ * {@link org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator}
+ * </p>
+ * If there is a problem with the descriptor then the whole pipeline goes down, so care must be taken by the User
+ * and any messages (logged or otherwise) produced by this class should be as specific as possible.  Devs take notice.
+ * <p/>
+ * TODO
+ * This parser can create a RareWordDictionary by wrapping the older Jdbc, Lucene, StringTable (CSV) descriptors.
+ * However, to prevent the dependency upon the current Dictionary-Lookup module and its "Dictionary" interface,
+ * all such code has been commented out.  Uncommenting, linking, and rebuilding is possible if use of an older dictionary
+ * resource is required.
+ * TODO
+ * Author: SPF
+ * Affiliation: CHIP-NLP
+ * Date: 11/20/13
+ */
+final public class DictionaryDescriptorParser {
+
+   // LOG4J logger based on class name
+   static private final Logger LOGGER = Logger.getLogger( "DictionaryDescriptorParser" );
+
+   /**
+    * A <B>Utility Class</B> cannot be instantiated
+    */
+   private DictionaryDescriptorParser() {
+   }
+
+   /**
+    * XML key specifying the section that defines each {@link RareWordDictionary} that should be used for annotation
+    */
+   static private final String DICTIONARIES_KEY = "rareWordDictionaries";
+   /**
+    * Each {@link RareWordDictionary} should have an id that specifies a unique name for that dictionary
+    */
+   static private final String NAME_ID = "id";
+   /**
+    * Each {@link RareWordDictionary} must have an external resource specified by the
+    * {@code configurableDataResourceSpecifier} in the XML descriptor for the Rare Word Term Lookup Annotator
+    * {@link org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator}.
+    * The external resource <i>does not</i> need to be unique for each dictionary.
+    */
+   static private final String EXTERNAL_RESOURCE = "externalResourceKey";
+   /**
+    * Each {@link RareWordDictionary} can utilize or ignore the case of terms.   In most situations case sensitivity
+    * is not beneficial, but it may be for some.  For instance, if it is an acronym dictionary then differentiating
+    * between "WHO" (World Health Organization) and "who" is important.
+    * The {@link org.apache.ctakes.dictionary.lookup2.ae.DefaultJCasTermAnnotator}
+    * does ignores case and ignores this setting
+    */
+   static private final String CASE_SENSITIVE = "caseSensitive";
+   /**
+    * Each {@link RareWordDictionary} should have a numerical {@code typeId} that indicates the semantic group
+    * to which the terms in the dictionary belong.  The standard cTakes type ids are numerical and listed in
+    * {*link org.apache.ctakes.typesystem.type.constants.CONST} as
+    * <ul>
+    * <li>0  Unknown</li>
+    * <li>1  Medication / Drug</li>
+    * <li>2  Disease / Disorder</li>
+    * <li>3  Sign / Symptom (Finding)</li>
+    * <li>4  <i>Not Defined</i></li>
+    * <li>5  Procedure</li>
+    * <li>6  Anatomical Site</li>
+    * <li>7  Clinical Attribute</li>
+    * <li>8  Device</li>
+    * <li>9  Lab</li>
+    * <li>10 Phenomena</li>
+    * </ul>
+    * In truth, any coding scheme (Numerical or otherwise) can be used as long as a {@link org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer}
+    * is created to use it. That being said ...
+    */
+   private static final String TYPE_ID = "typeId";
+   /**
+    * Each {@link RareWordDictionary} must have a java implementation.
+    * It is best if this is a {@link RareWordDictionary},
+    * but it can also be an older org apache ctakes dictionary lookup Dictionary, in which case a
+    * org apache ctakes dictionary lookup2 dictionary RareWordDictionaryWrapper will be used.
+    * <p>The available implementation keys are:</p>
+    * <ul>
+    * <li>rareWordJdbc</li>
+    * <li>rareWordUmls</li>
+    * <li>rareWordBsv</li>
+    * <li>luceneImpl</li>
+    * <li>jdbcImpl</li>
+    * <li>csvImpl</li>
+    * </ul>
+    */
+   private static final String IMPLEMENTATION = "implementation";
+   /**
+    * XML key specifying the section that defines the single {@link org.apache.ctakes.dictionary.lookup2.consumer.TermConsumer} that should be used to
+    * consume discovered terms.
+    */
+   static private final String CONSUMER_KEY = "rareWordConsumer";
+
+   // Added 'maxListSize'.  Size equals max int by default  - used for lucene dictionaries
+   private static int MAX_LIST_SIZE = Integer.MAX_VALUE; //ohnlp-Bugs-3296301
+
+   /**
+    * Initiates the parsing of the XML descriptor file containing definition of dictionaries and a consumer for the
+    * Rare Word Term dictionary paradigm
+    *
+    * @param descriptorFile XML-formatted file, see the dictionary-lookup resources file {@code RareWordTermsUMLS.xml}
+    *                       for an example
+    * @param uimaContext    -
+    * @return {@link org.apache.ctakes.dictionary.lookup2.util.DictionarySpec} with specification of dictionaries and a consumer as read from the
+    *         {@code descriptorFile}
+    * @throws AnnotatorContextException if the File could not be found/read or the xml could not be parsed
+    */
+   static public DictionarySpec parseDescriptor( final File descriptorFile, final UimaContext uimaContext )
+         throws AnnotatorContextException {
+      LOGGER.info( "Parsing dictionary specifications: " + descriptorFile.getPath() );
+      final SAXBuilder saxBuilder = new SAXBuilder();
+      Document doc;
+      try {
+         doc = saxBuilder.build( descriptorFile );
+      } catch ( JDOMException jdomE ) {
+         throw new AnnotatorContextException( "Could not parse " + descriptorFile.getPath(), new Object[0], jdomE );
+      } catch ( IOException ioE ) {
+         throw new AnnotatorContextException( "Could not parse " + descriptorFile.getPath(), new Object[0], ioE );
+      }
+      final Map<String, RareWordDictionary> dictionaries
+            = parseDictionaries( uimaContext, doc.getRootElement().getChild( DICTIONARIES_KEY ) );
+      final TermConsumer consumer = parseConsumerXml( uimaContext,
+                                                              doc.getRootElement().getChild( CONSUMER_KEY ) );
+      return new DictionarySpec( dictionaries.values(), consumer );
+   }
+
+   /**
+    * Creates dictionary engines by parsing the section defined by {@link this.DICTIONARIES_KEY}
+    *
+    * @param uimaContext         -
+    * @param dictionariesElement contains definition of all dictionaries
+    * @return Mapping of dictionary names {@link this.NAME_ID} to new {@link RareWordDictionary} instances
+    * @throws AnnotatorContextException if the resource specified by {@link this.EXTERNAL_RESOURCE} does not match
+    *                                   the type specified by {@link this.IMPLEMENTATION} or for some reason could not be used
+    */
+   static private Map<String, RareWordDictionary> parseDictionaries( final UimaContext uimaContext,
+                                                                           final Element dictionariesElement )
+         throws AnnotatorContextException {
+      final Map<String, RareWordDictionary> engines = new HashMap<String, RareWordDictionary>();
+      final Collection dictatteers = dictionariesElement.getChildren();
+      for ( Object dictatteer : dictatteers ) {
+         if ( dictatteer instanceof Element ) {
+            final String id = ((Element) dictatteer).getAttributeValue( NAME_ID );
+            final RareWordDictionary dictionary = parseDictionaryXml( uimaContext, (Element) dictatteer );
+            engines.put( id, dictionary );
+         }
+      }
+      return engines;
+   }
+
+   /**
+    * Creates a dictionary by parsing each child element of {@link this.DICTIONARIES_KEY}
+    *
+    * @param uimaContext       -
+    * @param dictionaryElement contains the definition of a single dictionary
+    * @return a dictionary or null if there is a problem
+    * @throws AnnotatorContextException if any of a dozen things goes wrong
+    */
+   private static RareWordDictionary parseDictionaryXml( final UimaContext uimaContext,
+                                                         final Element dictionaryElement )
+         throws AnnotatorContextException {
+      final String externalResourceKey = dictionaryElement.getAttributeValue( EXTERNAL_RESOURCE );
+      final Boolean keepCase = Boolean.valueOf( dictionaryElement.getAttributeValue( CASE_SENSITIVE ) );
+      final String entityTypeId = dictionaryElement.getAttributeValue( TYPE_ID );
+      Object externalResource;
+      try {
+         externalResource = uimaContext.getResourceObject( externalResourceKey );
+      } catch ( ResourceAccessException raE ) {
+         throw new AnnotatorContextException( "Could not access external resource " + externalResourceKey,
+                                              new Object[0], raE );
+      }
+      if ( externalResource == null ) {
+         throw new AnnotatorContextException( "Could not find external resource " + externalResourceKey,
+                                              new Object[0] );
+      }
+      RareWordDictionary dictionary = null;
+      final Element implementationElement = (Element) dictionaryElement.getChild( IMPLEMENTATION ).getChildren().get( 0 );
+      final String implementationName = implementationElement.getName();
+      if ( implementationName.equals( "rareWordJdbc" ) ) {
+         dictionary = DictionaryFactory.createRareWordJdbc( implementationElement,
+                                                            externalResource,
+                                                            entityTypeId );
+      } else if ( implementationName.equals( "rareWordUmls" ) ) {
+         try {
+            UmlsUserApprover.validateUMLSUser( uimaContext );
+            dictionary = DictionaryFactory.createRareWordJdbc( implementationElement,
+                                                               externalResource,
+                                                               entityTypeId );
+         } catch ( ResourceInitializationException riE ) {
+            throw new AnnotatorContextException( riE );
+         }
+      } else if ( implementationName.equals( "rareWordBsv" ) ) {
+         dictionary = DictionaryFactory.createRareWordBsv( externalResourceKey, externalResource, entityTypeId );
+//      } else if ( implementationName.equals( "luceneImpl" ) ) {
+//         dictionary = DictionaryFactory.createWrappedLucene( dictionaryElement,
+//                                                                     externalResourceKey,
+//                                                                     externalResource,
+//                                                                     entityTypeId );
+//      } else if ( implementationName.equals( "jdbcImpl" ) ) {
+//         dictionary = DictionaryFactory.createWrappedJdbc( dictionaryElement,
+//                                                                   implementationElement,
+//                                                                   externalResourceKey,
+//                                                                   externalResource,
+//                                                                   entityTypeId );
+//      } else if ( implementationName.equals( "csvImp" ) ) {
+//         dictionary = DictionaryFactory.createWrappedCsv( dictionaryElement,
+//                                                                  implementationElement,
+//                                                                  externalResourceKey,
+//                                                                  externalResource,
+//                                                                  entityTypeId );
+      } else {
+         throw new AnnotatorContextException( "Unsupported dictionary implementation " + implementationName,
+                                              new Object[0] );
+      }
+      if ( dictionary == null ) {
+         throw new AnnotatorContextException( "No appropriate dictionary defined", new Object[0] );
+      }
+      // Deprecated -
+//      if ( dictionary instanceof Dictionary ) {
+//         final Collection metaFields = dictionaryElement.getChild( "metaFields" ).getChildren();
+//         for ( Object value : metaFields ) {
+//            String metaFieldName = ((Element) value).getAttributeValue( "fieldName" );
+//            ((Dictionary) dictionary).retainMetaData( metaFieldName );
+//         }
+//      }
+      return dictionary;
+   }
+
+
+
+   /**
+    * Creates a term consumer by parsing section defined by {@link this.CONSUMER_KEY}
+    *
+    * @param uimaContext           -
+    * @param lookupConsumerElement contains the definition of the term consumer
+    * @return a term consumer
+    * @throws AnnotatorContextException if any of a dozen things goes wrong
+    */
+   private static TermConsumer parseConsumerXml( final UimaContext uimaContext,
+                                                         final Element lookupConsumerElement ) throws
+                                                                                               AnnotatorContextException {
+      Class[] constrArgsConsum = {UimaContext.class, Properties.class, int.class};//ohnlp-Bugs-3296301
+      Class[] constrArgsConsumB = {UimaContext.class, Properties.class};
+
+      String consumerClassName = lookupConsumerElement.getAttributeValue( "className" );
+      Element consumerPropertiesElement = lookupConsumerElement.getChild( "properties" );
+      Properties consumerProperties = parsePropertiesXml( consumerPropertiesElement );
+      Class consumerClass;
+      try {
+         consumerClass = Class.forName( consumerClassName );
+      } catch ( ClassNotFoundException cnfE ) {
+         throw new AnnotatorContextException( "Unknown class " + consumerClassName, new Object[0], cnfE );
+      }
+      if ( !TermConsumer.class.isAssignableFrom( consumerClass ) ) {
+         throw new AnnotatorContextException( consumerClassName + " is not a TermConsumer",
+                                              new Object[0] );
+      }
+      final Constructor[] constructors = consumerClass.getConstructors();
+      for ( Constructor constructor : constructors ) {
+         try {
+            if ( Arrays.equals( constrArgsConsum, constructor.getParameterTypes() ) ) {
+               final Object[] args = new Object[]{uimaContext, consumerProperties, MAX_LIST_SIZE}; //ohnlp-Bugs-3296301
+               return (TermConsumer) constructor.newInstance( args );
+            } else if ( Arrays.equals( constrArgsConsumB, constructor.getParameterTypes() ) ) {
+               final Object[] args = new Object[]{uimaContext, consumerProperties};
+               return (TermConsumer) constructor.newInstance( args );
+            }
+         } catch ( InstantiationException inE ) {
+            throw new AnnotatorContextException( "Could not construct " + consumerClassName, new Object[0], inE );
+         } catch ( IllegalAccessException iaE ) {
+            throw new AnnotatorContextException( "Could not construct " + consumerClassName, new Object[0], iaE );
+         } catch ( InvocationTargetException itE ) {
+            throw new AnnotatorContextException( "Could not construct " + consumerClassName, new Object[0], itE );
+         }
+      }
+      throw new AnnotatorContextException( "No Constructor for " + consumerClassName, new Object[0] );
+   }
+
+   /**
+    * Builds a collection of key, value properties
+    *
+    * @param propertiesElement element with key, value pairs
+    * @return Properties
+    */
+   private static Properties parsePropertiesXml( final Element propertiesElement ) {
+      final Properties properties = new Properties();
+      final Collection propertyElements = propertiesElement.getChildren();
+      for ( Object value : propertyElements ) {
+         final Element propertyElement = (Element) value;
+         final String key = propertyElement.getAttributeValue( "key" );
+         final String propertyValue = propertyElement.getAttributeValue( "value" );
+         properties.put( key, propertyValue );
+      }
+      return properties;
+   }
+
+}

Propchange: ctakes/sandbox/ctakes-dictionary-lookup2/src/main/java/org/apache/ctakes/dictionary/lookup2/dictionary/DictionaryDescriptorParser.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message