ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From seanfi...@apache.org
Subject svn commit: r1714634 - in /ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2: ae/DefaultJCasTermAnnotator.java ae/OverlapJCasTermAnnotator.java term/RareWordTerm.java util/TokenMatchUtil.java
Date Mon, 16 Nov 2015 17:22:03 GMT
Author: seanfinan
Date: Mon Nov 16 17:22:03 2015
New Revision: 1714634

URL: http://svn.apache.org/viewvc?rev=1714634&view=rev
Log:
CTAKES-389 : fix for erroneous terms returned when last token is a partial match

Removed:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/util/TokenMatchUtil.java
Modified:
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
    ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java?rev=1714634&r1=1714633&r2=1714634&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java
(original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/DefaultJCasTermAnnotator.java
Mon Nov 16 17:22:03 2015
@@ -23,7 +23,6 @@ import org.apache.ctakes.dictionary.look
 import org.apache.ctakes.dictionary.lookup2.textspan.DefaultTextSpan;
 import org.apache.ctakes.dictionary.lookup2.textspan.TextSpan;
 import org.apache.ctakes.dictionary.lookup2.util.FastLookupToken;
-import org.apache.ctakes.dictionary.lookup2.util.TokenMatchUtil;
 import org.apache.ctakes.dictionary.lookup2.util.collection.CollectionMap;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.fit.factory.AnalysisEngineFactory;
@@ -70,7 +69,7 @@ final public class DefaultJCasTermAnnota
                continue;
             }
             final int termEndIndex = termStartIndex + rareWordHit.getTokenCount() - 1;
-            if ( TokenMatchUtil.isTermMatch( rareWordHit, allTokens, termStartIndex, termEndIndex
) ) {
+            if ( isTermMatch( rareWordHit, allTokens, termStartIndex, termEndIndex ) ) {
                final int spanStart = allTokens.get( termStartIndex ).getStart();
                final int spanEnd = allTokens.get( termEndIndex ).getEnd();
                termsFromDictionary.placeValue( new DefaultTextSpan( spanStart, spanEnd ),
rareWordHit.getCuiCode() );
@@ -79,6 +78,34 @@ final public class DefaultJCasTermAnnota
       }
    }
 
+   /**
+    * Hopefully the jit will inline this method
+    *
+    * @param rareWordHit    rare word term to check for match
+    * @param allTokens      all tokens in a window
+    * @param termStartIndex index of first token in allTokens to check
+    * @param termEndIndex   index of last token in allTokens to check
+    * @return true if the rare word term exists in allTokens within the given indices
+    */
+   public static boolean isTermMatch( final RareWordTerm rareWordHit, final List<FastLookupToken>
allTokens,
+                                      final int termStartIndex, final int termEndIndex )
{
+      final String[] hitTokens = rareWordHit.getTokens();
+      int hit = 0;
+      for ( int i = termStartIndex; i < termEndIndex + 1; i++ ) {
+         if ( hitTokens[ hit ].equals( allTokens.get( i ).getText() )
+              || hitTokens[ hit ].equals( allTokens.get( i ).getVariant() ) ) {
+            // the normal token or variant matched, move to the next token
+            hit++;
+            continue;
+         }
+         // the token normal didn't match and there is no matching variant
+         return false;
+      }
+      // some combination of token and variant matched
+      return true;
+   }
+
+
    static public AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException
{
       return AnalysisEngineFactory.createEngineDescription( DefaultJCasTermAnnotator.class
);
    }

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java?rev=1714634&r1=1714633&r2=1714634&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
(original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/ae/OverlapJCasTermAnnotator.java
Mon Nov 16 17:22:03 2015
@@ -129,7 +129,7 @@ final public class OverlapJCasTermAnnota
    static private TextSpan getOverlapTerm( final List<FastLookupToken> allTokens, final
int lookupTokenIndex,
                                            final RareWordTerm rareWordHit,
                                            final int consecutiveSkipMax, final int totalSkipMax
) {
-      final String[] rareWordTokens = fastSplit( rareWordHit.getText(), rareWordHit.getTokenCount()
);
+      final String[] hitTokens = rareWordHit.getTokens();
       final List<TextSpan> missingSpanKeys = new ArrayList<>();
       int consecutiveSkips = 0;
       int totalSkips = 0;
@@ -139,8 +139,8 @@ final public class OverlapJCasTermAnnota
       } else {
          int nextRareWordIndex = rareWordHit.getRareWordIndex() - 1;
          for ( int allTokensIndex = lookupTokenIndex - 1; allTokensIndex >= 0; allTokensIndex--
) {
-            if ( rareWordTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex
).getText() )
-                 || rareWordTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex
).getVariant() ) ) {
+            if ( hitTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex ).getText()
)
+                 || hitTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex
).getVariant() ) ) {
                nextRareWordIndex--;
                if ( nextRareWordIndex < 0 ) {
                   firstWordIndex = allTokensIndex;
@@ -173,8 +173,8 @@ final public class OverlapJCasTermAnnota
          consecutiveSkips = 0;
          int nextRareWordIndex = rareWordHit.getRareWordIndex() + 1;
          for ( int allTokensIndex = lookupTokenIndex + 1; allTokensIndex < allTokens.size();
allTokensIndex++ ) {
-            if ( rareWordTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex
).getText() )
-                 || rareWordTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex
).getVariant() ) ) {
+            if ( hitTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex ).getText()
)
+                 || hitTokens[ nextRareWordIndex ].equals( allTokens.get( allTokensIndex
).getVariant() ) ) {
                nextRareWordIndex++;
                if ( nextRareWordIndex >= rareWordHit.getTokenCount() ) {
                   lastWordIndex = allTokensIndex;
@@ -206,23 +206,6 @@ final public class OverlapJCasTermAnnota
    }
 
 
-   static private String[] fastSplit( final String line, final int tokenCount ) {
-      final String[] tokens = new String[ tokenCount ];
-      int tokenIndex = 0;
-      int previousSpaceIndex = -1;
-      int spaceIndex = line.indexOf( ' ' );
-      while ( spaceIndex > 0 && tokenIndex < tokenCount ) {
-         tokens[ tokenIndex ] = line.substring( previousSpaceIndex + 1, spaceIndex );
-         tokenIndex++;
-         previousSpaceIndex = spaceIndex;
-         spaceIndex = line.indexOf( ' ', previousSpaceIndex + 1 );
-      }
-      if ( previousSpaceIndex + 1 < line.length() ) {
-         tokens[ tokenCount - 1 ] = line.substring( previousSpaceIndex + 1 );
-      }
-      return tokens;
-   }
-
    static public AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException
{
       return AnalysisEngineFactory.createEngineDescription( OverlapJCasTermAnnotator.class
);
    }

Modified: ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java?rev=1714634&r1=1714633&r2=1714634&view=diff
==============================================================================
--- ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java
(original)
+++ ctakes/trunk/ctakes-dictionary-lookup-fast/src/main/java/org/apache/ctakes/dictionary/lookup2/term/RareWordTerm.java
Mon Nov 16 17:22:03 2015
@@ -90,6 +90,26 @@ final public class RareWordTerm {
    }
 
    /**
+    * @return each token in the term as a separate String
+    */
+   public String[] getTokens() {
+      final String[] tokens = new String[ _tokenCount ];
+      int tokenIndex = 0;
+      int previousSpaceIndex = -1;
+      int spaceIndex = _text.indexOf( ' ' );
+      while ( spaceIndex > 0 && tokenIndex < _tokenCount ) {
+         tokens[ tokenIndex ] = _text.substring( previousSpaceIndex + 1, spaceIndex );
+         tokenIndex++;
+         previousSpaceIndex = spaceIndex;
+         spaceIndex = _text.indexOf( ' ', previousSpaceIndex + 1 );
+      }
+      if ( previousSpaceIndex + 1 < _text.length() ) {
+         tokens[ _tokenCount - 1 ] = _text.substring( previousSpaceIndex + 1 );
+      }
+      return tokens;
+   }
+
+   /**
     * {@inheritDoc}
     */
    @Override



Mime
View raw message