ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1541553 - /ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
Date Wed, 13 Nov 2013 15:26:30 GMT
Author: tmill
Date: Wed Nov 13 15:26:29 2013
New Revision: 1541553

URL: http://svn.apache.org/r1541553
Log:
Fixes CTAKES-266. Checks for zero-length word token before creating token before contraction.

Modified:
    ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java

Modified: ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java?rev=1541553&r1=1541552&r2=1541553&view=diff
==============================================================================
--- ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
(original)
+++ ctakes/trunk/ctakes-core/src/main/java/org/apache/ctakes/core/nlp/tokenizer/TokenizerPTB.java
Wed Nov 13 15:26:29 2013
@@ -343,12 +343,13 @@ public class TokenizerPTB {
 			        	char c = lowerCasedText.charAt(currentPosition+len);
 			        	if (c=='n' || c==APOSTROPHE) { // if a "n't" contraction or a contraction where
contraction token starts with '
 			        	    if (tokenLen < 0) throw new RuntimeException("c = " + c + "tokenLen =
" + tokenLen + " currentPosition = " + currentPosition);
-			        	    // First create the WordToken (no apostrophe)
-			        	    bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen,
offsetAdjustment);
-			        	    //System.out.println("bta = " + bta + " class = " + bta.getClass() + " tokenLen
= " + tokenLen + " currentPosition = " + currentPosition);
-			        	    tokens.add(bta);
-			        	    currentPosition+=tokenLen; // currentPosition
-
+			        	    // First create the WordToken (no apostrophe)
+			        	    if(tokenLen > 0){
+			        	      bta = createToken(tokenClass, textSegment, jcas, currentPosition, currentPosition+tokenLen,
offsetAdjustment);
+			        	      //System.out.println("bta = " + bta + " class = " + bta.getClass() + "
tokenLen = " + tokenLen + " currentPosition = " + currentPosition);
+			        	      tokens.add(bta);
+			        	      currentPosition+=tokenLen; // currentPosition
+			        	    }
 			        	    // Set up to create the second token, for other contractions, the next token
will start with an 
 			        	    // apostrophe and be handled above... but for "n't" contractions, next token
won't start with apostrophe
 			        	    // so just go ahead and handle it here instead of having to keep track of
previous 



Mime
View raw message