ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1631587 - /ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMETreebankReader.java
Date Mon, 13 Oct 2014 22:23:58 GMT
Author: tmill
Date: Mon Oct 13 22:23:57 2014
New Revision: 1631587

URL: http://svn.apache.org/r1631587
Log:
CTAKES-82: Added enum for central handling of different token types.

Modified:
    ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMETreebankReader.java

Modified: ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMETreebankReader.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMETreebankReader.java?rev=1631587&r1=1631586&r2=1631587&view=diff
==============================================================================
--- ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMETreebankReader.java
(original)
+++ ctakes/trunk/ctakes-temporal/src/main/java/org/apache/ctakes/temporal/ae/THYMETreebankReader.java
Mon Oct 13 22:23:57 2014
@@ -24,14 +24,21 @@ import java.io.IOException;
 import java.net.URI;
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.HashMap;
 import java.util.List;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
+import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
+import org.apache.ctakes.typesystem.type.syntax.NumToken;
+import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
+import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
 import org.apache.ctakes.typesystem.type.syntax.TerminalTreebankNode;
 import org.apache.ctakes.typesystem.type.syntax.TopTreebankNode;
 import org.apache.ctakes.typesystem.type.syntax.TreebankNode;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
 import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
@@ -46,6 +53,7 @@ import org.apache.uima.fit.util.JCasUtil
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
 import org.apache.uima.jcas.cas.StringArray;
+import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.util.FileUtils;
 import org.apache.uima.util.Level;
@@ -62,6 +70,8 @@ public class THYMETreebankReader extends
 	protected File treebankDirectory;
 	File[] subdirs = null;
 
+	enum TOKEN_TYPE {WORD, PUNCT, SYMBOL, NUM, NEWLINE, CONTRACTION }
+	
 	@Override
 	public void initialize(UimaContext aContext) throws ResourceInitializationException {
 		super.initialize(aContext);
@@ -127,8 +137,24 @@ public class THYMETreebankReader extends
 		for(Sentence sent : sents){
 			sent.removeFromIndexes();
 		}
+		HashMap<String,TOKEN_TYPE> tokMap = new HashMap<>();
 		List<BaseToken> toks = new ArrayList<BaseToken>(JCasUtil.select(jcas, BaseToken.class));
 		for(BaseToken tok : toks){
+		  String key = getAnnotationKey(tok);
+		  
+		  if(tok instanceof WordToken){
+		    tokMap.put(key, TOKEN_TYPE.WORD);
+		  }else if(tok instanceof PunctuationToken){
+		    tokMap.put(key, TOKEN_TYPE.PUNCT);
+		  }else if(tok instanceof SymbolToken){
+		    tokMap.put(key, TOKEN_TYPE.SYMBOL);
+		  }else if(tok instanceof NumToken){
+		    tokMap.put(key,  TOKEN_TYPE.NUM);
+		  }else if(tok instanceof NewlineToken){
+		    tokMap.put(key, TOKEN_TYPE.NEWLINE);
+		  }else if(tok instanceof ContractionToken){
+		    tokMap.put(key, TOKEN_TYPE.CONTRACTION);
+		  }
 			tok.removeFromIndexes();
 		}
 		
@@ -144,11 +170,39 @@ public class THYMETreebankReader extends
 			// create the Tokens and add them to the Sentence
 			for (int i = 0; i < tree.getTerminals().size(); i++) {
 				TreebankNode leaf = tree.getTerminals(i);
-				if (leaf.getBegin() != leaf.getEnd()) {
-					BaseToken token = new BaseToken(jcas, leaf.getBegin(), leaf.getEnd());
-					token.setPartOfSpeech(leaf.getNodeType());
-					token.addToIndexes();
-				}
+        if (leaf.getBegin() != leaf.getEnd()) {
+          String key = getAnnotationKey(leaf);
+          BaseToken token = null;
+          if(tokMap.containsKey(key)){
+            TOKEN_TYPE tokType = tokMap.get(key);
+            switch(tokType){            
+            case CONTRACTION:
+              token = new ContractionToken(jcas, leaf.getBegin(), leaf.getEnd());
+              break;
+            case NEWLINE:
+              token = new NewlineToken(jcas, leaf.getBegin(), leaf.getEnd());
+              break;
+            case NUM:
+              token = new NumToken(jcas, leaf.getBegin(), leaf.getEnd());
+              break;
+            case PUNCT:
+              token = new PunctuationToken(jcas, leaf.getBegin(), leaf.getEnd());
+              break;
+            case SYMBOL:
+              token = new SymbolToken(jcas, leaf.getBegin(), leaf.getEnd());
+              break;
+            case WORD:
+              token = new WordToken(jcas, leaf.getBegin(), leaf.getEnd());
+              break;
+            default:
+              token = new BaseToken(jcas, leaf.getBegin(), leaf.getEnd());
+            }
+          }else{
+            token = new BaseToken(jcas, leaf.getBegin(), leaf.getEnd());
+          }
+          token.setPartOfSpeech(leaf.getNodeType());
+          token.addToIndexes();
+        }
 			}
 		}
 	}
@@ -264,4 +318,8 @@ public class THYMETreebankReader extends
 			System.out.println("FOund match at: " + m.start() + "-" + m.end());
 		}
 	}
+	
+	public static final String getAnnotationKey(Annotation a){
+	  return a.getBegin() + "-" + a.getEnd();
+	}
 }



Mime
View raw message