incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From james-mas...@apache.org
Subject svn commit: r1403989 [12/28] - in /incubator/ctakes/branches/SHARPn-cTAKES: Constituency Parser/src/org/chboston/cnlp/ctakes/parser/ Constituency Parser/src/org/chboston/cnlp/ctakes/parser/uima/ae/ Constituency Parser/src/org/chboston/cnlp/ctakes/parse...
Date Wed, 31 Oct 2012 05:26:55 GMT
Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/Tokenizer.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/Tokenizer.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/Tokenizer.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/Tokenizer.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,567 +14,567 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.nlp.tokenizer;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-/**
- * A class used to break natural text into tokens. The token markup is external
- * to the text and is not embedded like XML. Character offset location is used
- * to identify the boundaries of a token.
- * 
- * @author Mayo Clinic
- */
-public class Tokenizer {
-	private OffsetComparator iv_offsetComp = new OffsetComparator();
-
-	// key = hypenated String obj, value = freq Integer obj
-	private Map<String, Integer> iv_hyphMap;
-
-	private int iv_freqCutoff;
-
-	/**
-	 * Constructor
-	 */
-	public Tokenizer() {
-	}
-
-	/**
-	 * Constructor
-	 * 
-	 * @param hyphMap
-	 *            Map where key=hyphenated string (lower cased) value=freq
-	 *            Integer
-	 * 
-	 * @param freqCutoff
-	 *            frequency cutoff
-	 */
-	public Tokenizer(Map<String, Integer> hyphMap, int freqCutoff) {
-		iv_hyphMap = hyphMap;
-		iv_freqCutoff = freqCutoff;
-	}
-
-	/**
-	 * Validate the structure of the hyphen map.
-	 * 
-	 * @param hyphMap
-	 */
-	public static void validateHyphenMap(Map<String, Integer> hyphMap)
-			throws Exception {
-		Iterator<String> keyItr = hyphMap.keySet().iterator();
-		while (keyItr.hasNext()) {
-			String key = keyItr.next();
-			Object val = hyphMap.get(key);
-			if (val == null) {
-				throw new Exception(
-						"Hyphen map is missing frequency data for key=" + key);
-			}
-			if ((val instanceof Integer) == false) {
-				throw new Exception(
-						"Hyphen map has non java.lang.Integer frequency data for key=" + key);
-			}
-		}
-	}
-
-	/**
-	 * Tokenizes a string of text and outputs a list of Token objects in sorted
-	 * order.
-	 * 
-	 * @param text
-	 *            The text to tokenize.
-	 * @return A list of Token objects sorted by the order they appear in the
-	 *         text.
-	 * @throws Exception
-	 *             Thrown if an error occurs while tokenizing.
-	 */
-	public List<Token> tokenizeAndSort(String text) throws Exception {
-		List<Token> tokenList = tokenize(text);
-
-		// sort tokens by offset
-		Collections.sort(tokenList, iv_offsetComp);
-
-		return tokenList;
-	}
-
-	/**
-	 * Tokenizes a string of text and outputs a list of Token objects. The list
-	 * is not guaranteed to be sorted.
-	 * 
-	 * @param text The text to tokenize.
-	 * @return A list of Token objects.
-	 * @throws Exception
-	 */
-	public List<Token> tokenize(String text) throws Exception {
-		try {
-			List<Token> eolTokens = getEndOfLineTokens(text);
-
-			// Break text into raw tokens (whitespace-delimited text)
-			List<Token> tokens = getRawTokens(text);
-
-			// Detect punctuation and symbols inside the raw tokens
-			applyPunctSymbolRules(tokens, text);
-
-			for (int i = 0; i < tokens.size(); i++) {
-				Token token = tokens.get(i);
-				String tokenText = text.substring(token.getStartOffset(), token
-						.getEndOffset());
-				if (token.getType() != Token.TYPE_PUNCT) {
-					if (isNumber(tokenText)) {
-						token.setType(Token.TYPE_NUMBER);
-
-						token.setIsInteger(isInteger(tokenText));
-					}
-
-					if (token.getType() == Token.TYPE_UNKNOWN) {
-						// token must be a word if it's not classified yet
-						token.setType(Token.TYPE_WORD);
-					}
-
-					if (token.getType() == Token.TYPE_WORD) {
-						applyCapitalizationRules(token, tokenText);
-						applyWordNumRules(token, tokenText);
-					}
-				}
-			}
-			tokens.addAll(eolTokens);
-
-			// set text for tokens
-			for (int i = 0; i < tokens.size(); i++) {
-				Token t = tokens.get(i);
-				t.setText(text.substring(t.getStartOffset(), t.getEndOffset()));
-			}
-
-			return tokens;
-			
-		} catch (Exception e) {
-			e.printStackTrace();
-			throw new Exception("Internal Error with Tokenizer.");
-		}
-	}
-
-	/**
-	 * Applies punctuation/symbol rules to the given list of tokens. Tokens that
-	 * are punctuation/symbols are marked as such. Tokens that contain
-	 * punctuation/symbols inside them are split into multiple tokens, one of
-	 * which is the inner punctuation/symbol token.
-	 * 
-	 * @param tokens List of tokens to apply rules to.
-	 * @param text The original text.
-	 */
-	private void applyPunctSymbolRules(List<Token> tokens, String text) {
-		List<Token> newTokenList = new ArrayList<Token>();
-		List<Token> removeTokenList = new ArrayList<Token>();
-
-		for (int tIndex = 0; tIndex < tokens.size(); tIndex++) {
-			Token token = tokens.get(tIndex);
-			String tokenText = text.substring(token.getStartOffset(), token
-					.getEndOffset());
-
-			if (tokenText.length() == 1) {
-				char currentChar = tokenText.charAt(0);
-				// token is only 1 character long, check if it's a symbol
-				if (!isAlphabetLetterOrDigit(currentChar)) {
-					if (isPunctuation(currentChar)) {
-						token.setType(Token.TYPE_PUNCT);
-					} 
-					else {
-						token.setType(Token.TYPE_SYMBOL);
-					}
-				}
-				continue;
-			}
-
-			// punctuation/symbol at start of token
-			int startCnt = processStartPunctSymbol(newTokenList, token,
-					tokenText);
-			// adjust original token to no longer include the punctuation/symbol
-			token.setStartOffset(token.getStartOffset() + startCnt);
-
-			// punctuation at end of token
-			tokenText = text.substring(token.getStartOffset(), token
-					.getEndOffset());
-			int endCnt = processEndPunctSymbol(newTokenList, token, tokenText);
-			// adjust original token to no longer include the punctuation/symbol
-			token.setEndOffset(token.getEndOffset() - endCnt);
-
-			// If the original token was only a punctuation or symbol,
-			// and the start and end punctuation/symbol
-			// has been stripped off, it's possible to now have an empty token
-			// In that case, remove the empty token
-			if (token.getStartOffset() == token.getEndOffset()) {
-				removeTokenList.add(token);
-			}
-
-			// contractions
-			tokenText = text.substring(token.getStartOffset(), token
-					.getEndOffset());
-			int aposIndex = tokenText.indexOf('\'');
-			if (aposIndex != -1) {
-				Token cpToken = null;
-				String afterAposStr = tokenText.substring(aposIndex + 1,
-						tokenText.length());
-				if (afterAposStr.length() == 1) {
-					// handle xxx'd (e.g. we'd)
-					// handle xxx'm (e.g. I'm)
-					// handle xxx's (e.g. it's)
-					if (afterAposStr.equalsIgnoreCase("d")
-							|| afterAposStr.equalsIgnoreCase("m")
-							|| afterAposStr.equalsIgnoreCase("s")) {
-						cpToken = new Token(token.getStartOffset() + aposIndex,
-								token.getEndOffset());
-					}
-					// handle xxxn't (e.g. won't don't)
-					else if (afterAposStr.equalsIgnoreCase("t")) {
-						String beforeAposChar = tokenText.substring(
-								aposIndex - 1, aposIndex);
-						if (beforeAposChar.equalsIgnoreCase("n")) {
-							cpToken = new Token(token.getStartOffset()
-									+ aposIndex - 1, token.getEndOffset());
-						}
-					}
-				} else if (afterAposStr.length() == 2) {
-					// handle xxx're (e.g. they're)
-					// handle xxx've (e.g. they've)
-					// handle xxx'll (e.g. they'll)
-					if (afterAposStr.equalsIgnoreCase("re")
-							|| afterAposStr.equalsIgnoreCase("ve")
-							|| afterAposStr.equalsIgnoreCase("ll")) {
-						cpToken = new Token(token.getStartOffset() + aposIndex,
-								token.getEndOffset());
-					}
-				}
-				if (cpToken != null) {
-					cpToken.setType(Token.TYPE_CONTRACTION);
-					newTokenList.add(cpToken);
-					// adjust original token to no longer include the
-					// contraction
-					// or possessive
-					token.setEndOffset(cpToken.getStartOffset());
-				}
-			} else if (tokenText.equalsIgnoreCase("cannot")) {
-				// special case where cannot needs to be split into can & not
-				Token notToken = new Token(token.getStartOffset() + 3, token
-						.getEndOffset());
-				notToken.setType(Token.TYPE_WORD);
-				newTokenList.add(notToken);
-				// adjust original token to no longer include "not"
-				token.setEndOffset(token.getStartOffset() + 3);
-			}
-
-			// punctuation inside the token
-			tokenText = text.substring(token.getStartOffset(), token
-					.getEndOffset());
-			boolean foundSomethingInside = findPunctSymbolInsideToken(tokens,
-					token, tokenText);
+package edu.mayo.bmi.nlp.tokenizer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * A class used to break natural text into tokens. The token markup is external
+ * to the text and is not embedded like XML. Character offset location is used
+ * to identify the boundaries of a token.
+ * 
+ * @author Mayo Clinic
+ */
+public class Tokenizer {
+	private OffsetComparator iv_offsetComp = new OffsetComparator();
+
+	// key = hypenated String obj, value = freq Integer obj
+	private Map<String, Integer> iv_hyphMap;
+
+	private int iv_freqCutoff;
+
+	/**
+	 * Constructor
+	 */
+	public Tokenizer() {
+	}
+
+	/**
+	 * Constructor
+	 * 
+	 * @param hyphMap
+	 *            Map where key=hyphenated string (lower cased) value=freq
+	 *            Integer
+	 * 
+	 * @param freqCutoff
+	 *            frequency cutoff
+	 */
+	public Tokenizer(Map<String, Integer> hyphMap, int freqCutoff) {
+		iv_hyphMap = hyphMap;
+		iv_freqCutoff = freqCutoff;
+	}
+
+	/**
+	 * Validate the structure of the hyphen map.
+	 * 
+	 * @param hyphMap
+	 */
+	public static void validateHyphenMap(Map<String, Integer> hyphMap)
+			throws Exception {
+		Iterator<String> keyItr = hyphMap.keySet().iterator();
+		while (keyItr.hasNext()) {
+			String key = keyItr.next();
+			Object val = hyphMap.get(key);
+			if (val == null) {
+				throw new Exception(
+						"Hyphen map is missing frequency data for key=" + key);
+			}
+			if ((val instanceof Integer) == false) {
+				throw new Exception(
+						"Hyphen map has non java.lang.Integer frequency data for key=" + key);
+			}
+		}
+	}
+
+	/**
+	 * Tokenizes a string of text and outputs a list of Token objects in sorted
+	 * order.
+	 * 
+	 * @param text
+	 *            The text to tokenize.
+	 * @return A list of Token objects sorted by the order they appear in the
+	 *         text.
+	 * @throws Exception
+	 *             Thrown if an error occurs while tokenizing.
+	 */
+	public List<Token> tokenizeAndSort(String text) throws Exception {
+		List<Token> tokenList = tokenize(text);
+
+		// sort tokens by offset
+		Collections.sort(tokenList, iv_offsetComp);
+
+		return tokenList;
+	}
+
+	/**
+	 * Tokenizes a string of text and outputs a list of Token objects. The list
+	 * is not guaranteed to be sorted.
+	 * 
+	 * @param text The text to tokenize.
+	 * @return A list of Token objects.
+	 * @throws Exception
+	 */
+	public List<Token> tokenize(String text) throws Exception {
+		try {
+			List<Token> eolTokens = getEndOfLineTokens(text);
+
+			// Break text into raw tokens (whitespace-delimited text)
+			List<Token> tokens = getRawTokens(text);
+
+			// Detect punctuation and symbols inside the raw tokens
+			applyPunctSymbolRules(tokens, text);
+
+			for (int i = 0; i < tokens.size(); i++) {
+				Token token = tokens.get(i);
+				String tokenText = text.substring(token.getStartOffset(), token
+						.getEndOffset());
+				if (token.getType() != Token.TYPE_PUNCT) {
+					if (isNumber(tokenText)) {
+						token.setType(Token.TYPE_NUMBER);
+
+						token.setIsInteger(isInteger(tokenText));
+					}
+
+					if (token.getType() == Token.TYPE_UNKNOWN) {
+						// token must be a word if it's not classified yet
+						token.setType(Token.TYPE_WORD);
+					}
+
+					if (token.getType() == Token.TYPE_WORD) {
+						applyCapitalizationRules(token, tokenText);
+						applyWordNumRules(token, tokenText);
+					}
+				}
+			}
+			tokens.addAll(eolTokens);
+
+			// set text for tokens
+			for (int i = 0; i < tokens.size(); i++) {
+				Token t = tokens.get(i);
+				t.setText(text.substring(t.getStartOffset(), t.getEndOffset()));
+			}
+
+			return tokens;
+			
+		} catch (Exception e) {
+			e.printStackTrace();
+			throw new Exception("Internal Error with Tokenizer.");
+		}
+	}
+
+	/**
+	 * Applies punctuation/symbol rules to the given list of tokens. Tokens that
+	 * are punctuation/symbols are marked as such. Tokens that contain
+	 * punctuation/symbols inside them are split into multiple tokens, one of
+	 * which is the inner punctuation/symbol token.
+	 * 
+	 * @param tokens List of tokens to apply rules to.
+	 * @param text The original text.
+	 */
+	private void applyPunctSymbolRules(List<Token> tokens, String text) {
+		List<Token> newTokenList = new ArrayList<Token>();
+		List<Token> removeTokenList = new ArrayList<Token>();
+
+		for (int tIndex = 0; tIndex < tokens.size(); tIndex++) {
+			Token token = tokens.get(tIndex);
+			String tokenText = text.substring(token.getStartOffset(), token
+					.getEndOffset());
+
+			if (tokenText.length() == 1) {
+				char currentChar = tokenText.charAt(0);
+				// token is only 1 character long, check if it's a symbol
+				if (!isAlphabetLetterOrDigit(currentChar)) {
+					if (isPunctuation(currentChar)) {
+						token.setType(Token.TYPE_PUNCT);
+					} 
+					else {
+						token.setType(Token.TYPE_SYMBOL);
+					}
+				}
+				continue;
+			}
+
+			// punctuation/symbol at start of token
+			int startCnt = processStartPunctSymbol(newTokenList, token,
+					tokenText);
+			// adjust original token to no longer include the punctuation/symbol
+			token.setStartOffset(token.getStartOffset() + startCnt);
+
+			// punctuation at end of token
+			tokenText = text.substring(token.getStartOffset(), token
+					.getEndOffset());
+			int endCnt = processEndPunctSymbol(newTokenList, token, tokenText);
+			// adjust original token to no longer include the punctuation/symbol
+			token.setEndOffset(token.getEndOffset() - endCnt);
+
+			// If the original token was only a punctuation or symbol,
+			// and the start and end punctuation/symbol
+			// has been stripped off, it's possible to now have an empty token
+			// In that case, remove the empty token
+			if (token.getStartOffset() == token.getEndOffset()) {
+				removeTokenList.add(token);
+			}
+
+			// contractions
+			tokenText = text.substring(token.getStartOffset(), token
+					.getEndOffset());
+			int aposIndex = tokenText.indexOf('\'');
+			if (aposIndex != -1) {
+				Token cpToken = null;
+				String afterAposStr = tokenText.substring(aposIndex + 1,
+						tokenText.length());
+				if (afterAposStr.length() == 1) {
+					// handle xxx'd (e.g. we'd)
+					// handle xxx'm (e.g. I'm)
+					// handle xxx's (e.g. it's)
+					if (afterAposStr.equalsIgnoreCase("d")
+							|| afterAposStr.equalsIgnoreCase("m")
+							|| afterAposStr.equalsIgnoreCase("s")) {
+						cpToken = new Token(token.getStartOffset() + aposIndex,
+								token.getEndOffset());
+					}
+					// handle xxxn't (e.g. won't don't)
+					else if (afterAposStr.equalsIgnoreCase("t")) {
+						String beforeAposChar = tokenText.substring(
+								aposIndex - 1, aposIndex);
+						if (beforeAposChar.equalsIgnoreCase("n")) {
+							cpToken = new Token(token.getStartOffset()
+									+ aposIndex - 1, token.getEndOffset());
+						}
+					}
+				} else if (afterAposStr.length() == 2) {
+					// handle xxx're (e.g. they're)
+					// handle xxx've (e.g. they've)
+					// handle xxx'll (e.g. they'll)
+					if (afterAposStr.equalsIgnoreCase("re")
+							|| afterAposStr.equalsIgnoreCase("ve")
+							|| afterAposStr.equalsIgnoreCase("ll")) {
+						cpToken = new Token(token.getStartOffset() + aposIndex,
+								token.getEndOffset());
+					}
+				}
+				if (cpToken != null) {
+					cpToken.setType(Token.TYPE_CONTRACTION);
+					newTokenList.add(cpToken);
+					// adjust original token to no longer include the
+					// contraction
+					// or possessive
+					token.setEndOffset(cpToken.getStartOffset());
+				}
+			} else if (tokenText.equalsIgnoreCase("cannot")) {
+				// special case where cannot needs to be split into can & not
+				Token notToken = new Token(token.getStartOffset() + 3, token
+						.getEndOffset());
+				notToken.setType(Token.TYPE_WORD);
+				newTokenList.add(notToken);
+				// adjust original token to no longer include "not"
+				token.setEndOffset(token.getStartOffset() + 3);
+			}
+
+			// punctuation inside the token
+			tokenText = text.substring(token.getStartOffset(), token
+					.getEndOffset());
+			boolean foundSomethingInside = findPunctSymbolInsideToken(tokens,
+					token, tokenText);
 			// sourceforge bug tracker #3072902
 			// if nothing left after remove the contraction, such as the line " n't "
 			// then all of token was turned into a contraction token
 			if (token.getEndOffset()== token.getStartOffset()) foundSomethingInside = true;
-			if (foundSomethingInside) {
-				removeTokenList.add(token);
-			}
-		}
-		tokens.addAll(newTokenList);
-		for (int i = 0; i < removeTokenList.size(); i++) {
-			Token tokenToBeRemoved = removeTokenList.get(i);
-			tokens.remove(tokenToBeRemoved);
-		}
-	}
-
-	private int processStartPunctSymbol(List<Token> newTokenList, Token token,
-			String tokenText) {
-		int count = 0;
-		for (int i = 0; i < tokenText.length(); i++) {
-			char currentChar = tokenText.charAt(i);
-			if (!isAlphabetLetterOrDigit(currentChar)) {
-				Token t = new Token(token.getStartOffset() + i, token
-						.getStartOffset()
-						+ i + 1);
-
-				if (isPunctuation(currentChar)) {
-					t.setType(Token.TYPE_PUNCT);
-				} else {
-					t.setType(Token.TYPE_SYMBOL);
-				}
-				newTokenList.add(t);
-				count++;
-			} else { // encountered a letter or digit, stop
-				return count;
-			}
-		}
-		return count;
-	}
-
-	private int processEndPunctSymbol(List<Token> newTokenList, Token token,
-			String tokenText) {
-		int count = 0;
-		for (int i = tokenText.length() - 1; i >= 0; i--) {
-			char currentChar = tokenText.charAt(i);
-			if (!isAlphabetLetterOrDigit(currentChar)) {
-				Token t = new Token(token.getStartOffset() + i, token
-						.getStartOffset()
-						+ i + 1);
-
-				if (isPunctuation(currentChar)) {
-					t.setType(Token.TYPE_PUNCT);
-				} else {
-					t.setType(Token.TYPE_SYMBOL);
-				}
-
-				newTokenList.add(t);
-				count++;
-			} else { // encountered a letter or digit, stop
-				return count;
-			}
-		}
-		return count;
-	}
-
-	private int getFirstInsidePunctSymbol(String tokenText) {
-		for (int i = 0; i < tokenText.length(); i++) {
-			char currentChar = tokenText.charAt(i);
-			
-			if (currentChar == ',' && !isNumber(tokenText)) {
-				return i;
-			}
-			if (currentChar == '.' && !isNumber(tokenText)) {
-				return i;
-			}
-			
-
-			if ((isAlphabetLetterOrDigit(currentChar) == false)
-					&& (currentChar != '.') && (currentChar != ',')
-					&& (currentChar != ':') && (currentChar != ';')) {
-				return i;
-			}
-		}
-		return -1;
-	}
-
-	/**
-	 * Finds punctuation/symbols located inside a token. If found, the token is
-	 * split into multiple Tokens. Note that the method is recursive.
-	 * 
-	 * @param tokens
-	 * @param token
-	 * @param tokenText
-	 * @return
-	 */
-	private boolean findPunctSymbolInsideToken(List<Token> tokens, Token token,
-			String tokenText) {
-		int startOffset = token.getStartOffset();
-		int punctSymbolOffset = getFirstInsidePunctSymbol(tokenText);
-		if (punctSymbolOffset != -1) {
-			char c = tokenText.charAt(punctSymbolOffset);
-
-			// logic for hypenation
-			if (c == '-') {
-				if ((iv_hyphMap != null)
-						&& iv_hyphMap.containsKey(tokenText.toLowerCase())) {
-					int freq = ((Integer) iv_hyphMap.get(tokenText
-							.toLowerCase())).intValue();
-					if (freq > iv_freqCutoff) {
-						if (!tokens.contains(token)) {
-							tokens.add(token);
-							return true;
-						}
-						return false;
-					}
-				}
-			}
-
-			Token t = new Token(startOffset + punctSymbolOffset, startOffset
-					+ punctSymbolOffset + 1);
-			if (isPunctuation(c)) {
-				t.setType(Token.TYPE_PUNCT);
-			} else {
-				t.setType(Token.TYPE_SYMBOL);
-			}
-
-			tokens.add(t);
-			if (startOffset != t.getStartOffset()) {
-				Token leftToken = new Token(startOffset, t.getStartOffset());
-				tokens.add(leftToken);
-			}
-
-			Token rightToken = new Token(t.getEndOffset(), token.getEndOffset());
-			String rightTokenText = tokenText.substring(punctSymbolOffset + 1,
-					tokenText.length());
-			// recurse
-			return findPunctSymbolInsideToken(tokens, rightToken,
-					rightTokenText);
-		} else {
-			if (!tokens.contains(token)) {
-				tokens.add(token);
-				return true;
-			}
-			return false;
-		}
-	}
-
-	private boolean isPunctuation(char c) {
-		if ((c == ';') || (c == ':') || (c == ',') || (c == '.') || (c == '(')
-				|| (c == ')') || (c == '[') || (c == ']') || (c == '{')
-				|| (c == '}') || (c == '<') || (c == '>') || (c == '\'')
-				|| (c == '"') || (c == '/') || (c == '\\') || (c == '-')) {
-			return true;
-		} else {
-			return false;
-		}
-	}
-
-	private boolean isAlphabetLetterOrDigit(char c) {
-		if (isAlphabetLetter(c))
-			return true;
-		if (isDigit(c))
-			return true; // otherwise
-		return false;
-	}
-
-	public boolean isAlphabetLetter(char c) {
-		int unicode = Character.getNumericValue(c);
-		if ((unicode >= 10) && (unicode <= 35))
-			return true;
-		else
-			return false;
-	}
-
-	private boolean isDigit(char c) {
-		int unicode = Character.getNumericValue(c);
-		if ((unicode >= 0) && (unicode <= 9))
-			return true;
-		else
-			return false;
-	}
-
-	/**
-	 * Applies number rules to the given token.
-	 * 
-	 * @param tokenText
-	 * @return True if the token is a number, false otherwise.
-	 */
-	public static boolean isNumber(String tokenText) {
-		final char decimalPoint = '.';
-		boolean foundDecimalPoint = false;
-		int charsBeforeDecimal = 0;
-		for (int i = tokenText.length() - 1; i >= 0; i--) {
-			char currentChar = tokenText.charAt(i);
-			if (Character.isDigit(currentChar) == false) {
-				if ((currentChar == decimalPoint)
-						&& (foundDecimalPoint == false)) {
-					foundDecimalPoint = true;
-					charsBeforeDecimal = 0;
-					continue;
-				} else if (currentChar == ',') { // commas are valid only
-													// every 3 digits
-					if (charsBeforeDecimal % 3 == 0) {
-						continue;
-					} else {
-						return false;
-					}
-				} // otherwise it's a letter or punct
-				return false;
-			}
-			charsBeforeDecimal++;
-		}
-		return true;
-	}
-
-	/**
-	 * Given that the token text is a number, this method will determine if the
-	 * number is an integer or not.
-	 * 
-	 * @param tokenText
-	 * @return
-	 */
-	private boolean isInteger(String tokenText) {
-		if (tokenText.indexOf('.') != -1) {
-			return false;
-		} else {
-			return true;
-		}
-	}
-
-	/**
-	 * Applies capitalization rules to the given token. This should normally
-	 * only be used for tokens containing strictly text, but mixtures of
-	 * letters, numbers, and symbols are allowed too.
-	 * 
-	 * @param token
-	 * @param tokenText
-	 */
-	private void applyCapitalizationRules(Token token, String tokenText) {
-		// true = upper case, false = lower case
-		boolean[] uppercaseMask = new boolean[tokenText.length()];
-		boolean isAllUppercase = true;
-		boolean isAllLowercase = true;
-		for (int i = 0; i < tokenText.length(); i++) {
-			char currentChar = tokenText.charAt(i);
-			uppercaseMask[i] = Character.isUpperCase(currentChar);
-			if (uppercaseMask[i] == false)
-				isAllUppercase = false;
-			else
-				isAllLowercase = false;
-		}
-
-		if (isAllLowercase) {
-			token.setCaps(Token.CAPS_NONE);
-		} else if (isAllUppercase) {
-			token.setCaps(Token.CAPS_ALL);
-		} else if (uppercaseMask[0] == true) {
-			if (uppercaseMask.length == 1) {
-				token.setCaps(Token.CAPS_FIRST_ONLY);
-				return;
-			}
-			boolean isRestLowercase = true;
-			for (int i = 1; i < uppercaseMask.length; i++) {
-				if (uppercaseMask[i] == true)
-					isRestLowercase = false;
-			}
-			if (isRestLowercase) {
-				token.setCaps(Token.CAPS_FIRST_ONLY);
-			} else {
-				token.setCaps(Token.CAPS_MIXED);
-			}
-		} else {
-			token.setCaps(Token.CAPS_MIXED);
-		}
-		return;
-	}
-
-	private void applyWordNumRules(Token token, String tokenText) {
-		boolean[] digitMask = new boolean[tokenText.length()];
-		boolean isAllLetters = true;
-		for (int i = 0; i < tokenText.length(); i++) {
-			char currentChar = tokenText.charAt(i);
-			digitMask[i] = Character.isDigit(currentChar);
-			if (digitMask[i] == true) {
-				isAllLetters = false;
-			}
-		}
-
-		if (isAllLetters) {
-			token.setNumPosition(Token.NUM_NONE);
-		} else if (digitMask[0] == true) {
-			token.setNumPosition(Token.NUM_FIRST);
-		} else if (digitMask[tokenText.length() - 1]) {
-			token.setNumPosition(Token.NUM_LAST);
-		} else {
-			token.setNumPosition(Token.NUM_MIDDLE);
-		}
-		return;
-	}
-
-	/**
-	 * Gets a list of tokens that mark end of a line.
-	 * 
-	 * @param text
-	 * @return
-	 */
+			if (foundSomethingInside) {
+				removeTokenList.add(token);
+			}
+		}
+		tokens.addAll(newTokenList);
+		for (int i = 0; i < removeTokenList.size(); i++) {
+			Token tokenToBeRemoved = removeTokenList.get(i);
+			tokens.remove(tokenToBeRemoved);
+		}
+	}
+
+	private int processStartPunctSymbol(List<Token> newTokenList, Token token,
+			String tokenText) {
+		int count = 0;
+		for (int i = 0; i < tokenText.length(); i++) {
+			char currentChar = tokenText.charAt(i);
+			if (!isAlphabetLetterOrDigit(currentChar)) {
+				Token t = new Token(token.getStartOffset() + i, token
+						.getStartOffset()
+						+ i + 1);
+
+				if (isPunctuation(currentChar)) {
+					t.setType(Token.TYPE_PUNCT);
+				} else {
+					t.setType(Token.TYPE_SYMBOL);
+				}
+				newTokenList.add(t);
+				count++;
+			} else { // encountered a letter or digit, stop
+				return count;
+			}
+		}
+		return count;
+	}
+
+	private int processEndPunctSymbol(List<Token> newTokenList, Token token,
+			String tokenText) {
+		int count = 0;
+		for (int i = tokenText.length() - 1; i >= 0; i--) {
+			char currentChar = tokenText.charAt(i);
+			if (!isAlphabetLetterOrDigit(currentChar)) {
+				Token t = new Token(token.getStartOffset() + i, token
+						.getStartOffset()
+						+ i + 1);
+
+				if (isPunctuation(currentChar)) {
+					t.setType(Token.TYPE_PUNCT);
+				} else {
+					t.setType(Token.TYPE_SYMBOL);
+				}
+
+				newTokenList.add(t);
+				count++;
+			} else { // encountered a letter or digit, stop
+				return count;
+			}
+		}
+		return count;
+	}
+
+	private int getFirstInsidePunctSymbol(String tokenText) {
+		for (int i = 0; i < tokenText.length(); i++) {
+			char currentChar = tokenText.charAt(i);
+			
+			if (currentChar == ',' && !isNumber(tokenText)) {
+				return i;
+			}
+			if (currentChar == '.' && !isNumber(tokenText)) {
+				return i;
+			}
+			
+
+			if ((isAlphabetLetterOrDigit(currentChar) == false)
+					&& (currentChar != '.') && (currentChar != ',')
+					&& (currentChar != ':') && (currentChar != ';')) {
+				return i;
+			}
+		}
+		return -1;
+	}
+
+	/**
+	 * Finds punctuation/symbols located inside a token. If found, the token is
+	 * split into multiple Tokens. Note that the method is recursive.
+	 * 
+	 * @param tokens
+	 * @param token
+	 * @param tokenText
+	 * @return
+	 */
+	private boolean findPunctSymbolInsideToken(List<Token> tokens, Token token,
+			String tokenText) {
+		int startOffset = token.getStartOffset();
+		int punctSymbolOffset = getFirstInsidePunctSymbol(tokenText);
+		if (punctSymbolOffset != -1) {
+			char c = tokenText.charAt(punctSymbolOffset);
+
+			// logic for hypenation
+			if (c == '-') {
+				if ((iv_hyphMap != null)
+						&& iv_hyphMap.containsKey(tokenText.toLowerCase())) {
+					int freq = ((Integer) iv_hyphMap.get(tokenText
+							.toLowerCase())).intValue();
+					if (freq > iv_freqCutoff) {
+						if (!tokens.contains(token)) {
+							tokens.add(token);
+							return true;
+						}
+						return false;
+					}
+				}
+			}
+
+			Token t = new Token(startOffset + punctSymbolOffset, startOffset
+					+ punctSymbolOffset + 1);
+			if (isPunctuation(c)) {
+				t.setType(Token.TYPE_PUNCT);
+			} else {
+				t.setType(Token.TYPE_SYMBOL);
+			}
+
+			tokens.add(t);
+			if (startOffset != t.getStartOffset()) {
+				Token leftToken = new Token(startOffset, t.getStartOffset());
+				tokens.add(leftToken);
+			}
+
+			Token rightToken = new Token(t.getEndOffset(), token.getEndOffset());
+			String rightTokenText = tokenText.substring(punctSymbolOffset + 1,
+					tokenText.length());
+			// recurse
+			return findPunctSymbolInsideToken(tokens, rightToken,
+					rightTokenText);
+		} else {
+			if (!tokens.contains(token)) {
+				tokens.add(token);
+				return true;
+			}
+			return false;
+		}
+	}
+
+	private boolean isPunctuation(char c) {
+		if ((c == ';') || (c == ':') || (c == ',') || (c == '.') || (c == '(')
+				|| (c == ')') || (c == '[') || (c == ']') || (c == '{')
+				|| (c == '}') || (c == '<') || (c == '>') || (c == '\'')
+				|| (c == '"') || (c == '/') || (c == '\\') || (c == '-')) {
+			return true;
+		} else {
+			return false;
+		}
+	}
+
+	private boolean isAlphabetLetterOrDigit(char c) {
+		if (isAlphabetLetter(c))
+			return true;
+		if (isDigit(c))
+			return true; // otherwise
+		return false;
+	}
+
+	public boolean isAlphabetLetter(char c) {
+		int unicode = Character.getNumericValue(c);
+		if ((unicode >= 10) && (unicode <= 35))
+			return true;
+		else
+			return false;
+	}
+
+	private boolean isDigit(char c) {
+		int unicode = Character.getNumericValue(c);
+		if ((unicode >= 0) && (unicode <= 9))
+			return true;
+		else
+			return false;
+	}
+
+	/**
+	 * Applies number rules to the given token.
+	 * 
+	 * @param tokenText
+	 * @return True if the token is a number, false otherwise.
+	 */
+	public static boolean isNumber(String tokenText) {
+		final char decimalPoint = '.';
+		boolean foundDecimalPoint = false;
+		int charsBeforeDecimal = 0;
+		for (int i = tokenText.length() - 1; i >= 0; i--) {
+			char currentChar = tokenText.charAt(i);
+			if (Character.isDigit(currentChar) == false) {
+				if ((currentChar == decimalPoint)
+						&& (foundDecimalPoint == false)) {
+					foundDecimalPoint = true;
+					charsBeforeDecimal = 0;
+					continue;
+				} else if (currentChar == ',') { // commas are valid only
+													// every 3 digits
+					if (charsBeforeDecimal % 3 == 0) {
+						continue;
+					} else {
+						return false;
+					}
+				} // otherwise it's a letter or punct
+				return false;
+			}
+			charsBeforeDecimal++;
+		}
+		return true;
+	}
+
+	/**
+	 * Given that the token text is a number, this method will determine if the
+	 * number is an integer or not.
+	 * 
+	 * @param tokenText
+	 * @return
+	 */
+	private boolean isInteger(String tokenText) {
+		if (tokenText.indexOf('.') != -1) {
+			return false;
+		} else {
+			return true;
+		}
+	}
+
+	/**
+	 * Applies capitalization rules to the given token. This should normally
+	 * only be used for tokens containing strictly text, but mixtures of
+	 * letters, numbers, and symbols are allowed too.
+	 * 
+	 * @param token
+	 * @param tokenText
+	 */
+	private void applyCapitalizationRules(Token token, String tokenText) {
+		// true = upper case, false = lower case
+		boolean[] uppercaseMask = new boolean[tokenText.length()];
+		boolean isAllUppercase = true;
+		boolean isAllLowercase = true;
+		for (int i = 0; i < tokenText.length(); i++) {
+			char currentChar = tokenText.charAt(i);
+			uppercaseMask[i] = Character.isUpperCase(currentChar);
+			if (uppercaseMask[i] == false)
+				isAllUppercase = false;
+			else
+				isAllLowercase = false;
+		}
+
+		if (isAllLowercase) {
+			token.setCaps(Token.CAPS_NONE);
+		} else if (isAllUppercase) {
+			token.setCaps(Token.CAPS_ALL);
+		} else if (uppercaseMask[0] == true) {
+			if (uppercaseMask.length == 1) {
+				token.setCaps(Token.CAPS_FIRST_ONLY);
+				return;
+			}
+			boolean isRestLowercase = true;
+			for (int i = 1; i < uppercaseMask.length; i++) {
+				if (uppercaseMask[i] == true)
+					isRestLowercase = false;
+			}
+			if (isRestLowercase) {
+				token.setCaps(Token.CAPS_FIRST_ONLY);
+			} else {
+				token.setCaps(Token.CAPS_MIXED);
+			}
+		} else {
+			token.setCaps(Token.CAPS_MIXED);
+		}
+		return;
+	}
+
+	private void applyWordNumRules(Token token, String tokenText) {
+		boolean[] digitMask = new boolean[tokenText.length()];
+		boolean isAllLetters = true;
+		for (int i = 0; i < tokenText.length(); i++) {
+			char currentChar = tokenText.charAt(i);
+			digitMask[i] = Character.isDigit(currentChar);
+			if (digitMask[i] == true) {
+				isAllLetters = false;
+			}
+		}
+
+		if (isAllLetters) {
+			token.setNumPosition(Token.NUM_NONE);
+		} else if (digitMask[0] == true) {
+			token.setNumPosition(Token.NUM_FIRST);
+		} else if (digitMask[tokenText.length() - 1]) {
+			token.setNumPosition(Token.NUM_LAST);
+		} else {
+			token.setNumPosition(Token.NUM_MIDDLE);
+		}
+		return;
+	}
+
+	/**
+	 * Gets a list of tokens that mark end of a line.
+	 * 
+	 * @param text
+	 * @return
+	 */
 	private List<Token> getEndOfLineTokens(String text) {
 		final char crChar = '\r';
 		final char nlChar = '\n';
@@ -615,15 +608,15 @@ public class Tokenizer {
 
 		}
 		return eolTokens;
-	}
-
-	/**
-	 * Text is split based on whitespace into raw tokens. A raw token is defined
-	 * as a span of text with no identified type.
-	 * 
-	 * @param text
-	 * @return
-	 */
+	}
+
+	/**
+	 * Text is split based on whitespace into raw tokens. A raw token is defined
+	 * as a span of text with no identified type.
+	 * 
+	 * @param text
+	 * @return
+	 */
 	private List<Token> getRawTokens(String text) {
 		final char wsChar = ' ';
 		final char tabChar = '\t';
@@ -658,6 +651,6 @@ public class Tokenizer {
 		}
 
 		return rawTokens;
-	}
-
-}
+	}
+
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerHelper.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerHelper.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerHelper.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerHelper.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2011   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerPTB.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerPTB.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerPTB.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/nlp/tokenizer/TokenizerPTB.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2011   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,8 +14,8 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.nlp.tokenizer;
-
+package edu.mayo.bmi.nlp.tokenizer;
+
 import static edu.mayo.bmi.nlp.tokenizer.TokenizerHelper.APOSTROPHE;
 import static edu.mayo.bmi.nlp.tokenizer.TokenizerHelper.COMMA;
 import static edu.mayo.bmi.nlp.tokenizer.TokenizerHelper.CR;
@@ -49,30 +42,30 @@ import edu.mayo.bmi.uima.core.type.synta
 import edu.mayo.bmi.uima.core.type.syntax.PunctuationToken;
 import edu.mayo.bmi.uima.core.type.syntax.SymbolToken;
 import edu.mayo.bmi.uima.core.type.syntax.WordToken;
-
-/**
+
+/**
  * A class used to break natural text into tokens following PTB rules.
  * See Supplementary Guidelines for ETTB 2.0
  * dated April 6th, 2009. 
  * The token markup is external to the text and is not embedded.
- * Character offset location is used to identify the boundaries of a token.
- * 
- * @author Mayo Clinic
- */
+ * Character offset location is used to identify the boundaries of a token.
+ * 
+ * @author Mayo Clinic
+ */
 public class TokenizerPTB {
-    
-    	/**
-	 * Constructor
-	 */
-	public TokenizerPTB() {
-	}
+    
+    	/**
+	 * Constructor
+	 */
+	public TokenizerPTB() {
+	}
 
 	
 	static final String [] emptyStringList = new String[0];
 	static final ArrayList<BaseToken> emptyTokenList = new ArrayList<BaseToken>();
 	    
-
-
+
+
 	/**
 	 * Tokenize text that starts at offset offsetAdjustment within the complete text
 	 * @param jcas
@@ -1287,9 +1280,9 @@ public class TokenizerPTB {
 
 	    }
 
-	}
+	}
 
-}
+}
 
 
 // createToken(Class clas, String s, JCas jcas, int begin, int end, int offsetAdjustment) {

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/CopyAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/CopyAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/CopyAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/CopyAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -137,4 +130,4 @@ public class CopyAnnotator extends JCasA
 		}
 	}
 
-}
\ No newline at end of file
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/FilterAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/FilterAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/FilterAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/FilterAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/NullAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/NullAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/NullAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/NullAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,25 +14,25 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.core.ae;
-
+package edu.mayo.bmi.uima.core.ae;
+
 import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
-
-/**
- * This annotator does nothing.  The process method is overridden but is empty.
- * None of the other methods are overriden.  This annotator may be useful if
- * you are using the CPE GUI and you are required to specify an analysis engine
- * but you don't actually want to specify one.  
- * 
- * @author Mayo Clinic
- *
- */
-public class NullAnnotator extends JCasAnnotator_ImplBase 
-{
-
-	   public void process(JCas jcas)
-       throws AnalysisEngineProcessException
-       {}
-}
\ No newline at end of file
+
+/**
+ * This annotator does nothing.  The process method is overridden but is empty.
+ * None of the other methods are overriden.  This annotator may be useful if
+ * you are using the CPE GUI and you are required to specify an analysis engine
+ * but you don't actually want to specify one.  
+ * 
+ * @author Mayo Clinic
+ *
+ */
+public class NullAnnotator extends JCasAnnotator_ImplBase 
+{
+
+	   public void process(JCas jcas)
+       throws AnalysisEngineProcessException
+       {}
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/OverlapAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/OverlapAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/OverlapAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/OverlapAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -485,4 +478,4 @@ public class OverlapAnnotator extends JC
 			}
 		}
 	}
-}
\ No newline at end of file
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SectionSegmentAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SectionSegmentAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SectionSegmentAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SectionSegmentAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SentenceDetector.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SentenceDetector.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SentenceDetector.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SentenceDetector.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -68,4 +61,4 @@ public class SimpleSegmentAnnotator exte
 		segment.setId(segmentId);
 		segment.addToIndexes();
 	}
-}
\ No newline at end of file
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentWithTagsAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentWithTagsAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentWithTagsAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/SimpleSegmentWithTagsAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -137,4 +130,4 @@ public class SimpleSegmentWithTagsAnnota
 		}
 
 	}
-}
\ No newline at end of file
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenConverter.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenConverter.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenConverter.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenConverter.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,209 +14,209 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.core.ae;
-
-import org.apache.uima.jcas.JCas;
-
-import edu.mayo.bmi.nlp.tokenizer.Token;
-import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
-import edu.mayo.bmi.uima.core.type.syntax.ContractionToken;
-import edu.mayo.bmi.uima.core.type.syntax.NewlineToken;
-import edu.mayo.bmi.uima.core.type.syntax.NumToken;
-import edu.mayo.bmi.uima.core.type.syntax.PunctuationToken;
-import edu.mayo.bmi.uima.core.type.syntax.SymbolToken;
-import edu.mayo.bmi.uima.core.type.syntax.WordToken;
-
-/**
- * Utilities methods for converting between Java Tokenizer objects and their
- * equivalent JCas objects.
- * 
- * @author Mayo Clinic
- */
-public class TokenConverter
-{
-    /**
-     * Converts from Java Tokenizer object into a JCas object.
-     * 
-     * @param tokenArr
-     * @param jcas
-     * @param beginPos
-     * @return
-     */
-    public static BaseToken convert(Token t, JCas jcas, int beginPos)
-    {
-        BaseToken bta = null;
-
-        int begin = beginPos + t.getStartOffset();
-        int end = beginPos + t.getEndOffset();
-
-        switch (t.getType())
-        {
-        case Token.TYPE_WORD:
-            WordToken wta = new WordToken(jcas);
-            wta.setBegin(begin);
-            wta.setEnd(end);
-            int cap = -1;
-            switch (t.getCaps())
-            {
-            case Token.CAPS_ALL:
-                cap = TokenizerAnnotator.TOKEN_CAP_ALL;
-                break;
-            case Token.CAPS_FIRST_ONLY:
-                cap = TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY;
-                break;
-            case Token.CAPS_MIXED:
-                cap = TokenizerAnnotator.TOKEN_CAP_MIXED;
-                break;
-            case Token.CAPS_NONE:
-                cap = TokenizerAnnotator.TOKEN_CAP_NONE;
-                break;
-            }
-
-            int numPos = -1;
-            switch (t.getNumPosition())
-            {
-            case Token.NUM_FIRST:
-                numPos = TokenizerAnnotator.TOKEN_NUM_POS_FIRST;
-                break;
-            case Token.NUM_MIDDLE:
-                numPos = TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE;
-                break;
-            case Token.NUM_LAST:
-                numPos = TokenizerAnnotator.TOKEN_NUM_POS_LAST;
-                break;
-            case Token.NUM_NONE:
-                numPos = TokenizerAnnotator.TOKEN_NUM_POS_NONE;
-                break;
-            }
-            wta.setCapitalization(cap);
-            wta.setNumPosition(numPos);
-            bta = wta;
-            break;
-        case Token.TYPE_NUMBER:
-            NumToken nta = new NumToken(jcas);
-            nta.setBegin(begin);
-            nta.setEnd(end);
-            if (t.isInteger())
-            {
-                nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER);
-            }
-            else
-            {
-                nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_DECIMAL);
-            }
-            bta = nta;
-            break;
-        case Token.TYPE_PUNCT:
-            PunctuationToken pta = new PunctuationToken(jcas);
-            pta.setBegin(begin);
-            pta.setEnd(end);
-            bta = pta;
-            break;
-        case Token.TYPE_EOL:
-            NewlineToken nlta = new NewlineToken(jcas);
-            nlta.setBegin(begin);
-            nlta.setEnd(end);
-            bta = nlta;
-            break;
-        case Token.TYPE_CONTRACTION:
-            ContractionToken cta = new ContractionToken(
-                    jcas);
-            cta.setBegin(begin);
-            cta.setEnd(end);
-            bta = cta;
-            break;
-        case Token.TYPE_SYMBOL:
-            SymbolToken sta = new SymbolToken(jcas);
-            sta.setBegin(begin);
-            sta.setEnd(end);
-            bta = sta;
-            break;
-        default:
-        }
-
-        return bta;
-    }
-
-    /**
-     * Convert from a JCas object into Java Tokenizer object.
-     * 
-     * @param bta
-     * @return
-     */
-    public static Token convert(BaseToken bta)
-    {
-        Token token = new Token(bta.getBegin(), bta.getEnd());
-        token.setText(bta.getCoveredText());
-
-        if (bta instanceof WordToken)
-        {
-            WordToken wta = (WordToken) bta;
-            token.setType(Token.TYPE_WORD);
-
-            switch (wta.getCapitalization())
-            {
-            case TokenizerAnnotator.TOKEN_CAP_ALL:
-                token.setCaps(Token.CAPS_ALL);
-                break;
-            case TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY:
-                token.setCaps(Token.CAPS_FIRST_ONLY);
-                break;
-            case TokenizerAnnotator.TOKEN_CAP_MIXED:
-                token.setCaps(Token.CAPS_MIXED);
-                break;
-            case TokenizerAnnotator.TOKEN_CAP_NONE:
-                token.setCaps(Token.CAPS_NONE);
-                break;
-            }
-
-            switch (wta.getNumPosition())
-            {
-            case TokenizerAnnotator.TOKEN_NUM_POS_FIRST:
-                token.setNumPosition(Token.NUM_FIRST);
-                break;
-            case TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE:
-                token.setNumPosition(Token.NUM_MIDDLE);
-                break;
-            case TokenizerAnnotator.TOKEN_NUM_POS_LAST:
-                token.setNumPosition(Token.NUM_LAST);
-                break;
-            case TokenizerAnnotator.TOKEN_NUM_POS_NONE:
-                token.setNumPosition(Token.NUM_NONE);
-                break;
-            }
-        }
-        else if (bta instanceof NumToken)
-        {
-            NumToken nta = (NumToken) bta;
-            token.setType(Token.TYPE_NUMBER);
-
-            if (nta.getNumType() == TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER)
-            {
-                token.setIsInteger(true);
-            }
-            else
-            {
-                token.setIsInteger(false);
-            }
-        }
-        else if (bta instanceof PunctuationToken)
-        {
-            token.setType(Token.TYPE_PUNCT);
-        }
-        else if (bta instanceof NewlineToken)
-        {
-            token.setType(Token.TYPE_EOL);
-        }
-        else if (bta instanceof ContractionToken)
-        {
-            token.setType(Token.TYPE_CONTRACTION);
-        }
-        else if (bta instanceof SymbolToken)
-        {
-            token.setType(Token.TYPE_SYMBOL);
-        }
-
-        return token;
-    }    
-}
\ No newline at end of file
+package edu.mayo.bmi.uima.core.ae;
+
+import org.apache.uima.jcas.JCas;
+
+import edu.mayo.bmi.nlp.tokenizer.Token;
+import edu.mayo.bmi.uima.core.type.syntax.BaseToken;
+import edu.mayo.bmi.uima.core.type.syntax.ContractionToken;
+import edu.mayo.bmi.uima.core.type.syntax.NewlineToken;
+import edu.mayo.bmi.uima.core.type.syntax.NumToken;
+import edu.mayo.bmi.uima.core.type.syntax.PunctuationToken;
+import edu.mayo.bmi.uima.core.type.syntax.SymbolToken;
+import edu.mayo.bmi.uima.core.type.syntax.WordToken;
+
+/**
+ * Utilities methods for converting between Java Tokenizer objects and their
+ * equivalent JCas objects.
+ * 
+ * @author Mayo Clinic
+ */
+public class TokenConverter
+{
+    /**
+     * Converts from Java Tokenizer object into a JCas object.
+     * 
+     * @param tokenArr
+     * @param jcas
+     * @param beginPos
+     * @return
+     */
+    public static BaseToken convert(Token t, JCas jcas, int beginPos)
+    {
+        BaseToken bta = null;
+
+        int begin = beginPos + t.getStartOffset();
+        int end = beginPos + t.getEndOffset();
+
+        switch (t.getType())
+        {
+        case Token.TYPE_WORD:
+            WordToken wta = new WordToken(jcas);
+            wta.setBegin(begin);
+            wta.setEnd(end);
+            int cap = -1;
+            switch (t.getCaps())
+            {
+            case Token.CAPS_ALL:
+                cap = TokenizerAnnotator.TOKEN_CAP_ALL;
+                break;
+            case Token.CAPS_FIRST_ONLY:
+                cap = TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY;
+                break;
+            case Token.CAPS_MIXED:
+                cap = TokenizerAnnotator.TOKEN_CAP_MIXED;
+                break;
+            case Token.CAPS_NONE:
+                cap = TokenizerAnnotator.TOKEN_CAP_NONE;
+                break;
+            }
+
+            int numPos = -1;
+            switch (t.getNumPosition())
+            {
+            case Token.NUM_FIRST:
+                numPos = TokenizerAnnotator.TOKEN_NUM_POS_FIRST;
+                break;
+            case Token.NUM_MIDDLE:
+                numPos = TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE;
+                break;
+            case Token.NUM_LAST:
+                numPos = TokenizerAnnotator.TOKEN_NUM_POS_LAST;
+                break;
+            case Token.NUM_NONE:
+                numPos = TokenizerAnnotator.TOKEN_NUM_POS_NONE;
+                break;
+            }
+            wta.setCapitalization(cap);
+            wta.setNumPosition(numPos);
+            bta = wta;
+            break;
+        case Token.TYPE_NUMBER:
+            NumToken nta = new NumToken(jcas);
+            nta.setBegin(begin);
+            nta.setEnd(end);
+            if (t.isInteger())
+            {
+                nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER);
+            }
+            else
+            {
+                nta.setNumType(TokenizerAnnotator.TOKEN_NUM_TYPE_DECIMAL);
+            }
+            bta = nta;
+            break;
+        case Token.TYPE_PUNCT:
+            PunctuationToken pta = new PunctuationToken(jcas);
+            pta.setBegin(begin);
+            pta.setEnd(end);
+            bta = pta;
+            break;
+        case Token.TYPE_EOL:
+            NewlineToken nlta = new NewlineToken(jcas);
+            nlta.setBegin(begin);
+            nlta.setEnd(end);
+            bta = nlta;
+            break;
+        case Token.TYPE_CONTRACTION:
+            ContractionToken cta = new ContractionToken(
+                    jcas);
+            cta.setBegin(begin);
+            cta.setEnd(end);
+            bta = cta;
+            break;
+        case Token.TYPE_SYMBOL:
+            SymbolToken sta = new SymbolToken(jcas);
+            sta.setBegin(begin);
+            sta.setEnd(end);
+            bta = sta;
+            break;
+        default:
+        }
+
+        return bta;
+    }
+
+    /**
+     * Convert from a JCas object into Java Tokenizer object.
+     * 
+     * @param bta
+     * @return
+     */
+    public static Token convert(BaseToken bta)
+    {
+        Token token = new Token(bta.getBegin(), bta.getEnd());
+        token.setText(bta.getCoveredText());
+
+        if (bta instanceof WordToken)
+        {
+            WordToken wta = (WordToken) bta;
+            token.setType(Token.TYPE_WORD);
+
+            switch (wta.getCapitalization())
+            {
+            case TokenizerAnnotator.TOKEN_CAP_ALL:
+                token.setCaps(Token.CAPS_ALL);
+                break;
+            case TokenizerAnnotator.TOKEN_CAP_FIRST_ONLY:
+                token.setCaps(Token.CAPS_FIRST_ONLY);
+                break;
+            case TokenizerAnnotator.TOKEN_CAP_MIXED:
+                token.setCaps(Token.CAPS_MIXED);
+                break;
+            case TokenizerAnnotator.TOKEN_CAP_NONE:
+                token.setCaps(Token.CAPS_NONE);
+                break;
+            }
+
+            switch (wta.getNumPosition())
+            {
+            case TokenizerAnnotator.TOKEN_NUM_POS_FIRST:
+                token.setNumPosition(Token.NUM_FIRST);
+                break;
+            case TokenizerAnnotator.TOKEN_NUM_POS_MIDDLE:
+                token.setNumPosition(Token.NUM_MIDDLE);
+                break;
+            case TokenizerAnnotator.TOKEN_NUM_POS_LAST:
+                token.setNumPosition(Token.NUM_LAST);
+                break;
+            case TokenizerAnnotator.TOKEN_NUM_POS_NONE:
+                token.setNumPosition(Token.NUM_NONE);
+                break;
+            }
+        }
+        else if (bta instanceof NumToken)
+        {
+            NumToken nta = (NumToken) bta;
+            token.setType(Token.TYPE_NUMBER);
+
+            if (nta.getNumType() == TokenizerAnnotator.TOKEN_NUM_TYPE_INTEGER)
+            {
+                token.setIsInteger(true);
+            }
+            else
+            {
+                token.setIsInteger(false);
+            }
+        }
+        else if (bta instanceof PunctuationToken)
+        {
+            token.setType(Token.TYPE_PUNCT);
+        }
+        else if (bta instanceof NewlineToken)
+        {
+            token.setType(Token.TYPE_EOL);
+        }
+        else if (bta instanceof ContractionToken)
+        {
+            token.setType(Token.TYPE_CONTRACTION);
+        }
+        else if (bta instanceof SymbolToken)
+        {
+            token.setType(Token.TYPE_SYMBOL);
+        }
+
+        return token;
+    }    
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotator.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotator.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotator.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotator.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -172,4 +165,4 @@ public class TokenizerAnnotator extends 
 			tokenCount++;
 		}
 	}
-}
\ No newline at end of file
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotatorPTB.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotatorPTB.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotatorPTB.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ae/TokenizerAnnotatorPTB.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2011   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -22,8 +15,8 @@
  * limitations under the License. 
  */
 
-package edu.mayo.bmi.uima.core.ae;
-
+package edu.mayo.bmi.uima.core.ae;
+
 import java.util.Date;
 import java.util.Iterator;
 import java.util.List;
@@ -53,78 +46,78 @@ import edu.mayo.bmi.uima.core.type.synta
 import edu.mayo.bmi.uima.core.type.textspan.Segment;
 import edu.mayo.bmi.uima.core.type.textspan.Sentence;
 import edu.mayo.bmi.uima.core.util.ParamUtil;
-
-/**
- * UIMA annotator that tokenizes based on Penn Treebank rules.
- * 
- * @author Mayo Clinic
- */
-public class TokenizerAnnotatorPTB extends JCasAnnotator_ImplBase
-{
-	// LOG4J logger based on class name
-	private Logger logger = Logger.getLogger(getClass().getName());
-
-	/**
-	 * Value is "SegmentsToSkip".  This parameter specifies which segments to skip.  The parameter should be
-	 * of type String, should be multi-valued and optional. 
-	 */
-	public static final String PARAM_SEGMENTS_TO_SKIP = "SegmentsToSkip";
-
-
-	private UimaContext context;
-	private Set<String> skipSegmentsSet;
-
-	private TokenizerPTB tokenizer;
-
-	private int tokenCount = 0;
-
-	public void initialize(UimaContext aContext) throws ResourceInitializationException {
 
-		super.initialize(aContext);
-
+/**
+ * UIMA annotator that tokenizes based on Penn Treebank rules.
+ * 
+ * @author Mayo Clinic
+ */
+public class TokenizerAnnotatorPTB extends JCasAnnotator_ImplBase
+{
+	// LOG4J logger based on class name
+	private Logger logger = Logger.getLogger(getClass().getName());
+
+	/**
+	 * Value is "SegmentsToSkip".  This parameter specifies which segments to skip.  The parameter should be
+	 * of type String, should be multi-valued and optional. 
+	 */
+	public static final String PARAM_SEGMENTS_TO_SKIP = "SegmentsToSkip";
+
+
+	private UimaContext context;
+	private Set<String> skipSegmentsSet;
+
+	private TokenizerPTB tokenizer;
+
+	private int tokenCount = 0;
+
+	public void initialize(UimaContext aContext) throws ResourceInitializationException {
+
+		super.initialize(aContext);
+
 		logger.info("Initializing " + this.getClass().getName());
-		context = aContext;
-		try {
-			configInit();
+		context = aContext;
+		try {
+			configInit();
 		} catch (ResourceAccessException e) {
 			throw new ResourceInitializationException(e);
-		} finally {};
-	}
-
-	/**
+		} finally {};
+	}
+
+	/**
 	 * Reads configuration parameters.
-	 * @throws ResourceAccessException 
-	 */
-	private void configInit() throws ResourceAccessException {
+	 * @throws ResourceAccessException 
+	 */
+	private void configInit() throws ResourceAccessException {
 
 		skipSegmentsSet = ParamUtil.getStringParameterValuesSet(PARAM_SEGMENTS_TO_SKIP, context); 
-
+
 		tokenizer = new TokenizerPTB();
 
-	}
-
-	/**
-	 * Entry point for processing.
-	 */
-	public void process(JCas jcas) throws AnalysisEngineProcessException {
-
-		logger.info("process(JCas) in " + this.getClass().getName());
-
-		tokenCount = 0;
-
-		JFSIndexRepository indexes = jcas.getJFSIndexRepository();
-		FSIterator<Annotation> segmentItr = indexes.getAnnotationIndex(Segment.type).iterator();
-		while (segmentItr.hasNext()) {
-			Segment sa = (Segment) segmentItr.next();
-			String segmentID = sa.getId();
+	}
+
+	/**
+	 * Entry point for processing.
+	 */
+	public void process(JCas jcas) throws AnalysisEngineProcessException {
+
+		logger.info("process(JCas) in " + this.getClass().getName());
+
+		tokenCount = 0;
+
+		JFSIndexRepository indexes = jcas.getJFSIndexRepository();
+		FSIterator<Annotation> segmentItr = indexes.getAnnotationIndex(Segment.type).iterator();
+		while (segmentItr.hasNext()) {
+			Segment sa = (Segment) segmentItr.next();
+			String segmentID = sa.getId();
 			if (!skipSegmentsSet.contains(segmentID)) { 
-				try {
+				try {
 					annotateRange(jcas, sa.getBegin(), sa.getEnd());
 				} catch (AnnotatorProcessException e) {
 					throw new AnalysisEngineProcessException(e);
-				}
-			}
-		}
+				}
+			}
+		}
 	}
 
 
@@ -331,4 +324,4 @@ public class TokenizerAnnotatorPTB exten
 //    uni-
 //    vice-
 //    -wise
-
+

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cc/CASConsumerTestDriver.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cc/CASConsumerTestDriver.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cc/CASConsumerTestDriver.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cc/CASConsumerTestDriver.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,56 +14,56 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.core.cc;
-
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.InputStream;
-
-import org.apache.uima.UIMAFramework;
-import org.apache.uima.analysis_engine.AnalysisEngine;
-import org.apache.uima.analysis_engine.AnalysisEngineDescription;
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.impl.XCASDeserializer;
-import org.apache.uima.collection.CasConsumer;
-import org.apache.uima.collection.CasConsumerDescription;
-import org.apache.uima.util.XMLInputSource;
-
-
-public class CASConsumerTestDriver 
-{
-    public static void main(String[] args) 
-    {
-	try
-	{  
-	    String xCasLocation = args[0];
-	    String taeDescriptionLocation = args[1];
-	    String casConsumerDescriptorLocation = args[2];
-
-	    InputStream xCasStream = new FileInputStream(xCasLocation);
-
-	    AnalysisEngineDescription taeDescription = UIMAFramework.getXMLParser().parseAnalysisEngineDescription(
-		    new XMLInputSource(new File(taeDescriptionLocation)));
-
-	    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(taeDescription);
-	    System.out.println("analysis engine created.");
-
-	    CAS cas = ae.newCAS();
-
-	    XCASDeserializer.deserialize(xCasStream, cas);
-	    System.out.println("XCAS deserialized");
-
-	    ae.process(cas);
-	    CasConsumerDescription casConsumerDescription = UIMAFramework.getXMLParser().parseCasConsumerDescription(
-		    new XMLInputSource(new File(casConsumerDescriptorLocation)));
-	    CasConsumer casConsumer = UIMAFramework.produceCasConsumer(casConsumerDescription);
-	    System.out.println("CasConsumer initialized.  Calling processCas....");
-	    casConsumer.processCas(cas);
+package edu.mayo.bmi.uima.core.cc;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+
+import org.apache.uima.UIMAFramework;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.impl.XCASDeserializer;
+import org.apache.uima.collection.CasConsumer;
+import org.apache.uima.collection.CasConsumerDescription;
+import org.apache.uima.util.XMLInputSource;
+
+
+public class CASConsumerTestDriver 
+{
+    public static void main(String[] args) 
+    {
+	try
+	{  
+	    String xCasLocation = args[0];
+	    String taeDescriptionLocation = args[1];
+	    String casConsumerDescriptorLocation = args[2];
+
+	    InputStream xCasStream = new FileInputStream(xCasLocation);
+
+	    AnalysisEngineDescription taeDescription = UIMAFramework.getXMLParser().parseAnalysisEngineDescription(
+		    new XMLInputSource(new File(taeDescriptionLocation)));
+
+	    AnalysisEngine ae = UIMAFramework.produceAnalysisEngine(taeDescription);
+	    System.out.println("analysis engine created.");
+
+	    CAS cas = ae.newCAS();
+
+	    XCASDeserializer.deserialize(xCasStream, cas);
+	    System.out.println("XCAS deserialized");
+
+	    ae.process(cas);
+	    CasConsumerDescription casConsumerDescription = UIMAFramework.getXMLParser().parseCasConsumerDescription(
+		    new XMLInputSource(new File(casConsumerDescriptorLocation)));
+	    CasConsumer casConsumer = UIMAFramework.produceCasConsumer(casConsumerDescription);
+	    System.out.println("CasConsumer initialized.  Calling processCas....");
+	    casConsumer.processCas(cas);
 	    System.out.println("processCas completed....");
-	}
-	catch(Exception e)
-	{
-	    e.printStackTrace();
-	}
-    }
-}
+	}
+	catch(Exception e)
+	{
+	    e.printStackTrace();
+	}
+    }
+}



Mime
View raw message