incubator-ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From james-mas...@apache.org
Subject svn commit: r1403989 [14/28] - in /incubator/ctakes/branches/SHARPn-cTAKES: Constituency Parser/src/org/chboston/cnlp/ctakes/parser/ Constituency Parser/src/org/chboston/cnlp/ctakes/parser/uima/ae/ Constituency Parser/src/org/chboston/cnlp/ctakes/parse...
Date Wed, 31 Oct 2012 05:26:55 GMT
Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,287 +14,287 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-/*
- * Created on May 23, 2005
- *
- * To change the template for this generated file go to
- * Window>Preferences>Java>Code Generation>Code and Comments
- */
-package edu.mayo.bmi.uima.core.ci;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import edu.mayo.bmi.nlp.tokenizer.Token;
-import edu.mayo.bmi.nlp.tokenizer.Tokenizer;
-
-/**
- * @author Mayo Clinic
- * 
- */
-public class HyphenTextModifierImpl implements TextModifier {
-
-	private Map iv_shouldbeHyphenMap = null;
-	private int iv_windowSize = 3; // default lookahead window
-	private Tokenizer iv_tokenizer = null;
-
-	/**
-	 * Default constructor takes a name of the file containing hyphenated
-	 * phrases, with their frequency.
-	 * Currently the frequency is unused.<br>
-	 * The case of the words in the file is unimportant - we lowercase
-	 * everything when doing compares.<br>
-	 * The file is delimited with "|" and has two fields:<br>
-	 * hyphen-term|frequency
-	 */
-	public HyphenTextModifierImpl(String hyphenfilename, int windowSize) {
-		iv_windowSize = windowSize;
-		iv_tokenizer = new Tokenizer();
-		BufferedReader br;
-		try {
-			br = new BufferedReader(new FileReader(new File(hyphenfilename)));
-
-			String line = "";
-
-			iv_shouldbeHyphenMap = new HashMap();
-			while ((line = br.readLine()) != null) {
-				String[] toks = line.split("\\|");
-				String[] unh = toks[0].split("\\-");
-				String shouldbehyphen = "";
-				for (int i = 0; i < unh.length; i++) {
-					shouldbehyphen += " " + unh[i];
-				}
-				shouldbehyphen = shouldbehyphen.trim().toLowerCase();
-				iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
-			}
-		} catch (FileNotFoundException e) {
-			System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
-			e.printStackTrace();
-		} catch (IOException e) {
-			System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
-			e.printStackTrace();
-		}
-
-	}
-
-	/**
-	 * Filters out unwanted tokens - newlines.
-	 * 
-	 * @param tokenList
-	 */
-	private void filterTokens(List tokenList) {
-
-		List removalList = new ArrayList();
-		Iterator tokenItr = tokenList.iterator();
-
-		while (tokenItr.hasNext()) {
-			Token token = (Token) tokenItr.next();
-			if (token.getType() == Token.TYPE_EOL) {
-				removalList.add(token);
-			}
-		}
-
-		tokenList.removeAll(removalList);
-	}
-
-	/*
-	 * (non-Javadoc)
-	 * 
-	 * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String)
-	 */
-	public TextModification[] modify(String in) throws Exception {
-
-		// intermediate data structure to use for easy adding of new
-		// TextModification objects
-		ArrayList textmods = new ArrayList();
-
-		// Tokenize the input to get offset information
-		List inputtoks = iv_tokenizer.tokenizeAndSort(in);
-
-		filterTokens(inputtoks);
-
-		int orig_startOffset = 0;
-		int orig_endOffset = 0;
-		int new_startOffset = 0;
-		int new_endOffset = 0;
-
-		int i = 0;
-		int j = 0;
-		int end_offset_adj = 0;
-		int start_offset_adj = 0;
-
-		while (i < inputtoks.size()) {
-
-			if (inputtoks.size() - (i + 1) < iv_windowSize) {
-				j = inputtoks.size() - 1;
-			} else {
-				j = i + iv_windowSize;
-			}
-
-			while (j > i) {
-
-				StringBuffer candSB = new StringBuffer();
-				for (int k = i; k <= j; k++) {
-					Token currtok = (Token) inputtoks.get(k);
-					candSB.append(" ");
-					candSB.append(currtok.getText());
-				}
-				String cand = candSB.toString().trim();
-
-				// Attempt to look up the candidate in the hyphen map
-				if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {
-
-					// set the initial offsets
-					orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
-					orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
-					new_startOffset = orig_startOffset;
-					new_endOffset = orig_endOffset;
-
-					// compile new text
-					String newText = "";
-					for (int k = i; k <= j; k++) {
-						Token currtok = (Token) inputtoks.get(k);
-						newText += currtok.getText() + "-";
-					}
-					newText = newText.substring(0, newText.length() - 1);
-
-					// Get the new and old lengths of hyphenated spans
-					int new_Length = newText.length();
-					int orig_Length = orig_endOffset - orig_startOffset;
-
-					// Pad the end offset adjuster by the new amount
-					end_offset_adj += orig_Length - new_Length;
-
-					// Create a new modification object
-					TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset
-							- start_offset_adj, new_endOffset - end_offset_adj, newText);
-
-					// Adjust the start offset on the next Text Modification
-					// object
-					start_offset_adj += orig_Length - new_Length;
-
-					// Put the newly created TextMod object into a temporary
-					// holder
-					textmods.add(tm);
-
-					i = j;
-				}
-				j--;
-			}
-
-			i++;
-		}
-
-		// generate the expected return as an array of TextModification objects
-		TextModification[] tma = new TextModification[textmods.size()];
-		for (int y = 0; y < tma.length; y++) {
-			tma[y] = (TextModification) textmods.get(y);
-		}
-
-		return tma;
-	}
-
-	
-    /**
-     * Apply text modifier to the text <br>
-     * TODO - move this to <code>TextModifier</code> and take a <code>Logger</code>
-     * 		See <code>HyphenTextModifierImpl</code>
-	 * @param tm TextModifier to apply
-	 * @param text Original text
-	 * @param sb Buffer containing text to apply modifier to
-     * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method 
-	 * @throws Exception
-     */
-    private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception {
-    	boolean unableToModifyText = false;
-        TextModification[] textModArr = tm.modify(text);
-        for (int i = 0; i < textModArr.length; i++) {
-
-        	TextModification textMod = textModArr[i];
-            
-            if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset())
-                    || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) {
-                System.err.println("UNSUPPORTED: TextModification with offset changes.");
-                unableToModifyText = true;
-            }
-            else {
-            	sb.replace(textMod.getOrigStartOffset(), 
-        				textMod.getOrigEndOffset(), 
-        				textMod.getNewText());
-            }
-        }  
-        return unableToModifyText;
-    }
-	
-    public static ArrayList<String> test(HyphenTextModifierImpl tm, String text) {
-    	ArrayList<String> messages = new ArrayList<String>();
-    	try {
-			TextModification[] tma = tm.modify(text);
-			StringBuffer sb = new StringBuffer(text);
-			boolean errorModifyingText = applyTextModifier(tm,text,sb);
-			messages.add("Orig: " + text);
-			if (!errorModifyingText) {
-				messages.add("New:  " + sb);
-			}
-			else {
-				System.err.println("New:  (new text not generated, see previous messages)");				
-			}
-			// Regardless of whether was able to modify the text
-			// without
-			// (_apply_ the TextModifier), output the  
-			// the 
-			for (int u = 0; u < tma.length; u++) {
-				TextModification tmo = (TextModification) tma[u];
-				messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-"
-						+ tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset());
-			}
-		} catch (Exception e) {
-			// TODO Auto-generated catch block
-			e.printStackTrace();
-		}
-		return messages;
-    	
-    }
-	/**
-	 * Simple tests of <code>TextModification</code>
-	 * <br>
-	 * Output expected:<br>
-	 * 		UNSUPPORTED: TextModification with offset changes.<br>
-	 * 		UNSUPPORTED: TextModification with offset changes.<br>
-	 * 		UNSUPPORTED: TextModification with offset changes.<br>
-	 *      Orig: Non  Hodgkin's the x  ray without any non small  cell complications.<br>
-	 *      New:  (new text not generated, see previous messages)
-	 * 		Non-Hodgkin Orig: 0-12 New: 0-11<br>
-	 * 		x-ray Orig: 19-25 New: 18-23<br>
-	 * 		non-small-cell Orig: 38-53 New: 36-50<br>
-	 * 
-	 * 		Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
-	 * 		New:  Non-Hodgkin's the x-ray without any non-small-cell complications.<br>
-	 * 		Non-Hodgkin Orig: 0-11 New: 0-11<br>
-	 * 		x-ray Orig: 18-23 New: 18-23<br>
-	 * 		non-small-cell Orig: 36-50 New: 36-50<br>
-	 * Note the case of the words doesn't matter. 
-	 * @param args hyphen text filename (each line: hyphenated-word|freq)
-	 */
-	public static void main(String[] args) {
-		ArrayList<String> messages;
-		HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7);
-
-		String t = "Non  Hodgkin's the x  ray without any non small  cell complications.";
-		messages = test(tm, t); // extra blanks
-		for (String s : messages) {	System.out.println(s); }
-
-		t = t.replace("  ", " "); // change text to only have single blanks between words
-		messages = test(tm, t); // single blanks
-		for (String s : messages) {	System.out.println(s); }
-	}
-
-}
+/*
+ * Created on May 23, 2005
+ *
+ * To change the template for this generated file go to
+ * Window>Preferences>Java>Code Generation>Code and Comments
+ */
+package edu.mayo.bmi.uima.core.ci;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileReader;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+
+import edu.mayo.bmi.nlp.tokenizer.Token;
+import edu.mayo.bmi.nlp.tokenizer.Tokenizer;
+
+/**
+ * @author Mayo Clinic
+ * 
+ */
+public class HyphenTextModifierImpl implements TextModifier {
+
+	private Map iv_shouldbeHyphenMap = null;
+	private int iv_windowSize = 3; // default lookahead window
+	private Tokenizer iv_tokenizer = null;
+
+	/**
+	 * Default constructor takes a name of the file containing hyphenated
+	 * phrases, with their frequency.
+	 * Currently the frequency is unused.<br>
+	 * The case of the words in the file is unimportant - we lowercase
+	 * everything when doing compares.<br>
+	 * The file is delimited with "|" and has two fields:<br>
+	 * hyphen-term|frequency
+	 */
+	public HyphenTextModifierImpl(String hyphenfilename, int windowSize) {
+		iv_windowSize = windowSize;
+		iv_tokenizer = new Tokenizer();
+		BufferedReader br;
+		try {
+			br = new BufferedReader(new FileReader(new File(hyphenfilename)));
+
+			String line = "";
+
+			iv_shouldbeHyphenMap = new HashMap();
+			while ((line = br.readLine()) != null) {
+				String[] toks = line.split("\\|");
+				String[] unh = toks[0].split("\\-");
+				String shouldbehyphen = "";
+				for (int i = 0; i < unh.length; i++) {
+					shouldbehyphen += " " + unh[i];
+				}
+				shouldbehyphen = shouldbehyphen.trim().toLowerCase();
+				iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1));
+			}
+		} catch (FileNotFoundException e) {
+			System.err.println("Cannot find the hyphenation file:" + hyphenfilename);
+			e.printStackTrace();
+		} catch (IOException e) {
+			System.err.println("IOException accessing the hyphenation file:" + hyphenfilename);
+			e.printStackTrace();
+		}
+
+	}
+
+	/**
+	 * Filters out unwanted tokens - newlines.
+	 * 
+	 * @param tokenList
+	 */
+	private void filterTokens(List tokenList) {
+
+		List removalList = new ArrayList();
+		Iterator tokenItr = tokenList.iterator();
+
+		while (tokenItr.hasNext()) {
+			Token token = (Token) tokenItr.next();
+			if (token.getType() == Token.TYPE_EOL) {
+				removalList.add(token);
+			}
+		}
+
+		tokenList.removeAll(removalList);
+	}
+
+	/*
+	 * (non-Javadoc)
+	 * 
+	 * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String)
+	 */
+	public TextModification[] modify(String in) throws Exception {
+
+		// intermediate data structure to use for easy adding of new
+		// TextModification objects
+		ArrayList textmods = new ArrayList();
+
+		// Tokenize the input to get offset information
+		List inputtoks = iv_tokenizer.tokenizeAndSort(in);
+
+		filterTokens(inputtoks);
+
+		int orig_startOffset = 0;
+		int orig_endOffset = 0;
+		int new_startOffset = 0;
+		int new_endOffset = 0;
+
+		int i = 0;
+		int j = 0;
+		int end_offset_adj = 0;
+		int start_offset_adj = 0;
+
+		while (i < inputtoks.size()) {
+
+			if (inputtoks.size() - (i + 1) < iv_windowSize) {
+				j = inputtoks.size() - 1;
+			} else {
+				j = i + iv_windowSize;
+			}
+
+			while (j > i) {
+
+				StringBuffer candSB = new StringBuffer();
+				for (int k = i; k <= j; k++) {
+					Token currtok = (Token) inputtoks.get(k);
+					candSB.append(" ");
+					candSB.append(currtok.getText());
+				}
+				String cand = candSB.toString().trim();
+
+				// Attempt to look up the candidate in the hyphen map
+				if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) {
+
+					// set the initial offsets
+					orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset();
+					orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset();
+					new_startOffset = orig_startOffset;
+					new_endOffset = orig_endOffset;
+
+					// compile new text
+					String newText = "";
+					for (int k = i; k <= j; k++) {
+						Token currtok = (Token) inputtoks.get(k);
+						newText += currtok.getText() + "-";
+					}
+					newText = newText.substring(0, newText.length() - 1);
+
+					// Get the new and old lengths of hyphenated spans
+					int new_Length = newText.length();
+					int orig_Length = orig_endOffset - orig_startOffset;
+
+					// Pad the end offset adjuster by the new amount
+					end_offset_adj += orig_Length - new_Length;
+
+					// Create a new modification object
+					TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset
+							- start_offset_adj, new_endOffset - end_offset_adj, newText);
+
+					// Adjust the start offset on the next Text Modification
+					// object
+					start_offset_adj += orig_Length - new_Length;
+
+					// Put the newly created TextMod object into a temporary
+					// holder
+					textmods.add(tm);
+
+					i = j;
+				}
+				j--;
+			}
+
+			i++;
+		}
+
+		// generate the expected return as an array of TextModification objects
+		TextModification[] tma = new TextModification[textmods.size()];
+		for (int y = 0; y < tma.length; y++) {
+			tma[y] = (TextModification) textmods.get(y);
+		}
+
+		return tma;
+	}
+
+	
+    /**
+     * Apply text modifier to the text <br>
+     * TODO - move this to <code>TextModifier</code> and take a <code>Logger</code>
+     * 		See <code>HyphenTextModifierImpl</code>
+	 * @param tm TextModifier to apply
+	 * @param text Original text
+	 * @param sb Buffer containing text to apply modifier to
+     * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method 
+	 * @throws Exception
+     */
+    private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception {
+    	boolean unableToModifyText = false;
+        TextModification[] textModArr = tm.modify(text);
+        for (int i = 0; i < textModArr.length; i++) {
+
+        	TextModification textMod = textModArr[i];
+            
+            if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset())
+                    || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) {
+                System.err.println("UNSUPPORTED: TextModification with offset changes.");
+                unableToModifyText = true;
+            }
+            else {
+            	sb.replace(textMod.getOrigStartOffset(), 
+        				textMod.getOrigEndOffset(), 
+        				textMod.getNewText());
+            }
+        }  
+        return unableToModifyText;
+    }
+	
+    public static ArrayList<String> test(HyphenTextModifierImpl tm, String text) {
+    	ArrayList<String> messages = new ArrayList<String>();
+    	try {
+			TextModification[] tma = tm.modify(text);
+			StringBuffer sb = new StringBuffer(text);
+			boolean errorModifyingText = applyTextModifier(tm,text,sb);
+			messages.add("Orig: " + text);
+			if (!errorModifyingText) {
+				messages.add("New:  " + sb);
+			}
+			else {
+				System.err.println("New:  (new text not generated, see previous messages)");				
+			}
+			// Regardless of whether was able to modify the text
+			// without
+			// (_apply_ the TextModifier), output the  
+			// the 
+			for (int u = 0; u < tma.length; u++) {
+				TextModification tmo = (TextModification) tma[u];
+				messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-"
+						+ tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset());
+			}
+		} catch (Exception e) {
+			// TODO Auto-generated catch block
+			e.printStackTrace();
+		}
+		return messages;
+    	
+    }
+	/**
+	 * Simple tests of <code>TextModification</code>
+	 * <br>
+	 * Output expected:<br>
+	 * 		UNSUPPORTED: TextModification with offset changes.<br>
+	 * 		UNSUPPORTED: TextModification with offset changes.<br>
+	 * 		UNSUPPORTED: TextModification with offset changes.<br>
+	 *      Orig: Non  Hodgkin's the x  ray without any non small  cell complications.<br>
+	 *      New:  (new text not generated, see previous messages)
+	 * 		Non-Hodgkin Orig: 0-12 New: 0-11<br>
+	 * 		x-ray Orig: 19-25 New: 18-23<br>
+	 * 		non-small-cell Orig: 38-53 New: 36-50<br>
+	 * 
+	 * 		Orig: Non Hodgkin's the x ray without any non small cell complications.<br>
+	 * 		New:  Non-Hodgkin's the x-ray without any non-small-cell complications.<br>
+	 * 		Non-Hodgkin Orig: 0-11 New: 0-11<br>
+	 * 		x-ray Orig: 18-23 New: 18-23<br>
+	 * 		non-small-cell Orig: 36-50 New: 36-50<br>
+	 * Note the case of the words doesn't matter. 
+	 * @param args hyphen text filename (each line: hyphenated-word|freq)
+	 */
+	public static void main(String[] args) {
+		ArrayList<String> messages;
+		HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7);
+
+		String t = "Non  Hodgkin's the x  ray without any non small  cell complications.";
+		messages = test(tm, t); // extra blanks
+		for (String s : messages) {	System.out.println(s); }
+
+		t = t.replace("  ", " "); // change text to only have single blanks between words
+		messages = test(tm, t); // single blanks
+		for (String s : messages) {	System.out.println(s); }
+	}
+
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,62 +14,62 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.core.ci;
-
-/**
- * Value object class that describes a modification of document text. This
- * object tracks the original text and the new replacement text.
- */
-public class TextModification
-{
-    private int iv_origStartOffset;
-    private int iv_origEndOffset;
-
-    private int iv_newStartOffset;
-    private int iv_newEndOffset;
-    private String iv_newText;
- 
-    /**
-     * Constructor
-     * 
-     * @param origStartOffset
-     * @param origEndOffset
-     * @param newStartOffset
-     * @param newEndOffset
-     * @param newText
-     */
-    public TextModification(int origStartOffset, int origEndOffset,
-            int newStartOffset, int newEndOffset, String newText)
-    {
-        iv_origStartOffset = origStartOffset;
-        iv_origEndOffset = origEndOffset;
-        iv_newStartOffset = newStartOffset;
-        iv_newEndOffset = newEndOffset;
-        iv_newText = newText;
-    }
-
-    public int getNewEndOffset()
-    {
-        return iv_newEndOffset;
-    }
-
-    public int getNewStartOffset()
-    {
-        return iv_newStartOffset;
-    }
-
-    public String getNewText()
-    {
-        return iv_newText;
-    }
-
-    public int getOrigEndOffset()
-    {
-        return iv_origEndOffset;
-    }
-
-    public int getOrigStartOffset()
-    {
-        return iv_origStartOffset;
-    }
-}
\ No newline at end of file
+package edu.mayo.bmi.uima.core.ci;
+
+/**
+ * Value object class that describes a modification of document text. This
+ * object tracks the original text and the new replacement text.
+ */
+public class TextModification
+{
+    private int iv_origStartOffset;
+    private int iv_origEndOffset;
+
+    private int iv_newStartOffset;
+    private int iv_newEndOffset;
+    private String iv_newText;
+ 
+    /**
+     * Constructor
+     * 
+     * @param origStartOffset
+     * @param origEndOffset
+     * @param newStartOffset
+     * @param newEndOffset
+     * @param newText
+     */
+    public TextModification(int origStartOffset, int origEndOffset,
+            int newStartOffset, int newEndOffset, String newText)
+    {
+        iv_origStartOffset = origStartOffset;
+        iv_origEndOffset = origEndOffset;
+        iv_newStartOffset = newStartOffset;
+        iv_newEndOffset = newEndOffset;
+        iv_newText = newText;
+    }
+
+    public int getNewEndOffset()
+    {
+        return iv_newEndOffset;
+    }
+
+    public int getNewStartOffset()
+    {
+        return iv_newStartOffset;
+    }
+
+    public String getNewText()
+    {
+        return iv_newText;
+    }
+
+    public int getOrigEndOffset()
+    {
+        return iv_origEndOffset;
+    }
+
+    public int getOrigStartOffset()
+    {
+        return iv_origStartOffset;
+    }
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,21 +14,21 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.core.ci;
-
-/**
- *  Defines a generic interface for modifying text.
- */
-public interface TextModifier
-{
-    /**
-     * Generates modifications for the specified text.
-     * 
-     * @param text
-     *            Original document text.
-     * @return Array of TextModification objects that describe the
-     *         modifications. Offset values are relative to the String object.
-     * @throws Exception
-     */
-    public TextModification[] modify(String text) throws Exception;
-}
\ No newline at end of file
+package edu.mayo.bmi.uima.core.ci;
+
+/**
+ *  Defines a generic interface for modifying text.
+ */
+public interface TextModifier
+{
+    /**
+     * Generates modifications for the specified text.
+     * 
+     * @param text
+     *            Original document text.
+     * @return Array of TextModification objects that describe the
+     *         modifications. Offset values are relative to the String object.
+     * @throws Exception
+     */
+    public TextModification[] modify(String text) throws Exception;
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,183 +14,183 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.core.cr;
-
-/**
- * @author Mayo Clinic
- * @version 1.0
- * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
- * and modified for Mayo use. This inherits from FilesInDirectoryCollectionReader and adds
- * the capability to specify the number of documents to process.
- * 
- * A simple collection reader that reads documents from a directory 
- * in the filesystem.  It can be configured with the following parameters:
- * <ul>
- *   <li><code>InputDirectory</code> - path to directory containing files</li>
- *   <li><code>Encoding</code> (optional) - character encoding of the input 
- *      files</li>
- *   <li><code>Language</code> (optional) - language of the input documents</li>
- *   <li><code>Extensions</code> (optional) - Name of optional configuration 
- *   parameter that specifies the extensions of the files that the 
- *   collection reader will read.  </li>
- *   <li><code>NumberOfIterations</code> (optional) - actual number of files to be processed</li>
- * </ul> 
- * 
- * TODO We may need to provide a way to specify some portion of the path of the file
- * to be included in the id of the document especially if we extend to recursively 
- * gather files in the directory from sub directories.    
- */
-
-import java.io.IOException;
-
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Progress;
-import org.apache.uima.util.ProgressImpl;
-
-
-public class FilesInDirectoryCollectionCyclicalReads extends FilesInDirectoryCollectionReader 
-{
-	/**
-	   * Name of configuration parameter that must be set to the path of
-	   * a directory containing input files.
-	   */
-	  public static final String PARAM_INPUTDIR = "InputDirectory";
-
-	  /**
-	   * Name of configuration parameter that contains the character encoding used
-	   * by the input files.  If not specified, the default system encoding will
-	   * be used.
-	   */
-	  public static final String PARAM_ENCODING = "Encoding";
-
-	  /**
-	   * Name of optional configuration parameter that contains the language of
-	   * the documents in the input directory.  If specified this information will
-	   * be added to the CAS.
-	   */
-	  public static final String PARAM_LANGUAGE = "Language";
-
-	  /**Name of optional configuration parameter that specifies the extensions
-	     * of the files that the collection reader will read.  Values for this
-	     * parameter should not begin with a dot <code>'.'</code>.
-	     */
-
-	  public static final String PARAM_EXTENSIONS = "Extensions";
-	  
-	  /**Arguement to equate to # of times it should read the files.
-	   * Takes this argument to equate to # of times it should read the files. 
-	   */  
-	  
-	  public static final String PARAM_NUMREADS = "NumberOfIterations";
-	     
-	  public static final String PARAM_RECURSE = "Recurse";
-	  private int iv_iteration;
-      private int scaleTime, totalNumFiles, remainTimes;
-      
-	  /**
-	   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
-	   */
-	public void initialize() throws ResourceInitializationException {
-
-		super.initialize();
-		totalNumFiles = iv_files.size();
-		iv_iteration = ((Integer) getConfigParameterValue(PARAM_NUMREADS))
-				.intValue();
-		if (iv_iteration > totalNumFiles) {
-			scaleTime = iv_iteration / totalNumFiles;
-			remainTimes = iv_iteration % totalNumFiles;
-		} else
-			scaleTime = -1;
-
-	}
-	
-    /**
-     * Similar to 'org.apache.uima.collection.CollectionReader' method hasNext() except
-     * interations represents the actual number of documents to be processed, so if the 
-     * total number of documents in a queue is more than the 'Iterations' value then only 
-     * the iteration amount will be processed.  Multiples of the total available documents
-     * will be provided to supplement the list required to meet the total iteration value.
-     */
-	public boolean hasNext()
-	{
-		
-		// If hasNext false then start over only if count that has been passed to the contructor hasn't been reached. 
-		boolean doNext = iv_currentIndex < totalNumFiles;
-	
-		if ((!doNext) && (scaleTime > 0)) {
-			scaleTime--;
-			if (scaleTime > 0) {
-				iv_currentIndex = 0;
-				doNext = true;
-			}
-			else if (remainTimes > 0){
-				iv_currentIndex = 0;
-				totalNumFiles = remainTimes;
-				remainTimes=0;
-				doNext = true;
-			}
-
-		}
-		if (scaleTime == -1) {
-			if (iv_currentIndex < iv_iteration)
-				doNext = true;
-			else
-				doNext = false;
-
-		}
-		
-		return doNext;
-	}
-
-	  /**
-	   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
-	   */
-	  public void getNext(CAS aCAS) throws IOException, CollectionException
-	  {
-
-		super.getNext(aCAS);
-		
-			  	
-	  }
-
-
-	  /**
-	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
-	   */
-	  public void close() throws IOException
-	  {
-		  super.close();
-	  }
-
-	  /**
-	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
-	   */
-	  public Progress[] getProgress() {
-		  int offSet = iv_currentIndex;
-		  if ((scaleTime > 0) && (iv_currentIndex > 0)) 
-			  offSet = iv_currentIndex*(1/scaleTime);
-		  if (scaleTime == 0){
-			  offSet = iv_iteration + remainTimes;
-		  }
-
-		return new Progress[] { new ProgressImpl( offSet , 
-				iv_iteration, Progress.ENTITIES) };
-	}
-
-	  /**
-		 * Gets the total number of documents that will be returned by this
-		 * collection reader. This is not part of the general collection reader
-		 * interface.
-		 * 
-		 * @return the number of documents in the collection
-		 */
-	  public int getNumberOfDocuments()
-	  {
-	    return iv_files.size();
-	  }
-
-
-}
+package edu.mayo.bmi.uima.core.cr;
+
+/**
+ * @author Mayo Clinic
+ * @version 1.0
+ * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
+ * and modified for Mayo use. This inherits from FilesInDirectoryCollectionReader and adds
+ * the capability to specify the number of documents to process.
+ * 
+ * A simple collection reader that reads documents from a directory 
+ * in the filesystem.  It can be configured with the following parameters:
+ * <ul>
+ *   <li><code>InputDirectory</code> - path to directory containing files</li>
+ *   <li><code>Encoding</code> (optional) - character encoding of the input 
+ *      files</li>
+ *   <li><code>Language</code> (optional) - language of the input documents</li>
+ *   <li><code>Extensions</code> (optional) - Name of optional configuration 
+ *   parameter that specifies the extensions of the files that the 
+ *   collection reader will read.  </li>
+ *   <li><code>NumberOfIterations</code> (optional) - actual number of files to be processed</li>
+ * </ul> 
+ * 
+ * TODO We may need to provide a way to specify some portion of the path of the file
+ * to be included in the id of the document especially if we extend to recursively 
+ * gather files in the directory from sub directories.    
+ */
+
+import java.io.IOException;
+
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+
+public class FilesInDirectoryCollectionCyclicalReads extends FilesInDirectoryCollectionReader 
+{
+	/**
+	   * Name of configuration parameter that must be set to the path of
+	   * a directory containing input files.
+	   */
+	  public static final String PARAM_INPUTDIR = "InputDirectory";
+
+	  /**
+	   * Name of configuration parameter that contains the character encoding used
+	   * by the input files.  If not specified, the default system encoding will
+	   * be used.
+	   */
+	  public static final String PARAM_ENCODING = "Encoding";
+
+	  /**
+	   * Name of optional configuration parameter that contains the language of
+	   * the documents in the input directory.  If specified this information will
+	   * be added to the CAS.
+	   */
+	  public static final String PARAM_LANGUAGE = "Language";
+
+	  /**Name of optional configuration parameter that specifies the extensions
+	     * of the files that the collection reader will read.  Values for this
+	     * parameter should not begin with a dot <code>'.'</code>.
+	     */
+
+	  public static final String PARAM_EXTENSIONS = "Extensions";
+	  
+	  /**Arguement to equate to # of times it should read the files.
+	   * Takes this argument to equate to # of times it should read the files. 
+	   */  
+	  
+	  public static final String PARAM_NUMREADS = "NumberOfIterations";
+	     
+	  public static final String PARAM_RECURSE = "Recurse";
+	  private int iv_iteration;
+      private int scaleTime, totalNumFiles, remainTimes;
+      
+	  /**
+	   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+	   */
+	public void initialize() throws ResourceInitializationException {
+
+		super.initialize();
+		totalNumFiles = iv_files.size();
+		iv_iteration = ((Integer) getConfigParameterValue(PARAM_NUMREADS))
+				.intValue();
+		if (iv_iteration > totalNumFiles) {
+			scaleTime = iv_iteration / totalNumFiles;
+			remainTimes = iv_iteration % totalNumFiles;
+		} else
+			scaleTime = -1;
+
+	}
+	
+    /**
+     * Similar to 'org.apache.uima.collection.CollectionReader' method hasNext() except
+     * interations represents the actual number of documents to be processed, so if the 
+     * total number of documents in a queue is more than the 'Iterations' value then only 
+     * the iteration amount will be processed.  Multiples of the total available documents
+     * will be provided to supplement the list required to meet the total iteration value.
+     */
+	public boolean hasNext()
+	{
+		
+		// If hasNext false then start over only if count that has been passed to the contructor hasn't been reached. 
+		boolean doNext = iv_currentIndex < totalNumFiles;
+	
+		if ((!doNext) && (scaleTime > 0)) {
+			scaleTime--;
+			if (scaleTime > 0) {
+				iv_currentIndex = 0;
+				doNext = true;
+			}
+			else if (remainTimes > 0){
+				iv_currentIndex = 0;
+				totalNumFiles = remainTimes;
+				remainTimes=0;
+				doNext = true;
+			}
+
+		}
+		if (scaleTime == -1) {
+			if (iv_currentIndex < iv_iteration)
+				doNext = true;
+			else
+				doNext = false;
+
+		}
+		
+		return doNext;
+	}
+
+	  /**
+	   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+	   */
+	  public void getNext(CAS aCAS) throws IOException, CollectionException
+	  {
+
+		super.getNext(aCAS);
+		
+			  	
+	  }
+
+
+	  /**
+	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+	   */
+	  public void close() throws IOException
+	  {
+		  super.close();
+	  }
+
+	  /**
+	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+	   */
+	  public Progress[] getProgress() {
+		  int offSet = iv_currentIndex;
+		  if ((scaleTime > 0) && (iv_currentIndex > 0)) 
+			  offSet = iv_currentIndex*(1/scaleTime);
+		  if (scaleTime == 0){
+			  offSet = iv_iteration + remainTimes;
+		  }
+
+		return new Progress[] { new ProgressImpl( offSet , 
+				iv_iteration, Progress.ENTITIES) };
+	}
+
+	  /**
+		 * Gets the total number of documents that will be returned by this
+		 * collection reader. This is not part of the general collection reader
+		 * interface.
+		 * 
+		 * @return the number of documents in the collection
+		 */
+	  public int getNumberOfDocuments()
+	  {
+	    return iv_files.size();
+	  }
+
+
+}

Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java
URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java?rev=1403989&r1=1403988&r2=1403989&view=diff
==============================================================================
--- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java (original)
+++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java Wed Oct 31 05:26:43 2012
@@ -1,18 +1,11 @@
 /*
- * Copyright: (c) 2009   Mayo Foundation for Medical Education and 
- * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the
- * triple-shield Mayo logo are trademarks and service marks of MFMER.
- *
- * Except as contained in the copyright notice above, or as used to identify 
- * MFMER as the author of this software, the trade names, trademarks, service
- * marks, or product names of the copyright holder shall not be used in
- * advertising, promotion or otherwise in connection with this software without
- * prior written authorization of the copyright holder.
- * 
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- * 
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
  * http://www.apache.org/licenses/LICENSE-2.0 
  * 
  * Unless required by applicable law or agreed to in writing, software
@@ -21,295 +14,295 @@
  * See the License for the specific language governing permissions and 
  * limitations under the License. 
  */
-package edu.mayo.bmi.uima.core.cr;
-
-/**
- * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
- * and modified for Mayo use.
- * 
- * A simple collection reader that reads documents from a directory 
- * in the filesystem.  It can be configured with the following parameters:
- * <ul>
- *   <li><code>InputDirectory</code> - path to directory containing files</li>
- *   <li><code>Encoding</code> (optional) - character encoding of the input 
- *      files</li>
- *   <li><code>Language</code> (optional) - language of the input documents</li>
- *   <li><code>Extensions</code> (optional) - Name of optional configuration 
- *   parameter that specifies the extensions of the files that the 
- *   collection reader will read.  
- * </ul> 
- * 
- * TODO We may need to provide a way to specify some portion of the path of the file
- * to be included in the id of the document especially if we extend to recursively 
- * gather files in the directory from sub directories.    
- */
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.Reader;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.uima.cas.CAS;
-import org.apache.uima.cas.CASException;
-import org.apache.uima.collection.CollectionException;
-import org.apache.uima.collection.CollectionReader_ImplBase;
-import org.apache.uima.jcas.JCas;
-//import org.apache.uima.jcas.tcas.DocumentAnnotation;
-import org.apache.uima.resource.ResourceConfigurationException;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.apache.uima.util.Progress;
-import org.apache.uima.util.ProgressImpl;
-
-import edu.mayo.bmi.uima.core.type.structured.DocumentID;
-
-public class FilesInDirectoryCollectionReader extends CollectionReader_ImplBase 
-{
-	/**
-	   * Name of configuration parameter that must be set to the path of
-	   * a directory containing input files.
-	   */
-	  public static final String PARAM_INPUTDIR = "InputDirectory";
-
-	  /**
-	   * Name of configuration parameter that contains the character encoding used
-	   * by the input files.  If not specified, the default system encoding will
-	   * be used.
-	   */
-	  public static final String PARAM_ENCODING = "Encoding";
-
-	  /**
-	   * Name of optional configuration parameter that contains the language of
-	   * the documents in the input directory.  If specified this information will
-	   * be added to the CAS.
-	   */
-	  public static final String PARAM_LANGUAGE = "Language";
-
-	  /**Name of optional configuration parameter that specifies the extensions
-	     * of the files that the collection reader will read.  Values for this
-	     * parameter should not begin with a dot <code>'.'</code>.
-	     */
-	    
-	  public static final String PARAM_EXTENSIONS = "Extensions";
-	    
-	  public static final String PARAM_RECURSE = "Recurse";
-	  
-	  protected ArrayList iv_files;
-	  private String iv_encoding;
-	  private String iv_language;
-	  private static String[] iv_extensions; 
-
-      protected int iv_currentIndex;
-	  
-      private boolean iv_recurse = false;
-      
-      private String iv_rootPath = "";
-      
-	  /**
-	   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
-	   */
-	public void initialize() throws ResourceInitializationException
-	{
-	    File directory = new File((String)getConfigParameterValue(PARAM_INPUTDIR));
-	    iv_encoding = (String)getConfigParameterValue(PARAM_ENCODING);
-	    iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE);
-	    iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS);
-	    
-	    iv_currentIndex = 0;
-
-	    iv_recurse = false;
-	    Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE);
-	    if(recurse != null)
-	    	iv_recurse = recurse.booleanValue();
-	    iv_rootPath = directory.getPath();
-    	
-		//if input directory does not exist or is not a directory, throw exception
-		if (!directory.exists() || !directory.isDirectory())
-		{
-			throw new ResourceInitializationException(
-				ResourceConfigurationException.DIRECTORY_NOT_FOUND,
-				new Object[]{PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath()});
-		}
-		
-		
-	    //get list of files (not subdirectories) in the specified directory
-	    iv_files = new ArrayList();
-	    if(!iv_recurse)
-	    {
-	    	File[] files = directory.listFiles();
-	    	for (int i = 0; i < files.length; i++)
-	    	{
-	    		if (!files[i].isDirectory() && hasValidExtension(files[i]))
-	    		{
-	    			iv_files.add(files[i]);  
-	    		}
-	    	}
-	    }
-	    else
-	    {
-	    	try
-	    	{
-	    		collectFiles(directory, iv_files);
-	    		System.out.println("iv_files.size()="+iv_files.size());
-	    	}
-	    	catch(IOException ioe)
-	    	{
-	    		throw new ResourceInitializationException(ioe);
-	    	}
-	    }
-    }
-	
-    private void collectFiles(File directory, List files) throws IOException
-    {
-        File[] dirFiles = directory.listFiles();
-        for(int i=0; i<dirFiles.length;i++)
-        {
-        	if(dirFiles[i].isDirectory())
-        	{
-                collectFiles(dirFiles[i], files);
-            }
-        	else if(hasValidExtension(dirFiles[i]))
-        	{
-        		files.add(dirFiles[i]);	
-        	}
-        }
-    }
-
-	
-    private boolean hasValidExtension(File file)
-    {
-	    if(iv_extensions == null) return true;
-	    for (int i = 0; i < iv_extensions.length; i++) 
-	    {
-		    if(file.getName().endsWith("."+iv_extensions[i]))
-		    {
-			    return true;
-		    }
-	    }
-	    return false;
-    }
-	 
-	
-    /**
-     * @see org.apache.uima.collection.CollectionReader#hasNext()
-     */
-	public boolean hasNext()
-	{
-		return iv_currentIndex < iv_files.size();
-	}
-
-	  /**
-	   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
-	   */
-	  public void getNext(CAS aCAS) throws IOException, CollectionException
-	  {
-	  	JCas jcas;
-	  	InputStream fileInputStream = null;
-	  	Reader fileReader = null;
-	  	
-	  	try
-	    {
-	        jcas = aCAS.getJCas();
-	   	
-	  		//open input stream to file
-	      	File file = (File)iv_files.get(iv_currentIndex);
-	      	fileInputStream = new FileInputStream(file);
-	      	fileReader = new BufferedReader(new InputStreamReader(fileInputStream));
-
-	      	DocumentID documentIDAnnotation = new DocumentID(jcas);
-		    String docID = createDocID(file);
-		    documentIDAnnotation.setDocumentID(docID);
-		    documentIDAnnotation.addToIndexes();
-
-	      	//if there's a CAS Initializer, call it	
-			if (getCasInitializer() != null)
-			{
-				getCasInitializer().initializeCas(fileReader, aCAS);	
-			}
-			else  //No CAS Initializer, so read file and set document text ourselves
-			{				
-				byte[] contents = new byte[(int)file.length() ];
-				fileInputStream.read( contents );   
-				String text;
-				if (iv_encoding != null)
-				{   
-					text = new String(contents, iv_encoding);
-				}
-				else
-				{ 
-					text = new String(contents); 
-				}
-				if(text == null)
-				{
-					System.out.println("text ==null!");
-					System.out.println("docID = "+docID);
-				}
-				//put document in CAS (assume CAS)
-				jcas.setDocumentText(text);
-			}
-	   
-		    //set language if it was explicitly specified as a configuration parameter
-		    if (iv_language != null)
-		    {
-		//      ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
-		    }
-
-	    }		
-	    catch (CASException e)
-	    {
-	      throw new CollectionException(e);
-	    }
-	  	finally
-		{
-			if (fileInputStream != null)
-				fileInputStream.close();
-			iv_currentIndex++;	
-		}  
-	  }
-
-	  private String createDocID(File file)
-	  {
-		    String docID = file.getPath();
-		    if(iv_rootPath.endsWith(""+File.separator) ||
-	           iv_rootPath.equals(""))
-	        {
-	            docID = docID.substring(iv_rootPath.length());
-	        }
-	        else
-	            docID = docID.substring(iv_rootPath.length()+1);
-		   return docID;
-	  }
-	  /**
-	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
-	   */
-	  public void close() throws IOException
-	  {
-	  }
-
-	  /**
-	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
-	   */
-	  public Progress[] getProgress()
-	  {
-	    return new Progress[]{
-	       new ProgressImpl(iv_currentIndex, iv_files.size(),Progress.ENTITIES)};
-	  }
-
-	  /**
-	   * Gets the total number of documents that will be returned by this
-	   * collection reader.  This is not part of the general collection reader
-	   * interface.
-	   * 
-	   * @return the number of documents in the collection
-	   */
-	  public int getNumberOfDocuments()
-	  {
-	    return iv_files.size();
-	  }
-
-
-}
+package edu.mayo.bmi.uima.core.cr;
+
+/**
+ * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader
+ * and modified for Mayo use.
+ * 
+ * A simple collection reader that reads documents from a directory 
+ * in the filesystem.  It can be configured with the following parameters:
+ * <ul>
+ *   <li><code>InputDirectory</code> - path to directory containing files</li>
+ *   <li><code>Encoding</code> (optional) - character encoding of the input 
+ *      files</li>
+ *   <li><code>Language</code> (optional) - language of the input documents</li>
+ *   <li><code>Extensions</code> (optional) - Name of optional configuration 
+ *   parameter that specifies the extensions of the files that the 
+ *   collection reader will read.  
+ * </ul> 
+ * 
+ * TODO We may need to provide a way to specify some portion of the path of the file
+ * to be included in the id of the document especially if we extend to recursively 
+ * gather files in the directory from sub directories.    
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
+import org.apache.uima.collection.CollectionException;
+import org.apache.uima.collection.CollectionReader_ImplBase;
+import org.apache.uima.jcas.JCas;
+//import org.apache.uima.jcas.tcas.DocumentAnnotation;
+import org.apache.uima.resource.ResourceConfigurationException;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.Progress;
+import org.apache.uima.util.ProgressImpl;
+
+import edu.mayo.bmi.uima.core.type.structured.DocumentID;
+
+public class FilesInDirectoryCollectionReader extends CollectionReader_ImplBase 
+{
+	/**
+	   * Name of configuration parameter that must be set to the path of
+	   * a directory containing input files.
+	   */
+	  public static final String PARAM_INPUTDIR = "InputDirectory";
+
+	  /**
+	   * Name of configuration parameter that contains the character encoding used
+	   * by the input files.  If not specified, the default system encoding will
+	   * be used.
+	   */
+	  public static final String PARAM_ENCODING = "Encoding";
+
+	  /**
+	   * Name of optional configuration parameter that contains the language of
+	   * the documents in the input directory.  If specified this information will
+	   * be added to the CAS.
+	   */
+	  public static final String PARAM_LANGUAGE = "Language";
+
+	  /**Name of optional configuration parameter that specifies the extensions
+	     * of the files that the collection reader will read.  Values for this
+	     * parameter should not begin with a dot <code>'.'</code>.
+	     */
+	    
+	  public static final String PARAM_EXTENSIONS = "Extensions";
+	    
+	  public static final String PARAM_RECURSE = "Recurse";
+	  
+	  protected ArrayList iv_files;
+	  private String iv_encoding;
+	  private String iv_language;
+	  private static String[] iv_extensions; 
+
+      protected int iv_currentIndex;
+	  
+      private boolean iv_recurse = false;
+      
+      private String iv_rootPath = "";
+      
+	  /**
+	   * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize()
+	   */
+	public void initialize() throws ResourceInitializationException
+	{
+	    File directory = new File((String)getConfigParameterValue(PARAM_INPUTDIR));
+	    iv_encoding = (String)getConfigParameterValue(PARAM_ENCODING);
+	    iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE);
+	    iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS);
+	    
+	    iv_currentIndex = 0;
+
+	    iv_recurse = false;
+	    Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE);
+	    if(recurse != null)
+	    	iv_recurse = recurse.booleanValue();
+	    iv_rootPath = directory.getPath();
+    	
+		//if input directory does not exist or is not a directory, throw exception
+		if (!directory.exists() || !directory.isDirectory())
+		{
+			throw new ResourceInitializationException(
+				ResourceConfigurationException.DIRECTORY_NOT_FOUND,
+				new Object[]{PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath()});
+		}
+		
+		
+	    //get list of files (not subdirectories) in the specified directory
+	    iv_files = new ArrayList();
+	    if(!iv_recurse)
+	    {
+	    	File[] files = directory.listFiles();
+	    	for (int i = 0; i < files.length; i++)
+	    	{
+	    		if (!files[i].isDirectory() && hasValidExtension(files[i]))
+	    		{
+	    			iv_files.add(files[i]);  
+	    		}
+	    	}
+	    }
+	    else
+	    {
+	    	try
+	    	{
+	    		collectFiles(directory, iv_files);
+	    		System.out.println("iv_files.size()="+iv_files.size());
+	    	}
+	    	catch(IOException ioe)
+	    	{
+	    		throw new ResourceInitializationException(ioe);
+	    	}
+	    }
+    }
+	
+    private void collectFiles(File directory, List files) throws IOException
+    {
+        File[] dirFiles = directory.listFiles();
+        for(int i=0; i<dirFiles.length;i++)
+        {
+        	if(dirFiles[i].isDirectory())
+        	{
+                collectFiles(dirFiles[i], files);
+            }
+        	else if(hasValidExtension(dirFiles[i]))
+        	{
+        		files.add(dirFiles[i]);	
+        	}
+        }
+    }
+
+	
+    private boolean hasValidExtension(File file)
+    {
+	    if(iv_extensions == null) return true;
+	    for (int i = 0; i < iv_extensions.length; i++) 
+	    {
+		    if(file.getName().endsWith("."+iv_extensions[i]))
+		    {
+			    return true;
+		    }
+	    }
+	    return false;
+    }
+	 
+	
+    /**
+     * @see org.apache.uima.collection.CollectionReader#hasNext()
+     */
+	public boolean hasNext()
+	{
+		return iv_currentIndex < iv_files.size();
+	}
+
+	  /**
+	   * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS)
+	   */
+	  public void getNext(CAS aCAS) throws IOException, CollectionException
+	  {
+	  	JCas jcas;
+	  	InputStream fileInputStream = null;
+	  	Reader fileReader = null;
+	  	
+	  	try
+	    {
+	        jcas = aCAS.getJCas();
+	   	
+	  		//open input stream to file
+	      	File file = (File)iv_files.get(iv_currentIndex);
+	      	fileInputStream = new FileInputStream(file);
+	      	fileReader = new BufferedReader(new InputStreamReader(fileInputStream));
+
+	      	DocumentID documentIDAnnotation = new DocumentID(jcas);
+		    String docID = createDocID(file);
+		    documentIDAnnotation.setDocumentID(docID);
+		    documentIDAnnotation.addToIndexes();
+
+	      	//if there's a CAS Initializer, call it	
+			if (getCasInitializer() != null)
+			{
+				getCasInitializer().initializeCas(fileReader, aCAS);	
+			}
+			else  //No CAS Initializer, so read file and set document text ourselves
+			{				
+				byte[] contents = new byte[(int)file.length() ];
+				fileInputStream.read( contents );   
+				String text;
+				if (iv_encoding != null)
+				{   
+					text = new String(contents, iv_encoding);
+				}
+				else
+				{ 
+					text = new String(contents); 
+				}
+				if(text == null)
+				{
+					System.out.println("text ==null!");
+					System.out.println("docID = "+docID);
+				}
+				//put document in CAS (assume CAS)
+				jcas.setDocumentText(text);
+			}
+	   
+		    //set language if it was explicitly specified as a configuration parameter
+		    if (iv_language != null)
+		    {
+		//      ((DocumentAnnotation)jcas.getDocumentAnnotationFs()).setLanguage(iv_language);
+		    }
+
+	    }		
+	    catch (CASException e)
+	    {
+	      throw new CollectionException(e);
+	    }
+	  	finally
+		{
+			if (fileInputStream != null)
+				fileInputStream.close();
+			iv_currentIndex++;	
+		}  
+	  }
+
+	  private String createDocID(File file)
+	  {
+		    String docID = file.getPath();
+		    if(iv_rootPath.endsWith(""+File.separator) ||
+	           iv_rootPath.equals(""))
+	        {
+	            docID = docID.substring(iv_rootPath.length());
+	        }
+	        else
+	            docID = docID.substring(iv_rootPath.length()+1);
+		   return docID;
+	  }
+	  /**
+	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close()
+	   */
+	  public void close() throws IOException
+	  {
+	  }
+
+	  /**
+	   * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress()
+	   */
+	  public Progress[] getProgress()
+	  {
+	    return new Progress[]{
+	       new ProgressImpl(iv_currentIndex, iv_files.size(),Progress.ENTITIES)};
+	  }
+
+	  /**
+	   * Gets the total number of documents that will be returned by this
+	   * collection reader.  This is not part of the general collection reader
+	   * interface.
+	   * 
+	   * @return the number of documents in the collection
+	   */
+	  public int getNumberOfDocuments()
+	  {
+	    return iv_files.size();
+	  }
+
+
+}



Mime
View raw message