Return-Path: X-Original-To: apmail-incubator-ctakes-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-ctakes-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 9676FDC88 for ; Wed, 31 Oct 2012 05:28:51 +0000 (UTC) Received: (qmail 51967 invoked by uid 500); 31 Oct 2012 05:28:51 -0000 Delivered-To: apmail-incubator-ctakes-commits-archive@incubator.apache.org Received: (qmail 51936 invoked by uid 500); 31 Oct 2012 05:28:51 -0000 Mailing-List: contact ctakes-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: ctakes-dev@incubator.apache.org Delivered-To: mailing list ctakes-commits@incubator.apache.org Received: (qmail 51919 invoked by uid 99); 31 Oct 2012 05:28:51 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 31 Oct 2012 05:28:51 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 31 Oct 2012 05:28:37 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id CF5552388C63; Wed, 31 Oct 2012 05:27:18 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1403989 [14/28] - in /incubator/ctakes/branches/SHARPn-cTAKES: Constituency Parser/src/org/chboston/cnlp/ctakes/parser/ Constituency Parser/src/org/chboston/cnlp/ctakes/parser/uima/ae/ Constituency Parser/src/org/chboston/cnlp/ctakes/parse... Date: Wed, 31 Oct 2012 05:26:55 -0000 To: ctakes-commits@incubator.apache.org From: james-masanz@apache.org X-Mailer: svnmailer-1.0.8-patched Message-Id: <20121031052718.CF5552388C63@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/HyphenTextModifierImpl.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,287 +14,287 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -/* - * Created on May 23, 2005 - * - * To change the template for this generated file go to - * Window>Preferences>Java>Code Generation>Code and Comments - */ -package edu.mayo.bmi.uima.core.ci; - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileNotFoundException; -import java.io.FileReader; -import java.io.IOException; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import edu.mayo.bmi.nlp.tokenizer.Token; -import edu.mayo.bmi.nlp.tokenizer.Tokenizer; - -/** - * @author Mayo Clinic - * - */ -public class HyphenTextModifierImpl implements TextModifier { - - private Map iv_shouldbeHyphenMap = null; - private int iv_windowSize = 3; // default lookahead window - private Tokenizer iv_tokenizer = null; - - /** - * Default constructor takes a name of the file containing hyphenated - * phrases, with their frequency. - * Currently the frequency is unused.
- * The case of the words in the file is unimportant - we lowercase - * everything when doing compares.
- * The file is delimited with "|" and has two fields:
- * hyphen-term|frequency - */ - public HyphenTextModifierImpl(String hyphenfilename, int windowSize) { - iv_windowSize = windowSize; - iv_tokenizer = new Tokenizer(); - BufferedReader br; - try { - br = new BufferedReader(new FileReader(new File(hyphenfilename))); - - String line = ""; - - iv_shouldbeHyphenMap = new HashMap(); - while ((line = br.readLine()) != null) { - String[] toks = line.split("\\|"); - String[] unh = toks[0].split("\\-"); - String shouldbehyphen = ""; - for (int i = 0; i < unh.length; i++) { - shouldbehyphen += " " + unh[i]; - } - shouldbehyphen = shouldbehyphen.trim().toLowerCase(); - iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1)); - } - } catch (FileNotFoundException e) { - System.err.println("Cannot find the hyphenation file:" + hyphenfilename); - e.printStackTrace(); - } catch (IOException e) { - System.err.println("IOException accessing the hyphenation file:" + hyphenfilename); - e.printStackTrace(); - } - - } - - /** - * Filters out unwanted tokens - newlines. - * - * @param tokenList - */ - private void filterTokens(List tokenList) { - - List removalList = new ArrayList(); - Iterator tokenItr = tokenList.iterator(); - - while (tokenItr.hasNext()) { - Token token = (Token) tokenItr.next(); - if (token.getType() == Token.TYPE_EOL) { - removalList.add(token); - } - } - - tokenList.removeAll(removalList); - } - - /* - * (non-Javadoc) - * - * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String) - */ - public TextModification[] modify(String in) throws Exception { - - // intermediate data structure to use for easy adding of new - // TextModification objects - ArrayList textmods = new ArrayList(); - - // Tokenize the input to get offset information - List inputtoks = iv_tokenizer.tokenizeAndSort(in); - - filterTokens(inputtoks); - - int orig_startOffset = 0; - int orig_endOffset = 0; - int new_startOffset = 0; - int new_endOffset = 0; - - int i = 0; - int j = 0; - int end_offset_adj = 0; - int start_offset_adj = 0; - - while (i < inputtoks.size()) { - - if (inputtoks.size() - (i + 1) < iv_windowSize) { - j = inputtoks.size() - 1; - } else { - j = i + iv_windowSize; - } - - while (j > i) { - - StringBuffer candSB = new StringBuffer(); - for (int k = i; k <= j; k++) { - Token currtok = (Token) inputtoks.get(k); - candSB.append(" "); - candSB.append(currtok.getText()); - } - String cand = candSB.toString().trim(); - - // Attempt to look up the candidate in the hyphen map - if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) { - - // set the initial offsets - orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset(); - orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset(); - new_startOffset = orig_startOffset; - new_endOffset = orig_endOffset; - - // compile new text - String newText = ""; - for (int k = i; k <= j; k++) { - Token currtok = (Token) inputtoks.get(k); - newText += currtok.getText() + "-"; - } - newText = newText.substring(0, newText.length() - 1); - - // Get the new and old lengths of hyphenated spans - int new_Length = newText.length(); - int orig_Length = orig_endOffset - orig_startOffset; - - // Pad the end offset adjuster by the new amount - end_offset_adj += orig_Length - new_Length; - - // Create a new modification object - TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset - - start_offset_adj, new_endOffset - end_offset_adj, newText); - - // Adjust the start offset on the next Text Modification - // object - start_offset_adj += orig_Length - new_Length; - - // Put the newly created TextMod object into a temporary - // holder - textmods.add(tm); - - i = j; - } - j--; - } - - i++; - } - - // generate the expected return as an array of TextModification objects - TextModification[] tma = new TextModification[textmods.size()]; - for (int y = 0; y < tma.length; y++) { - tma[y] = (TextModification) textmods.get(y); - } - - return tma; - } - - - /** - * Apply text modifier to the text
- * TODO - move this to TextModifier and take a Logger - * See HyphenTextModifierImpl - * @param tm TextModifier to apply - * @param text Original text - * @param sb Buffer containing text to apply modifier to - * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method - * @throws Exception - */ - private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception { - boolean unableToModifyText = false; - TextModification[] textModArr = tm.modify(text); - for (int i = 0; i < textModArr.length; i++) { - - TextModification textMod = textModArr[i]; - - if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset()) - || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) { - System.err.println("UNSUPPORTED: TextModification with offset changes."); - unableToModifyText = true; - } - else { - sb.replace(textMod.getOrigStartOffset(), - textMod.getOrigEndOffset(), - textMod.getNewText()); - } - } - return unableToModifyText; - } - - public static ArrayList test(HyphenTextModifierImpl tm, String text) { - ArrayList messages = new ArrayList(); - try { - TextModification[] tma = tm.modify(text); - StringBuffer sb = new StringBuffer(text); - boolean errorModifyingText = applyTextModifier(tm,text,sb); - messages.add("Orig: " + text); - if (!errorModifyingText) { - messages.add("New: " + sb); - } - else { - System.err.println("New: (new text not generated, see previous messages)"); - } - // Regardless of whether was able to modify the text - // without - // (_apply_ the TextModifier), output the - // the - for (int u = 0; u < tma.length; u++) { - TextModification tmo = (TextModification) tma[u]; - messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-" - + tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset()); - } - } catch (Exception e) { - // TODO Auto-generated catch block - e.printStackTrace(); - } - return messages; - - } - /** - * Simple tests of TextModification - *
- * Output expected:
- * UNSUPPORTED: TextModification with offset changes.
- * UNSUPPORTED: TextModification with offset changes.
- * UNSUPPORTED: TextModification with offset changes.
- * Orig: Non Hodgkin's the x ray without any non small cell complications.
- * New: (new text not generated, see previous messages) - * Non-Hodgkin Orig: 0-12 New: 0-11
- * x-ray Orig: 19-25 New: 18-23
- * non-small-cell Orig: 38-53 New: 36-50
- * - * Orig: Non Hodgkin's the x ray without any non small cell complications.
- * New: Non-Hodgkin's the x-ray without any non-small-cell complications.
- * Non-Hodgkin Orig: 0-11 New: 0-11
- * x-ray Orig: 18-23 New: 18-23
- * non-small-cell Orig: 36-50 New: 36-50
- * Note the case of the words doesn't matter. - * @param args hyphen text filename (each line: hyphenated-word|freq) - */ - public static void main(String[] args) { - ArrayList messages; - HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7); - - String t = "Non Hodgkin's the x ray without any non small cell complications."; - messages = test(tm, t); // extra blanks - for (String s : messages) { System.out.println(s); } - - t = t.replace(" ", " "); // change text to only have single blanks between words - messages = test(tm, t); // single blanks - for (String s : messages) { System.out.println(s); } - } - -} +/* + * Created on May 23, 2005 + * + * To change the template for this generated file go to + * Window>Preferences>Java>Code Generation>Code and Comments + */ +package edu.mayo.bmi.uima.core.ci; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +import edu.mayo.bmi.nlp.tokenizer.Token; +import edu.mayo.bmi.nlp.tokenizer.Tokenizer; + +/** + * @author Mayo Clinic + * + */ +public class HyphenTextModifierImpl implements TextModifier { + + private Map iv_shouldbeHyphenMap = null; + private int iv_windowSize = 3; // default lookahead window + private Tokenizer iv_tokenizer = null; + + /** + * Default constructor takes a name of the file containing hyphenated + * phrases, with their frequency. + * Currently the frequency is unused.
+ * The case of the words in the file is unimportant - we lowercase + * everything when doing compares.
+ * The file is delimited with "|" and has two fields:
+ * hyphen-term|frequency + */ + public HyphenTextModifierImpl(String hyphenfilename, int windowSize) { + iv_windowSize = windowSize; + iv_tokenizer = new Tokenizer(); + BufferedReader br; + try { + br = new BufferedReader(new FileReader(new File(hyphenfilename))); + + String line = ""; + + iv_shouldbeHyphenMap = new HashMap(); + while ((line = br.readLine()) != null) { + String[] toks = line.split("\\|"); + String[] unh = toks[0].split("\\-"); + String shouldbehyphen = ""; + for (int i = 0; i < unh.length; i++) { + shouldbehyphen += " " + unh[i]; + } + shouldbehyphen = shouldbehyphen.trim().toLowerCase(); + iv_shouldbeHyphenMap.put(shouldbehyphen, new Integer(1)); + } + } catch (FileNotFoundException e) { + System.err.println("Cannot find the hyphenation file:" + hyphenfilename); + e.printStackTrace(); + } catch (IOException e) { + System.err.println("IOException accessing the hyphenation file:" + hyphenfilename); + e.printStackTrace(); + } + + } + + /** + * Filters out unwanted tokens - newlines. + * + * @param tokenList + */ + private void filterTokens(List tokenList) { + + List removalList = new ArrayList(); + Iterator tokenItr = tokenList.iterator(); + + while (tokenItr.hasNext()) { + Token token = (Token) tokenItr.next(); + if (token.getType() == Token.TYPE_EOL) { + removalList.add(token); + } + } + + tokenList.removeAll(removalList); + } + + /* + * (non-Javadoc) + * + * @see edu.mayo.bmi.uima.util.ci.TextModifier#modify(java.lang.String) + */ + public TextModification[] modify(String in) throws Exception { + + // intermediate data structure to use for easy adding of new + // TextModification objects + ArrayList textmods = new ArrayList(); + + // Tokenize the input to get offset information + List inputtoks = iv_tokenizer.tokenizeAndSort(in); + + filterTokens(inputtoks); + + int orig_startOffset = 0; + int orig_endOffset = 0; + int new_startOffset = 0; + int new_endOffset = 0; + + int i = 0; + int j = 0; + int end_offset_adj = 0; + int start_offset_adj = 0; + + while (i < inputtoks.size()) { + + if (inputtoks.size() - (i + 1) < iv_windowSize) { + j = inputtoks.size() - 1; + } else { + j = i + iv_windowSize; + } + + while (j > i) { + + StringBuffer candSB = new StringBuffer(); + for (int k = i; k <= j; k++) { + Token currtok = (Token) inputtoks.get(k); + candSB.append(" "); + candSB.append(currtok.getText()); + } + String cand = candSB.toString().trim(); + + // Attempt to look up the candidate in the hyphen map + if (iv_shouldbeHyphenMap.containsKey(cand.toLowerCase())) { + + // set the initial offsets + orig_startOffset = ((Token) inputtoks.get(i)).getStartOffset(); + orig_endOffset = ((Token) inputtoks.get(j)).getEndOffset(); + new_startOffset = orig_startOffset; + new_endOffset = orig_endOffset; + + // compile new text + String newText = ""; + for (int k = i; k <= j; k++) { + Token currtok = (Token) inputtoks.get(k); + newText += currtok.getText() + "-"; + } + newText = newText.substring(0, newText.length() - 1); + + // Get the new and old lengths of hyphenated spans + int new_Length = newText.length(); + int orig_Length = orig_endOffset - orig_startOffset; + + // Pad the end offset adjuster by the new amount + end_offset_adj += orig_Length - new_Length; + + // Create a new modification object + TextModification tm = new TextModification(orig_startOffset, orig_endOffset, new_startOffset + - start_offset_adj, new_endOffset - end_offset_adj, newText); + + // Adjust the start offset on the next Text Modification + // object + start_offset_adj += orig_Length - new_Length; + + // Put the newly created TextMod object into a temporary + // holder + textmods.add(tm); + + i = j; + } + j--; + } + + i++; + } + + // generate the expected return as an array of TextModification objects + TextModification[] tma = new TextModification[textmods.size()]; + for (int y = 0; y < tma.length; y++) { + tma[y] = (TextModification) textmods.get(y); + } + + return tma; + } + + + /** + * Apply text modifier to the text
+ * TODO - move this to TextModifier and take a Logger + * See HyphenTextModifierImpl + * @param tm TextModifier to apply + * @param text Original text + * @param sb Buffer containing text to apply modifier to + * @return unableToModifyText true if modifier would require offset changes, which is not supported by this method + * @throws Exception + */ + private static boolean applyTextModifier(TextModifier tm, String text, StringBuffer sb) throws Exception { + boolean unableToModifyText = false; + TextModification[] textModArr = tm.modify(text); + for (int i = 0; i < textModArr.length; i++) { + + TextModification textMod = textModArr[i]; + + if ((textMod.getOrigStartOffset() != textMod.getNewStartOffset()) + || (textMod.getOrigEndOffset() != textMod.getNewEndOffset())) { + System.err.println("UNSUPPORTED: TextModification with offset changes."); + unableToModifyText = true; + } + else { + sb.replace(textMod.getOrigStartOffset(), + textMod.getOrigEndOffset(), + textMod.getNewText()); + } + } + return unableToModifyText; + } + + public static ArrayList test(HyphenTextModifierImpl tm, String text) { + ArrayList messages = new ArrayList(); + try { + TextModification[] tma = tm.modify(text); + StringBuffer sb = new StringBuffer(text); + boolean errorModifyingText = applyTextModifier(tm,text,sb); + messages.add("Orig: " + text); + if (!errorModifyingText) { + messages.add("New: " + sb); + } + else { + System.err.println("New: (new text not generated, see previous messages)"); + } + // Regardless of whether was able to modify the text + // without + // (_apply_ the TextModifier), output the + // the + for (int u = 0; u < tma.length; u++) { + TextModification tmo = (TextModification) tma[u]; + messages.add(tmo.getNewText() + " Orig: " + tmo.getOrigStartOffset() + "-" + + tmo.getOrigEndOffset() + " New: " + tmo.getNewStartOffset() + "-" + tmo.getNewEndOffset()); + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return messages; + + } + /** + * Simple tests of TextModification + *
+ * Output expected:
+ * UNSUPPORTED: TextModification with offset changes.
+ * UNSUPPORTED: TextModification with offset changes.
+ * UNSUPPORTED: TextModification with offset changes.
+ * Orig: Non Hodgkin's the x ray without any non small cell complications.
+ * New: (new text not generated, see previous messages) + * Non-Hodgkin Orig: 0-12 New: 0-11
+ * x-ray Orig: 19-25 New: 18-23
+ * non-small-cell Orig: 38-53 New: 36-50
+ * + * Orig: Non Hodgkin's the x ray without any non small cell complications.
+ * New: Non-Hodgkin's the x-ray without any non-small-cell complications.
+ * Non-Hodgkin Orig: 0-11 New: 0-11
+ * x-ray Orig: 18-23 New: 18-23
+ * non-small-cell Orig: 36-50 New: 36-50
+ * Note the case of the words doesn't matter. + * @param args hyphen text filename (each line: hyphenated-word|freq) + */ + public static void main(String[] args) { + ArrayList messages; + HyphenTextModifierImpl tm = new HyphenTextModifierImpl(args[0], 7); + + String t = "Non Hodgkin's the x ray without any non small cell complications."; + messages = test(tm, t); // extra blanks + for (String s : messages) { System.out.println(s); } + + t = t.replace(" ", " "); // change text to only have single blanks between words + messages = test(tm, t); // single blanks + for (String s : messages) { System.out.println(s); } + } + +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModification.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,62 +14,62 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.uima.core.ci; - -/** - * Value object class that describes a modification of document text. This - * object tracks the original text and the new replacement text. - */ -public class TextModification -{ - private int iv_origStartOffset; - private int iv_origEndOffset; - - private int iv_newStartOffset; - private int iv_newEndOffset; - private String iv_newText; - - /** - * Constructor - * - * @param origStartOffset - * @param origEndOffset - * @param newStartOffset - * @param newEndOffset - * @param newText - */ - public TextModification(int origStartOffset, int origEndOffset, - int newStartOffset, int newEndOffset, String newText) - { - iv_origStartOffset = origStartOffset; - iv_origEndOffset = origEndOffset; - iv_newStartOffset = newStartOffset; - iv_newEndOffset = newEndOffset; - iv_newText = newText; - } - - public int getNewEndOffset() - { - return iv_newEndOffset; - } - - public int getNewStartOffset() - { - return iv_newStartOffset; - } - - public String getNewText() - { - return iv_newText; - } - - public int getOrigEndOffset() - { - return iv_origEndOffset; - } - - public int getOrigStartOffset() - { - return iv_origStartOffset; - } -} \ No newline at end of file +package edu.mayo.bmi.uima.core.ci; + +/** + * Value object class that describes a modification of document text. This + * object tracks the original text and the new replacement text. + */ +public class TextModification +{ + private int iv_origStartOffset; + private int iv_origEndOffset; + + private int iv_newStartOffset; + private int iv_newEndOffset; + private String iv_newText; + + /** + * Constructor + * + * @param origStartOffset + * @param origEndOffset + * @param newStartOffset + * @param newEndOffset + * @param newText + */ + public TextModification(int origStartOffset, int origEndOffset, + int newStartOffset, int newEndOffset, String newText) + { + iv_origStartOffset = origStartOffset; + iv_origEndOffset = origEndOffset; + iv_newStartOffset = newStartOffset; + iv_newEndOffset = newEndOffset; + iv_newText = newText; + } + + public int getNewEndOffset() + { + return iv_newEndOffset; + } + + public int getNewStartOffset() + { + return iv_newStartOffset; + } + + public String getNewText() + { + return iv_newText; + } + + public int getOrigEndOffset() + { + return iv_origEndOffset; + } + + public int getOrigStartOffset() + { + return iv_origStartOffset; + } +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/ci/TextModifier.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,21 +14,21 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.uima.core.ci; - -/** - * Defines a generic interface for modifying text. - */ -public interface TextModifier -{ - /** - * Generates modifications for the specified text. - * - * @param text - * Original document text. - * @return Array of TextModification objects that describe the - * modifications. Offset values are relative to the String object. - * @throws Exception - */ - public TextModification[] modify(String text) throws Exception; -} \ No newline at end of file +package edu.mayo.bmi.uima.core.ci; + +/** + * Defines a generic interface for modifying text. + */ +public interface TextModifier +{ + /** + * Generates modifications for the specified text. + * + * @param text + * Original document text. + * @return Array of TextModification objects that describe the + * modifications. Offset values are relative to the String object. + * @throws Exception + */ + public TextModification[] modify(String text) throws Exception; +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionCyclicalReads.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,183 +14,183 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.uima.core.cr; - -/** - * @author Mayo Clinic - * @version 1.0 - * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader - * and modified for Mayo use. This inherits from FilesInDirectoryCollectionReader and adds - * the capability to specify the number of documents to process. - * - * A simple collection reader that reads documents from a directory - * in the filesystem. It can be configured with the following parameters: - *
    - *
  • InputDirectory - path to directory containing files
  • - *
  • Encoding (optional) - character encoding of the input - * files
  • - *
  • Language (optional) - language of the input documents
  • - *
  • Extensions (optional) - Name of optional configuration - * parameter that specifies the extensions of the files that the - * collection reader will read.
  • - *
  • NumberOfIterations (optional) - actual number of files to be processed
  • - *
- * - * TODO We may need to provide a way to specify some portion of the path of the file - * to be included in the id of the document especially if we extend to recursively - * gather files in the directory from sub directories. - */ - -import java.io.IOException; - - -import org.apache.uima.cas.CAS; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; - - -public class FilesInDirectoryCollectionCyclicalReads extends FilesInDirectoryCollectionReader -{ - /** - * Name of configuration parameter that must be set to the path of - * a directory containing input files. - */ - public static final String PARAM_INPUTDIR = "InputDirectory"; - - /** - * Name of configuration parameter that contains the character encoding used - * by the input files. If not specified, the default system encoding will - * be used. - */ - public static final String PARAM_ENCODING = "Encoding"; - - /** - * Name of optional configuration parameter that contains the language of - * the documents in the input directory. If specified this information will - * be added to the CAS. - */ - public static final String PARAM_LANGUAGE = "Language"; - - /**Name of optional configuration parameter that specifies the extensions - * of the files that the collection reader will read. Values for this - * parameter should not begin with a dot '.'. - */ - - public static final String PARAM_EXTENSIONS = "Extensions"; - - /**Arguement to equate to # of times it should read the files. - * Takes this argument to equate to # of times it should read the files. - */ - - public static final String PARAM_NUMREADS = "NumberOfIterations"; - - public static final String PARAM_RECURSE = "Recurse"; - private int iv_iteration; - private int scaleTime, totalNumFiles, remainTimes; - - /** - * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() - */ - public void initialize() throws ResourceInitializationException { - - super.initialize(); - totalNumFiles = iv_files.size(); - iv_iteration = ((Integer) getConfigParameterValue(PARAM_NUMREADS)) - .intValue(); - if (iv_iteration > totalNumFiles) { - scaleTime = iv_iteration / totalNumFiles; - remainTimes = iv_iteration % totalNumFiles; - } else - scaleTime = -1; - - } - - /** - * Similar to 'org.apache.uima.collection.CollectionReader' method hasNext() except - * interations represents the actual number of documents to be processed, so if the - * total number of documents in a queue is more than the 'Iterations' value then only - * the iteration amount will be processed. Multiples of the total available documents - * will be provided to supplement the list required to meet the total iteration value. - */ - public boolean hasNext() - { - - // If hasNext false then start over only if count that has been passed to the contructor hasn't been reached. - boolean doNext = iv_currentIndex < totalNumFiles; - - if ((!doNext) && (scaleTime > 0)) { - scaleTime--; - if (scaleTime > 0) { - iv_currentIndex = 0; - doNext = true; - } - else if (remainTimes > 0){ - iv_currentIndex = 0; - totalNumFiles = remainTimes; - remainTimes=0; - doNext = true; - } - - } - if (scaleTime == -1) { - if (iv_currentIndex < iv_iteration) - doNext = true; - else - doNext = false; - - } - - return doNext; - } - - /** - * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) - */ - public void getNext(CAS aCAS) throws IOException, CollectionException - { - - super.getNext(aCAS); - - - } - - - /** - * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close() - */ - public void close() throws IOException - { - super.close(); - } - - /** - * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress() - */ - public Progress[] getProgress() { - int offSet = iv_currentIndex; - if ((scaleTime > 0) && (iv_currentIndex > 0)) - offSet = iv_currentIndex*(1/scaleTime); - if (scaleTime == 0){ - offSet = iv_iteration + remainTimes; - } - - return new Progress[] { new ProgressImpl( offSet , - iv_iteration, Progress.ENTITIES) }; - } - - /** - * Gets the total number of documents that will be returned by this - * collection reader. This is not part of the general collection reader - * interface. - * - * @return the number of documents in the collection - */ - public int getNumberOfDocuments() - { - return iv_files.size(); - } - - -} +package edu.mayo.bmi.uima.core.cr; + +/** + * @author Mayo Clinic + * @version 1.0 + * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader + * and modified for Mayo use. This inherits from FilesInDirectoryCollectionReader and adds + * the capability to specify the number of documents to process. + * + * A simple collection reader that reads documents from a directory + * in the filesystem. It can be configured with the following parameters: + *
    + *
  • InputDirectory - path to directory containing files
  • + *
  • Encoding (optional) - character encoding of the input + * files
  • + *
  • Language (optional) - language of the input documents
  • + *
  • Extensions (optional) - Name of optional configuration + * parameter that specifies the extensions of the files that the + * collection reader will read.
  • + *
  • NumberOfIterations (optional) - actual number of files to be processed
  • + *
+ * + * TODO We may need to provide a way to specify some portion of the path of the file + * to be included in the id of the document especially if we extend to recursively + * gather files in the directory from sub directories. + */ + +import java.io.IOException; + + +import org.apache.uima.cas.CAS; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; + + +public class FilesInDirectoryCollectionCyclicalReads extends FilesInDirectoryCollectionReader +{ + /** + * Name of configuration parameter that must be set to the path of + * a directory containing input files. + */ + public static final String PARAM_INPUTDIR = "InputDirectory"; + + /** + * Name of configuration parameter that contains the character encoding used + * by the input files. If not specified, the default system encoding will + * be used. + */ + public static final String PARAM_ENCODING = "Encoding"; + + /** + * Name of optional configuration parameter that contains the language of + * the documents in the input directory. If specified this information will + * be added to the CAS. + */ + public static final String PARAM_LANGUAGE = "Language"; + + /**Name of optional configuration parameter that specifies the extensions + * of the files that the collection reader will read. Values for this + * parameter should not begin with a dot '.'. + */ + + public static final String PARAM_EXTENSIONS = "Extensions"; + + /**Arguement to equate to # of times it should read the files. + * Takes this argument to equate to # of times it should read the files. + */ + + public static final String PARAM_NUMREADS = "NumberOfIterations"; + + public static final String PARAM_RECURSE = "Recurse"; + private int iv_iteration; + private int scaleTime, totalNumFiles, remainTimes; + + /** + * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() + */ + public void initialize() throws ResourceInitializationException { + + super.initialize(); + totalNumFiles = iv_files.size(); + iv_iteration = ((Integer) getConfigParameterValue(PARAM_NUMREADS)) + .intValue(); + if (iv_iteration > totalNumFiles) { + scaleTime = iv_iteration / totalNumFiles; + remainTimes = iv_iteration % totalNumFiles; + } else + scaleTime = -1; + + } + + /** + * Similar to 'org.apache.uima.collection.CollectionReader' method hasNext() except + * interations represents the actual number of documents to be processed, so if the + * total number of documents in a queue is more than the 'Iterations' value then only + * the iteration amount will be processed. Multiples of the total available documents + * will be provided to supplement the list required to meet the total iteration value. + */ + public boolean hasNext() + { + + // If hasNext false then start over only if count that has been passed to the contructor hasn't been reached. + boolean doNext = iv_currentIndex < totalNumFiles; + + if ((!doNext) && (scaleTime > 0)) { + scaleTime--; + if (scaleTime > 0) { + iv_currentIndex = 0; + doNext = true; + } + else if (remainTimes > 0){ + iv_currentIndex = 0; + totalNumFiles = remainTimes; + remainTimes=0; + doNext = true; + } + + } + if (scaleTime == -1) { + if (iv_currentIndex < iv_iteration) + doNext = true; + else + doNext = false; + + } + + return doNext; + } + + /** + * @see org.apache.uima.collection.CollectionReader#getNext(org.apache.uima.cas.CAS) + */ + public void getNext(CAS aCAS) throws IOException, CollectionException + { + + super.getNext(aCAS); + + + } + + + /** + * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#close() + */ + public void close() throws IOException + { + super.close(); + } + + /** + * @see org.apache.uima.collection.base_cpm.BaseCollectionReader#getProgress() + */ + public Progress[] getProgress() { + int offSet = iv_currentIndex; + if ((scaleTime > 0) && (iv_currentIndex > 0)) + offSet = iv_currentIndex*(1/scaleTime); + if (scaleTime == 0){ + offSet = iv_iteration + remainTimes; + } + + return new Progress[] { new ProgressImpl( offSet , + iv_iteration, Progress.ENTITIES) }; + } + + /** + * Gets the total number of documents that will be returned by this + * collection reader. This is not part of the general collection reader + * interface. + * + * @return the number of documents in the collection + */ + public int getNumberOfDocuments() + { + return iv_files.size(); + } + + +} Modified: incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java URL: http://svn.apache.org/viewvc/incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java?rev=1403989&r1=1403988&r2=1403989&view=diff ============================================================================== --- incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java (original) +++ incubator/ctakes/branches/SHARPn-cTAKES/core/src/edu/mayo/bmi/uima/core/cr/FilesInDirectoryCollectionReader.java Wed Oct 31 05:26:43 2012 @@ -1,18 +1,11 @@ /* - * Copyright: (c) 2009 Mayo Foundation for Medical Education and - * Research (MFMER). All rights reserved. MAYO, MAYO CLINIC, and the - * triple-shield Mayo logo are trademarks and service marks of MFMER. - * - * Except as contained in the copyright notice above, or as used to identify - * MFMER as the author of this software, the trade names, trademarks, service - * marks, or product names of the copyright holder shall not be used in - * advertising, promotion or otherwise in connection with this software without - * prior written authorization of the copyright holder. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software @@ -21,295 +14,295 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package edu.mayo.bmi.uima.core.cr; - -/** - * The original code was copied from org.apache.uima.examples.cpe.FileSystemCollectionReader - * and modified for Mayo use. - * - * A simple collection reader that reads documents from a directory - * in the filesystem. It can be configured with the following parameters: - *
    - *
  • InputDirectory - path to directory containing files
  • - *
  • Encoding (optional) - character encoding of the input - * files
  • - *
  • Language (optional) - language of the input documents
  • - *
  • Extensions (optional) - Name of optional configuration - * parameter that specifies the extensions of the files that the - * collection reader will read. - *
- * - * TODO We may need to provide a way to specify some portion of the path of the file - * to be included in the id of the document especially if we extend to recursively - * gather files in the directory from sub directories. - */ - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.Reader; -import java.util.ArrayList; -import java.util.List; - -import org.apache.uima.cas.CAS; -import org.apache.uima.cas.CASException; -import org.apache.uima.collection.CollectionException; -import org.apache.uima.collection.CollectionReader_ImplBase; -import org.apache.uima.jcas.JCas; -//import org.apache.uima.jcas.tcas.DocumentAnnotation; -import org.apache.uima.resource.ResourceConfigurationException; -import org.apache.uima.resource.ResourceInitializationException; -import org.apache.uima.util.Progress; -import org.apache.uima.util.ProgressImpl; - -import edu.mayo.bmi.uima.core.type.structured.DocumentID; - -public class FilesInDirectoryCollectionReader extends CollectionReader_ImplBase -{ - /** - * Name of configuration parameter that must be set to the path of - * a directory containing input files. - */ - public static final String PARAM_INPUTDIR = "InputDirectory"; - - /** - * Name of configuration parameter that contains the character encoding used - * by the input files. If not specified, the default system encoding will - * be used. - */ - public static final String PARAM_ENCODING = "Encoding"; - - /** - * Name of optional configuration parameter that contains the language of - * the documents in the input directory. If specified this information will - * be added to the CAS. - */ - public static final String PARAM_LANGUAGE = "Language"; - - /**Name of optional configuration parameter that specifies the extensions - * of the files that the collection reader will read. Values for this - * parameter should not begin with a dot '.'. - */ - - public static final String PARAM_EXTENSIONS = "Extensions"; - - public static final String PARAM_RECURSE = "Recurse"; - - protected ArrayList iv_files; - private String iv_encoding; - private String iv_language; - private static String[] iv_extensions; - - protected int iv_currentIndex; - - private boolean iv_recurse = false; - - private String iv_rootPath = ""; - - /** - * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() - */ - public void initialize() throws ResourceInitializationException - { - File directory = new File((String)getConfigParameterValue(PARAM_INPUTDIR)); - iv_encoding = (String)getConfigParameterValue(PARAM_ENCODING); - iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE); - iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS); - - iv_currentIndex = 0; - - iv_recurse = false; - Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE); - if(recurse != null) - iv_recurse = recurse.booleanValue(); - iv_rootPath = directory.getPath(); - - //if input directory does not exist or is not a directory, throw exception - if (!directory.exists() || !directory.isDirectory()) - { - throw new ResourceInitializationException( - ResourceConfigurationException.DIRECTORY_NOT_FOUND, - new Object[]{PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath()}); - } - - - //get list of files (not subdirectories) in the specified directory - iv_files = new ArrayList(); - if(!iv_recurse) - { - File[] files = directory.listFiles(); - for (int i = 0; i < files.length; i++) - { - if (!files[i].isDirectory() && hasValidExtension(files[i])) - { - iv_files.add(files[i]); - } - } - } - else - { - try - { - collectFiles(directory, iv_files); - System.out.println("iv_files.size()="+iv_files.size()); - } - catch(IOException ioe) - { - throw new ResourceInitializationException(ioe); - } - } - } - - private void collectFiles(File directory, List files) throws IOException - { - File[] dirFiles = directory.listFiles(); - for(int i=0; i + *
  • InputDirectory - path to directory containing files
  • + *
  • Encoding (optional) - character encoding of the input + * files
  • + *
  • Language (optional) - language of the input documents
  • + *
  • Extensions (optional) - Name of optional configuration + * parameter that specifies the extensions of the files that the + * collection reader will read. + * + * + * TODO We may need to provide a way to specify some portion of the path of the file + * to be included in the id of the document especially if we extend to recursively + * gather files in the directory from sub directories. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; +import org.apache.uima.collection.CollectionException; +import org.apache.uima.collection.CollectionReader_ImplBase; +import org.apache.uima.jcas.JCas; +//import org.apache.uima.jcas.tcas.DocumentAnnotation; +import org.apache.uima.resource.ResourceConfigurationException; +import org.apache.uima.resource.ResourceInitializationException; +import org.apache.uima.util.Progress; +import org.apache.uima.util.ProgressImpl; + +import edu.mayo.bmi.uima.core.type.structured.DocumentID; + +public class FilesInDirectoryCollectionReader extends CollectionReader_ImplBase +{ + /** + * Name of configuration parameter that must be set to the path of + * a directory containing input files. + */ + public static final String PARAM_INPUTDIR = "InputDirectory"; + + /** + * Name of configuration parameter that contains the character encoding used + * by the input files. If not specified, the default system encoding will + * be used. + */ + public static final String PARAM_ENCODING = "Encoding"; + + /** + * Name of optional configuration parameter that contains the language of + * the documents in the input directory. If specified this information will + * be added to the CAS. + */ + public static final String PARAM_LANGUAGE = "Language"; + + /**Name of optional configuration parameter that specifies the extensions + * of the files that the collection reader will read. Values for this + * parameter should not begin with a dot '.'. + */ + + public static final String PARAM_EXTENSIONS = "Extensions"; + + public static final String PARAM_RECURSE = "Recurse"; + + protected ArrayList iv_files; + private String iv_encoding; + private String iv_language; + private static String[] iv_extensions; + + protected int iv_currentIndex; + + private boolean iv_recurse = false; + + private String iv_rootPath = ""; + + /** + * @see org.apache.uima.collection.CollectionReader_ImplBase#initialize() + */ + public void initialize() throws ResourceInitializationException + { + File directory = new File((String)getConfigParameterValue(PARAM_INPUTDIR)); + iv_encoding = (String)getConfigParameterValue(PARAM_ENCODING); + iv_language = (String)getConfigParameterValue(PARAM_LANGUAGE); + iv_extensions = (String[]) getConfigParameterValue(PARAM_EXTENSIONS); + + iv_currentIndex = 0; + + iv_recurse = false; + Boolean recurse = (Boolean) getConfigParameterValue(PARAM_RECURSE); + if(recurse != null) + iv_recurse = recurse.booleanValue(); + iv_rootPath = directory.getPath(); + + //if input directory does not exist or is not a directory, throw exception + if (!directory.exists() || !directory.isDirectory()) + { + throw new ResourceInitializationException( + ResourceConfigurationException.DIRECTORY_NOT_FOUND, + new Object[]{PARAM_INPUTDIR, this.getMetaData().getName(), directory.getPath()}); + } + + + //get list of files (not subdirectories) in the specified directory + iv_files = new ArrayList(); + if(!iv_recurse) + { + File[] files = directory.listFiles(); + for (int i = 0; i < files.length; i++) + { + if (!files[i].isDirectory() && hasValidExtension(files[i])) + { + iv_files.add(files[i]); + } + } + } + else + { + try + { + collectFiles(directory, iv_files); + System.out.println("iv_files.size()="+iv_files.size()); + } + catch(IOException ioe) + { + throw new ResourceInitializationException(ioe); + } + } + } + + private void collectFiles(File directory, List files) throws IOException + { + File[] dirFiles = directory.listFiles(); + for(int i=0; i