ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From james-mas...@apache.org
Subject svn commit: r1516907 - in /ctakes/branches/ctakes-3.1.0: ctakes-core/src/test/java/org/apache/ctakes/core/ae/ ctakes-dependency-parser/src/test/java/org/apache/ctakes/dependency/parser/ae/util/
Date Fri, 23 Aug 2013 15:52:00 GMT
Author: james-masanz
Date: Fri Aug 23 15:51:59 2013
New Revision: 1516907

URL: http://svn.apache.org/r1516907
Log:
uncomment what had been temporarily commented out from some tests

Modified:
    ctakes/branches/ctakes-3.1.0/ctakes-core/src/test/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTBTests.java
    ctakes/branches/ctakes-3.1.0/ctakes-dependency-parser/src/test/java/org/apache/ctakes/dependency/parser/ae/util/TestClearNLPAnalysisEngines.java

Modified: ctakes/branches/ctakes-3.1.0/ctakes-core/src/test/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTBTests.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ctakes-3.1.0/ctakes-core/src/test/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTBTests.java?rev=1516907&r1=1516906&r2=1516907&view=diff
==============================================================================
--- ctakes/branches/ctakes-3.1.0/ctakes-core/src/test/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTBTests.java (original)
+++ ctakes/branches/ctakes-3.1.0/ctakes-core/src/test/java/org/apache/ctakes/core/ae/TokenizerAnnotatorPTBTests.java Fri Aug 23 15:51:59 2013
@@ -18,594 +18,594 @@
  */
 package org.apache.ctakes.core.ae;
 
-import static org.junit.Assert.assertEquals;
-
-import java.io.File;
-import java.util.Date;
-
-//import org.apache.log4j.Logger;
-import org.apache.uima.analysis_engine.AnalysisEngine;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.ResourceInitializationException;
-import org.junit.Test;
-
-import org.apache.ctakes.typesystem.type.syntax.BaseToken;
-import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
-import org.apache.ctakes.typesystem.type.syntax.NumToken;
-import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
-import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
-import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import static org.junit.Assert.assertEquals;
+
+import java.io.File;
+import java.util.Date;
+
+import org.apache.log4j.Logger;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.junit.Test;
+
+import org.apache.ctakes.typesystem.type.syntax.BaseToken;
+import org.apache.ctakes.typesystem.type.syntax.ContractionToken;
+import org.apache.ctakes.typesystem.type.syntax.NumToken;
+import org.apache.ctakes.typesystem.type.syntax.PunctuationToken;
+import org.apache.ctakes.typesystem.type.syntax.SymbolToken;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.ctakes.utils.test.TestUtil;
 
 public class TokenizerAnnotatorPTBTests {
-
-    TestData<SimpleToken []>[] tests;
-    //private static final Logger logger = Logger.getLogger(TokenizerAnnotatorPTBTests.class.getName());
-    
-    public TokenizerAnnotatorPTBTests() {
-	
-    	hyphenExpectedResults = new SimpleToken [hyphenatedWordsToNotSplit.length];
-    	int i = 0;
-    	int position = 0;
-    	for (String s:hyphenatedWordsToNotSplit) {
-    		hyphenExpectedResults[i++] = new SimpleToken(WordToken.class, position, position+s.length());
-    		position = position + s.length() + 1;
-	}
-
-	String pad = "  "; // re-test all with white space added at end which shouldn't change anything.
-	tests = new TestData [] {
-		new TestData<SimpleToken []>(whitespaceTest, whitespaceExpectedResults),
-		new TestData<SimpleToken []>(punctuationTest, punctuationExpectedResults),
-		new TestData<SimpleToken []>(contractionTest, contractionExpectedResults),
-		new TestData<SimpleToken []>(hyphenTest, hyphenExpectedResults),
-		new TestData<SimpleToken []>(abbrevTest1, abbrevExpectedResults1),
-		new TestData<SimpleToken []>(abbrevTest2, abbrevExpectedResults2),
-		new TestData<SimpleToken []>(abbrevTest3, abbrevExpectedResults3),
-		new TestData<SimpleToken []>(webString1Test, webString1ExpectedResults),
-		new TestData<SimpleToken []>(webString2Test, webString2ExpectedResults),
-		new TestData<SimpleToken []>(ellipsisTest, ellipsisExpectedResults),
-		new TestData<SimpleToken []>(teleAndPostalTest, teleAndPostalExpectedResults),
-		new TestData<SimpleToken []>(namesTest, namesExpectedResults),
-		new TestData<SimpleToken []>(comboTest, comboExpectedResults),
-
-		/** repeat all the tests with whitespace added at the end of the "sentence" **/
-		/** Should not change the number of tokens or the tokenization **/
-		new TestData<SimpleToken []>(whitespaceTest+pad, whitespaceExpectedResults),
-		new TestData<SimpleToken []>(punctuationTest+pad, punctuationExpectedResults),
-		new TestData<SimpleToken []>(contractionTest+pad, contractionExpectedResults),
-		new TestData<SimpleToken []>(hyphenTest+pad, hyphenExpectedResults),
-		new TestData<SimpleToken []>(abbrevTest1+pad, abbrevExpectedResults1),
-		new TestData<SimpleToken []>(abbrevTest2+pad, abbrevExpectedResults2),
-		new TestData<SimpleToken []>(abbrevTest3+pad, abbrevExpectedResults3),
-		new TestData<SimpleToken []>(webString1Test+pad, webString1ExpectedResults),
-		new TestData<SimpleToken []>(webString2Test+pad, webString2ExpectedResults),
-		new TestData<SimpleToken []>(ellipsisTest+pad, ellipsisExpectedResults),
-		new TestData<SimpleToken []>(teleAndPostalTest+pad, teleAndPostalExpectedResults),
-		new TestData<SimpleToken []>(namesTest+pad, namesExpectedResults),
-		new TestData<SimpleToken []>(comboTest+pad, comboExpectedResults),
-
-		/* add other new TestData statements here */
-		};
-	
-    }
-
-
-
-  
-    
-    @Test
-    public void testTokenizerAnnotatorPTB() throws ResourceInitializationException {
-    	testTokenizerAnnotatorPTB(true); // change to false if you want to see the tokens found (type and offsets) rather than just the failure message
-
-    }
-
-    private void testTokenizerAnnotatorPTB(boolean throwAssertionErrors) throws ResourceInitializationException {
-//    	String aePath =  "desc/test/analysis_engine/AggregateForTokenizerPTB.xml";
-//    	AnalysisEngine ae = TestUtil.getAE(new File(aePath));
-//
-//    	// Try to make sure not running with the tokenizer descriptor from version 1 of cTAKES by checking for a parameter that dosen't apply to the tokenizer that uses PTB rules
-//    	if (ae.getConfigParameterValue("HyphFreqTable")!=null) { // TODO this check does not work - change to building own aggregate and checking the TokenizerAnnotator's parms intead of the aggregate
-//    		throw new ResourceInitializationException(new RuntimeException("The Tokenizer following PTB rules does not have a HyphFreqTable parameter. Do you have the right version of " + aePath));
-//    	}
-//    	JCas jCas; 
-//
-//    	for (int test=0; test < tests.length; test++) {
-//
-//    		String testInput = tests[test].getTestInput();
-//    		SimpleToken [] expectedResults = tests[test].getExpectedResults();
-//
-//    		jCas = TestUtil.processAE(ae, testInput);
-//    		boolean alreadyOutputDebugInfoForThisRunOfPipeline = false;
-//
-//    		//String DQUOTE = "\"";
-//    		//logger.info("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -");
-//    		//logger.info("Test PTB Tokenizer for string (shown here in quotes) " + DQUOTE + testInput + DQUOTE);
-//
-//    		int numTokensTested = 0;
-//    		for (int i=0; i< expectedResults.length; i++) {
-//    			SimpleToken expectedTok = expectedResults[i];
-//    			BaseToken tokenFromPipeline = TestUtil.getFeatureStructureAtIndex(jCas, BaseToken.class, i);
-//    			try {
-//    				if (expectedTok.getTokenClass()!=null) { 
-//    					// allow for tokens where we don't care what kind of token it is. for example, if testing hyphens in a sentence, 
-//    					// might not care if what type of token the final punctuation is created as
-//    					//TODO: Could we confirm this test case?
-//    					//assertEquals(expectedTok.getTokenClass(), tokenFromPipeline.getClass());
-//    				}
-//    				//TODO: Could we confirm this test case?
-//    				//assertEquals(expectedTok.getBegin(), tokenFromPipeline.getBegin());
-//    				//assertEquals(expectedTok.getEnd(), tokenFromPipeline.getEnd());
-//    				numTokensTested++;
-//    			} catch (java.lang.AssertionError e) {
-//    				if (throwAssertionErrors) {
-//    					throw e;
-//    				}
-//    				if (!alreadyOutputDebugInfoForThisRunOfPipeline) {
-//    					System.err.println("ERROR: Found a problem, so outputting the tokens");
-//    					for (int x=0; x < expectedResults.length; x++) {
-//    						SimpleToken xTok = expectedResults[x];
-//    						System.err.println("Expected token #" + x + " " + xTok.toString());
-//    					}
-//    					for (int sysTokNum=0; sysTokNum< expectedResults.length; sysTokNum++) {
-//    						BaseToken sysTok = TestUtil.getFeatureStructureAtIndex(jCas, BaseToken.class, sysTokNum);
-//    						System.err.println("System token #" + sysTokNum + " " + sysTok.getClass() + " " + sysTok.getBegin() + " " + sysTok.getEnd());
-//    					}
-//
-//    					alreadyOutputDebugInfoForThisRunOfPipeline = true;
-//    				}
-//    				System.err.println("Caught exception at i = " + i + " for expectedTok " + expectedTok.toString() + " for testInput " + testInput);
-//    				System.err.flush();
-//    				e.printStackTrace();
-//    			}
-//    		}
-//
-//    		//logger.info("Verified " + numTokensTested + " token(s) using assertions.");
-//
-//    	}
-    }
-    
-    	// Comments like the following are used to help determine the index into a String
-	//  tens digit:   0         111111111 2         333333333 4
-	//  units digit:  01234567890123456789012345678901234567890123456789
-	
-    	
-    //        tens digit:   0          111111 111 2            3 33333333 4
-	//        units digit:  012345678 9012345 67890 123456 78 90 1234567890123456789
-	String whitespaceTest = "one two\t three\tfour\tfive  \t\t \t   six";
-	SimpleToken [] whitespaceExpectedResults = {
-		new SimpleToken(WordToken.class, 0, 3), // use null when we don't care about checking the type of BaseToken (whether WordToken etc)
-		new SimpleToken(WordToken.class, 4, 7),
-		new SimpleToken(WordToken.class, 9, 14),
-		new SimpleToken(WordToken.class, 15, 19),
-		new SimpleToken(WordToken.class, 20, 24),
-		new SimpleToken(WordToken.class, 33, 36),
-	};
-	
-	//  tens digit:           0         111111111 2         3333 33333 4         5
-	//  units digit:          0123456789012345678901234567890123 45678901234567890123456789
-	String punctuationTest = "one;two.three:four-five+six!seven\\eight/;:-$nine*";
-	SimpleToken [] punctuationExpectedResults = {
-		new SimpleToken(WordToken.class, 0, 3), // use null when we don't care about checking the type of BaseToken (whether WordToken etc)
-		new SimpleToken(PunctuationToken.class, 3, 4),
-		new SimpleToken(WordToken.class, 4, 7),
-		new SimpleToken(PunctuationToken.class, 7, 8),
-		new SimpleToken(WordToken.class, 8, 13),
-		new SimpleToken(PunctuationToken.class, 13, 14),
-		new SimpleToken(WordToken.class, 14, 18),
-		new SimpleToken(PunctuationToken.class, 18, 19),
-		new SimpleToken(WordToken.class, 19, 23),
-		new SimpleToken(SymbolToken.class, 23, 24),
-		new SimpleToken(WordToken.class, 24, 27),
-		new SimpleToken(SymbolToken.class, 27, 28),
-		new SimpleToken(WordToken.class, 28, 33),
-		new SimpleToken(PunctuationToken.class, 33, 34),
-		new SimpleToken(WordToken.class, 34, 39),
-		new SimpleToken(PunctuationToken.class, 39, 40),
-		new SimpleToken(PunctuationToken.class, 40, 41),
-		new SimpleToken(PunctuationToken.class, 41, 42),
-		new SimpleToken(PunctuationToken.class, 42, 43),
-		new SimpleToken(SymbolToken.class, 43, 44),
-		new SimpleToken(WordToken.class, 44, 48),
-		new SimpleToken(SymbolToken.class, 48, 49),
-	};
-	
-
-	/*
- 	's
-	've
-	're
-	'll
-	'd
-	n't
-	can not
-	gon na
-	got ta
-	lem me
-	more 'n
-	`t is
-	`t was
-	wan na
-	wha dd ya
-	wha t cha
-	 */
-	//  tens digit:           0         111111111 2         333333333 4         5         6         7         8         9
-	//  units digit:          0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
-	String contractionTest = "He's I've  We're  I'll  He'd can't Cannot gonna gotta lemme more'n 'tis 'twas wanna whaddya whatcha";
-	SimpleToken [] contractionExpectedResults = {
-		new SimpleToken(WordToken.class, 0, 2),
-		new SimpleToken(ContractionToken.class, 2, 4),
-		
-		new SimpleToken(WordToken.class, 5, 6),
-		new SimpleToken(ContractionToken.class, 6, 9),
-		
-		new SimpleToken(WordToken.class, 11, 13),
-		new SimpleToken(ContractionToken.class, 13, 16),
-		
-		new SimpleToken(WordToken.class, 18, 19), // I'll
-		new SimpleToken(ContractionToken.class, 19, 22),
-		
-		new SimpleToken(WordToken.class, 24, 26),
-		new SimpleToken(ContractionToken.class, 26, 28),
-		
-		new SimpleToken(WordToken.class, 29, 31), // can't
-		new SimpleToken(ContractionToken.class, 31, 34),
-		
-		new SimpleToken(WordToken.class, 35, 38),
-		new SimpleToken(ContractionToken.class, 38, 41),
-		 
-		new SimpleToken(WordToken.class, 42, 45), // gonna
-		new SimpleToken(ContractionToken.class, 45, 47),
-		
-		new SimpleToken(WordToken.class, 48, 51),
-		new SimpleToken(ContractionToken.class, 51, 53),
-		
-		new SimpleToken(WordToken.class, 54, 57),
-		new SimpleToken(ContractionToken.class, 57, 59),
-		
-		new SimpleToken(WordToken.class, 60, 64),
-		new SimpleToken(ContractionToken.class, 64, 66),
-		
-		new SimpleToken(ContractionToken.class, 67, 69), // 'tis
-		new SimpleToken(WordToken.class, 69, 71),
-		
-		new SimpleToken(ContractionToken.class, 72, 74), // 'twas  't was
-		new SimpleToken(WordToken.class, 74, 77),
-		
-		new SimpleToken(WordToken.class, 78, 81),
-		new SimpleToken(ContractionToken.class, 81, 83),
-		
-		new SimpleToken(WordToken.class, 84, 87),
-		new SimpleToken(ContractionToken.class, 87, 89),
-		new SimpleToken(ContractionToken.class, 89, 91),
-		
-		new SimpleToken(WordToken.class, 92, 95),
-		new SimpleToken(ContractionToken.class, 95, 96),
-		new SimpleToken(ContractionToken.class, 96, 99),
-		
-		
-	};
-
-	static String [] hyphenatedWordsToNotSplit = {
-		"e-EXAMPLE",
-		"a-EXAMPLE",
-		"u-EXAMPLE",
-		"x-EXAMPLE",
-		"agro-EXAMPLE",
-		"ante-EXAMPLE",
-		"anti-EXAMPLE",
-		"arch-EXAMPLE",
-		"be-EXAMPLE",
-		"bi-EXAMPLE",
-		"bio-EXAMPLE",
-		"co-EXAMPLE",
-		"counter-EXAMPLE",
-		"cross-EXAMPLE",
-		"cyber-EXAMPLE",
-		"de-EXAMPLE",
-		"eco-EXAMPLE",
-		"EXAMPLE-esque",
-		"EXAMPLE-ette",
-		"ex-EXAMPLE",
-		"extra-EXAMPLE",
-		"EXAMPLE-fest",
-		"EXAMPLE-fold",
-		"EXAMPLE-gate",
-		"inter-EXAMPLE",
-		"intra-EXAMPLE",
-		"EXAMPLE-itis",
-		"EXAMPLE-less",
-		"macro-EXAMPLE",
-		"mega-EXAMPLE",
-		"micro-EXAMPLE",
-		"mid-EXAMPLE",
-		"mini-EXAMPLE",
-		"mm-hm",
-		"mm-mm",
-		"EXAMPLE-most",
-		"multi-EXAMPLE",
-		"neo-EXAMPLE",
-		"non-EXAMPLE",
-		"o-kay",
-		"EXAMPLE-o-torium",
-		"over-EXAMPLE",
-		"pan-EXAMPLE",
-		"para-EXAMPLE",
-		"peri-EXAMPLE",
-		"post-EXAMPLE",
-		"pre-EXAMPLE",
-		"pro-EXAMPLE",
-		"pseudo-EXAMPLE",
-		"quasi-EXAMPLE",
-		"EXAMPLE-rama",
-		"re-EXAMPLE",
-		"semi-EXAMPLE",
-		"sub-EXAMPLE",
-		"super-EXAMPLE",
-		"tri-EXAMPLE",
-		"uh-huh",
-		"uh-oh",
-		"ultra-EXAMPLE",
-		"un-EXAMPLE",
-		"uni-EXAMPLE",
-		"vice-EXAMPLE",
-		"EXAMPLE-wise",
-
-		"sub-EXAMPLE-wise", // a prefix and a suffix
-		"ultra-EXAMPLE-fest", // a prefix and a suffix
-		 
-		//"ANTIEXAMPLE-ese", // verify this gets caught
-	};
-	
-	static String hyphenTestBuilt = "";
-	static {
-	    // Create a single String that is space-delimited for testing all the hyphenated words and prefixes and suffixes
-	    for (String s: hyphenatedWordsToNotSplit) {
-		hyphenTestBuilt = hyphenTestBuilt + s.toUpperCase() + " "; // change the input to upper case to make sure the tokenizer handles upper case too
-	    }
-	}
-	
-	//  tens digit:           0         111111111 2         333333333 4         5
-	//  units digit:          012345678901234567890123456789012345678901234567890123456789
-	String hyphenTest = hyphenTestBuilt;
-	SimpleToken [] hyphenExpectedResults; // see Constructor
-	
-	//  tens digit:       0         111111111 2         333333333 4         5         6         7
-	//  units digit:      0123456789012345678901234567890123456789012345678901234567890123456789012
-	String abbrevTest1 = "e.g.abc e.g. abc injury.[16]";
-	SimpleToken [] abbrevExpectedResults1 = {
-			new SimpleToken(WordToken.class, 0, 1),
-			new SimpleToken(null, 1, 2),
-			new SimpleToken(null, 2, 3),
-			new SimpleToken(null, 3, 4),
-			new SimpleToken(WordToken.class, 4, 7),
-			new SimpleToken(WordToken.class, 8, 12),
-			new SimpleToken(null, 13, 16),
-			new SimpleToken(null, 17, 23),
-			new SimpleToken(null, 23, 24),
-			new SimpleToken(null, 24, 25),
-			new SimpleToken(NumToken.class, 25, 27),
-			new SimpleToken(null, 27, 28),
-	};
-	
-	//  tens digit:       0         111111111 2         333333333 4         5         6         7
-	//  units digit:      0123456789012345678901234567890123456789012345678901234567890123456789012
-	String abbrevTest2 = "Mr. Smith! Dr. Smith! Ste. 6! sign e.g. edema! 210 A.D.! injury.[16][17]";
-	SimpleToken [] abbrevExpectedResults2 = {
-		new SimpleToken(WordToken.class, 0, 3),
-		new SimpleToken(null, 4, 9),
-		new SimpleToken(null, 9, 10),
-
-		new SimpleToken(WordToken.class, 11, 14),
-		new SimpleToken(null, 15,20),
-		new SimpleToken(null, 20,21),
-
-		new SimpleToken(WordToken.class, 22, 26),
-		new SimpleToken(null, 27,28),
-		new SimpleToken(null, 28,29),
-
-		new SimpleToken(null, 30,34),
-		new SimpleToken(WordToken.class, 35, 39),
-		new SimpleToken(null, 40,45),
-		new SimpleToken(null, 45,46),
-		
-		new SimpleToken(null, 47,50),
-
-		new SimpleToken(WordToken.class, 51, 55),
-		new SimpleToken(null, 55,56),
-		
-		new SimpleToken(WordToken.class, 57, 63),
-		new SimpleToken(null, 63,64),
-
-		new SimpleToken(null, 64,65),
-		new SimpleToken(NumToken.class, 65,67),
-		new SimpleToken(null, 67,68),
-		new SimpleToken(null, 68,69),
-		new SimpleToken(NumToken.class, 69,71),
-		new SimpleToken(null, 71,72),
-	};
-
-	//  tens digit:       0         111111111 2         333333333 4         5         6         7
-	//  units digit:      0123456789012345678901234567890123456789012345678901234567890123456789012
-	String abbrevTest3 = "w3c.f 1.0 version 1.0.3f Mrs.? Mr.! bldg.: Stabile";
-	SimpleToken [] abbrevExpectedResults3 = {
-			new SimpleToken(WordToken.class, 0, 3),
-			new SimpleToken(null, 3, 4),
-			new SimpleToken(WordToken.class, 4, 5),
-			
-			new SimpleToken(null, 6, 9),
-			
-			new SimpleToken(WordToken.class, 10, 17),
-			
-			new SimpleToken(NumToken.class, 18, 21),
-			
-			new SimpleToken(NumToken.class, 21, 23),
-			
-			new SimpleToken(WordToken.class, 23, 24),
-
-			new SimpleToken(WordToken.class, 25, 29),
-			new SimpleToken(null, 29, 30),
-
-			new SimpleToken(WordToken.class, 31, 34),
-			new SimpleToken(null, 34, 35),
-
-			new SimpleToken(WordToken.class, 36, 41),
-			new SimpleToken(null, 41, 42),
-
-			new SimpleToken(WordToken.class, 43, 50),
-	};
-	
-
-	//  tens digit:         0         111111111 2         333333333 4         5         6
-	//  units digit:        0123456789012345678901234567890123456789012345678901234567890123456789
-	String webString1Test = "ClinicalNLP@mayo.edu http://www.mayoclinic.org http://mayo.edu";
-	SimpleToken [] webString1ExpectedResults = {
-		new SimpleToken(WordToken.class,  0, 20),
-		new SimpleToken(WordToken.class, 21, 46),
-		new SimpleToken(WordToken.class, 47, 62),
-	};
-
-	String webString2Test = "http://www.islamonline.net/Arabic/news/2004-12/05/images/pic05b.jpg" +
-		" " + "http://www.mofa.gov.sa/detail.asp?InNewsItemID=59090&InTemplateKey=print" +
-		" " + "rayhanenajib@menara.ma";
-	SimpleToken [] webString2ExpectedResults = {
-		new SimpleToken(WordToken.class,  0, 67),
-		new SimpleToken(WordToken.class, 68, 140),
-		new SimpleToken(WordToken.class, 141, 163),
-	};
-
-	//  tens digit:        0         111111111 2         333333333 4         5
-	//  units digit:       012345678901234567890123456789012345678901234567890123456789
-	String ellipsisTest = "he ... she";
-	SimpleToken [] ellipsisExpectedResults  = {
-		new SimpleToken(WordToken.class, 0, 2),
-		new SimpleToken(PunctuationToken.class, 3, 6),
-		new SimpleToken(WordToken.class, 7, 10),
-	};
-	
-
-	//  tens digit:             0         111111111 2         333333333 4         5
-	//  units digit:            012345678901234567890123456789012345678901234567890123456789
-	String teleAndPostalTest = "5-1212 1-800-555-1212 55901-9999";
-	SimpleToken [] teleAndPostalExpectedResults = {
-		new SimpleToken(WordToken.class, 0, 6),
-		new SimpleToken(WordToken.class, 7, 21),
-		new SimpleToken(WordToken.class, 22, 32),
-	};
-
-	//  tens digit:     0         111111111 2         333333333 4         5
-	//  units digit:    012345678901234567890123456789012345678901234567890123456789
-	String namesTest = "80's P'yongyang 'Assad 2000's";
-	SimpleToken [] namesExpectedResults = {
-		new SimpleToken(WordToken.class, 0, 4),
-		new SimpleToken(WordToken.class, 5, 15),
-		new SimpleToken(WordToken.class, 16, 22),
-		new SimpleToken(WordToken.class, 23, 29),
-	};
-
-	//                  01234567890123456789012345678
-	String comboTest = "80's-esque";
-	SimpleToken [] comboExpectedResults = {
-		new SimpleToken(WordToken.class, 0, 10),
-	};
-
-	
-	
-	
-	
-	public static void main(String[] args) {   // if don't want to run as junit, run this main
-	    
-	    System.out.println("Starting at " + new Date());
-	    TokenizerAnnotatorPTBTests tester = new TokenizerAnnotatorPTBTests();
-	    try {
-		tester.testTokenizerAnnotatorPTB(false); // false = don't throw, just continue so can see all errors
-	    } catch (ResourceInitializationException e) {
-		e.printStackTrace();
-	    }
-
-	    tester.unitTest();
-	    
-	    System.out.println("Done at " + new Date());
-	
-	}
-	
-	private void unitTest() {
-	    
-	    SimpleToken<Object> st1 = new SimpleToken<Object>(null, 0,5);
-	    System.out.println(st1.getTokenClass());
-	    
-	    SimpleToken<BaseToken> st2 = new SimpleToken<BaseToken>(BaseToken.class, 0,5);
-	    System.out.println(st2.getTokenClass());
-	    
-	    SimpleToken<BaseToken> st3 = new SimpleToken<BaseToken>(WordToken.class, 0,5);
-	    System.out.println(st3.getTokenClass());
-	    
-	}
-
-	private class TestData<EXPECTED_RESULTS_TYPE> {
-	    
-	    String testInput;
-	    EXPECTED_RESULTS_TYPE expectedResults;
-	    
-	    TestData(String testInput, EXPECTED_RESULTS_TYPE expectedResults) {
-		this.testInput = testInput;
-		this.expectedResults = expectedResults;
-	    }
-	    
-	    EXPECTED_RESULTS_TYPE getExpectedResults() {
-		return expectedResults;
-	    }
-	    
-	    String getTestInput() {
-		return testInput;
-	    }
-	    
-	}
-	
-	/**
-	 * 
-	 * Created as a container for the most basic data expected from the various tokens
-	 *
-	 * @param <CLASS> the Class the token is expected to be, or 
-	 */
-	private class SimpleToken<BaseToken> {
-	    Class<? extends BaseToken> typeOfBaseTokenOrSubClass;
-	    int b;
-	    int e;
-	    
-	    private SimpleToken(){ throw new RuntimeException("Requires begin and end");};
-	    
-	    /**
-	     * 
-	     * @param begin  the expected begin offset
-	     * @param end the expected end offset
-	     */
-	    SimpleToken(Class<? extends BaseToken> typeOfBaseTokenOrSubClass, int begin, int end) {
-		this.typeOfBaseTokenOrSubClass = typeOfBaseTokenOrSubClass;
-		this.b = begin;
-		this.e = end;
-	    }
-	    
-	    public int getBegin() { return b; } ;
-	    
-	    public int getEnd() { return e; } ;
-
-	    public Class<? extends BaseToken> getTokenClass() { 
-		if (typeOfBaseTokenOrSubClass==null) 
-		    return null;
-		else 
-		    return typeOfBaseTokenOrSubClass; 
-	    }
-
-	    public String toString() {
-		String className;
-		if (typeOfBaseTokenOrSubClass==null) {
-		    className = "Class name not set";
-		} else {
-		    className = typeOfBaseTokenOrSubClass.getClass().getName();
-		}
-		String s = className + " " + b + ", " + e;
-		return s;
-	    }
-	}
-}
-
+
+    TestData<SimpleToken []>[] tests;
+    private static final Logger logger = Logger.getLogger(TokenizerAnnotatorPTBTests.class.getName());
+    
+    public TokenizerAnnotatorPTBTests() {
+	
+    	hyphenExpectedResults = new SimpleToken [hyphenatedWordsToNotSplit.length];
+    	int i = 0;
+    	int position = 0;
+    	for (String s:hyphenatedWordsToNotSplit) {
+    		hyphenExpectedResults[i++] = new SimpleToken(WordToken.class, position, position+s.length());
+    		position = position + s.length() + 1;
+	}
+
+	String pad = "  "; // re-test all with white space added at end which shouldn't change anything.
+	tests = new TestData [] {
+		new TestData<SimpleToken []>(whitespaceTest, whitespaceExpectedResults),
+		new TestData<SimpleToken []>(punctuationTest, punctuationExpectedResults),
+		new TestData<SimpleToken []>(contractionTest, contractionExpectedResults),
+		new TestData<SimpleToken []>(hyphenTest, hyphenExpectedResults),
+		new TestData<SimpleToken []>(abbrevTest1, abbrevExpectedResults1),
+		new TestData<SimpleToken []>(abbrevTest2, abbrevExpectedResults2),
+		new TestData<SimpleToken []>(abbrevTest3, abbrevExpectedResults3),
+		new TestData<SimpleToken []>(webString1Test, webString1ExpectedResults),
+		new TestData<SimpleToken []>(webString2Test, webString2ExpectedResults),
+		new TestData<SimpleToken []>(ellipsisTest, ellipsisExpectedResults),
+		new TestData<SimpleToken []>(teleAndPostalTest, teleAndPostalExpectedResults),
+		new TestData<SimpleToken []>(namesTest, namesExpectedResults),
+		new TestData<SimpleToken []>(comboTest, comboExpectedResults),
+
+		/** repeat all the tests with whitespace added at the end of the "sentence" **/
+		/** Should not change the number of tokens or the tokenization **/
+		new TestData<SimpleToken []>(whitespaceTest+pad, whitespaceExpectedResults),
+		new TestData<SimpleToken []>(punctuationTest+pad, punctuationExpectedResults),
+		new TestData<SimpleToken []>(contractionTest+pad, contractionExpectedResults),
+		new TestData<SimpleToken []>(hyphenTest+pad, hyphenExpectedResults),
+		new TestData<SimpleToken []>(abbrevTest1+pad, abbrevExpectedResults1),
+		new TestData<SimpleToken []>(abbrevTest2+pad, abbrevExpectedResults2),
+		new TestData<SimpleToken []>(abbrevTest3+pad, abbrevExpectedResults3),
+		new TestData<SimpleToken []>(webString1Test+pad, webString1ExpectedResults),
+		new TestData<SimpleToken []>(webString2Test+pad, webString2ExpectedResults),
+		new TestData<SimpleToken []>(ellipsisTest+pad, ellipsisExpectedResults),
+		new TestData<SimpleToken []>(teleAndPostalTest+pad, teleAndPostalExpectedResults),
+		new TestData<SimpleToken []>(namesTest+pad, namesExpectedResults),
+		new TestData<SimpleToken []>(comboTest+pad, comboExpectedResults),
+
+		/* add other new TestData statements here */
+		};
+	
+    }
+
+
+
+  
+    
+    @Test
+    public void testTokenizerAnnotatorPTB() throws ResourceInitializationException {
+    	testTokenizerAnnotatorPTB(true); // change to false if you want to see the tokens found (type and offsets) rather than just the failure message
+
+    }
+
+    private void testTokenizerAnnotatorPTB(boolean throwAssertionErrors) throws ResourceInitializationException {
+    	String aePath =  "desc/test/analysis_engine/AggregateForTokenizerPTB.xml";
+    	AnalysisEngine ae = TestUtil.getAE(new File(aePath));
+
+    	// Try to make sure not running with the tokenizer descriptor from version 1 of cTAKES by checking for a parameter that dosen't apply to the tokenizer that uses PTB rules
+    	if (ae.getConfigParameterValue("HyphFreqTable")!=null) { // TODO this check does not work - change to building own aggregate and checking the TokenizerAnnotator's parms intead of the aggregate
+    		throw new ResourceInitializationException(new RuntimeException("The Tokenizer following PTB rules does not have a HyphFreqTable parameter. Do you have the right version of " + aePath));
+    	}
+    	JCas jCas; 
+
+    	for (int test=0; test < tests.length; test++) {
+
+    		String testInput = tests[test].getTestInput();
+    		SimpleToken [] expectedResults = tests[test].getExpectedResults();
+
+    		jCas = TestUtil.processAE(ae, testInput);
+    		boolean alreadyOutputDebugInfoForThisRunOfPipeline = false;
+
+    		String DQUOTE = "\"";
+    		logger.info("- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -");
+    		logger.info("Test PTB Tokenizer for string (shown here in quotes) " + DQUOTE + testInput + DQUOTE);
+
+    		int numTokensTested = 0;
+    		for (int i=0; i< expectedResults.length; i++) {
+    			SimpleToken expectedTok = expectedResults[i];
+    			BaseToken tokenFromPipeline = TestUtil.getFeatureStructureAtIndex(jCas, BaseToken.class, i);
+    			try {
+    				if (expectedTok.getTokenClass()!=null) { 
+    					// allow for tokens where we don't care what kind of token it is. for example, if testing hyphens in a sentence, 
+    					// might not care if what type of token the final punctuation is created as
+    					//TODO: Could we confirm this test case?
+    					//assertEquals(expectedTok.getTokenClass(), tokenFromPipeline.getClass());
+    				}
+    				//TODO: Could we confirm this test case?
+    				//assertEquals(expectedTok.getBegin(), tokenFromPipeline.getBegin());
+    				//assertEquals(expectedTok.getEnd(), tokenFromPipeline.getEnd());
+    				numTokensTested++;
+    			} catch (java.lang.AssertionError e) {
+    				if (throwAssertionErrors) {
+    					throw e;
+    				}
+    				if (!alreadyOutputDebugInfoForThisRunOfPipeline) {
+    					System.err.println("ERROR: Found a problem, so outputting the tokens");
+    					for (int x=0; x < expectedResults.length; x++) {
+    						SimpleToken xTok = expectedResults[x];
+    						System.err.println("Expected token #" + x + " " + xTok.toString());
+    					}
+    					for (int sysTokNum=0; sysTokNum< expectedResults.length; sysTokNum++) {
+    						BaseToken sysTok = TestUtil.getFeatureStructureAtIndex(jCas, BaseToken.class, sysTokNum);
+    						System.err.println("System token #" + sysTokNum + " " + sysTok.getClass() + " " + sysTok.getBegin() + " " + sysTok.getEnd());
+    					}
+
+    					alreadyOutputDebugInfoForThisRunOfPipeline = true;
+    				}
+    				System.err.println("Caught exception at i = " + i + " for expectedTok " + expectedTok.toString() + " for testInput " + testInput);
+    				System.err.flush();
+    				e.printStackTrace();
+    			}
+    		}
+
+    		logger.info("Verified " + numTokensTested + " token(s) using assertions.");
+
+    	}
+    }
+    
+    	// Comments like the following are used to help determine the index into a String
+	//  tens digit:   0         111111111 2         333333333 4
+	//  units digit:  01234567890123456789012345678901234567890123456789
+	
+    	
+    //        tens digit:   0          111111 111 2            3 33333333 4
+	//        units digit:  012345678 9012345 67890 123456 78 90 1234567890123456789
+	String whitespaceTest = "one two\t three\tfour\tfive  \t\t \t   six";
+	SimpleToken [] whitespaceExpectedResults = {
+		new SimpleToken(WordToken.class, 0, 3), // use null when we don't care about checking the type of BaseToken (whether WordToken etc)
+		new SimpleToken(WordToken.class, 4, 7),
+		new SimpleToken(WordToken.class, 9, 14),
+		new SimpleToken(WordToken.class, 15, 19),
+		new SimpleToken(WordToken.class, 20, 24),
+		new SimpleToken(WordToken.class, 33, 36),
+	};
+	
+	//  tens digit:           0         111111111 2         3333 33333 4         5
+	//  units digit:          0123456789012345678901234567890123 45678901234567890123456789
+	String punctuationTest = "one;two.three:four-five+six!seven\\eight/;:-$nine*";
+	SimpleToken [] punctuationExpectedResults = {
+		new SimpleToken(WordToken.class, 0, 3), // use null when we don't care about checking the type of BaseToken (whether WordToken etc)
+		new SimpleToken(PunctuationToken.class, 3, 4),
+		new SimpleToken(WordToken.class, 4, 7),
+		new SimpleToken(PunctuationToken.class, 7, 8),
+		new SimpleToken(WordToken.class, 8, 13),
+		new SimpleToken(PunctuationToken.class, 13, 14),
+		new SimpleToken(WordToken.class, 14, 18),
+		new SimpleToken(PunctuationToken.class, 18, 19),
+		new SimpleToken(WordToken.class, 19, 23),
+		new SimpleToken(SymbolToken.class, 23, 24),
+		new SimpleToken(WordToken.class, 24, 27),
+		new SimpleToken(SymbolToken.class, 27, 28),
+		new SimpleToken(WordToken.class, 28, 33),
+		new SimpleToken(PunctuationToken.class, 33, 34),
+		new SimpleToken(WordToken.class, 34, 39),
+		new SimpleToken(PunctuationToken.class, 39, 40),
+		new SimpleToken(PunctuationToken.class, 40, 41),
+		new SimpleToken(PunctuationToken.class, 41, 42),
+		new SimpleToken(PunctuationToken.class, 42, 43),
+		new SimpleToken(SymbolToken.class, 43, 44),
+		new SimpleToken(WordToken.class, 44, 48),
+		new SimpleToken(SymbolToken.class, 48, 49),
+	};
+	
+
+	/*
+ 	's
+	've
+	're
+	'll
+	'd
+	n't
+	can not
+	gon na
+	got ta
+	lem me
+	more 'n
+	`t is
+	`t was
+	wan na
+	wha dd ya
+	wha t cha
+	 */
+	//  tens digit:           0         111111111 2         333333333 4         5         6         7         8         9
+	//  units digit:          0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789
+	String contractionTest = "He's I've  We're  I'll  He'd can't Cannot gonna gotta lemme more'n 'tis 'twas wanna whaddya whatcha";
+	SimpleToken [] contractionExpectedResults = {
+		new SimpleToken(WordToken.class, 0, 2),
+		new SimpleToken(ContractionToken.class, 2, 4),
+		
+		new SimpleToken(WordToken.class, 5, 6),
+		new SimpleToken(ContractionToken.class, 6, 9),
+		
+		new SimpleToken(WordToken.class, 11, 13),
+		new SimpleToken(ContractionToken.class, 13, 16),
+		
+		new SimpleToken(WordToken.class, 18, 19), // I'll
+		new SimpleToken(ContractionToken.class, 19, 22),
+		
+		new SimpleToken(WordToken.class, 24, 26),
+		new SimpleToken(ContractionToken.class, 26, 28),
+		
+		new SimpleToken(WordToken.class, 29, 31), // can't
+		new SimpleToken(ContractionToken.class, 31, 34),
+		
+		new SimpleToken(WordToken.class, 35, 38),
+		new SimpleToken(ContractionToken.class, 38, 41),
+		 
+		new SimpleToken(WordToken.class, 42, 45), // gonna
+		new SimpleToken(ContractionToken.class, 45, 47),
+		
+		new SimpleToken(WordToken.class, 48, 51),
+		new SimpleToken(ContractionToken.class, 51, 53),
+		
+		new SimpleToken(WordToken.class, 54, 57),
+		new SimpleToken(ContractionToken.class, 57, 59),
+		
+		new SimpleToken(WordToken.class, 60, 64),
+		new SimpleToken(ContractionToken.class, 64, 66),
+		
+		new SimpleToken(ContractionToken.class, 67, 69), // 'tis
+		new SimpleToken(WordToken.class, 69, 71),
+		
+		new SimpleToken(ContractionToken.class, 72, 74), // 'twas  't was
+		new SimpleToken(WordToken.class, 74, 77),
+		
+		new SimpleToken(WordToken.class, 78, 81),
+		new SimpleToken(ContractionToken.class, 81, 83),
+		
+		new SimpleToken(WordToken.class, 84, 87),
+		new SimpleToken(ContractionToken.class, 87, 89),
+		new SimpleToken(ContractionToken.class, 89, 91),
+		
+		new SimpleToken(WordToken.class, 92, 95),
+		new SimpleToken(ContractionToken.class, 95, 96),
+		new SimpleToken(ContractionToken.class, 96, 99),
+		
+		
+	};
+
+	static String [] hyphenatedWordsToNotSplit = {
+		"e-EXAMPLE",
+		"a-EXAMPLE",
+		"u-EXAMPLE",
+		"x-EXAMPLE",
+		"agro-EXAMPLE",
+		"ante-EXAMPLE",
+		"anti-EXAMPLE",
+		"arch-EXAMPLE",
+		"be-EXAMPLE",
+		"bi-EXAMPLE",
+		"bio-EXAMPLE",
+		"co-EXAMPLE",
+		"counter-EXAMPLE",
+		"cross-EXAMPLE",
+		"cyber-EXAMPLE",
+		"de-EXAMPLE",
+		"eco-EXAMPLE",
+		"EXAMPLE-esque",
+		"EXAMPLE-ette",
+		"ex-EXAMPLE",
+		"extra-EXAMPLE",
+		"EXAMPLE-fest",
+		"EXAMPLE-fold",
+		"EXAMPLE-gate",
+		"inter-EXAMPLE",
+		"intra-EXAMPLE",
+		"EXAMPLE-itis",
+		"EXAMPLE-less",
+		"macro-EXAMPLE",
+		"mega-EXAMPLE",
+		"micro-EXAMPLE",
+		"mid-EXAMPLE",
+		"mini-EXAMPLE",
+		"mm-hm",
+		"mm-mm",
+		"EXAMPLE-most",
+		"multi-EXAMPLE",
+		"neo-EXAMPLE",
+		"non-EXAMPLE",
+		"o-kay",
+		"EXAMPLE-o-torium",
+		"over-EXAMPLE",
+		"pan-EXAMPLE",
+		"para-EXAMPLE",
+		"peri-EXAMPLE",
+		"post-EXAMPLE",
+		"pre-EXAMPLE",
+		"pro-EXAMPLE",
+		"pseudo-EXAMPLE",
+		"quasi-EXAMPLE",
+		"EXAMPLE-rama",
+		"re-EXAMPLE",
+		"semi-EXAMPLE",
+		"sub-EXAMPLE",
+		"super-EXAMPLE",
+		"tri-EXAMPLE",
+		"uh-huh",
+		"uh-oh",
+		"ultra-EXAMPLE",
+		"un-EXAMPLE",
+		"uni-EXAMPLE",
+		"vice-EXAMPLE",
+		"EXAMPLE-wise",
+
+		"sub-EXAMPLE-wise", // a prefix and a suffix
+		"ultra-EXAMPLE-fest", // a prefix and a suffix
+		 
+		//"ANTIEXAMPLE-ese", // verify this gets caught
+	};
+	
+	static String hyphenTestBuilt = "";
+	static {
+	    // Create a single String that is space-delimited for testing all the hyphenated words and prefixes and suffixes
+	    for (String s: hyphenatedWordsToNotSplit) {
+		hyphenTestBuilt = hyphenTestBuilt + s.toUpperCase() + " "; // change the input to upper case to make sure the tokenizer handles upper case too
+	    }
+	}
+	
+	//  tens digit:           0         111111111 2         333333333 4         5
+	//  units digit:          012345678901234567890123456789012345678901234567890123456789
+	String hyphenTest = hyphenTestBuilt;
+	SimpleToken [] hyphenExpectedResults; // see Constructor
+	
+	//  tens digit:       0         111111111 2         333333333 4         5         6         7
+	//  units digit:      0123456789012345678901234567890123456789012345678901234567890123456789012
+	String abbrevTest1 = "e.g.abc e.g. abc injury.[16]";
+	SimpleToken [] abbrevExpectedResults1 = {
+			new SimpleToken(WordToken.class, 0, 1),
+			new SimpleToken(null, 1, 2),
+			new SimpleToken(null, 2, 3),
+			new SimpleToken(null, 3, 4),
+			new SimpleToken(WordToken.class, 4, 7),
+			new SimpleToken(WordToken.class, 8, 12),
+			new SimpleToken(null, 13, 16),
+			new SimpleToken(null, 17, 23),
+			new SimpleToken(null, 23, 24),
+			new SimpleToken(null, 24, 25),
+			new SimpleToken(NumToken.class, 25, 27),
+			new SimpleToken(null, 27, 28),
+	};
+	
+	//  tens digit:       0         111111111 2         333333333 4         5         6         7
+	//  units digit:      0123456789012345678901234567890123456789012345678901234567890123456789012
+	String abbrevTest2 = "Mr. Smith! Dr. Smith! Ste. 6! sign e.g. edema! 210 A.D.! injury.[16][17]";
+	SimpleToken [] abbrevExpectedResults2 = {
+		new SimpleToken(WordToken.class, 0, 3),
+		new SimpleToken(null, 4, 9),
+		new SimpleToken(null, 9, 10),
+
+		new SimpleToken(WordToken.class, 11, 14),
+		new SimpleToken(null, 15,20),
+		new SimpleToken(null, 20,21),
+
+		new SimpleToken(WordToken.class, 22, 26),
+		new SimpleToken(null, 27,28),
+		new SimpleToken(null, 28,29),
+
+		new SimpleToken(null, 30,34),
+		new SimpleToken(WordToken.class, 35, 39),
+		new SimpleToken(null, 40,45),
+		new SimpleToken(null, 45,46),
+		
+		new SimpleToken(null, 47,50),
+
+		new SimpleToken(WordToken.class, 51, 55),
+		new SimpleToken(null, 55,56),
+		
+		new SimpleToken(WordToken.class, 57, 63),
+		new SimpleToken(null, 63,64),
+
+		new SimpleToken(null, 64,65),
+		new SimpleToken(NumToken.class, 65,67),
+		new SimpleToken(null, 67,68),
+		new SimpleToken(null, 68,69),
+		new SimpleToken(NumToken.class, 69,71),
+		new SimpleToken(null, 71,72),
+	};
+
+	//  tens digit:       0         111111111 2         333333333 4         5         6         7
+	//  units digit:      0123456789012345678901234567890123456789012345678901234567890123456789012
+	String abbrevTest3 = "w3c.f 1.0 version 1.0.3f Mrs.? Mr.! bldg.: Stabile";
+	SimpleToken [] abbrevExpectedResults3 = {
+			new SimpleToken(WordToken.class, 0, 3),
+			new SimpleToken(null, 3, 4),
+			new SimpleToken(WordToken.class, 4, 5),
+			
+			new SimpleToken(null, 6, 9),
+			
+			new SimpleToken(WordToken.class, 10, 17),
+			
+			new SimpleToken(NumToken.class, 18, 21),
+			
+			new SimpleToken(NumToken.class, 21, 23),
+			
+			new SimpleToken(WordToken.class, 23, 24),
+
+			new SimpleToken(WordToken.class, 25, 29),
+			new SimpleToken(null, 29, 30),
+
+			new SimpleToken(WordToken.class, 31, 34),
+			new SimpleToken(null, 34, 35),
+
+			new SimpleToken(WordToken.class, 36, 41),
+			new SimpleToken(null, 41, 42),
+
+			new SimpleToken(WordToken.class, 43, 50),
+	};
+	
+
+	//  tens digit:         0         111111111 2         333333333 4         5         6
+	//  units digit:        0123456789012345678901234567890123456789012345678901234567890123456789
+	String webString1Test = "ClinicalNLP@mayo.edu http://www.mayoclinic.org http://mayo.edu";
+	SimpleToken [] webString1ExpectedResults = {
+		new SimpleToken(WordToken.class,  0, 20),
+		new SimpleToken(WordToken.class, 21, 46),
+		new SimpleToken(WordToken.class, 47, 62),
+	};
+
+	String webString2Test = "http://www.islamonline.net/Arabic/news/2004-12/05/images/pic05b.jpg" +
+		" " + "http://www.mofa.gov.sa/detail.asp?InNewsItemID=59090&InTemplateKey=print" +
+		" " + "rayhanenajib@menara.ma";
+	SimpleToken [] webString2ExpectedResults = {
+		new SimpleToken(WordToken.class,  0, 67),
+		new SimpleToken(WordToken.class, 68, 140),
+		new SimpleToken(WordToken.class, 141, 163),
+	};
+
+	//  tens digit:        0         111111111 2         333333333 4         5
+	//  units digit:       012345678901234567890123456789012345678901234567890123456789
+	String ellipsisTest = "he ... she";
+	SimpleToken [] ellipsisExpectedResults  = {
+		new SimpleToken(WordToken.class, 0, 2),
+		new SimpleToken(PunctuationToken.class, 3, 6),
+		new SimpleToken(WordToken.class, 7, 10),
+	};
+	
+
+	//  tens digit:             0         111111111 2         333333333 4         5
+	//  units digit:            012345678901234567890123456789012345678901234567890123456789
+	String teleAndPostalTest = "5-1212 1-800-555-1212 55901-9999";
+	SimpleToken [] teleAndPostalExpectedResults = {
+		new SimpleToken(WordToken.class, 0, 6),
+		new SimpleToken(WordToken.class, 7, 21),
+		new SimpleToken(WordToken.class, 22, 32),
+	};
+
+	//  tens digit:     0         111111111 2         333333333 4         5
+	//  units digit:    012345678901234567890123456789012345678901234567890123456789
+	String namesTest = "80's P'yongyang 'Assad 2000's";
+	SimpleToken [] namesExpectedResults = {
+		new SimpleToken(WordToken.class, 0, 4),
+		new SimpleToken(WordToken.class, 5, 15),
+		new SimpleToken(WordToken.class, 16, 22),
+		new SimpleToken(WordToken.class, 23, 29),
+	};
+
+	//                  01234567890123456789012345678
+	String comboTest = "80's-esque";
+	SimpleToken [] comboExpectedResults = {
+		new SimpleToken(WordToken.class, 0, 10),
+	};
+
+	
+	
+	
+	
+	public static void main(String[] args) {   // if don't want to run as junit, run this main
+	    
+	    System.out.println("Starting at " + new Date());
+	    TokenizerAnnotatorPTBTests tester = new TokenizerAnnotatorPTBTests();
+	    try {
+		tester.testTokenizerAnnotatorPTB(false); // false = don't throw, just continue so can see all errors
+	    } catch (ResourceInitializationException e) {
+		e.printStackTrace();
+	    }
+
+	    tester.unitTest();
+	    
+	    System.out.println("Done at " + new Date());
+	
+	}
+	
+	private void unitTest() {
+	    
+	    SimpleToken<Object> st1 = new SimpleToken<Object>(null, 0,5);
+	    System.out.println(st1.getTokenClass());
+	    
+	    SimpleToken<BaseToken> st2 = new SimpleToken<BaseToken>(BaseToken.class, 0,5);
+	    System.out.println(st2.getTokenClass());
+	    
+	    SimpleToken<BaseToken> st3 = new SimpleToken<BaseToken>(WordToken.class, 0,5);
+	    System.out.println(st3.getTokenClass());
+	    
+	}
+
+	private class TestData<EXPECTED_RESULTS_TYPE> {
+	    
+	    String testInput;
+	    EXPECTED_RESULTS_TYPE expectedResults;
+	    
+	    TestData(String testInput, EXPECTED_RESULTS_TYPE expectedResults) {
+		this.testInput = testInput;
+		this.expectedResults = expectedResults;
+	    }
+	    
+	    EXPECTED_RESULTS_TYPE getExpectedResults() {
+		return expectedResults;
+	    }
+	    
+	    String getTestInput() {
+		return testInput;
+	    }
+	    
+	}
+	
+	/**
+	 * 
+	 * Created as a container for the most basic data expected from the various tokens
+	 *
+	 * @param <CLASS> the Class the token is expected to be, or 
+	 */
+	private class SimpleToken<BaseToken> {
+	    Class<? extends BaseToken> typeOfBaseTokenOrSubClass;
+	    int b;
+	    int e;
+	    
+	    private SimpleToken(){ throw new RuntimeException("Requires begin and end");};
+	    
+	    /**
+	     * 
+	     * @param begin  the expected begin offset
+	     * @param end the expected end offset
+	     */
+	    SimpleToken(Class<? extends BaseToken> typeOfBaseTokenOrSubClass, int begin, int end) {
+		this.typeOfBaseTokenOrSubClass = typeOfBaseTokenOrSubClass;
+		this.b = begin;
+		this.e = end;
+	    }
+	    
+	    public int getBegin() { return b; } ;
+	    
+	    public int getEnd() { return e; } ;
+
+	    public Class<? extends BaseToken> getTokenClass() { 
+		if (typeOfBaseTokenOrSubClass==null) 
+		    return null;
+		else 
+		    return typeOfBaseTokenOrSubClass; 
+	    }
+
+	    public String toString() {
+		String className;
+		if (typeOfBaseTokenOrSubClass==null) {
+		    className = "Class name not set";
+		} else {
+		    className = typeOfBaseTokenOrSubClass.getClass().getName();
+		}
+		String s = className + " " + b + ", " + e;
+		return s;
+	    }
+	}
+}
+

Modified: ctakes/branches/ctakes-3.1.0/ctakes-dependency-parser/src/test/java/org/apache/ctakes/dependency/parser/ae/util/TestClearNLPAnalysisEngines.java
URL: http://svn.apache.org/viewvc/ctakes/branches/ctakes-3.1.0/ctakes-dependency-parser/src/test/java/org/apache/ctakes/dependency/parser/ae/util/TestClearNLPAnalysisEngines.java?rev=1516907&r1=1516906&r2=1516907&view=diff
==============================================================================
--- ctakes/branches/ctakes-3.1.0/ctakes-dependency-parser/src/test/java/org/apache/ctakes/dependency/parser/ae/util/TestClearNLPAnalysisEngines.java (original)
+++ ctakes/branches/ctakes-3.1.0/ctakes-dependency-parser/src/test/java/org/apache/ctakes/dependency/parser/ae/util/TestClearNLPAnalysisEngines.java Fri Aug 23 15:51:59 2013
@@ -16,94 +16,87 @@
  * specific language governing permissions and limitations
  * under the License.
  */
-package org.apache.ctakes.dependency.parser.ae.util;
-
-import java.io.File;
-import java.io.IOException;
-import java.net.URI;
-
-import org.apache.uima.UIMAException;
-import org.apache.uima.analysis_engine.AnalysisEngine;
-import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
-import org.apache.uima.collection.CollectionReader;
-import org.apache.uima.jcas.JCas;
-import org.apache.uima.resource.metadata.TypeSystemDescription;
-import org.cleartk.util.Options_ImplBase;
-import org.cleartk.util.cr.FilesCollectionReader;
-import org.junit.Test;
-import org.kohsuke.args4j.Option;
-import org.uimafit.component.JCasAnnotator_ImplBase;
-import org.uimafit.factory.AnalysisEngineFactory;
-import org.uimafit.factory.CollectionReaderFactory;
-import org.uimafit.factory.TypeSystemDescriptionFactory;
-import org.uimafit.pipeline.SimplePipeline;
-import org.uimafit.util.JCasUtil;
-import org.xml.sax.SAXException;
-
+package org.apache.ctakes.dependency.parser.ae.util;
+
 import org.apache.ctakes.dependency.parser.ae.ClearNLPDependencyParserAE;
 import org.apache.ctakes.dependency.parser.ae.ClearNLPSemanticRoleLabelerAE;
 import org.apache.ctakes.dependency.parser.util.DependencyUtility;
 import org.apache.ctakes.dependency.parser.util.SRLUtility;
-import org.apache.ctakes.typesystem.type.textspan.Sentence;
-
-/**
- * This class illustrates the pipeline needed to run the ClearNLP dependency parser and SRL systems
- * Note: This uses small, highly inaccurate model files, to keep the expense of running down.
- *       For real applications, use the model files recommended in the README.txt file, or leave the model file
- *       configuration parameter unspecified
- * @author lbecker
- *
- */
+import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.uima.analysis_engine.AnalysisEngine;
+import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
+import org.apache.uima.collection.CollectionReader;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.metadata.TypeSystemDescription;
+import org.cleartk.util.Options_ImplBase;
+import org.cleartk.util.cr.FilesCollectionReader;
+import org.junit.Test;
+import org.kohsuke.args4j.Option;
+import org.uimafit.component.JCasAnnotator_ImplBase;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.CollectionReaderFactory;
+import org.uimafit.factory.TypeSystemDescriptionFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.util.JCasUtil;
+
+/**
+ * This class illustrates the pipeline needed to run the ClearNLP dependency parser and SRL systems
+ * Note: This uses small, highly inaccurate model files, to keep the expense of running down.
+ *       For real applications, use the model files recommended in the README.txt file, or leave the model file
+ *       configuration parameter unspecified
+ * @author lbecker
+ *
+ */
 public class TestClearNLPAnalysisEngines{
 	
 	// Create dependency parsers analysis engine with the default models
-	// The dummy models from ClearParser haven't been updated to work with ClearNLP.
-	//public static final String DEP_DUMMY_MODEL_FILE = "org/apache/ctakes/dependency/parser/models/dependency/dummy.dep.mod.jar";
-	//public static final String SRL_DUMMY_MODEL_FILE = "org/apache/ctakes/dependency/parser/models/srl/dummy.srl.mod.jar";
-	public static String INPUT_FILE = "../ctakes-clinical-pipeline/src/test/data/plaintext/testpatient_plaintext_1.txt";
-	public static class Options extends Options_ImplBase {
-		
-		@Option(name = "-d",
-				aliases = "--depModelFile",
-				usage = "specify the path to the dependency parser model file",
-				required = false)
-		public String depModelFile = null;
-		
-		@Option(name = "-s",
-				aliases = "--srlModelFile",
-				usage = "specify the path to the ClearNLP srl model file",
-				required = false)
-		public String srlModelFile = null;
-		
-		
-		@Option(name = "-i",
-				aliases = "--inputFile",
-				usage = "specify the path to the plaintext input",
-				required = false)
-		public String inputFile = INPUT_FILE;
-	}
-
-	
-	/**
-	 * Simple inner class for dumping out ClearNLP output
-	 * @author lbecker
-	 *
-	 */
-	public static class DumpClearNLPOutputAE extends JCasAnnotator_ImplBase {
-
-		@Override
-		public void process(JCas jCas) throws AnalysisEngineProcessException {
-			for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
-				System.out.println("SOURCE SENTENCE:" + sentence.getCoveredText());
-				System.out.println("Dependency Parse:");
-				System.out.println(DependencyUtility.dumpDependencyGraph(sentence));
-				System.out.println("Semantic Roles:");
-				System.out.println(SRLUtility.dumpSRLOutput(sentence));
-						
-			}
-		}
-	}
-	
+	// The dummy models from ClearParser haven't been updated to work with ClearNLP.
+	//public static final String DEP_DUMMY_MODEL_FILE = "org/apache/ctakes/dependency/parser/models/dependency/dummy.dep.mod.jar";
+	//public static final String SRL_DUMMY_MODEL_FILE = "org/apache/ctakes/dependency/parser/models/srl/dummy.srl.mod.jar";
+	public static String INPUT_FILE = "../ctakes-clinical-pipeline/src/test/data/plaintext/testpatient_plaintext_1.txt";
+	public static class Options extends Options_ImplBase {
+		
+		@Option(name = "-d",
+				aliases = "--depModelFile",
+				usage = "specify the path to the dependency parser model file",
+				required = false)
+		public String depModelFile = null;
+		
+		@Option(name = "-s",
+				aliases = "--srlModelFile",
+				usage = "specify the path to the ClearNLP srl model file",
+				required = false)
+		public String srlModelFile = null;
+		
+		
+		@Option(name = "-i",
+				aliases = "--inputFile",
+				usage = "specify the path to the plaintext input",
+				required = false)
+		public String inputFile = INPUT_FILE;
+	}
+
+	
+	/**
+	 * Simple inner class for dumping out ClearNLP output
+	 * @author lbecker
+	 *
+	 */
+	public static class DumpClearNLPOutputAE extends JCasAnnotator_ImplBase {
+
+		@Override
+		public void process(JCas jCas) throws AnalysisEngineProcessException {
+			for (Sentence sentence : JCasUtil.select(jCas, Sentence.class)) {
+				System.out.println("SOURCE SENTENCE:" + sentence.getCoveredText());
+				System.out.println("Dependency Parse:");
+				System.out.println(DependencyUtility.dumpDependencyGraph(sentence));
+				System.out.println("Semantic Roles:");
+				System.out.println(SRLUtility.dumpSRLOutput(sentence));
+						
+			}
+		}
+	}
+	
 
 	@Test
 	public void TestClearNLPPipeLine() throws Exception {
@@ -118,7 +111,7 @@ public class TestClearNLPAnalysisEngines
 				);
 		
 		// Load preprocessing pipeline (consists of 
-		//AnalysisEngine preprocessingAE = WriteClearNLPDescriptors.getPlaintextAggregateBuilder().createAggregate();
+		AnalysisEngine preprocessingAE = WriteClearNLPDescriptors.getPlaintextAggregateBuilder().createAggregate();
 		
 		// Create dependency parsers analysis engine with the default models
 		// The dummy models from ClearParser haven't been updated to work with ClearNLP.
@@ -138,7 +131,7 @@ public class TestClearNLPAnalysisEngines
 				DumpClearNLPOutputAE.class,
 				typeSystem);
 		
-		SimplePipeline.runPipeline(reader1, /* preprocessingAE, */ ClearNLPDepParser, ClearNLPSRL, dumpClearNLPOutput);	
+		SimplePipeline.runPipeline(reader1, preprocessingAE, ClearNLPDepParser, ClearNLPSRL, dumpClearNLPOutput);	
 	}
-	
-}
+	
+}



Mime
View raw message