ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1590234 - in /ctakes/trunk/ctakes-lvg: ./ src/main/java/org/apache/ctakes/lvg/ae/ src/main/test/ src/main/test/org/ src/main/test/org/apache/ src/main/test/org/apache/ctakes/ src/main/test/org/apache/ctakes/lvg/ src/main/test/org/apache/ct...
Date Sat, 26 Apr 2014 13:10:07 GMT
Author: tmill
Date: Sat Apr 26 13:10:06 2014
New Revision: 1590234

URL: http://svn.apache.org/r1590234
Log:
CTAKES-297: Added method for getting descriptor for LvgAnnotator. Added unit test. Made some
parameters optional with sane defaults.

Added:
    ctakes/trunk/ctakes-lvg/src/main/test/
    ctakes/trunk/ctakes-lvg/src/main/test/org/
    ctakes/trunk/ctakes-lvg/src/main/test/org/apache/
    ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/
    ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/lvg/
    ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/lvg/test/
    ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/lvg/test/TestLvgAnnotator.java
Modified:
    ctakes/trunk/ctakes-lvg/.classpath
    ctakes/trunk/ctakes-lvg/src/main/java/org/apache/ctakes/lvg/ae/LvgAnnotator.java

Modified: ctakes/trunk/ctakes-lvg/.classpath
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-lvg/.classpath?rev=1590234&r1=1590233&r2=1590234&view=diff
==============================================================================
--- ctakes/trunk/ctakes-lvg/.classpath (original)
+++ ctakes/trunk/ctakes-lvg/.classpath Sat Apr 26 13:10:06 2014
@@ -12,6 +12,7 @@
 			<attribute name="maven.pomderived" value="true"/>
 		</attributes>
 	</classpathentry>
+	<classpathentry kind="src" path="src/main/test"/>
 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
 		<attributes>
 			<attribute name="maven.pomderived" value="true"/>

Modified: ctakes/trunk/ctakes-lvg/src/main/java/org/apache/ctakes/lvg/ae/LvgAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-lvg/src/main/java/org/apache/ctakes/lvg/ae/LvgAnnotator.java?rev=1590234&r1=1590233&r2=1590234&view=diff
==============================================================================
--- ctakes/trunk/ctakes-lvg/src/main/java/org/apache/ctakes/lvg/ae/LvgAnnotator.java (original)
+++ ctakes/trunk/ctakes-lvg/src/main/java/org/apache/ctakes/lvg/ae/LvgAnnotator.java Sat Apr
26 13:10:06 2014
@@ -24,10 +24,12 @@ import gov.nih.nlm.nls.lvg.Lib.Category;
 import gov.nih.nlm.nls.lvg.Lib.LexItem;
 
 import java.io.BufferedReader;
+import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
@@ -40,11 +42,13 @@ import java.util.Vector;
 
 import org.apache.ctakes.core.util.ListFactory;
 import org.apache.ctakes.lvg.resource.LvgCmdApiResource;
+import org.apache.ctakes.lvg.resource.LvgCmdApiResourceImpl;
 import org.apache.ctakes.typesystem.type.syntax.Lemma;
 import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
 import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.JFSIndexRepository;
@@ -53,6 +57,8 @@ import org.apache.uima.resource.Resource
 import org.uimafit.component.JCasAnnotator_ImplBase;
 import org.uimafit.descriptor.ConfigurationParameter;
 import org.uimafit.descriptor.ExternalResource;
+import org.uimafit.factory.AnalysisEngineFactory;
+import org.uimafit.factory.ExternalResourceFactory;
 
 /**
  * UIMA annotator that uses the UMLS LVG package to find the canonical form of
@@ -67,7 +73,10 @@ import org.uimafit.descriptor.ExternalRe
  *         misspelling is a word in the lexicon.
  */
 public class LvgAnnotator extends JCasAnnotator_ImplBase {
-	/**
+  public static final String[] defaultExclusionWords = {"And", "and", "By", "by", "For",
"for", "In", "in", "Of", "of", "On", "on", "The", "the", "To", "to", "With", "with"};
+  public static final String[] defaultTreebankMap = {"adj|JJ", "adv|RB", "aux|AUX", "compl|CS",
"conj|CC", "det|DET", "modal|MD", "noun|NN", "prep|IN", "pron|PRP", "verb|VB"};
+
+  /**
 	 * Value is "PostLemmas". This parameter determines whether the feature
 	 * lemmaEntries will be populated for word annotations.
 	 */
@@ -141,10 +150,10 @@ public class LvgAnnotator extends JCasAn
 	public static final String PARAM_XT_MAP = "XeroxTreebankMap";
 	@ConfigurationParameter(
 	    name = PARAM_XT_MAP,
-	    mandatory = true,
+	    mandatory = false,
 	    description = "Mapping from Xerox parts of speech to Treebank equivalents"
 	    )
-	private String[] xtMaps;
+	private String[] xtMaps = defaultTreebankMap;
   private Map<String, String> xeroxTreebankMap;
 	
 	public static final String PARAM_USE_CMD_CACHE = "UseCmdCache";
@@ -178,16 +187,15 @@ public class LvgAnnotator extends JCasAn
 	@ConfigurationParameter(
 	    name = PARAM_EXCLUSION_WORDS,
 	    mandatory = false,
-	    defaultValue = {"And", "and", "By", "by", "For", "for", "In", "in", "Of", "of", "On",
"on", "The", "the", "To", "to", "With", "with"},
 	    description = "Words to exclude when doing LVG normalization"
 	    )
-	String[] wordsToExclude;
+	private String[] wordsToExclude = defaultExclusionWords;
   private Set<String> exclusionSet;
-	
+  
 	// LOG4J logger based on class name
 	private Logger logger = Logger.getLogger(getClass().getName());
 
-	private final String PARAM_LVGCMDAPI_RESRC_KEY = "LvgCmdApi";
+	public static final String PARAM_LVGCMDAPI_RESRC_KEY = "LvgCmdApi";
   @ExternalResource(
       key = PARAM_LVGCMDAPI_RESRC_KEY,
       mandatory = true
@@ -552,6 +560,28 @@ public class LvgAnnotator extends JCasAn
 		}
 	}
 
+	public static AnalysisEngineDescription createAnnotatorDescription() throws ResourceInitializationException,
URISyntaxException{
+	  return AnalysisEngineFactory.createPrimitiveDescription(LvgAnnotator.class,
+        LvgAnnotator.PARAM_USE_CMD_CACHE,
+        false,
+        LvgAnnotator.PARAM_USE_LEMMA_CACHE,
+        false,
+        LvgAnnotator.PARAM_USE_SEGMENTS,
+        false,
+	      LvgAnnotator.PARAM_LEMMA_CACHE_FREQUENCY_CUTOFF,
+	      20,
+	      LvgAnnotator.PARAM_LEMMA_FREQ_CUTOFF,
+	      20,
+	      LvgAnnotator.PARAM_POST_LEMMAS,
+	      false,
+	      LvgAnnotator.PARAM_LVGCMDAPI_RESRC_KEY,
+	      ExternalResourceFactory.createExternalResourceDescription(
+            LvgCmdApiResourceImpl.class,
+            new File(LvgCmdApiResourceImpl.class.getResource(
+                "/org/apache/ctakes/lvg/data/config/lvg.properties").toURI()))
+	      );
+	}
+	
 	/**
 	 * Basic class to group a lemma word with its various parts of speech.
 	 * 

Added: ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/lvg/test/TestLvgAnnotator.java
URL: http://svn.apache.org/viewvc/ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/lvg/test/TestLvgAnnotator.java?rev=1590234&view=auto
==============================================================================
--- ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/lvg/test/TestLvgAnnotator.java
(added)
+++ ctakes/trunk/ctakes-lvg/src/main/test/org/apache/ctakes/lvg/test/TestLvgAnnotator.java
Sat Apr 26 13:10:06 2014
@@ -0,0 +1,63 @@
+package org.apache.ctakes.lvg.test;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.net.URISyntaxException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.ctakes.core.ae.SentenceDetector;
+import org.apache.ctakes.core.ae.SimpleSegmentAnnotator;
+import org.apache.ctakes.core.ae.TokenizerAnnotatorPTB;
+import org.apache.ctakes.lvg.ae.LvgAnnotator;
+import org.apache.ctakes.typesystem.type.syntax.WordToken;
+import org.apache.uima.UIMAException;
+import org.apache.uima.analysis_engine.AnalysisEngineDescription;
+import org.apache.uima.jcas.JCas;
+import org.apache.uima.resource.ResourceInitializationException;
+import org.junit.Test;
+import org.uimafit.factory.AggregateBuilder;
+import org.uimafit.factory.JCasFactory;
+import org.uimafit.pipeline.SimplePipeline;
+import org.uimafit.util.JCasUtil;
+
+public class TestLvgAnnotator {
+  public static final String note = "" +
+      "Medications:\n" +
+      "Hibernol, jamitol, triopenin, sproingo\n\n" +
+      "Physical exam:\n" +
+      "Patient is doing fine but probably taking too many fictional drugs. Cholesterol is
acceptable. Heartrate is elevated. \n" +
+      "Instructions:\n" +
+      "Patient should quit smoking and taunting sharks.";
+
+  @Test
+  public void testLvgAnnotator() throws UIMAException, IOException, URISyntaxException{
+    JCas jcas = JCasFactory.createJCas();
+    jcas.setDocumentText(note);
+    
+    SimplePipeline.runPipeline(jcas, getDefaultPipeline());
+    List<WordToken> tokens = new ArrayList<>(JCasUtil.select(jcas, WordToken.class));
+    assertEquals("Incorrect canonical form!", "medication", tokens.get(0).getCanonicalForm());
+    
+    assertTrue(tokens.get(29).getCanonicalForm() == null);
+    
+  }
+  
+  public static AnalysisEngineDescription getPrerequisitePipeline() throws ResourceInitializationException{
+    AggregateBuilder builder = new AggregateBuilder();
+    builder.add(SimpleSegmentAnnotator.createAnnotatorDescription());
+    builder.add(SentenceDetector.createAnnotatorDescription());
+    builder.add(TokenizerAnnotatorPTB.createAnnotatorDescription());
+    return builder.createAggregateDescription();
+  }
+  
+  public static AnalysisEngineDescription getDefaultPipeline() throws ResourceInitializationException,
URISyntaxException{
+    AggregateBuilder builder = new AggregateBuilder();
+    builder.add(getPrerequisitePipeline());
+    builder.add(LvgAnnotator.createAnnotatorDescription());
+    return builder.createAggregateDescription();
+  }
+  
+  
+}



Mime
View raw message