ctakes-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tm...@apache.org
Subject svn commit: r1651528 - in /ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core: ae/SentenceDetector.java sentence/SDContextGeneratorCtakes.java sentence/SentenceDetectorCtakes.java
Date Tue, 13 Jan 2015 22:37:14 GMT
Author: tmill
Date: Tue Jan 13 22:37:13 2015
New Revision: 1651528

URL: http://svn.apache.org/r1651528
Log:
Added hacky way around issue with newlines/newline representing tokens. 

Modified:
    ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
    ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java
    ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java?rev=1651528&r1=1651527&r2=1651528&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
(original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/ae/SentenceDetector.java
Tue Jan 13 22:37:13 2015
@@ -27,9 +27,12 @@ import java.nio.charset.Charset;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Set;
 
+import javax.swing.JFileChooser;
+
 import opennlp.tools.cmdline.sentdetect.SentenceDetectorCrossValidatorTool;
 import opennlp.tools.cmdline.sentdetect.SentenceEvaluationErrorListener;
 import opennlp.tools.dictionary.Dictionary;
@@ -45,13 +48,16 @@ import opennlp.tools.util.PlainTextByLin
 import opennlp.tools.util.TrainingParameters;
 
 import org.apache.ctakes.core.resource.FileLocator;
+import org.apache.ctakes.core.sentence.PlainTextByLineStreamCtakes;
 import org.apache.ctakes.core.sentence.SDContextGeneratorCtakes;
 import org.apache.ctakes.core.sentence.EndOfSentenceScannerImpl;
 import org.apache.ctakes.core.sentence.SentenceDetectorCtakes;
 import org.apache.ctakes.core.sentence.SentenceDetectorFactoryCtakes;
+import org.apache.ctakes.core.sentence.SentenceSampleStreamCtakes;
 import org.apache.ctakes.core.sentence.SentenceSpan;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
+import org.apache.ctakes.utils.struct.CounterMap;
 import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineDescription;
@@ -63,6 +69,7 @@ import org.apache.uima.fit.factory.Analy
 import org.apache.uima.fit.util.JCasUtil;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;
+import org.apache.uima.util.FileUtils;
 
 /**
  * Wraps the OpenNLP sentence detector in a UIMA annotator
@@ -111,6 +118,9 @@ public class SentenceDetector extends JC
   // LOG4J logger based on class name
   private Logger logger = Logger.getLogger(getClass().getName());
 
+  private final static int WINDOW = 5;
+  private final static double SMOOTH = 1.0;
+
   @Override
   public void initialize(UimaContext aContext)
 			throws ResourceInitializationException {
@@ -142,14 +152,54 @@ public class SentenceDetector extends JC
 		logger.info("Starting processing.");
 
 		int sentenceCount = 0;
-
+		
 		String text = jcas.getDocumentText();
 
+    CounterMap<Integer> lenHist = new CounterMap<>();
+    HashMap<Integer,Double> smoothHist = new HashMap<>();
+    String[] lines = text.split("\n");
+
+    int maxLen = 0;
+    for(int i = 0; i < lines.length; i++){
+      int len = lines[i].length();
+      if(len == 0) continue;
+      if(len > maxLen){
+        maxLen = len;
+      }
+      lenHist.add(len);
+      for(int j = -WINDOW; j <= WINDOW; j++){
+        int ind = len + j;
+        if(ind < 0) continue;
+        if(!smoothHist.containsKey(ind)){
+          smoothHist.put(ind, SMOOTH);
+        }
+        smoothHist.put(ind, smoothHist.get(ind)+ (1-Math.abs(j)/(WINDOW+1.0)));
+      }
+    }
+
+    for(int i = 0; i <= maxLen+WINDOW; i++){
+      if(!smoothHist.containsKey(i)){
+        smoothHist.put(i, SMOOTH);
+      }
+    }
+
+    double sum = 0.0;
+    for(double val : smoothHist.values()){
+      sum += val;
+    }
+
+    for(int i = 0; i <= maxLen+WINDOW; i++){
+      double smoothed = smoothHist.get(i);
+//      String slope = (i==0 ? "-" : (smoothHist.get(i) > smoothHist.get(i-1) ? "+" :
"-"));
+      smoothHist.put(i, smoothed / sum);
+    }
+
+    
 		Collection<Segment> segments = JCasUtil.select(jcas, Segment.class);
 		for(Segment segment : segments){
 			String sectionID = segment.getId();
 			if (!skipSegmentsSet.contains(sectionID)) {
-				sentenceCount = annotateRange(jcas, text, segment, sentenceCount);
+				sentenceCount = annotateRange(jcas, text, segment, sentenceCount, lenHist, smoothHist);
 			}
 		}
 	}
@@ -171,12 +221,14 @@ public class SentenceDetector extends JC
 	 * @param sentenceCount
 	 *            the number of sentences added already to the CAS (if
 	 *            processing one section at a time)
+	 * @param smoothHist 
+	 * @param lenHist 
 	 * @return count The sum of <code>sentenceCount</code> and the number of
 	 *         Sentence annotations added to the CAS for this section
 	 * @throws AnnotatorProcessException
 	 */
 	protected int annotateRange(JCas jcas, String text, Segment section,
-			int sentenceCount) {
+			int sentenceCount, CounterMap<Integer> lenHist, HashMap<Integer, Double> smoothHist)
{
 
 		int b = section.getBegin();
 		int e = section.getEnd();
@@ -186,7 +238,7 @@ public class SentenceDetector extends JC
 		// detects
 		// within the string
 		int[] sentenceBreaks = sentenceDetector.sentPosDetect(text.substring(b,
-				e)); // OpenNLP tools 1.5 returns Spans rather than offsets that
+				e), lenHist, smoothHist); // OpenNLP tools 1.5 returns Spans rather than offsets that
 						// 1.4 did
 		int numSentences = sentenceBreaks.length;
 		// There might be text after the last sentence-ending found by detector,

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java?rev=1651528&r1=1651527&r2=1651528&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java
(original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SDContextGeneratorCtakes.java
Tue Jan 13 22:37:13 2015
@@ -1,6 +1,10 @@
 package org.apache.ctakes.core.sentence;
 
 import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.logging.Logger;
+
+import org.apache.ctakes.utils.struct.CounterMap;
 
 import opennlp.tools.sentdetect.DefaultSDContextGenerator;
 import opennlp.tools.util.StringUtil;
@@ -9,13 +13,31 @@ public class SDContextGeneratorCtakes ex
 
   // TODO -- is this threadsafe?? At the very least its not less thread-safe than existing
data structures in parent class
   String ws = null;
+//  CounterMap<Integer> lenHist = null;
+//  HashMap<Integer,Double> smoothHist = null;
   
   public SDContextGeneratorCtakes(char[] eosCharacters) {
     super(eosCharacters);
   }
 
   @Override
-  public String[] getContext(CharSequence sb, int position) {
+  public String[] getContext(CharSequence sb, int position){
+    return getContext(sb, position, null, null);
+  }
+  
+  public String[] getContext(CharSequence sb, int position, CounterMap<Integer> lenHist,
HashMap<Integer, Double> smoothHist) {
+    int ind = -1;
+    StringBuffer text = new StringBuffer(sb.toString());
+    
+    if(text.charAt(position)== '>' &&
+        text.charAt(position-1) == 'F' &&
+        text.charAt(position-2) == 'L' &&
+        text.charAt(position-3) == '<'){
+      text.replace(position-3, position+1, "\n");
+      position -= 3;
+    }
+    sb = text;
+    
     // add features to addlFeats string array:
     int lastIndex = sb.length() - 1;
     int wsEnd = nextNonspaceIndex(sb, position, lastIndex);
@@ -23,6 +45,38 @@ public class SDContextGeneratorCtakes ex
       ws = new StringBuilder(sb.subSequence(position + 1, wsEnd)).toString();
     }
 
+    /*
+    int lastBreak = position-1;
+    while(lastBreak > 0 && sb.charAt(lastBreak) != '\n'){
+      lastBreak--;
+    }
+    int lineLen = position - lastBreak;
+    char eosChar = sb.charAt(position);
+    
+    // line length-based features (requires document-level information)
+    if(lenHist != null && smoothHist != null){
+      if(eosChar == '\n'){
+        int nextWordLen = 0;
+        int nextNonWs = 0;
+        while(Character.isWhitespace(sb.charAt(position+nextNonWs))){
+          nextNonWs++;
+        }
+        while(Character.isLetterOrDigit(sb.charAt(position+nextNonWs+nextWordLen))){
+          nextWordLen++;
+        }
+        int potLen = lineLen + nextWordLen;
+        if(potLen >= 0){
+          this.collectFeats.add("othersOfThisLen=" + (lenHist.get(potLen) > 0));
+
+          boolean upSlope = (smoothHist.get(lineLen) > smoothHist.get(lineLen-1));
+          boolean downSlope = (smoothHist.get(potLen) < smoothHist.get(potLen-1));
+
+          this.collectFeats.add("upSlope="+upSlope);
+          this.collectFeats.add("downSlope="+downSlope);
+        }
+      }
+    }
+    */
     return super.getContext(sb, position);    
   }
   
@@ -42,6 +96,12 @@ public class SDContextGeneratorCtakes ex
   protected void collectFeatures(String prefix, String suffix, String previous, String next,
Character eosChar) {
     super.collectFeatures(prefix, suffix, previous, next, eosChar);
 
+    for(int i = 0; i < collectFeats.size(); i++){
+      if(collectFeats.get(i).equals("eos=\n")){
+        collectFeats.set(i, "eos=<LF>");
+        break;
+      }
+    }
     if (!next.equals("")) {
       if(isAllUpper(next)) {
         collectFeats.add("nbold");
@@ -103,7 +163,7 @@ public class SDContextGeneratorCtakes ex
   }
 
   private static final String getCollapsedShape(String s){
-    return getShape(s).replaceAll("(.)\\1+", "$1+");
+    return getShape(s).replaceAll("(.)\\1+", "$1+").replaceAll("D\\+?", "D+");
   }
 
   private static final int nextNonspaceIndex(CharSequence sb, int seek, int lastIndex) {

Modified: ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
URL: http://svn.apache.org/viewvc/ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java?rev=1651528&r1=1651527&r2=1651528&view=diff
==============================================================================
--- ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
(original)
+++ ctakes/branches/sent-detector-newline-fix/ctakes-core/src/main/java/org/apache/ctakes/core/sentence/SentenceDetectorCtakes.java
Tue Jan 13 22:37:13 2015
@@ -29,6 +29,8 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
+import org.apache.ctakes.utils.struct.CounterMap;
+
 import opennlp.model.MaxentModel;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.sentdetect.EndOfSentenceScanner;
@@ -75,7 +77,7 @@ public class SentenceDetectorCtakes {
 	  /**
 	   * The feature context generator.
 	   */
-	  private final SDContextGenerator cgen;
+	  private final SDContextGeneratorCtakes cgen;
 
 	  /**
 	   * The {@link EndOfSentenceScanner} to use when scanning for end of sentence offsets.
@@ -94,7 +96,7 @@ public class SentenceDetectorCtakes {
 	   *
 	   * @param model the {@link SentenceModel}
 	   */
-	  public SentenceDetectorCtakes(MaxentModel model, SDContextGenerator cg, EndOfSentenceScanner
eoss) {
+	  public SentenceDetectorCtakes(MaxentModel model, SDContextGeneratorCtakes cg, EndOfSentenceScanner
eoss) {
 		  this.model = model;
 		  cgen = cg;
 		  scanner = eoss;
@@ -108,7 +110,7 @@ public class SentenceDetectorCtakes {
 	   * @param s  The string to be processed.
 	   *
 	   * @return   A string array containing individual sentences as elements.
-	   */
+	   *//*
 	  public String[] sentDetect(String s) {
 	    int[] endsOfSentences = sentPosDetect(s);
 	    String sentences[];
@@ -126,7 +128,7 @@ public class SentenceDetectorCtakes {
 	      sentences = new String[] {};
 	    }
 	    return sentences;
-	  }
+	  }*/
 
 	  private int getFirstWS(String s, int pos) {
 	    while (pos < s.length() && !StringUtil.isWhitespace(s.charAt(pos)))
@@ -144,18 +146,20 @@ public class SentenceDetectorCtakes {
 	   * Detect the position of the first words of sentences in a String.
 	   *
 	   * @param s  The string to be processed.
+	   * @param smoothHist 
+	   * @param lenHist 
 	   * @return   A integer array containing the positions of the end index of
 	   *          every sentence
 	   *
 	   * @see SentenceDetectorME#sentPosDetect(String)  
 	   */
-	  public int[] sentPosDetect(String s) { // return int[] to be line OpenNLP 1.4
+	  public int[] sentPosDetect(String s, CounterMap<Integer> lenHist, HashMap<Integer,
Double> smoothHist) { // return int[] to be line OpenNLP 1.4
 	    double sentProb = 1;
 	    sentProbs.clear();
 	    StringBuffer sb = new StringBuffer(s);
 	    List<Integer> enders = scanner.getPositions(s);
 	    List<Integer> positions = new ArrayList<Integer>(enders.size());
-
+	    
 	    for (int i = 0, end = enders.size(), index = 0; i < end; i++) {
 	      Integer candidate = enders.get(i);
 	      int cint = candidate;
@@ -165,7 +169,7 @@ public class SentenceDetectorCtakes {
 	        continue;
 	      }
 
-	      double[] probs = model.eval(cgen.getContext(sb, cint));
+	      double[] probs = model.eval(cgen.getContext(sb, cint, lenHist, smoothHist));
 	      String bestOutcome = model.getBestOutcome(probs);
 	      sentProb *= probs[model.getIndex(bestOutcome)];
 



Mime
View raw message