opennlp-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From co...@apache.org
Subject svn commit: r1240701 - /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
Date Sun, 05 Feb 2012 11:39:24 GMT
Author: colen
Date: Sun Feb  5 11:39:23 2012
New Revision: 1240701

URL: http://svn.apache.org/viewvc?rev=1240701&view=rev
Log:
OPENNLP-422: Modified the sentence reader so it can better handle FlorestaVirgem.

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java?rev=1240701&r1=1240700&r2=1240701&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
(original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
Sun Feb  5 11:39:23 2012
@@ -130,7 +130,8 @@ public class ADSentenceStream extends
 	        // we should have the plain sentence
 	        // we remove the first token
 	        int start = line.indexOf(" ");
-	        text = line.substring(start + 1);
+	        text = line.substring(start + 1).trim();
+	        text = fixPunctuation(text);
 	        String titleTag = "";
 	        if(isTitle) titleTag = " title";
 	        String boxTag = "";
@@ -213,6 +214,12 @@ public class ADSentenceStream extends
       return sentence;
     }
 
+    private String fixPunctuation(String text) {
+      text = text.replaceAll("\\»\\s+\\.", "».");
+      text = text.replaceAll("\\»\\s+\\,", "»,");
+      return text;
+    }
+
     /**
      * Parse a tree element from a AD line
      * 



Mime
View raw message