lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sar...@apache.org
Subject svn commit: r1588137 - in /lucene/dev/branches/branch_4x: ./ lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/
Date Thu, 17 Apr 2014 06:15:31 GMT
Author: sarowe
Date: Thu Apr 17 06:15:31 2014
New Revision: 1588137

URL: http://svn.apache.org/r1588137
Log:
SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly (merged trunk r1588136)

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/CHANGES.txt
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java

Modified: lucene/dev/branches/branch_4x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/CHANGES.txt?rev=1588137&r1=1588136&r2=1588137&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/lucene/CHANGES.txt Thu Apr 17 06:15:31 2014
@@ -215,6 +215,9 @@ Bug fixes
 
 * LUCENE-5568: Benchmark module's "default.codec" option didn't work. (David Smiley)
 
+* SOLR-5983: HTMLStripCharFilter is treating CDATA sections incorrectly.
+  (Dan Funk, Steve Rowe)
+
 Test Framework
 
 * LUCENE-5592: Incorrectly reported uncloseable files. (Dawid Weiss)

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java?rev=1588137&r1=1588136&r2=1588137&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.java
Thu Apr 17 06:15:31 2014
@@ -31134,7 +31134,7 @@ public final class HTMLStripCharFilter e
       cumulativeDiff += inputSegment.length() - outputSegment.length();
       // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
-      eofReturnValue = outputSegment.nextChar();
+      eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
       break;
     }
     case BANG:
@@ -31147,7 +31147,7 @@ public final class HTMLStripCharFilter e
     case LEFT_ANGLE_BRACKET_SLASH:
     case LEFT_ANGLE_BRACKET_SPACE: {        // Include
       outputSegment = inputSegment;
-      eofReturnValue = outputSegment.nextChar();
+      eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
       break;
     }
     default: {
@@ -31506,7 +31506,11 @@ public final class HTMLStripCharFilter e
           }
         case 84: break;
         case 32: 
-          { yybegin(COMMENT);
+          { if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this
is not a comment
+      inputSegment.append(yytext());
+    } else {
+      yybegin(COMMENT);
+    }
           }
         case 85: break;
         case 33: 
@@ -31611,12 +31615,16 @@ public final class HTMLStripCharFilter e
           }
         case 99: break;
         case 47: 
-          { // add (previously matched input length) + (this match length) [ - (substitution
length) = 0 ]
-    cumulativeDiff += inputSegment.length() + yylength();
-    // position the correction at (already output length) [ + (substitution length) = 0 ]
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(CDATA);
+          { if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" -
this is not a CDATA section
+      inputSegment.append(yytext());
+    } else {
+      // add (previously matched input length) + (this match length) [ - (substitution length)
= 0 ]
+      cumulativeDiff += inputSegment.length() + yylength();
+      // position the correction at (already output length) [ + (substitution length) = 0
]
+      addOffCorrectMap(outputCharCount, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(CDATA);
+    }
           }
         case 100: break;
         case 48: 

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex?rev=1588137&r1=1588136&r2=1588137&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/HTMLStripCharFilter.jflex
Thu Apr 17 06:15:31 2014
@@ -309,7 +309,7 @@ InlineElment = ( [aAbBiIqQsSuU]         
       cumulativeDiff += inputSegment.length() - outputSegment.length();
       // position the correction at (already output length) + (substitution length)
       addOffCorrectMap(outputCharCount + outputSegment.length(), cumulativeDiff);
-      eofReturnValue = outputSegment.nextChar();
+      eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
       break;
     }
     case BANG:
@@ -322,7 +322,7 @@ InlineElment = ( [aAbBiIqQsSuU]         
     case LEFT_ANGLE_BRACKET_SLASH:
     case LEFT_ANGLE_BRACKET_SPACE: {        // Include
       outputSegment = inputSegment;
-      eofReturnValue = outputSegment.nextChar();
+      eofReturnValue = outputSegment.length() > 0 ? outputSegment.nextChar() : -1;
       break;
     }
     default: {
@@ -754,7 +754,13 @@ InlineElment = ( [aAbBiIqQsSuU]         
 }
 
 <BANG> {
-  "--" { yybegin(COMMENT); }
+  "--" {
+    if (inputSegment.length() > 2) { // Chars between "<!" and "--" - this is not a
comment
+      inputSegment.append(yytext());
+    } else {
+      yybegin(COMMENT);
+    }
+  }
   ">" {
     // add (previously matched input length) + (this match length) [ - (substitution length)
= 0 ]
     cumulativeDiff += inputSegment.length() + yylength();
@@ -771,12 +777,16 @@ InlineElment = ( [aAbBiIqQsSuU]         
   // [21] CDEnd   ::= ']]>'
   //
   "[CDATA[" {
-    // add (previously matched input length) + (this match length) [ - (substitution length)
= 0 ]
-    cumulativeDiff += inputSegment.length() + yylength();
-    // position the correction at (already output length) [ + (substitution length) = 0 ]
-    addOffCorrectMap(outputCharCount, cumulativeDiff);
-    inputSegment.clear();
-    yybegin(CDATA);
+    if (inputSegment.length() > 2) { // Chars between "<!" and "[CDATA[" - this is
not a CDATA section
+      inputSegment.append(yytext());
+    } else {
+      // add (previously matched input length) + (this match length) [ - (substitution length)
= 0 ]
+      cumulativeDiff += inputSegment.length() + yylength();
+      // position the correction at (already output length) [ + (substitution length) = 0
]
+      addOffCorrectMap(outputCharCount, cumulativeDiff);
+      inputSegment.clear();
+      yybegin(CDATA);
+    }
   }
   [^] {
     inputSegment.append(zzBuffer[zzStartRead]);

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java?rev=1588137&r1=1588136&r2=1588137&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/test/org/apache/lucene/analysis/charfilter/HTMLStripCharFilterTest.java
Thu Apr 17 06:15:31 2014
@@ -18,7 +18,6 @@ package org.apache.lucene.analysis.charf
  */
 
 import java.io.BufferedReader;
-import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.io.Reader;
@@ -26,6 +25,7 @@ import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.HashSet;
+import java.util.List;
 import java.util.Set;
 
 import org.apache.lucene.analysis.Analyzer;
@@ -53,27 +53,14 @@ public class HTMLStripCharFilterTest ext
 
   //this is some text  here is a  link  and another  link . This is an entity: & plus
a <.  Here is an &
   //
-  public void test() throws IOException {
+  public void test() throws Exception {
     String html = "<div class=\"foo\">this is some text</div> here is a <a
href=\"#bar\">link</a> and " +
             "another <a href=\"http://lucene.apache.org/\">link</a>. " +
             "This is an entity: &amp; plus a &lt;.  Here is an &. <!-- is
a comment -->";
     String gold = "\nthis is some text\n here is a link and " +
             "another link. " +
             "This is an entity: & plus a <.  Here is an &. ";
-    HTMLStripCharFilter reader = new HTMLStripCharFilter(new StringReader(html));
-    StringBuilder builder = new StringBuilder();
-    int ch = -1;
-    char [] goldArray = gold.toCharArray();
-    int position = 0;
-    while ((ch = reader.read()) != -1){
-      char theChar = (char) ch;
-      builder.append(theChar);
-      assertTrue("\"" + theChar + "\"" + " at position: " + position + " does not equal:
" + goldArray[position]
-              + " Buffer so far: " + builder + "<EOB>", theChar == goldArray[position]);
-      position++;
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(html, gold, null);
   }
 
   //Some sanity checks, but not a full-fledged check
@@ -100,61 +87,28 @@ public class HTMLStripCharFilterTest ext
     String gold = "This is a test";
     StringBuilder builder = new StringBuilder();
     int ch = 0;
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
+    while ((ch = reader.read()) != -1){
+      builder.append((char)ch);
     }
+    // Compare trim()'d output to gold
     assertEquals("'" + builder.toString().trim() + "' is not equal to '" + gold + "'",
                  gold, builder.toString().trim());
   }
-  
-  
+
   public void testGamma() throws Exception {
-    String test = "&Gamma;";
-    String gold = "\u0393";
-    Set<String> set = new HashSet<>();
-    set.add("reserved");
-    Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
-    StringBuilder builder = new StringBuilder();
-    int ch = 0;
-    while ((ch = reader.read()) != -1){
-      builder.append((char)ch);
-    }
-    String result = builder.toString();
-    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
+    assertHTMLStripsTo("&Gamma;", "\u0393", new HashSet<>(Arrays.asList("reserved")));
   }
 
   public void testEntities() throws Exception {
     String test = "&nbsp; &lt;foo&gt; &Uuml;bermensch &#61; &Gamma;
bar &#x393;";
     String gold = "  <foo> \u00DCbermensch = \u0393 bar \u0393";
-    Set<String> set = new HashSet<>();
-    set.add("reserved");
-    Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
-    StringBuilder builder = new StringBuilder();
-    int ch = 0;
-    while ((ch = reader.read()) != -1){
-      builder.append((char)ch);
-    }
-    String result = builder.toString();
-    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
+    assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
   }
 
   public void testMoreEntities() throws Exception {
     String test = "&nbsp; &lt;junk/&gt; &nbsp; &#33; &#64; and &#8217;";
     String gold = "  <junk/>   ! @ and ’";
-    Set<String> set = new HashSet<>();
-    set.add("reserved");
-    Reader reader = new HTMLStripCharFilter(new StringReader(test), set);
-    StringBuilder builder = new StringBuilder();
-    int ch = 0;
-    while ((ch = reader.read()) != -1){
-      builder.append((char)ch);
-    }
-    String result = builder.toString();
-    assertEquals("'" + result + "' is not equal to '" + gold + "<EOS>'", gold, result);
+    assertHTMLStripsTo(test, gold, new HashSet<>(Arrays.asList("reserved")));
   }
 
   public void testReserved() throws Exception {
@@ -344,16 +298,7 @@ public class HTMLStripCharFilterTest ext
         "\n\n\n\n\n\n\n\n",
     };
     for (int i = 0 ; i < testGold.length ; i += 2) {
-      String test = testGold[i];
-      String gold = testGold[i + 1];
-      Reader reader = new HTMLStripCharFilter(new StringReader(test));
-      StringBuilder builder = new StringBuilder();
-      int ch = 0;
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-      String result = builder.toString();
-      assertEquals("Test: '" + test + "'", gold, result);
+      assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
     }
   }
 
@@ -362,7 +307,9 @@ public class HTMLStripCharFilterTest ext
     StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.getInitialBufferSize()
+ 50);
     testBuilder.append("ah<?> ??????");
     appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
-    processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
+    Reader reader = new HTMLStripCharFilter
+        (new BufferedReader(new StringReader(testBuilder.toString()))); //force the use of
BufferedReader
+    assertHTMLStripsTo(reader, testBuilder.toString(), null);
 
     testBuilder.setLength(0);
     testBuilder.append("<!--");//comments
@@ -370,54 +317,21 @@ public class HTMLStripCharFilterTest ext
 
     testBuilder.append("-->foo");
     String gold = "foo";
-    Reader reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(testBuilder.toString(), gold, null);
 
     testBuilder.setLength(0);
     testBuilder.append("<?");
     appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     testBuilder.append("?>");
     gold = "";
-    reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
-    ch = 0;
-    builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
-    
+    assertHTMLStripsTo(testBuilder.toString(), gold, null);
+
     testBuilder.setLength(0);
     testBuilder.append("<b ");
     appendChars(testBuilder, HTMLStripCharFilter.getInitialBufferSize() + 500);
     testBuilder.append("/>");
     gold = "";
-    reader = new HTMLStripCharFilter(new StringReader(testBuilder.toString()));
-    ch = 0;
-    builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(testBuilder.toString(), gold, null);
   }
 
   private void appendChars(StringBuilder testBuilder, int numChars) {
@@ -427,39 +341,19 @@ public class HTMLStripCharFilterTest ext
     }
   }  
 
-
-  private void processBuffer(String test, String assertMsg) throws IOException {
-    // System.out.println("-------------------processBuffer----------");
-    Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force
the use of BufferedReader
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String (trimmed): " + builder.toString().trim() + "<EOS>");
-    }
-    assertEquals(assertMsg + "::: " + builder.toString() + " is not equal to " + test,
-        test, builder.toString());
-  }
-
   public void testComment() throws Exception {
-
     String test = "<!--- three dashes, still a valid comment ---> ";
     String gold = " ";
-    Reader reader = new HTMLStripCharFilter(new BufferedReader(new StringReader(test)));//force
the use of BufferedReader
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
+
+    test = "<! -- blah > "; // should not be recognized as a comment
+    gold = " ";
+    assertHTMLStripsTo(test, gold, null);
+
+    StringBuilder testBuilder = new StringBuilder("<!--");
+    appendChars(testBuilder, TestUtil.nextInt(random(), 0, 1000));
+    gold = "";
+    assertHTMLStripsTo(testBuilder.toString(), gold, null);
   }
 
 
@@ -526,83 +420,28 @@ public class HTMLStripCharFilterTest ext
         + " alt =  \"Alt: <!--#echo var='${IMAGE_CAPTION:<!--comment-->\\'Comment\\'}'
 -->\"\n\n"
         + " title=\"Title: <!--#echo var=\"IMAGE_CAPTION\"-->\">two";
     String gold = "onetwo";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertTrue(builder.toString() + " is not equal to " + gold, builder.toString().equals(gold));
+    assertHTMLStripsTo(test, gold, null);
 
     test = "one<script><!-- <!--#config comment=\"<!-- \\\"comment\\\"-->\"-->
--></script>two";
     gold = "one\ntwo";
-    reader = new HTMLStripCharFilter(new StringReader(test));
-    ch = 0;
-    builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
   }
 
   public void testScriptQuotes() throws Exception {
     String test = "one<script attr= bare><!-- action('<!-- comment -->', \"\\\"-->\\\"\");
--></script>two";
     String gold = "one\ntwo";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-        gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
 
     test = "hello<script><!-- f('<!--internal--></script>'); --></script>";
     gold = "hello\n";
-    reader = new HTMLStripCharFilter(new StringReader(test));
-    ch = 0;
-    builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
   }
 
   public void testEscapeScript() throws Exception {
     String test = "one<script no-value-attr>callSomeMethod();</script>two";
     String gold = "one<script no-value-attr></script>two";
     Set<String> escapedTags = new HashSet<>(Arrays.asList("SCRIPT"));
-    Reader reader = new HTMLStripCharFilter
-        (new StringReader(test), escapedTags);
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, escapedTags);
   }
 
   public void testStyle() throws Exception {
@@ -612,37 +451,14 @@ public class HTMLStripCharFilterTest ext
                 + "-->\n"
                 + "</style>two";
     String gold = "one\ntwo";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-        gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
   }
 
   public void testEscapeStyle() throws Exception {
     String test = "one<style type=\"text/css\"> body,font,a { font-family:arial; }
</style>two";
     String gold = "one<style type=\"text/css\"></style>two";
     Set<String> escapedTags = new HashSet<>(Arrays.asList("STYLE"));
-    Reader reader = new HTMLStripCharFilter
-        (new StringReader(test), escapedTags);
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-        gold, builder.toString());
+    assertHTMLStripsTo(test, gold, escapedTags);
   }
 
   public void testBR() throws Exception {
@@ -654,135 +470,80 @@ public class HTMLStripCharFilterTest ext
         "one\ntwo\n",
     };
     for (int i = 0 ; i < testGold.length ; i += 2) {
-      String test = testGold[i];
-      String gold = testGold[i + 1];
-      Reader reader = new HTMLStripCharFilter(new StringReader(test));
-      StringBuilder builder = new StringBuilder();
-      int ch = 0;
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-      String result = builder.toString();
-      assertEquals("Test: '" + test + "'", gold, result);
+      assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
     }
   }
   public void testEscapeBR() throws Exception {
     String test = "one<BR class='whatever'>two</\nBR\n>";
     String gold = "one<BR class='whatever'>two</\nBR\n>";
     Set<String> escapedTags = new HashSet<>(Arrays.asList("BR"));
-    Reader reader = new HTMLStripCharFilter
-        (new StringReader(test), escapedTags);
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, escapedTags);
   }
   
   public void testInlineTagsNoSpace() throws Exception {
     String test = "one<sPAn class=\"invisible\">two<sup>2<sup>e</sup></sup>.</SpaN>three";
     String gold = "onetwo2e.three";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
   }
 
   public void testCDATA() throws Exception {
-    String test = "one<![CDATA[<one><two>three<four></four></two></one>]]>two";
-    String gold = "one<one><two>three<four></four></two></one>two";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    int maxNumElems = 100;
+    String randomHtmlishString1 // Don't create a comment (disallow "<!--") and don't
include a closing ">"
+        = TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
+    String closedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[&]]>";
+
+    String randomHtmlishString2 // Don't create a comment (disallow "<!--") and don't
include a closing ">"
+        = TestUtil.randomHtmlishString(random(), maxNumElems).replaceAll(">", " ").replaceFirst("^--","__");
+    String unclosedAngleBangNonCDATA = "<!" + randomHtmlishString1 +"-[CDATA[";
 
-    test = "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five";
-    gold = "onetwo<![CDATA[three]]>fourfive";
-    reader = new HTMLStripCharFilter(new StringReader(test));
-    ch = 0;
-    builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
+    String[] testGold = {
+        "one<![CDATA[<one><two>three<four></four></two></one>]]>two",
+        "one<one><two>three<four></four></two></one>two",
+
+        "one<![CDATA[two<![CDATA[three]]]]><![CDATA[>four]]>five",
+        "onetwo<![CDATA[three]]>fourfive",
+
+        "<! [CDATA[&]]>", "",
+        "<! [CDATA[&] ] >", "",
+        "<! [CDATA[&]]", "<! [CDATA[&]]", // unclosed angle bang - all input
is output
+        "<!\u2009[CDATA[&]]>", "",
+        "<!\u2009[CDATA[&]\u2009]\u2009>", "",
+        "<!\u2009[CDATA[&]\u2009]\u2009", "<!\u2009[CDATA[&]\u2009]\u2009",
// unclosed angle bang - all input is output
+        closedAngleBangNonCDATA, "",
+        "<![CDATA[", "",
+        "<![CDATA[<br>", "<br>",
+        "<![CDATA[<br>]]", "<br>]]",
+        "<![CDATA[<br>]]>", "<br>",
+        "<![CDATA[<br>] ] >", "<br>] ] >",
+        "<![CDATA[<br>]\u2009]\u2009>", "<br>]\u2009]\u2009>",
+        "<!\u2009[CDATA[", "<!\u2009[CDATA[",
+        unclosedAngleBangNonCDATA, unclosedAngleBangNonCDATA
+    };
+    for (int i = 0 ; i < testGold.length ; i += 2) {
+      assertHTMLStripsTo(testGold[i], testGold[i + 1], null);
     }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+  }
+
+  public void testUnclosedAngleBang() throws Exception {
+    assertHTMLStripsTo("<![endif]", "<![endif]", null);
   }
 
   public void testUppercaseCharacterEntityVariants() throws Exception {
     String test = " &QUOT;-&COPY;&GT;>&LT;<&REG;&AMP;";
     String gold = " \"-\u00A9>><<\u00AE&";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
   }
   
   public void testMSWordMalformedProcessingInstruction() throws Exception {
     String test = "one<?xml:namespace prefix = o ns = \"urn:schemas-microsoft-com:office:office\"
/>two";
     String gold = "onetwo";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-                 gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
   }
 
   public void testSupplementaryCharsInTags() throws Exception {
     String test = "one<𩬅艱鍟䇹愯瀛>two<瀛愯𩬅>three
瀛愯𩬅</瀛愯𩬅>four</𩬅艱鍟䇹愯瀛>five<𠀀𠀀>six<𠀀𠀀/>seven";
     String gold = "one\ntwo\nthree 瀛愯𩬅\nfour\nfive\nsix\nseven";
-    Reader reader = new HTMLStripCharFilter(new StringReader(test));
-    int ch = 0;
-    StringBuilder builder = new StringBuilder();
-    try {
-      while ((ch = reader.read()) != -1){
-        builder.append((char)ch);
-      }
-    } finally {
-      // System.out.println("String: " + builder.toString());
-    }
-    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
-        gold, builder.toString());
+    assertHTMLStripsTo(test, gold, null);
   }
 
   public void testRandomBrokenHTML() throws Exception {
@@ -857,4 +618,33 @@ public class HTMLStripCharFilterTest ext
     assertAnalyzesTo(analyzer, " &#57209", new String[] { "\uFFFD" } );
     assertAnalyzesTo(analyzer, " &#57209<br>", new String[] { "&#57209" } );
   }
+
+
+  public static void assertHTMLStripsTo(String input, String gold, Set<String> escapedTags)
throws Exception {
+    assertHTMLStripsTo(new StringReader(input), gold, escapedTags);
+  }
+
+  public static void assertHTMLStripsTo(Reader input, String gold, Set<String> escapedTags)
throws Exception {
+    HTMLStripCharFilter reader;
+    if (null == escapedTags) {
+      reader = new HTMLStripCharFilter(input);
+    } else {
+      reader = new HTMLStripCharFilter(input, escapedTags);
+    }
+    int ch = 0;
+    StringBuilder builder = new StringBuilder();
+    try {
+      while ((ch = reader.read()) != -1) {
+        builder.append((char)ch);
+      }
+    } catch (Exception e) {
+      if (gold.equals(builder.toString())) {
+        throw e;
+      }
+      throw new Exception
+          ("('" + builder.toString() + "' is not equal to '" + gold + "').  " + e.getMessage(),
e);
+    }
+    assertEquals("'" + builder.toString() + "' is not equal to '" + gold + "'",
+                 gold, builder.toString());
+  }
 }



Mime
View raw message