lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From sar...@apache.org
Subject svn commit: r1235308 [5/5] - in /lucene/dev/branches/branch_3x: lucene/ lucene/contrib/analyzers/common/ lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/charfilter/ lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis...
Date Tue, 24 Jan 2012 15:51:57 GMT
Modified: lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
(original)
+++ lucene/dev/branches/branch_3x/solr/contrib/dataimporthandler/src/java/org/apache/solr/handler/dataimport/HTMLStripTransformer.java
Tue Jan 24 15:51:55 2012
@@ -16,8 +16,8 @@
  */
 package org.apache.solr.handler.dataimport;
 
-import org.apache.solr.analysis.HTMLStripCharFilter;
 import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
 
 import java.io.IOException;
 import java.io.StringReader;

Modified: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
Tue Jan 24 15:51:55 2012
@@ -19,22 +19,52 @@ package org.apache.solr.analysis;
  */
 
 import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.charfilter.HTMLStripCharFilter;
+
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 /**
-* Factory for {@link HTMLStripCharFilter}. 
+* Factory for {@link HTMLStripCharFilter}.
  * <pre class="prettyprint" >
  * &lt;fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"&gt;
  *   &lt;analyzer&gt;
- *     &lt;charFilter class="solr.HTMLStripCharFilterFactory"/&gt;
+ *     &lt;charFilter class="solr.HTMLStripCharFilterFactory" escapedTags="a, title"
/&gt;
  *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
  *   &lt;/analyzer&gt;
  * &lt;/fieldType&gt;</pre
- * @version $Id$  
+ *
  */
- public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
+public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
+  
+  Set<String> escapedTags = null;
+  Pattern TAG_NAME_PATTERN = Pattern.compile("[^\\s,]");
 
   public HTMLStripCharFilter create(CharStream input) {
-    return new HTMLStripCharFilter(input);
+    HTMLStripCharFilter charFilter;
+    if (null == escapedTags) {
+      charFilter = new HTMLStripCharFilter(input);
+    } else {
+      charFilter = new HTMLStripCharFilter(input, escapedTags);
+    }
+    return charFilter;
+  }
+  
+  @Override
+  public void init(Map<String,String> args) {
+    super.init(args);
+    String escapedTagsArg = args.get("escapedTags");
+    if (null != escapedTagsArg) {
+      Matcher matcher = TAG_NAME_PATTERN.matcher(escapedTagsArg);
+      while (matcher.find()) {
+        if (null == escapedTags) {
+          escapedTags = new HashSet<String>();
+        }
+        escapedTags.add(matcher.group(0));
+      }
+    }
   }
-
 }

Copied: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
(from r1234452, lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java?p2=lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java&p1=lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java&r1=1234452&r2=1235308&rev=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilter.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilter.java
Tue Jan 24 15:51:55 2012
@@ -29,11 +29,26 @@ import org.apache.lucene.analysis.CharRe
 import org.apache.lucene.analysis.CharStream;
 
 /**
- * A CharFilter that wraps another Reader and attempts to strip out HTML constructs.
- * 
- * @version $Id$
+ * <p>
+ * This class is <b>NOT</b> recommended for new users and should be
+ * considered <b>UNSUPPORTED</b>.
+ * </p>
+ * <p>
+ * In Solr version 3.5 and earlier, <tt>HTMLStripCharFilter(Factory)</tt>
+ * had known bugs in the offsets it provided, triggering e.g. exceptions in
+ * highlighting.
+ * </p>
+ * <p>
+ * This class is provided as possible alternative for people who depend on
+ * the "broken" behavior of <tt>HTMLStripCharFilter</tt> in Solr version 3.5
+ * and earlier, and/or who don't like the changes introduced by the Solr 3.6+
+ * version of <tt>HTMLStripCharFilterFactory</tt>.  (See the 3.6.0 release
+ * section of solr/CHANGES.txt for a list of differences in behavior.)
+ * </p>
+ * @deprecated use {@link org.apache.lucene.analysis.charfilter.HTMLStripCharFilter}
  */
-public class HTMLStripCharFilter extends BaseCharFilter {
+@Deprecated
+public class LegacyHTMLStripCharFilter extends BaseCharFilter {
   private int readAheadLimit = DEFAULT_READ_AHEAD;
   private int safeReadAheadLimit = readAheadLimit - 3;
   private int numWhitespace = 0;
@@ -55,22 +70,22 @@ public class HTMLStripCharFilter extends
 
 
   public static void main(String[] args) throws IOException {
-    Reader in = new HTMLStripCharFilter(
+    Reader in = new LegacyHTMLStripCharFilter(
             CharReader.get(new InputStreamReader(System.in)));
     int ch;
     while ( (ch=in.read()) != -1 ) System.out.print((char)ch);
   }
 
-  public HTMLStripCharFilter(CharStream source) {
+  public LegacyHTMLStripCharFilter(CharStream source) {
     super(source.markSupported() ? source : CharReader.get(new BufferedReader(source)));
   }
 
-  public HTMLStripCharFilter(CharStream source, Set<String> escapedTags){
+  public LegacyHTMLStripCharFilter(CharStream source, Set<String> escapedTags){
     this(source);
     this.escapedTags = escapedTags;
   }
 
-  public HTMLStripCharFilter(CharStream source, Set<String> escapedTags, int readAheadLimit){
+  public LegacyHTMLStripCharFilter(CharStream source, Set<String> escapedTags, int
readAheadLimit){
     this(source);
     this.escapedTags = escapedTags;
     this.readAheadLimit = readAheadLimit;

Copied: lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
(from r1234452, lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java?p2=lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java&p1=lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java&r1=1234452&r2=1235308&rev=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/HTMLStripCharFilterFactory.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/java/org/apache/solr/analysis/LegacyHTMLStripCharFilterFactory.java
Tue Jan 24 15:51:55 2012
@@ -21,20 +21,38 @@ package org.apache.solr.analysis;
 import org.apache.lucene.analysis.CharStream;
 
 /**
-* Factory for {@link HTMLStripCharFilter}. 
+ * Factory for {@link LegacyHTMLStripCharFilter}.
  * <pre class="prettyprint" >
- * &lt;fieldType name="text_html" class="solr.TextField" positionIncrementGap="100"&gt;
- *   &lt;analyzer&gt;
- *     &lt;charFilter class="solr.HTMLStripCharFilterFactory"/&gt;
- *     &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
- *   &lt;/analyzer&gt;
- * &lt;/fieldType&gt;</pre
- * @version $Id$  
+ * &lt;fieldType name="text_html_legacy" class="solr.TextField" positionIncrementGap="100"&gt;
+ * &lt;analyzer&gt;
+ *     &lt;charFilter class="solr.LegacyHTMLStripCharFilterFactory"/&gt;
+ * &lt;tokenizer class="solr.WhitespaceTokenizerFactory"/&gt;
+ * &lt;/analyzer&gt;
+ * &lt;/fieldType&gt;
+ * </pre>
+ * <p>
+ * This factory is <b>NOT</b> recommended for new users and should be
+ * considered <b>UNSUPPORTED</b>.
+ * </p>
+ * <p>
+ * In Solr version 3.5 and earlier, <tt>HTMLStripCharFilter(Factory)</tt>
+ * had known bugs in the offsets it provided, triggering e.g. exceptions in
+ * highlighting.
+ * </p>
+ * <p>
+ * This class is provided as possible alternative for people who depend on
+ * the "broken" behavior of <tt>HTMLStripCharFilter</tt> in Solr version 3.5
+ * and earlier, and/or who don't like the changes introduced by the Solr 3.6+
+ * version of <tt>HTMLStripCharFilterFactory</tt>.  (See the 3.6.0 release
+ * section of lucene/CHANGES.txt for a list of differences in behavior.)
+ * </p>
+ * @deprecated use {@link HTMLStripCharFilterFactory}
  */
- public class HTMLStripCharFilterFactory extends BaseCharFilterFactory {
+@Deprecated
+public class LegacyHTMLStripCharFilterFactory extends BaseCharFilterFactory {
 
-  public HTMLStripCharFilter create(CharStream input) {
-    return new HTMLStripCharFilter(input);
+  public LegacyHTMLStripCharFilter create(CharStream input) {
+    return new LegacyHTMLStripCharFilter(input);
   }
 
 }

Copied: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
(from r1234452, lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java)
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java?p2=lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java&p1=lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java&r1=1234452&r2=1235308&rev=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/HTMLStripCharFilterTest.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/LegacyHTMLStripCharFilterTest.java
Tue Jan 24 15:51:55 2012
@@ -33,11 +33,12 @@ import org.apache.lucene.analysis.Reusab
 
 import org.apache.lucene.analysis.MockTokenizer;
 import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.util._TestUtil;
 import org.junit.Ignore;
 
 import org.apache.solr.SolrTestCaseJ4;
 
-public class HTMLStripCharFilterTest extends BaseTokenStreamTestCase {
+public class LegacyHTMLStripCharFilterTest extends BaseTokenStreamTestCase {
 
   //this is some text  here is a  link  and another  link . This is an entity: & plus
a <.  Here is an &
   //
@@ -48,7 +49,7 @@ public class HTMLStripCharFilterTest ext
     String gold = " this is some text  here is a  link  and " +
             "another  link . " +
             "This is an entity: & plus a <.  Here is an &.  ";
-    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new StringReader(html)));
+    LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(html)));
     StringBuilder builder = new StringBuilder();
     int ch = -1;
     char [] goldArray = gold.toCharArray();
@@ -66,7 +67,7 @@ public class HTMLStripCharFilterTest ext
   //Some sanity checks, but not a full-fledged check
   public void testHTML() throws Exception {
 
-    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new FileReader(SolrTestCaseJ4.getFile("htmlStripReaderTest.html"))));
+    LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new FileReader(SolrTestCaseJ4.getFile("htmlStripReaderTest.html"))));
     StringBuilder builder = new StringBuilder();
     int ch = -1;
     while ((ch = reader.read()) != -1){
@@ -86,7 +87,7 @@ public class HTMLStripCharFilterTest ext
     String gold = "\u0393";
     Set<String> set = new HashSet<String>();
     set.add("reserved");
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)),
set);
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.read()) != -1){
@@ -103,7 +104,7 @@ public class HTMLStripCharFilterTest ext
     String gold = "  <foo> \u00DCbermensch = \u0393 bar \u0393";
     Set<String> set = new HashSet<String>();
     set.add("reserved");
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)),
set);
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.read()) != -1){
@@ -120,7 +121,7 @@ public class HTMLStripCharFilterTest ext
     String gold = "  <junk/>   ! @ and ’";
     Set<String> set = new HashSet<String>();
     set.add("reserved");
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)),
set);
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.read()) != -1){
@@ -136,7 +137,7 @@ public class HTMLStripCharFilterTest ext
     String test = "aaa bbb <reserved ccc=\"ddddd\"> eeee </reserved> ffff <reserved
ggg=\"hhhh\"/> <other/>";
     Set<String> set = new HashSet<String>();
     set.add("reserved");
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)), set);
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)),
set);
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.read()) != -1){
@@ -153,7 +154,7 @@ public class HTMLStripCharFilterTest ext
   public void testMalformedHTML() throws Exception {
     String test = "a <a hr<ef=aa<a>> </close</a>";
     String gold = "a <a hr<ef=aa > </close ";
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new StringReader(test)));
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(test)));
     StringBuilder builder = new StringBuilder();
     int ch = 0;
     while ((ch = reader.read()) != -1){
@@ -166,27 +167,27 @@ public class HTMLStripCharFilterTest ext
   }
 
   public void testBufferOverflow() throws Exception {
-    StringBuilder testBuilder = new StringBuilder(HTMLStripCharFilter.DEFAULT_READ_AHEAD
+ 50);
+    StringBuilder testBuilder = new StringBuilder(LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD
+ 50);
     testBuilder.append("ah<?> ??????");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
     processBuffer(testBuilder.toString(), "Failed on pseudo proc. instr.");//processing instructions
 
     testBuilder.setLength(0);
     testBuilder.append("<!--");//comments
-    appendChars(testBuilder, 3*HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments have
two lookaheads
+    appendChars(testBuilder, 3*LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);//comments
have two lookaheads
 
     testBuilder.append("-->foo");
     processBuffer(testBuilder.toString(), "Failed w/ comment");
 
     testBuilder.setLength(0);
     testBuilder.append("<?");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
     testBuilder.append("?>");
     processBuffer(testBuilder.toString(), "Failed with proc. instr.");
     
     testBuilder.setLength(0);
     testBuilder.append("<b ");
-    appendChars(testBuilder, HTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
+    appendChars(testBuilder, LegacyHTMLStripCharFilter.DEFAULT_READ_AHEAD + 500);
     testBuilder.append("/>");
     processBuffer(testBuilder.toString(), "Failed on tag");
 
@@ -195,14 +196,14 @@ public class HTMLStripCharFilterTest ext
   private void appendChars(StringBuilder testBuilder, int numChars) {
     int i1 = numChars / 2;
     for (int i = 0; i < i1; i++){
-      testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead
limit, since <?> makes HTMLStripCharFilter think it is a processing instruction
+      testBuilder.append('a').append(' ');//tack on enough to go beyond the mark readahead
limit, since <?> makes LegacyHTMLStripCharFilter think it is a processing instruction
     }
   }  
 
 
   private void processBuffer(String test, String assertMsg) throws IOException {
     // System.out.println("-------------------processBuffer----------");
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force
the use of BufferedReader
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force
the use of BufferedReader
     int ch = 0;
     StringBuilder builder = new StringBuilder();
     try {
@@ -219,7 +220,7 @@ public class HTMLStripCharFilterTest ext
 
     String test = "<!--- three dashes, still a valid comment ---> ";
     String gold = "  ";
-    Reader reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force
the use of BufferedReader
+    Reader reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new StringReader(test))));//force
the use of BufferedReader
     int ch = 0;
     StringBuilder builder = new StringBuilder();
     try {
@@ -234,7 +235,7 @@ public class HTMLStripCharFilterTest ext
 
 
   public void doTestOffsets(String in) throws Exception {
-    HTMLStripCharFilter reader = new HTMLStripCharFilter(CharReader.get(new BufferedReader(new
StringReader(in))));
+    LegacyHTMLStripCharFilter reader = new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(new
StringReader(in))));
     int ch = 0;
     int off = 0;     // offset in the reader
     int strOff = -1; // offset in the original string
@@ -271,11 +272,54 @@ public class HTMLStripCharFilterTest ext
 
       @Override
       protected Reader initReader(Reader reader) {
-        return new HTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
+        return new LegacyHTMLStripCharFilter(CharReader.get(new BufferedReader(reader)));
       }
     };
     
     int numRounds = RANDOM_MULTIPLIER * 10000;
     checkRandomData(random, analyzer, numRounds);
   }
+
+  public void testRandomBrokenHTML() throws Exception {
+    int maxNumElements = 10000;
+    String text = _TestUtil.randomHtmlishString(random, maxNumElements);
+    Reader reader
+        = new LegacyHTMLStripCharFilter(CharReader.get(new StringReader(text)));
+    while (reader.read() != -1);
+  }
+
+  public void testRandomText() throws Exception {
+    StringBuilder text = new StringBuilder();
+    int minNumWords = 10;
+    int maxNumWords = 10000;
+    int minWordLength = 3;
+    int maxWordLength = 20;
+    int numWords = _TestUtil.nextInt(random, minNumWords, maxNumWords);
+    switch (_TestUtil.nextInt(random, 0, 4)) {
+      case 0: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomUnicodeString(random, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      case 1: {
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomRealisticUnicodeString
+              (random, minWordLength, maxWordLength));
+          text.append(' ');
+        }
+        break;
+      }
+      default: { // ASCII 50% of the time
+        for (int wordNum = 0 ; wordNum < numWords ; ++wordNum) {
+          text.append(_TestUtil.randomSimpleString(random));
+          text.append(' ');
+        }
+      }
+    }
+    Reader reader = new LegacyHTMLStripCharFilter
+        (CharReader.get(new StringReader(text.toString())));
+    while (reader.read() != -1);
+  }
 }

Added: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java?rev=1235308&view=auto
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
(added)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/analysis/TestHTMLStripCharFilterFactory.java
Tue Jan 24 15:51:55 2012
@@ -0,0 +1,130 @@
+package org.apache.solr.analysis;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.apache.lucene.analysis.CharReader;
+import org.apache.lucene.analysis.CharStream;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.TokenStream;
+
+/**
+ * Simple tests to ensure this factory is working
+ */
+public class TestHTMLStripCharFilterFactory extends BaseTokenTestCase {
+
+
+  public void testNothingChanged() throws IOException {
+    //                             11111111112
+    //                   012345678901234567890
+    final String text = "this is only a test.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", "a, Title");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] { 0, 5,  8, 13, 15 },
+        new int[] { 4, 7, 12, 14, 20 });
+  }
+
+  public void testNoEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] {  3, 12, 18, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", "U i");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "<u>this</u>", "is", "only", "a", "<I>test</I>."
},
+        new int[] {  0, 12, 18, 27, 29 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testSeparatorOnlyEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", ",, , ");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] {  3, 12, 18, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testEmptyEscapedTags() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", "");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "only", "a", "test." },
+        new int[] {  3, 12, 18, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+
+  public void testSingleEscapedTag() throws IOException {
+    //                             11111111112222222222333333333344
+    //                   012345678901234567890123456789012345678901
+    final String text = "<u>this</u> is <b>only</b> a <I>test</I>.";
+    HTMLStripCharFilterFactory factory = new HTMLStripCharFilterFactory();
+    Map<String,String> args = new HashMap<String,String>();
+    args.put("escapedTags", ", B\r\n\t");
+    factory.init(args);
+    CharStream cs = factory.create(CharReader.get(new StringReader(text)));
+    TokenStream ts = new MockTokenizer(cs, MockTokenizer.WHITESPACE, false);
+    assertTokenStreamContents(ts,
+        new String[] { "this", "is", "<b>only</b>", "a", "test." },
+        new int[] {  3, 12, 15, 27, 32 },
+        new int[] { 11, 14, 26, 28, 41 });
+  }
+}

Modified: lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java?rev=1235308&r1=1235307&r2=1235308&view=diff
==============================================================================
--- lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
(original)
+++ lucene/dev/branches/branch_3x/solr/core/src/test/org/apache/solr/handler/FieldAnalysisRequestHandlerTest.java
Tue Jan 24 15:51:55 2012
@@ -326,8 +326,8 @@ public class FieldAnalysisRequestHandler
     NamedList indexPart = textType.get("index");
     assertNotNull("expecting an index token analysis for field type 'charfilthtmlmap'", indexPart);
     
-    assertEquals("  whátëvêr  ", indexPart.get("org.apache.solr.analysis.HTMLStripCharFilter"));
-    assertEquals("  whatever  ", indexPart.get("org.apache.lucene.analysis.MappingCharFilter"));
+    assertEquals("\n\nwhátëvêr\n\n", indexPart.get("org.apache.lucene.analysis.charfilter.HTMLStripCharFilter"));
+    assertEquals("\n\nwhatever\n\n", indexPart.get("org.apache.lucene.analysis.MappingCharFilter"));
 
     List<NamedList> tokenList = (List<NamedList>)indexPart.get(MockTokenizer.class.getName());
     assertNotNull("Expecting MockTokenizer analysis breakdown", tokenList);



Mime
View raw message