lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From gol...@apache.org
Subject cvs commit: jakarta-lucene/src/demo/org/apache/lucene/demo/html Tags.java HTMLParser.java HTMLParser.jj
Date Sun, 23 Nov 2003 17:24:32 GMT
goller      2003/11/23 09:24:32

  Modified:    src/demo/org/apache/lucene/demo/html HTMLParser.java
                        HTMLParser.jj
  Added:       src/demo/org/apache/lucene/demo/html Tags.java
  Log:
  Add space for certain html tags. This is a fix for bug 19253.
  I committed a slightly modified version of Daniel's patch.
  
  Revision  Changes    Path
  1.3       +16 -12    jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.java
  
  Index: HTMLParser.java
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.java,v
  retrieving revision 1.2
  retrieving revision 1.3
  diff -u -r1.2 -r1.3
  --- HTMLParser.java	23 Nov 2003 15:37:26 -0000	1.2
  +++ HTMLParser.java	23 Nov 2003 17:24:32 -0000	1.3
  @@ -224,14 +224,18 @@
     Token t1, t2;
     boolean inImg = false;
       t1 = jj_consume_token(TagName);
  -    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
  -    inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
  -    inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
  -    inImg = t1.image.equalsIgnoreCase("<img");    // keep track if in <IMG>
  +   String tagName = t1.image.toLowerCase();
  +   if(Tags.WS_ELEMS.contains(tagName) ) {
  +      addSpace();
  +    }
  +    inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
  +    inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
  +    inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
  +    inImg = tagName.equalsIgnoreCase("<img");     // keep track if in <IMG>
       if (inScript) {                               // keep track if in <SCRIPT>
  -      inScript = !t1.image.equalsIgnoreCase("</script");
  +      inScript = !tagName.equalsIgnoreCase("</script");
       } else {
  -      inScript = t1.image.equalsIgnoreCase("<script");
  +      inScript = tagName.equalsIgnoreCase("<script");
       }
       label_2:
       while (true) {
  @@ -424,15 +428,15 @@
       finally { jj_save(1, xla); }
     }
   
  -  final private boolean jj_3_2() {
  -    if (jj_scan_token(ArgQuote2)) return true;
  -    if (jj_scan_token(CloseQuote2)) return true;
  -    return false;
  -  }
  -
     final private boolean jj_3_1() {
       if (jj_scan_token(ArgQuote1)) return true;
       if (jj_scan_token(CloseQuote1)) return true;
  +    return false;
  +  }
  +
  +  final private boolean jj_3_2() {
  +    if (jj_scan_token(ArgQuote2)) return true;
  +    if (jj_scan_token(CloseQuote2)) return true;
       return false;
     }
   
  
  
  
  1.4       +10 -6     jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj
  
  Index: HTMLParser.jj
  ===================================================================
  RCS file: /home/cvs/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
  retrieving revision 1.3
  retrieving revision 1.4
  diff -u -r1.3 -r1.4
  --- HTMLParser.jj	23 Nov 2003 15:37:26 -0000	1.3
  +++ HTMLParser.jj	23 Nov 2003 17:24:32 -0000	1.4
  @@ -265,14 +265,18 @@
   }
   {
     t1=<TagName> {
  -    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
  -    inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in <META>
  -    inStyle = t1.image.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
  -    inImg = t1.image.equalsIgnoreCase("<img");	  // keep track if in <IMG>
  +   String tagName = t1.image.toLowerCase();
  +   if(Tags.WS_ELEMS.contains(tagName) ) {
  +      addSpace();
  +    }
  +    inTitle = tagName.equalsIgnoreCase("<title"); // keep track if in <TITLE>
  +    inMetaTag = tagName.equalsIgnoreCase("<META"); // keep track if in <META>
  +    inStyle = tagName.equalsIgnoreCase("<STYLE"); // keep track if in <STYLE>
  +    inImg = tagName.equalsIgnoreCase("<img");	  // keep track if in <IMG>
       if (inScript) {				  // keep track if in <SCRIPT>
  -      inScript = !t1.image.equalsIgnoreCase("</script");
  +      inScript = !tagName.equalsIgnoreCase("</script");
       } else {
  -      inScript = t1.image.equalsIgnoreCase("<script");
  +      inScript = tagName.equalsIgnoreCase("<script");
       }
     }
     (t1=<ArgName>
  
  
  
  1.1                  jakarta-lucene/src/demo/org/apache/lucene/demo/html/Tags.java
  
  Index: Tags.java
  ===================================================================
  package org.apache.lucene.demo.html;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2003 The Apache Software Foundation. All rights reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
   
  import java.util.Collections;
  import java.util.HashSet;
  import java.util.Set;
  
  
  public final class Tags {
    
    /**
     * contains all tags for which whitespaces have to be inserted for proper tokenization

     */
    public final static Set WS_ELEMS = Collections.synchronizedSet(new HashSet());
    
    static{
      WS_ELEMS.add("<p");
      WS_ELEMS.add("<div");
      WS_ELEMS.add("<hr");
      WS_ELEMS.add("<hr/");  // note that "<hr />" does not need to be listed explicitly
      WS_ELEMS.add("<br");
      WS_ELEMS.add("<br/");
      WS_ELEMS.add("<td");
      WS_ELEMS.add("<li");
      WS_ELEMS.add("<p");
      WS_ELEMS.add("<q");
      WS_ELEMS.add("<blockquote");
      WS_ELEMS.add("<dt");
      WS_ELEMS.add("<h1");
      WS_ELEMS.add("<h2");
      WS_ELEMS.add("<h3");
      WS_ELEMS.add("<h4");
      WS_ELEMS.add("<h5");
      WS_ELEMS.add("<h6");
    }
  
  }
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: lucene-dev-unsubscribe@jakarta.apache.org
For additional commands, e-mail: lucene-dev-help@jakarta.apache.org


Mime
View raw message