lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Strittmatter Stephan (external)" <Stephan.Strittmatter....@kst.siemens.de>
Subject RE: HTMLParser
Date Tue, 19 Feb 2002 08:05:36 GMT
Hi,

I have some jsp-Pages which are indexed locally with some "<% ... %>"-Tags
in it. I am not famillar within JavaCC. How have I to filter this
jsp-parts out of the page? Could you add this feature also. This would
be great for me!

Regards,

Stephan

> -----Original Message-----
> From: Daniel Calvo [mailto:dcalvo@task.com.br]
> Sent: Friday, February 15, 2002 10:42 PM
> To: Lucene Developers List
> Subject: HTMLParser
> 
> 
> Hi,
> 
> I was playing with HTMLParser.jj and made some changes you 
> might be interested in. What I did was start handling <META> 
> tags (added
> new methods: getAuthor, getKeywords and getMetadata and 
> changed getSummary to check if there's any metadata item with
> name=="description"). I'm also filtering out any text inside 
> <STYLE>...</STYLE> (like <SCRIPT> is being handled).
> I've performed some tests and I belive I didn't break anything ;-)
> 
> The patch is as follows
> 
> Best regards,
> 
> --Daniel
> 
> Index: HTMLParser.jj
> ===================================================================
> RCS file: 
> /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo
/html/HTMLParser.jj,v
> retrieving revision 1.1
> diff -u -r1.1 HTMLParser.jj
> --- HTMLParser.jj	26 Jan 2002 15:01:31 -0000	1.1
> +++ HTMLParser.jj	15 Feb 2002 20:39:49 -0000
> @@ -66,6 +66,8 @@
>  package org.apache.lucene.demo.html;
> 
>  import java.io.*;
> +import java.util.Map;
> +import java.util.HashMap;
> 
>  public class HTMLParser {
>    public static int SUMMARY_LENGTH = 200;
> @@ -76,11 +78,13 @@
>    boolean titleComplete = false;
>    boolean inTitle = false;
>    boolean inScript = false;
> +  boolean inStyle = false;
>    boolean afterTag = false;
>    boolean afterSpace = false;
>    String eol = System.getProperty("line.separator");
>    PipedReader pipeIn = null;
>    PipedWriter pipeOut;
> +  HashMap metadata = new HashMap(7);
> 
>    public HTMLParser(File file) throws FileNotFoundException {
>      this(new FileInputStream(file));
> @@ -109,15 +113,60 @@
>  	wait(10);
>        }
>      }
> -    if (summary.length() > SUMMARY_LENGTH)
> -      summary.setLength(SUMMARY_LENGTH);
> +    // look in metadata
> +    String description = (String) metadata.get("description");
> +    if (description != null)
> +      return description;
> +    else {
> +      if (summary.length() > SUMMARY_LENGTH)
> +        summary.setLength(SUMMARY_LENGTH);
> +
> +      String sum = summary.toString().trim();
> +      String tit = getTitle();
> +      if (sum.startsWith(tit))
> +        return sum.substring(tit.length());
> +      else
> +        return sum;
> +    }
> +  }
> +
> +  public String getAuthor() throws IOException, 
> InterruptedException {
> +    if (pipeIn == null)
> +      getReader();                // spawn parsing thread
> +    while (true) {
> +      synchronized(this) {
> +        if (summary.length() > 0) // assume that all metadata
> +          break;                  // has already been collected
> +        wait(10);
> +      }
> +    }
> +    return (String)metadata.get("author");
> +  }
> +
> +  public String getKeywords() throws IOException, 
> InterruptedException {
> +    if (pipeIn == null)
> +      getReader();                // spawn parsing thread
> +    while (true) {
> +      synchronized(this) {
> +        if (summary.length() > 0) // assume that all metadata
> +          break;                  // has already been collected
> +        wait(10);
> +      }
> +    }
> +    return (String)metadata.get("keywords");
> +  }
> 
> -    String sum = summary.toString().trim();
> -    String tit = getTitle();
> -    if (sum.startsWith(tit))
> -      return sum.substring(tit.length());
> -    else
> -      return sum;
> +  public Map getMetadata() throws IOException, InterruptedException {
> +    if (pipeIn == null)
> +      getReader();                // spawn parsing thread
> +    while (true) {
> +      synchronized(this) {
> +        if (summary.length() > 0) // assume that all metadata
> +          break;                  // has already been collected
> +        wait(10);
> +      }
> +    }
> +    return metadata;
>    }
> 
>    public Reader getReader() throws IOException {
> @@ -144,7 +193,7 @@
>    }
> 
>    void addText(String text) throws IOException {
> -    if (inScript)
> +        if (inScript || inStyle)
>        return;
>      if (inTitle)
>        title.append(text);
> @@ -165,7 +214,7 @@
>    }
> 
>    void addSpace() throws IOException {
> -    if (inScript)
> +        if (inScript || inStyle)
>        return;
>      if (!afterSpace) {
>        if (inTitle)
> @@ -216,23 +265,38 @@
>  {
>    Token t1, t2;
>    boolean inImg = false;
> +  boolean inMeta = false;
> +  String name = null;
> +  String content = null;
>  }
>  {
>    t1=<TagName> {
> -    inTitle = t1.image.equalsIgnoreCase("<title"); // keep 
> track if in <TITLE>
> -    inImg = t1.image.equalsIgnoreCase("<img");	  // 
> keep track if in <IMG>
> -    if (inScript) {				  // keep track 
> if in <SCRIPT>
> +    inTitle = t1.image.equalsIgnoreCase("<title");     // 
> keep track if in <TITLE>
> +    inImg = t1.image.equalsIgnoreCase("<img");         // 
> keep track if in <IMG>
> +    inMeta = t1.image.equalsIgnoreCase("<meta");       // 
> keep track if in <META>
> +    if (inScript) {                                    // 
> keep track if in <SCRIPT>
>        inScript = !t1.image.equalsIgnoreCase("</script");
>      } else {
>        inScript = t1.image.equalsIgnoreCase("<script");
>      }
> +    if (inStyle) {                                     // 
> keep track if in <STYLE>
> +      inStyle = !t1.image.equalsIgnoreCase("</style");
> +    } else {
> +      inStyle = t1.image.equalsIgnoreCase("<style");
> +    }
>    }
>    (t1=<ArgName>
>     (<ArgEquals>
> -    (t2=ArgValue()				  // save ALT 
> text in IMG tag
> +     (t2=ArgValue()
>       {
>         if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
> -         addText("[" + t2.image + "]");
> +         addText("[" + t2.image + "]");       // save ALT 
> text in IMG tag
> +       if (inMeta && t1.image.equalsIgnoreCase("name") && t2 != null)
> +         name = t2.image.toLowerCase();       // save name 
> in META tag
> +       if (inMeta && t1.image.equalsIgnoreCase("content") && 
> t2 != null)
> +         content = t2.image;                  // save 
> content in META tag
> +       if (inMeta && name != null && content != null)
> +         metadata.put(name, content);             // save metadata
>       }
>      )?
>     )?
> 
> 
> --
> To unsubscribe, e-mail:   
<mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message