lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Otis Gospodnetic <otis_gospodne...@yahoo.com>
Subject Re: HTMLParser
Date Fri, 15 Feb 2002 20:47:10 GMT
While you are at it, perhaps it would be good to add support for add
other META tags, such as "robots", especially since people are working
on adding a web crawler component to Lucene.

Thanks,
Otis

--- Daniel Calvo <dcalvo@task.com.br> wrote:
> Hi,
> 
> I was playing with HTMLParser.jj and made some changes you might be
> interested in. What I did was start handling <META> tags (added
> new methods: getAuthor, getKeywords and getMetadata and changed
> getSummary to check if there's any metadata item with
> name=="description"). I'm also filtering out any text inside
> <STYLE>...</STYLE> (like <SCRIPT> is being handled).
> I've performed some tests and I belive I didn't break anything ;-)
> 
> The patch is as follows
> 
> Best regards,
> 
> --Daniel
> 
> Index: HTMLParser.jj
> ===================================================================
> RCS file:
>
/home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
> retrieving revision 1.1
> diff -u -r1.1 HTMLParser.jj
> --- HTMLParser.jj	26 Jan 2002 15:01:31 -0000	1.1
> +++ HTMLParser.jj	15 Feb 2002 20:39:49 -0000
> @@ -66,6 +66,8 @@
>  package org.apache.lucene.demo.html;
> 
>  import java.io.*;
> +import java.util.Map;
> +import java.util.HashMap;
> 
>  public class HTMLParser {
>    public static int SUMMARY_LENGTH = 200;
> @@ -76,11 +78,13 @@
>    boolean titleComplete = false;
>    boolean inTitle = false;
>    boolean inScript = false;
> +  boolean inStyle = false;
>    boolean afterTag = false;
>    boolean afterSpace = false;
>    String eol = System.getProperty("line.separator");
>    PipedReader pipeIn = null;
>    PipedWriter pipeOut;
> +  HashMap metadata = new HashMap(7);
> 
>    public HTMLParser(File file) throws FileNotFoundException {
>      this(new FileInputStream(file));
> @@ -109,15 +113,60 @@
>  	wait(10);
>        }
>      }
> -    if (summary.length() > SUMMARY_LENGTH)
> -      summary.setLength(SUMMARY_LENGTH);
> +    // look in metadata
> +    String description = (String) metadata.get("description");
> +    if (description != null)
> +      return description;
> +    else {
> +      if (summary.length() > SUMMARY_LENGTH)
> +        summary.setLength(SUMMARY_LENGTH);
> +
> +      String sum = summary.toString().trim();
> +      String tit = getTitle();
> +      if (sum.startsWith(tit))
> +        return sum.substring(tit.length());
> +      else
> +        return sum;
> +    }
> +  }
> +
> +  public String getAuthor() throws IOException, InterruptedException
> {
> +    if (pipeIn == null)
> +      getReader();                // spawn parsing thread
> +    while (true) {
> +      synchronized(this) {
> +        if (summary.length() > 0) // assume that all metadata
> +          break;                  // has already been collected
> +        wait(10);
> +      }
> +    }
> +    return (String)metadata.get("author");
> +  }
> +
> +  public String getKeywords() throws IOException,
> InterruptedException {
> +    if (pipeIn == null)
> +      getReader();                // spawn parsing thread
> +    while (true) {
> +      synchronized(this) {
> +        if (summary.length() > 0) // assume that all metadata
> +          break;                  // has already been collected
> +        wait(10);
> +      }
> +    }
> +    return (String)metadata.get("keywords");
> +  }
> 
> -    String sum = summary.toString().trim();
> -    String tit = getTitle();
> -    if (sum.startsWith(tit))
> -      return sum.substring(tit.length());
> -    else
> -      return sum;
> +  public Map getMetadata() throws IOException, InterruptedException
> {
> +    if (pipeIn == null)
> +      getReader();                // spawn parsing thread
> +    while (true) {
> +      synchronized(this) {
> +        if (summary.length() > 0) // assume that all metadata
> +          break;                  // has already been collected
> +        wait(10);
> +      }
> +    }
> +    return metadata;
>    }
> 
>    public Reader getReader() throws IOException {
> @@ -144,7 +193,7 @@
>    }
> 
>    void addText(String text) throws IOException {
> -    if (inScript)
> +        if (inScript || inStyle)
>        return;
>      if (inTitle)
>        title.append(text);
> @@ -165,7 +214,7 @@
>    }
> 
>    void addSpace() throws IOException {
> -    if (inScript)
> +        if (inScript || inStyle)
>        return;
>      if (!afterSpace) {
>        if (inTitle)
> @@ -216,23 +265,38 @@
>  {
>    Token t1, t2;
>    boolean inImg = false;
> +  boolean inMeta = false;
> +  String name = null;
> +  String content = null;
>  }
>  {
>    t1=<TagName> {
> -    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if
> in <TITLE>
> -    inImg = t1.image.equalsIgnoreCase("<img");	  // keep track if in
> <IMG>
> -    if (inScript) {				  // keep track if in <SCRIPT>
> +    inTitle = t1.image.equalsIgnoreCase("<title");     // keep track
> if in <TITLE>
> +    inImg = t1.image.equalsIgnoreCase("<img");         // keep track
> if in <IMG>
> +    inMeta = t1.image.equalsIgnoreCase("<meta");       // keep track
> if in <META>
> +    if (inScript) {                                    // keep track
> if in <SCRIPT>
>        inScript = !t1.image.equalsIgnoreCase("</script");
>      } else {
>        inScript = t1.image.equalsIgnoreCase("<script");
>      }
> +    if (inStyle) {                                     // keep track
> if in <STYLE>
> +      inStyle = !t1.image.equalsIgnoreCase("</style");
> +    } else {
> +      inStyle = t1.image.equalsIgnoreCase("<style");
> +    }
>    }
>    (t1=<ArgName>
>     (<ArgEquals>
> -    (t2=ArgValue()				  // save ALT text in IMG tag
> +     (t2=ArgValue()
>       {
>         if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
> -         addText("[" + t2.image + "]");
> +         addText("[" + t2.image + "]");       // save ALT text in
> IMG tag
> +       if (inMeta && t1.image.equalsIgnoreCase("name") && t2 !=
> null)
> +         name = t2.image.toLowerCase();       // save name in META
> tag
> +       if (inMeta && t1.image.equalsIgnoreCase("content") && t2 !=
> null)
> +         content = t2.image;                  // save content in
> META tag
> +       if (inMeta && name != null && content != null)
> +         metadata.put(name, content);             // save metadata
>       }
>      )?
>     )?
> 
> 
> --
> To unsubscribe, e-mail:  
> <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
> For additional commands, e-mail:
> <mailto:lucene-dev-help@jakarta.apache.org>
> 


__________________________________________________
Do You Yahoo!?
Got something to say? Say it better with Yahoo! Video Mail 
http://mail.yahoo.com

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message