lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Daniel Calvo" <dca...@task.com.br>
Subject RE: HTMLParser
Date Sat, 16 Feb 2002 04:11:27 GMT
> From: harwoods [mailto:harwoods@ntlworld.com]
> >>While you are at it, perhaps it would be good to add support for add
> >>other META tags
> I posted that a while back.
> Here it is again. See the getMetaTags() method.....
>
> Mark Harwood

Hi,

I haven't seen this code before (haven't checked the list archives :-(). I've created a getMetadata()
method as well but I forgot to
convert character references, which I'm doing now. Besides that, I'm also collecting URIs
(href=...) and making them available via
getURIs() (which can be usefull for people writing crawlers as Otis has mentioned). There's
a problem with this method, though.
Since the original parser relies on piped readers/writers and the URI set won't be available
till all file has been parsed, the
output generated during parsing (the file contents) must be consumed before one tries to get
the URIs, or else the method will block
forever. It took me some time to figure that out but it does make sense.

BTW, is there any special reason to have implemented this parser with pipes? Wouldn't it be
easier if the parser was single
threaded? Of course you can get some metadata (title, summary) before the whole file is parsed
but at some point you'll have to wait
till the process ends. OTOH, the contents would have to be stored in a String or any other
kind of buffer which isn't good either...

Here's the new patch

Index: HTMLParser.jj
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
retrieving revision 1.1
diff -u -r1.1 HTMLParser.jj
--- HTMLParser.jj	26 Jan 2002 15:01:31 -0000	1.1
+++ HTMLParser.jj	16 Feb 2002 03:08:04 -0000
@@ -66,6 +66,7 @@
 package org.apache.lucene.demo.html;

 import java.io.*;
+import java.util.*;

 public class HTMLParser {
   public static int SUMMARY_LENGTH = 200;
@@ -74,13 +75,17 @@
   StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
   int length = 0;
   boolean titleComplete = false;
+  boolean parseComplete = false;
   boolean inTitle = false;
   boolean inScript = false;
+  boolean inStyle = false;
   boolean afterTag = false;
   boolean afterSpace = false;
   String eol = System.getProperty("line.separator");
   PipedReader pipeIn = null;
   PipedWriter pipeOut;
+  HashMap metadata = new HashMap(7);
+    ArrayList uri = new ArrayList(10);

   public HTMLParser(File file) throws FileNotFoundException {
     this(new FileInputStream(file));
@@ -109,17 +114,77 @@
 	wait(10);
       }
     }
-    if (summary.length() > SUMMARY_LENGTH)
-      summary.setLength(SUMMARY_LENGTH);
+    // look in metadata
+    String description = (String) metadata.get("description");
+    if (description != null)
+      return description;
+    else {
+      if (summary.length() > SUMMARY_LENGTH)
+        summary.setLength(SUMMARY_LENGTH);
+
+      String sum = summary.toString().trim();
+      String tit = getTitle();
+      if (sum.startsWith(tit))
+        return sum.substring(tit.length());
+      else
+        return sum;
+    }
+  }
+
+  public String getAuthor() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+        if (summary.length() > 0) // assume that all metadata
+          break;                  // has already been collected
+        wait(10);
+      }
+    }
+    return (String)metadata.get("author");
+  }
+
+  public String getKeywords() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+        if (summary.length() > 0) // assume that all metadata
+          break;                  // has already been collected
+        wait(10);
+      }
+    }
+    return (String)metadata.get("keywords");
+  }

-    String sum = summary.toString().trim();
-    String tit = getTitle();
-    if (sum.startsWith(tit))
-      return sum.substring(tit.length());
-    else
-      return sum;
+  public Map getMetadata() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+        if (summary.length() > 0) // assume that all metadata
+          break;                  // has already been collected
+        wait(10);
+      }
+    }
+    return metadata;
   }

+  public String[] getURIs() throws IOException, InterruptedException{
+    if (pipeIn == null)
+      getReader();                // spawn parsing thread
+      while (true) {
+        synchronized(this) {
+          if (parseComplete)
+            break;
+            wait();
+          }
+        }
+      if (uri.size() == 0)
+        return new String[0];
+      else return (String[]) uri.toArray(new String[0]);
+  }
+
   public Reader getReader() throws IOException {
     if (pipeIn == null) {
       pipeIn = new PipedReader();
@@ -144,7 +209,7 @@
   }

   void addText(String text) throws IOException {
-    if (inScript)
+        if (inScript || inStyle)
       return;
     if (inTitle)
       title.append(text);
@@ -165,7 +230,7 @@
   }

   void addSpace() throws IOException {
-    if (inScript)
+        if (inScript || inStyle)
       return;
     if (!afterSpace) {
       if (inTitle)
@@ -180,6 +245,28 @@
     }
   }

+  String decode(String txt) {
+    if (txt == null)
+      return txt;
+    StringBuffer buf = new StringBuffer(txt);
+    for (int i=0; i<buf.length(); ++i) {
+      if (buf.charAt(i) == '&') {
+        int j;
+        for (j=i+1; j<buf.length() && buf.charAt(j) != ';'; ++j);
+        String decoded = Entities.decode(buf.substring(i, j+1));
+        buf.replace(i, j+1, decoded);
+      }
+    }
+    return buf.toString();
+  }
+
+  void endParse() {
+    synchronized(this) {
+      parseComplete = true;
+      notifyAll();
+    }
+  }
+
 //    void handleException(Exception e) {
 //      System.out.println(e.toString());  // print the error message
 //      System.out.println("Skipping...");
@@ -206,7 +293,7 @@
     | t=<Entity>    { addText(Entities.decode(t.image)); afterTag = false; }
     | t=<Punct>     { addText(t.image); afterTag = false; }
     | <Space>       { addSpace(); afterTag = false; }
-    )* <EOF>
+    )* <EOF>        { endParse(); }
 //  } catch (ParseException e) {
 //    handleException(e);
 //  }
@@ -216,23 +303,44 @@
 {
   Token t1, t2;
   boolean inImg = false;
+  boolean inMeta = false;
+    boolean inA = false;
+  String name = null;
+  String content = null;
 }
 {
   t1=<TagName> {
-    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
-    inImg = t1.image.equalsIgnoreCase("<img");	  // keep track if in <IMG>
-    if (inScript) {				  // keep track if in <SCRIPT>
+    inTitle = t1.image.equalsIgnoreCase("<title");  // keep track if in <TITLE>
+    inImg = t1.image.equalsIgnoreCase("<img");      // keep track if in <IMG>
+    inMeta = t1.image.equalsIgnoreCase("<meta");    // keep track if in <META>
+    inA = t1.image.equalsIgnoreCase("<a");          // keep track if in <A>
+    if (inScript) {                                 // keep track if in <SCRIPT>
       inScript = !t1.image.equalsIgnoreCase("</script");
     } else {
       inScript = t1.image.equalsIgnoreCase("<script");
     }
+    if (inStyle) {                                  // keep track if in <STYLE>
+      inStyle = !t1.image.equalsIgnoreCase("</style");
+    } else {
+      inStyle = t1.image.equalsIgnoreCase("<style");
+    }
   }
   (t1=<ArgName>
    (<ArgEquals>
-    (t2=ArgValue()				  // save ALT text in IMG tag
+     (t2=ArgValue()
      {
        if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
-         addText("[" + t2.image + "]");
+         addText("[" + t2.image + "]");       // save ALT text in IMG tag
+       if (inMeta) {
+         if (t1.image.equalsIgnoreCase("name") && t2 != null)
+           name = t2.image.toLowerCase();       // save name in META tag
+         if (t1.image.equalsIgnoreCase("content") && t2 != null)
+           content = decode(t2.image);          // save content in META tag
+         if (name != null && content != null)
+           metadata.put(name, content.trim());  // save metadata
+       }
+       if (inA && t1.image.equalsIgnoreCase("href") && t2 != null)
+         uri.add(t2.image);
      }
     )?
    )?


--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message