lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Daniel Calvo" <dca...@task.com.br>
Subject HTMLParser
Date Fri, 15 Feb 2002 21:42:24 GMT
Hi,

I was playing with HTMLParser.jj and made some changes you might be interested in. What I
did was start handling <META> tags (added
new methods: getAuthor, getKeywords and getMetadata and changed getSummary to check if there's
any metadata item with
name=="description"). I'm also filtering out any text inside <STYLE>...</STYLE>
(like <SCRIPT> is being handled).
I've performed some tests and I belive I didn't break anything ;-)

The patch is as follows

Best regards,

--Daniel

Index: HTMLParser.jj
===================================================================
RCS file: /home/cvspublic/jakarta-lucene/src/demo/org/apache/lucene/demo/html/HTMLParser.jj,v
retrieving revision 1.1
diff -u -r1.1 HTMLParser.jj
--- HTMLParser.jj	26 Jan 2002 15:01:31 -0000	1.1
+++ HTMLParser.jj	15 Feb 2002 20:39:49 -0000
@@ -66,6 +66,8 @@
 package org.apache.lucene.demo.html;

 import java.io.*;
+import java.util.Map;
+import java.util.HashMap;

 public class HTMLParser {
   public static int SUMMARY_LENGTH = 200;
@@ -76,11 +78,13 @@
   boolean titleComplete = false;
   boolean inTitle = false;
   boolean inScript = false;
+  boolean inStyle = false;
   boolean afterTag = false;
   boolean afterSpace = false;
   String eol = System.getProperty("line.separator");
   PipedReader pipeIn = null;
   PipedWriter pipeOut;
+  HashMap metadata = new HashMap(7);

   public HTMLParser(File file) throws FileNotFoundException {
     this(new FileInputStream(file));
@@ -109,15 +113,60 @@
 	wait(10);
       }
     }
-    if (summary.length() > SUMMARY_LENGTH)
-      summary.setLength(SUMMARY_LENGTH);
+    // look in metadata
+    String description = (String) metadata.get("description");
+    if (description != null)
+      return description;
+    else {
+      if (summary.length() > SUMMARY_LENGTH)
+        summary.setLength(SUMMARY_LENGTH);
+
+      String sum = summary.toString().trim();
+      String tit = getTitle();
+      if (sum.startsWith(tit))
+        return sum.substring(tit.length());
+      else
+        return sum;
+    }
+  }
+
+  public String getAuthor() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+        if (summary.length() > 0) // assume that all metadata
+          break;                  // has already been collected
+        wait(10);
+      }
+    }
+    return (String)metadata.get("author");
+  }
+
+  public String getKeywords() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+        if (summary.length() > 0) // assume that all metadata
+          break;                  // has already been collected
+        wait(10);
+      }
+    }
+    return (String)metadata.get("keywords");
+  }

-    String sum = summary.toString().trim();
-    String tit = getTitle();
-    if (sum.startsWith(tit))
-      return sum.substring(tit.length());
-    else
-      return sum;
+  public Map getMetadata() throws IOException, InterruptedException {
+    if (pipeIn == null)
+      getReader();                // spawn parsing thread
+    while (true) {
+      synchronized(this) {
+        if (summary.length() > 0) // assume that all metadata
+          break;                  // has already been collected
+        wait(10);
+      }
+    }
+    return metadata;
   }

   public Reader getReader() throws IOException {
@@ -144,7 +193,7 @@
   }

   void addText(String text) throws IOException {
-    if (inScript)
+        if (inScript || inStyle)
       return;
     if (inTitle)
       title.append(text);
@@ -165,7 +214,7 @@
   }

   void addSpace() throws IOException {
-    if (inScript)
+        if (inScript || inStyle)
       return;
     if (!afterSpace) {
       if (inTitle)
@@ -216,23 +265,38 @@
 {
   Token t1, t2;
   boolean inImg = false;
+  boolean inMeta = false;
+  String name = null;
+  String content = null;
 }
 {
   t1=<TagName> {
-    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in <TITLE>
-    inImg = t1.image.equalsIgnoreCase("<img");	  // keep track if in <IMG>
-    if (inScript) {				  // keep track if in <SCRIPT>
+    inTitle = t1.image.equalsIgnoreCase("<title");     // keep track if in <TITLE>
+    inImg = t1.image.equalsIgnoreCase("<img");         // keep track if in <IMG>
+    inMeta = t1.image.equalsIgnoreCase("<meta");       // keep track if in <META>
+    if (inScript) {                                    // keep track if in <SCRIPT>
       inScript = !t1.image.equalsIgnoreCase("</script");
     } else {
       inScript = t1.image.equalsIgnoreCase("<script");
     }
+    if (inStyle) {                                     // keep track if in <STYLE>
+      inStyle = !t1.image.equalsIgnoreCase("</style");
+    } else {
+      inStyle = t1.image.equalsIgnoreCase("<style");
+    }
   }
   (t1=<ArgName>
    (<ArgEquals>
-    (t2=ArgValue()				  // save ALT text in IMG tag
+     (t2=ArgValue()
      {
        if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
-         addText("[" + t2.image + "]");
+         addText("[" + t2.image + "]");       // save ALT text in IMG tag
+       if (inMeta && t1.image.equalsIgnoreCase("name") && t2 != null)
+         name = t2.image.toLowerCase();       // save name in META tag
+       if (inMeta && t1.image.equalsIgnoreCase("content") && t2 != null)
+         content = t2.image;                  // save content in META tag
+       if (inMeta && name != null && content != null)
+         metadata.put(name, content);             // save metadata
      }
     )?
    )?


--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message