lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "harwoods" <harwo...@ntlworld.com>
Subject Re: HTMLParser
Date Fri, 15 Feb 2002 22:10:38 GMT
>>While you are at it, perhaps it would be good to add support for add
>>other META tags
I posted that a while back.
Here it is again. See the getMetaTags() method.....

Mark Harwood


/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

// HTMLParser.jj

options {
  STATIC = false;
  OPTIMIZE_TOKEN_MANAGER = true;
  file://DEBUG_LOOKAHEAD = true;
  file://DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(HTMLParser)

package org.apache.lucene.HTMLParser;

import java.io.*;
import java.util.Properties;

public class HTMLParser {
  public static int SUMMARY_LENGTH = 200;

  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  Properties metaTags=new Properties();
  String currentMetaTag="";
  int length = 0;
  boolean titleComplete = false;
  boolean inTitle = false;
  boolean inMetaTag = false;
  boolean inScript = false;
  boolean afterTag = false;
  boolean afterSpace = false;
  String eol = System.getProperty("line.separator");
  PipedReader pipeIn = null;
  PipedWriter pipeOut;

  public HTMLParser(File file) throws FileNotFoundException {
    this(new FileInputStream(file));
  }

  public String getTitle() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();      // spawn parsing thread
    while (true) {
      synchronized(this) {
 if (titleComplete || (length > SUMMARY_LENGTH))
   break;
 wait(10);
      }
    }
    return title.toString().trim();
  }

  public Properties getMetaTags() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();      // spawn parsing thread
    while (true) {
      synchronized(this) {
 if (titleComplete || (length > SUMMARY_LENGTH))
   break;
 wait(10);
      }
    }
    return metaTags;
  }


  public String getSummary() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();      // spawn parsing thread
    while (true) {
      synchronized(this) {
 if (summary.length() >= SUMMARY_LENGTH)
   break;
 wait(10);
      }
    }
    if (summary.length() > SUMMARY_LENGTH)
      summary.setLength(SUMMARY_LENGTH);

    String sum = summary.toString().trim();
    String tit = getTitle();
    if (sum.startsWith(tit))
      return sum.substring(tit.length());
    else
      return sum;
  }

  public Reader getReader() throws IOException {
    if (pipeIn == null) {
      pipeIn = new PipedReader();
      pipeOut = new PipedWriter(pipeIn);

      Thread thread = new ParserThread(this);
      thread.start();      // start parsing
    }

    return pipeIn;
  }

  void addToSummary(String text) {
    if (summary.length() < SUMMARY_LENGTH) {
      summary.append(text);
      if (summary.length() >= SUMMARY_LENGTH) {
 synchronized(this) {
   notifyAll();
 }
      }
    }
  }

  void addText(String text) throws IOException {
    if (inScript)
      return;
    if (inMetaTag)
    {
 metaTags.setProperty(currentMetaTag, text);
       return;
    }
    if (inTitle)
      title.append(text);
    else {
      addToSummary(text);
      if (!titleComplete && !title.equals("")) {  // finished title
 synchronized(this) {
   titleComplete = true;     // tell waiting threads
   notifyAll();
 }
      }
    }

    length += text.length();
    pipeOut.write(text);

    afterSpace = false;
  }

  void addSpace() throws IOException {
    if (inScript)
      return;
    if (!afterSpace) {
      if (inTitle)
 title.append(" ");
      else
 addToSummary(" ");

      String space = afterTag ? eol : " ";
      length += space.length();
      pipeOut.write(space);
      afterSpace = true;
    }
  }

//    void handleException(Exception e) {
//      System.out.println(e.toString());  // print the error message
//      System.out.println("Skipping...");
//      Token t;
//      do {
//        t = getNextToken();
//      } while (t.kind != TagEnd);
//    }
}

PARSER_END(HTMLParser)


void HTMLDocument() throws IOException :
{
  Token t;
}
{
//  try {
    ( Tag()         { afterTag = true; }
    | t=Decl()      { afterTag = true; }
    | CommentTag()  { afterTag = true; }
    | t=<Word>      { addText(t.image); afterTag = false; }
    | t=<Entity>    { addText(Entities.decode(t.image)); afterTag = false; }
    | t=<Punct>     { addText(t.image); afterTag = false; }
    | <Space>       { addSpace(); afterTag = false; }
    )* <EOF>
//  } catch (ParseException e) {
//    handleException(e);
//  }
}

void Tag() throws IOException :
{
  Token t1, t2;
  boolean inImg = false;
}
{
  t1=<TagName> {
    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in
<TITLE>
    inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in
<META>
    inImg = t1.image.equalsIgnoreCase("<img");   // keep track if in <IMG>
    if (inScript) {      // keep track if in <SCRIPT>
      inScript = !t1.image.equalsIgnoreCase("</script");
    } else {
      inScript = t1.image.equalsIgnoreCase("<script");
    }
  }
  (t1=<ArgName>
   (<ArgEquals>
    (t2=ArgValue()      // save ALT text in IMG tag
     {
       if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
         addText("[" + t2.image + "]");

     if(inMetaTag &&
   (  t1.image.equalsIgnoreCase("name") ||
      t1.image.equalsIgnoreCase("HTTP-EQUIV")
   )
    && t2 != null)
 {
  currentMetaTag=t2.image.toLowerCase();
 }
     if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null)
 {
  addText(t2.image);
 }
     }
    )?
   )?
  )*
  <TagEnd>
}

Token ArgValue() :
{
  Token t = null;
}
{
  t=<ArgValue>                              { return t; }
| LOOKAHEAD(2)
  <ArgQuote1> <CloseQuote1>                 { return t; }
| <ArgQuote1> t=<Quote1Text> <CloseQuote1>  { return t; }
| LOOKAHEAD(2)
  <ArgQuote2> <CloseQuote2>                 { return t; }
| <ArgQuote2> t=<Quote2Text> <CloseQuote2>  { return t; }
}


Token Decl() :
{
  Token t;
}
{
  t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
  { return t; }
}


void CommentTag() :
{}
{
  (<Comment1> ( <CommentText1> )* <CommentEnd1>)
 |
  (<Comment2> ( <CommentText2> )* <CommentEnd2>)
}


TOKEN :
{
  < TagName:  "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<"  "!"   ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag

| < Comment1:  "<!--" > : WithinComment1
| < Comment2:  "<!" >   : WithinComment2

| < Word:     ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
                <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM>
)+ >
| < #LET:     ["A"-"Z","a"-"z","0"-"9"] >
| < #NUM:     ["0"-"9"] >

| < Entity:   ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")?
)
>

| < Space:    (<SP>)+ >
| < #SP:      [" ","\t","\r","\n"] >

| < Punct:    ~[] > // Keep this last.  It is a catch-all.
}


<WithinTag> TOKEN:
{
  < ArgName:   (~[" ","\t","\r","\n","=",">","'","\""])
               (~[" ","\t","\r","\n","=",">"])* >
| < ArgEquals: "=" >  : AfterEquals
| < TagEnd:    ">" | "=>" >  : DEFAULT
}

<AfterEquals> TOKEN:
{
  < ArgValue:  (~[" ","\t","\r","\n","=",">","'","\""])
        (~[" ","\t","\r","\n",">"])* > : WithinTag
}

<WithinTag, AfterEquals> TOKEN:
{
  < ArgQuote1: "'"  > : WithinQuote1
| < ArgQuote2: "\"" > : WithinQuote2
}

<WithinTag, AfterEquals> SKIP:
{
  < <Space> >
}

<WithinQuote1> TOKEN:
{
  < Quote1Text:  (~["'"])+ >
| < CloseQuote1: <ArgQuote1> > : WithinTag
}

<WithinQuote2> TOKEN:
{
  < Quote2Text:  (~["\""])+ >
| < CloseQuote2: <ArgQuote2> > : WithinTag
}


<WithinComment1> TOKEN :
{
  < CommentText1:  (~["-"])+ | "-" >
| < CommentEnd1:   "-->" > : DEFAULT
}

<WithinComment2> TOKEN :
{
  < CommentText2:  (~[">"])+ >
| < CommentEnd2:   ">" > : DEFAULT
}





Mime
  • Unnamed multipart/alternative (inline, None, 0 bytes)
View raw message