lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Mark Harwood <mark.harw...@rubus.com>
Subject Contribution - updated HTMLParser now parses metatags
Date Tue, 04 Dec 2001 10:19:37 GMT
Apologies if this is not the correct forum for contributing code, couldn't
find a "how to.." anywhere.
I have adapted the .jj file for HTMLParser so that it parses and stores all
metatag data in a java.util.Properties object with lowercased keys.
I'm not sure why all that threading code is needed in this class but I've
followed the general pattern used for other data and this works for me. The
IndexHTML demo code could now be adapted to store metatags such as
"description" or "keywords" in the index too.

Cheers
Mark Harwood

========Code begins below===================
/* ====================================================================
 * The Apache Software License, Version 1.1
 *
 * Copyright (c) 2001 The Apache Software Foundation.  All rights
 * reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in
 *    the documentation and/or other materials provided with the
 *    distribution.
 *
 * 3. The end-user documentation included with the redistribution,
 *    if any, must include the following acknowledgment:
 *       "This product includes software developed by the
 *        Apache Software Foundation (http://www.apache.org/)."
 *    Alternately, this acknowledgment may appear in the software itself,
 *    if and wherever such third-party acknowledgments normally appear.
 *
 * 4. The names "Apache" and "Apache Software Foundation" and
 *    "Apache Lucene" must not be used to endorse or promote products
 *    derived from this software without prior written permission. For
 *    written permission, please contact apache@apache.org.
 *
 * 5. Products derived from this software may not be called "Apache",
 *    "Apache Lucene", nor may "Apache" appear in their name, without
 *    prior written permission of the Apache Software Foundation.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
 * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
 * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 * ====================================================================
 *
 * This software consists of voluntary contributions made by many
 * individuals on behalf of the Apache Software Foundation.  For more
 * information on the Apache Software Foundation, please see
 * <http://www.apache.org/>.
 */

// HTMLParser.jj

options {
  STATIC = false;
  OPTIMIZE_TOKEN_MANAGER = true;
  //DEBUG_LOOKAHEAD = true;
  //DEBUG_TOKEN_MANAGER = true;
}

PARSER_BEGIN(HTMLParser)

package org.apache.lucene.HTMLParser;

import java.io.*;
import java.util.Properties;

public class HTMLParser {
  public static int SUMMARY_LENGTH = 200;
  
  StringBuffer title = new StringBuffer(SUMMARY_LENGTH);
  StringBuffer summary = new StringBuffer(SUMMARY_LENGTH * 2);
  Properties metaTags=new Properties();
  String currentMetaTag="";
  int length = 0;
  boolean titleComplete = false;
  boolean inTitle = false;
  boolean inMetaTag = false;
  boolean inScript = false;
  boolean afterTag = false;
  boolean afterSpace = false;
  String eol = System.getProperty("line.separator");
  PipedReader pipeIn = null;
  PipedWriter pipeOut;

  public HTMLParser(File file) throws FileNotFoundException {
    this(new FileInputStream(file));
  }

  public String getTitle() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();				  // spawn parsing thread
    while (true) {
      synchronized(this) {
	if (titleComplete || (length > SUMMARY_LENGTH))
	  break;
	wait(10);
      }
    }
    return title.toString().trim();
  }

  public Properties getMetaTags() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();				  // spawn parsing thread
    while (true) {
      synchronized(this) {
	if (titleComplete || (length > SUMMARY_LENGTH))
	  break;
	wait(10);
      }
    }
    return metaTags;
  }


  public String getSummary() throws IOException, InterruptedException {
    if (pipeIn == null)
      getReader();				  // spawn parsing thread
    while (true) {
      synchronized(this) {
	if (summary.length() >= SUMMARY_LENGTH)
	  break;
	wait(10);
      }
    }
    if (summary.length() > SUMMARY_LENGTH)
      summary.setLength(SUMMARY_LENGTH);

    String sum = summary.toString().trim();
    String tit = getTitle();
    if (sum.startsWith(tit))
      return sum.substring(tit.length());
    else
      return sum;
  }

  public Reader getReader() throws IOException {
    if (pipeIn == null) {
      pipeIn = new PipedReader();
      pipeOut = new PipedWriter(pipeIn);
      
      Thread thread = new ParserThread(this);
      thread.start();				  // start parsing
    }

    return pipeIn;
  }

  void addToSummary(String text) {
    if (summary.length() < SUMMARY_LENGTH) {
      summary.append(text);
      if (summary.length() >= SUMMARY_LENGTH) {
	synchronized(this) {
	  notifyAll();
	}
      }
    }
  }

  void addText(String text) throws IOException {
    if (inScript)
      return;
    if (inMetaTag)
    {
	metaTags.setProperty(currentMetaTag, text);
      	return;
    }
    if (inTitle)
      title.append(text);
    else {
      addToSummary(text);
      if (!titleComplete && !title.equals("")) {  // finished title
	synchronized(this) {
	  titleComplete = true;			  // tell waiting threads
	  notifyAll();
	}
      }
    }

    length += text.length();
    pipeOut.write(text);

    afterSpace = false;
  }
  
  void addSpace() throws IOException {
    if (inScript)
      return;
    if (!afterSpace) {
      if (inTitle)
	title.append(" ");
      else
	addToSummary(" ");
      
      String space = afterTag ? eol : " ";
      length += space.length();
      pipeOut.write(space);
      afterSpace = true;
    }
  }

//    void handleException(Exception e) {
//      System.out.println(e.toString());  // print the error message
//      System.out.println("Skipping...");
//      Token t;
//      do {
//        t = getNextToken();
//      } while (t.kind != TagEnd);
//    }
}

PARSER_END(HTMLParser)


void HTMLDocument() throws IOException :
{
  Token t;
}
{
//  try {
    ( Tag()         { afterTag = true; }
    | t=Decl()      { afterTag = true; }
    | CommentTag()  { afterTag = true; }
    | t=<Word>      { addText(t.image); afterTag = false; }
    | t=<Entity>    { addText(Entities.decode(t.image)); afterTag = false; }
    | t=<Punct>     { addText(t.image); afterTag = false; }
    | <Space>       { addSpace(); afterTag = false; }
    )* <EOF>
//  } catch (ParseException e) {
//    handleException(e);
//  }
}

void Tag() throws IOException :
{
  Token t1, t2;
  boolean inImg = false;
}
{
  t1=<TagName> {
    inTitle = t1.image.equalsIgnoreCase("<title"); // keep track if in
<TITLE>
    inMetaTag = t1.image.equalsIgnoreCase("<META"); // keep track if in
<META>
    inImg = t1.image.equalsIgnoreCase("<img");	  // keep track if in <IMG>
    if (inScript) {				  // keep track if in
<SCRIPT>
      inScript = !t1.image.equalsIgnoreCase("</script");
    } else {
      inScript = t1.image.equalsIgnoreCase("<script");
    }
  }
  (t1=<ArgName>
   (<ArgEquals>
    (t2=ArgValue()				  // save ALT text in IMG
tag
     {
       if (inImg && t1.image.equalsIgnoreCase("alt") && t2 != null)
         addText("[" + t2.image + "]");

    	if(inMetaTag && 
			(  t1.image.equalsIgnoreCase("name") ||
			   t1.image.equalsIgnoreCase("HTTP-EQUIV")
			)
	   && t2 != null)
	{
		currentMetaTag=t2.image.toLowerCase();
	}
    	if(inMetaTag && t1.image.equalsIgnoreCase("content") && t2 != null)
	{
		addText(t2.image);
	}
     }
    )?
   )?
  )*
  <TagEnd>
}

Token ArgValue() :
{
  Token t = null;
}
{
  t=<ArgValue>                              { return t; }
| LOOKAHEAD(2)
  <ArgQuote1> <CloseQuote1>                 { return t; }
| <ArgQuote1> t=<Quote1Text> <CloseQuote1>  { return t; }
| LOOKAHEAD(2)
  <ArgQuote2> <CloseQuote2>                 { return t; }
| <ArgQuote2> t=<Quote2Text> <CloseQuote2>  { return t; }
}


Token Decl() :
{
  Token t;
}
{
  t=<DeclName> ( <ArgName> | ArgValue() | <ArgEquals> )* <TagEnd>
  { return t; }
}


void CommentTag() :
{}
{
  (<Comment1> ( <CommentText1> )* <CommentEnd1>)
 |
  (<Comment2> ( <CommentText2> )* <CommentEnd2>)
}
  

TOKEN :
{
  < TagName:  "<" ("/")? ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag
| < DeclName: "<"  "!"   ["A"-"Z","a"-"z"] (<ArgName>)? > : WithinTag

| < Comment1:  "<!--" > : WithinComment1
| < Comment2:  "<!" >   : WithinComment2

| < Word:     ( <LET> | <LET> (["+","/"])+ | <NUM> ["\""] |
                <LET> ["-","'"] <LET> | ("$")? <NUM> [",","."] <NUM>
)+ >
| < #LET:     ["A"-"Z","a"-"z","0"-"9"] >
| < #NUM:     ["0"-"9"] >

| < Entity:   ( "&" (["A"-"Z","a"-"z"])+ (";")? | "&" "#" (<NUM>)+ (";")?
)
>

| < Space:    (<SP>)+ >
| < #SP:      [" ","\t","\r","\n"] >

| < Punct:    ~[] > // Keep this last.  It is a catch-all.
}


<WithinTag> TOKEN:
{
  < ArgName:   (~[" ","\t","\r","\n","=",">","'","\""])
               (~[" ","\t","\r","\n","=",">"])* >
| < ArgEquals: "=" >  : AfterEquals
| < TagEnd:    ">" | "=>" >  : DEFAULT
}

<AfterEquals> TOKEN:
{
  < ArgValue:  (~[" ","\t","\r","\n","=",">","'","\""])
	       (~[" ","\t","\r","\n",">"])* > : WithinTag
}

<WithinTag, AfterEquals> TOKEN:
{
  < ArgQuote1: "'"  > : WithinQuote1
| < ArgQuote2: "\"" > : WithinQuote2
}

<WithinTag, AfterEquals> SKIP:
{
  < <Space> >
}

<WithinQuote1> TOKEN:
{
  < Quote1Text:  (~["'"])+ >
| < CloseQuote1: <ArgQuote1> > : WithinTag
}

<WithinQuote2> TOKEN:
{
  < Quote2Text:  (~["\""])+ >
| < CloseQuote2: <ArgQuote2> > : WithinTag
}


<WithinComment1> TOKEN :
{
  < CommentText1:  (~["-"])+ | "-" >
| < CommentEnd1:   "-->" > : DEFAULT
}

<WithinComment2> TOKEN :
{
  < CommentText2:  (~[">"])+ >
| < CommentEnd2:   ">" > : DEFAULT
}
==================Code ENDS============================================

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message