forrest-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From che...@apache.org
Subject cvs commit: xml-forrest/src/scratchpad/src/java/org/apache/forrest/search ForrestDocument.java ForrestDocumentSAXParser.java ForrestIndexer.java ForrestSearchRenderer.java ForrestSearchServlet.java ForrestSearcher.java
Date Sat, 13 Sep 2003 01:21:39 GMT
cheche      2003/09/12 18:21:39

  Modified:    src/scratchpad/src/java/org/apache/forrest/search
                        ForrestDocument.java ForrestDocumentSAXParser.java
                        ForrestIndexer.java ForrestSearchRenderer.java
                        ForrestSearchServlet.java ForrestSearcher.java
  Log:
  Remove all dos format. Tidy up with ImportScrubber.
  
  Revision  Changes    Path
  1.2       +122 -119  xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestDocument.java
  
  Index: ForrestDocument.java
  ===================================================================
  RCS file: /home/cvs/xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestDocument.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ForrestDocument.java	12 Sep 2003 19:07:31 -0000	1.1
  +++ ForrestDocument.java	13 Sep 2003 01:21:39 -0000	1.2
  @@ -1,119 +1,122 @@
  -/*
  - * The Apache Software License, Version 1.1
  - *
  - *
  - * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  - *    not be used to endorse or promote products derived from this
  - *    software without prior written permission. For written
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    nor may "Apache" appear in their name, without prior written
  - *    permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation and was
  - * originally based on software copyright (c) 1999, International
  - * Business Machines, Inc., http://www.apache.org.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - */
  -package org.apache.forrest.search;
  -
  -import java.io.File;
  -import java.util.HashMap;
  -import org.apache.lucene.document.*;
  -
  -/**
  - * Utility class to make Lucene Documents from Forrest Documents
  - * @author Ramon Prades [RPR]
  - * @version $Id$
  - */
  -public class ForrestDocument {
  -
  -  /**
  -   * Makes the Lucene document asking the parser to extract
  -   * the relevant information.
  -   */
  -  public static Document document(File file) {
  -    // Instantiate a parser for this file
  -    Document doc = null;
  -    ForrestDocumentSAXParser parser = new ForrestDocumentSAXParser();
  -    try {
  -      HashMap results = parser.parseDocument(file);
  -      doc = processInfo(file, results);
  -    }
  -    catch (Exception ex) {
  -      // Not a forrest doc
  -    }
  -    return doc;
  -  }
  -
  -  /**
  -   * Process the results returned from the parser and creates the
  -   * Lucene document
  -   */
  -  private static Document processInfo(File file, HashMap results) {
  -    Document doc = new Document();
  -    // Get info
  -    String docTitle = (String) getFromResults("title", results);
  -    String docSummary = (String) getFromResults("abstract", results);
  -    String docAuthor = (String) getFromResults("author", results);
  -    String docContents = (String) getFromResults("body", results);
  -    // Index and store title and summary
  -    doc.add(Field.Text("title", docTitle));
  -    doc.add(Field.Text("summary", docSummary));
  -    doc.add(Field.Text("author", docAuthor));
  -    // Index but don't store contents
  -    doc.add(Field.UnStored("contents", docTitle + " " + docSummary + " " + docContents));
  -    return doc;
  -  } // document
  -
  -  /*
  -   * Utility method to extract a key from a hashmap
  -   */
  -  private static Object getFromResults(String key, HashMap results) {
  -    if (results.containsKey(key)) {
  -      return results.get(key);
  -    } else {
  -      return "";
  -    }
  -  } // getFromResults  }
  -
  -} // Class ForrestDocument
  \ No newline at end of file
  +/*
  + * The Apache Software License, Version 1.1
  + *
  + *
  + * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  + * reserved.
  + *
  + * Redistribution and use in source and binary forms, with or without
  + * modification, are permitted provided that the following conditions
  + * are met:
  + *
  + * 1. Redistributions of source code must retain the above copyright
  + *    notice, this list of conditions and the following disclaimer.
  + *
  + * 2. Redistributions in binary form must reproduce the above copyright
  + *    notice, this list of conditions and the following disclaimer in
  + *    the documentation and/or other materials provided with the
  + *    distribution.
  + *
  + * 3. The end-user documentation included with the redistribution,
  + *    if any, must include the following acknowledgment:
  + *       "This product includes software developed by the
  + *        Apache Software Foundation (http://www.apache.org/)."
  + *    Alternately, this acknowledgment may appear in the software itself,
  + *    if and wherever such third-party acknowledgments normally appear.
  + *
  + * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  + *    not be used to endorse or promote products derived from this
  + *    software without prior written permission. For written
  + *    permission, please contact apache@apache.org.
  + *
  + * 5. Products derived from this software may not be called "Apache",
  + *    nor may "Apache" appear in their name, without prior written
  + *    permission of the Apache Software Foundation.
  + *
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + * SUCH DAMAGE.
  + * ====================================================================
  + *
  + * This software consists of voluntary contributions made by many
  + * individuals on behalf of the Apache Software Foundation and was
  + * originally based on software copyright (c) 1999, International
  + * Business Machines, Inc., http://www.apache.org.  For more
  + * information on the Apache Software Foundation, please see
  + * <http://www.apache.org/>.
  + */
  +package org.apache.forrest.search;
  +
  +import java.io.File;
  +
  +import java.util.HashMap;
  +
  +import org.apache.lucene.document.Document;
  +import org.apache.lucene.document.Field;
  +
  +/**
  + * Utility class to make Lucene Documents from Forrest Documents
  + * @author Ramon Prades [RPR]
  + * @version $Id$
  + */
  +public class ForrestDocument {
  +
  +  /**
  +   * Makes the Lucene document asking the parser to extract
  +   * the relevant information.
  +   */
  +  public static Document document(File file) {
  +    // Instantiate a parser for this file
  +    Document doc = null;
  +    ForrestDocumentSAXParser parser = new ForrestDocumentSAXParser();
  +    try {
  +      HashMap results = parser.parseDocument(file);
  +      doc = processInfo(file, results);
  +    }
  +    catch (Exception ex) {
  +      // Not a forrest doc
  +    }
  +    return doc;
  +  }
  +
  +  /**
  +   * Process the results returned from the parser and creates the
  +   * Lucene document
  +   */
  +  private static Document processInfo(File file, HashMap results) {
  +    Document doc = new Document();
  +    // Get info
  +    String docTitle = (String) getFromResults("title", results);
  +    String docSummary = (String) getFromResults("abstract", results);
  +    String docAuthor = (String) getFromResults("author", results);
  +    String docContents = (String) getFromResults("body", results);
  +    // Index and store title and summary
  +    doc.add(Field.Text("title", docTitle));
  +    doc.add(Field.Text("summary", docSummary));
  +    doc.add(Field.Text("author", docAuthor));
  +    // Index but don't store contents
  +    doc.add(Field.UnStored("contents", docTitle + " " + docSummary + " " + docContents));
  +    return doc;
  +  } // document
  +
  +  /*
  +   * Utility method to extract a key from a hashmap
  +   */
  +  private static Object getFromResults(String key, HashMap results) {
  +    if (results.containsKey(key)) {
  +      return results.get(key);
  +    } else {
  +      return "";
  +    }
  +  } // getFromResults  }
  +
  +} // Class ForrestDocument
  
  
  
  1.2       +274 -264  xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestDocumentSAXParser.java
  
  Index: ForrestDocumentSAXParser.java
  ===================================================================
  RCS file: /home/cvs/xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestDocumentSAXParser.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ForrestDocumentSAXParser.java	12 Sep 2003 19:07:31 -0000	1.1
  +++ ForrestDocumentSAXParser.java	13 Sep 2003 01:21:39 -0000	1.2
  @@ -1,264 +1,274 @@
  -/*
  - * The Apache Software License, Version 1.1
  - *
  - *
  - * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  - *    not be used to endorse or promote products derived from this
  - *    software without prior written permission. For written
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    nor may "Apache" appear in their name, without prior written
  - *    permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation and was
  - * originally based on software copyright (c) 1999, International
  - * Business Machines, Inc., http://www.apache.org.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - */
  -package org.apache.forrest.search;
  -
  -import org.apache.xerces.parsers.SAXParser;
  -import java.io.*;
  -import org.xml.sax.*;
  -import org.xml.sax.helpers.*;
  -import org.xml.sax.ext.LexicalHandler;
  -import java.util.Vector;
  -import java.util.HashMap;
  -
  -/**
  - * <p>Parses a Forrest Document and extracts the information to use when
  - * generating Lucene indexes.</p>
  - * <p>The parser scans the document searching for a number of tags. When a match
  - * is found, it buffers all the text contained in the full subtree. When the parser
  - * is buffering text, it ignores all tags and just keeps the text.</p>
  - * <p>As an example consider the following document:</p>
  - * <code>
  - * <pre>
  - * &lt;document&gt;
  - * &lt;header&gt;
  - * &lt;title&gt;The title&lt;/title&gt;
  - * &lt;abstract&gt;An example&lt;/abstract&gt;
  - * &lt;/header&gt;
  - * &lt;body&gt;
  - * &lt;section&gt;
  - * &lt;title&gt;The Section&lt;/title&gt;
  - * &lt;p&gt;Some text with &lt;strong&gt;embedded&lt;/strong&gt; tags&lt;/p&gt;
  - * &lt;section&gt;
  - * &lt;/body&gt;
  - * &lt;/document&gt;
  - * </pre>
  - * </code>
  - * <p>If the parser is applied to <code>body</code> the result will be
  - * "The Section Some text with embedded tags". This permits the parser to generate
  - * fields with the full content of the body, so it can be indexed and searched later.</p>
  - * <p>If the parser now checks for <code>title</code> and <code>body</code> the
  - * results will be "The title" for <code>title</code> and the same as above for <code>body</code>.
  - * This demosntrates the parser is ignoring the <code>title</code> inside the
  - * <code>body</code>, since while the parser is buffering <code>body</code> is
  - * ignoring all the tags. This feature is useful to capture information inside
  - * the header.</p>
  - * <p>This is all what is needed to pass the information to Lucene, and by using this
  - * algorithm the class gets quite simple.</p>
  - *
  - * <p><em>(Hope my English it's not too bad ;-)</em></p>
  - *
  - * @author Ramon Prades [RPR]
  - * @version $Id$
  - */
  -
  -public class ForrestDocumentSAXParser extends DefaultHandler {
  -
  -  // Parser configuration constants
  -  static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
  -  static final String VALIDATION_FEATURE_ID = "http://xml.org/sax/features/validation";
  -  static final String EXTERNAL_DTD_FEATURE_ID = "http://apache.org/xml/features/nonvalidating/load-external-dtd";
  -
  -
  -  // List with the tags to capture
  -  static final String[] FORREST_HEADER_INDEXERS = {"title","abstract","body"};
  -  static String docAuthors = "";
  -  // Control variables
  -  XMLReader parser = null;
  -  HashMap results = null;
  -  String currentElement = "";
  -  StringBuffer textBuffer = new StringBuffer();
  -  Vector tags = null;
  -  boolean buffering = false;
  -  boolean isForrest = false;
  -
  -  /**
  -   * Constructor. Initiliazes the parser.
  -   */
  -  public ForrestDocumentSAXParser() {
  -    super();
  -    // Load the list of interesting tags in a vector for later processing
  -    tags = new Vector();
  -    for (int i=0; i<FORREST_HEADER_INDEXERS.length; i++) {
  -      tags.add(FORREST_HEADER_INDEXERS[i]);
  -    }
  -    // Instantiate the SAX parser
  -    try {
  -      parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER_NAME);
  -      parser.setFeature(VALIDATION_FEATURE_ID, false);
  -      parser.setFeature(EXTERNAL_DTD_FEATURE_ID, false);
  -      parser.setContentHandler(this);
  -      parser.setErrorHandler(this);
  -    } catch (SAXException ex) {
  -      System.err.println("Error getting the parser (" + ex.getMessage() + ")");
  -    }
  -  } // Constructor
  -
  -  /**
  -   * Gets a parser and parses the selected document
  -   * @param fileName Forrest document file name
  -   */
  -  public HashMap parseDocument(String fileName) throws SAXException {
  -    try {
  -      parser.parse(new InputSource(fileName));
  -    } catch (IOException ex) {
  -      ex.printStackTrace();
  -    }
  -    return results;
  -  } // parseDocument
  -
  -  /**
  -   * Gets a parser and parses the selected document
  -   * @param file Forrest document file
  -   */
  -  public HashMap parseDocument(File file) throws SAXException {
  -    try {
  -      parser.parse(new InputSource(new java.io.FileInputStream(file)));
  -    } catch (IOException ex) {
  -      ex.printStackTrace();
  -    }
  -    return results;
  -  } // parseDocument
  -
  -  /**
  -   * Gets the results
  -   * @return
  -   */
  -  public HashMap getResults() {
  -      return results;
  -  } // getResults
  -
  -  /**
  -   * Triggered when a new document is about to be parsed
  -   */
  -  public void startDocument() {
  -    // Reset control variables
  -    textBuffer.setLength(0);
  -    results = new HashMap();
  -    isForrest = false;
  -    docAuthors = "";
  -  } // startDocument
  -
  -  /**
  -   * Saves authors when document fully parsed
  -   */
  -  public void endDocument() {
  -    results.put("author", docAuthors);
  -  }
  -
  -  /**
  -   * Triggered when a new element is about to be parsed
  -   */
  -  public void startElement(String uri, String localName, String qName, Attributes attributes)
  -      throws SAXException  {
  -    // Check the new tag only when not buffering
  -    if (!buffering) {
  -      // Check the root element to see if the document is a Forrest one
  -      if (!isForrest && (!localName.equals("document"))) {
  -        results = null;
  -        // If not forrest, throw an exception to stop parsing (speed matters!)
  -        throw new SAXException("The document is not a Forrest document!");
  -      }
  -      // Is Forrest, so carry on processing
  -      isForrest = true;
  -      // Check "person". Here we want the attribute "@name"
  -      if (localName.equals("person")) {
  -        String separator = "";
  -        if (docAuthors!=null && docAuthors.length()>0) {
  -          separator = ";";
  -        }
  -        docAuthors += separator + attributes.getValue("name");
  -      } else if (tags.contains(localName)) {
  -        currentElement = localName;
  -        buffering = true;
  -      }
  -    }
  -  } // startElement
  -
  -  /**
  -   * End of element detected. If the closing element is the one the parser is
  -   * bufferig, store the text, otherwise don't do anything
  -   */
  -  public void endElement(String uri, String localName, String qName) {
  -    if (buffering) {
  -     if (localName.equals(currentElement)) {
  -       buffering = false;
  -       results.put(currentElement, textBuffer.toString());
  -       textBuffer.setLength(0); // reset buffer
  -     } else {
  -       // add an extra space to avoid the following case:
  -       // <body>
  -       //   <section>
  -       //     <title>A title</title>
  -       //     <p>A paragraph</p>
  -       //   </section>
  -       // </body>
  -       // Unless an extra space is added the result would be: "A titleA paragraph"
  -       textBuffer.append(' ');
  -     }
  -    }
  -  } // endElement
  -
  -  /**
  -   * Buffer the parsed character when "doCapture" tells so.
  -   */
  -  public void characters(char[] cbuf, int start, int len) {
  -    if (buffering) {
  -      textBuffer.append(cbuf, start, len);
  -    }
  -  } // characters
  -
  -} // ForrestDocumentSAXParser
  \ No newline at end of file
  +/*
  + * The Apache Software License, Version 1.1
  + *
  + *
  + * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  + * reserved.
  + *
  + * Redistribution and use in source and binary forms, with or without
  + * modification, are permitted provided that the following conditions
  + * are met:
  + *
  + * 1. Redistributions of source code must retain the above copyright
  + *    notice, this list of conditions and the following disclaimer.
  + *
  + * 2. Redistributions in binary form must reproduce the above copyright
  + *    notice, this list of conditions and the following disclaimer in
  + *    the documentation and/or other materials provided with the
  + *    distribution.
  + *
  + * 3. The end-user documentation included with the redistribution,
  + *    if any, must include the following acknowledgment:
  + *       "This product includes software developed by the
  + *        Apache Software Foundation (http://www.apache.org/)."
  + *    Alternately, this acknowledgment may appear in the software itself,
  + *    if and wherever such third-party acknowledgments normally appear.
  + *
  + * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  + *    not be used to endorse or promote products derived from this
  + *    software without prior written permission. For written
  + *    permission, please contact apache@apache.org.
  + *
  + * 5. Products derived from this software may not be called "Apache",
  + *    nor may "Apache" appear in their name, without prior written
  + *    permission of the Apache Software Foundation.
  + *
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + * SUCH DAMAGE.
  + * ====================================================================
  + *
  + * This software consists of voluntary contributions made by many
  + * individuals on behalf of the Apache Software Foundation and was
  + * originally based on software copyright (c) 1999, International
  + * Business Machines, Inc., http://www.apache.org.  For more
  + * information on the Apache Software Foundation, please see
  + * <http://www.apache.org/>.
  + */
  +package org.apache.forrest.search;
  +
  +import java.io.File;
  +import java.io.FileInputStream;
  +import java.io.InputStream;
  +import java.io.IOException;
  +
  +import java.util.HashMap;
  +import java.util.Vector;
  +
  +import org.xml.sax.Attributes;
  +import org.xml.sax.ContentHandler;
  +import org.xml.sax.ErrorHandler;
  +import org.xml.sax.InputSource;
  +import org.xml.sax.SAXException;
  +import org.xml.sax.XMLReader;
  +
  +import org.xml.sax.helpers.DefaultHandler;
  +import org.xml.sax.helpers.XMLReaderFactory;
  +
  +/**
  + * <p>Parses a Forrest Document and extracts the information to use when
  + * generating Lucene indexes.</p>
  + * <p>The parser scans the document searching for a number of tags. When a match
  + * is found, it buffers all the text contained in the full subtree. When the parser
  + * is buffering text, it ignores all tags and just keeps the text.</p>
  + * <p>As an example consider the following document:</p>
  + * <code>
  + * <pre>
  + * &lt;document&gt;
  + * &lt;header&gt;
  + * &lt;title&gt;The title&lt;/title&gt;
  + * &lt;abstract&gt;An example&lt;/abstract&gt;
  + * &lt;/header&gt;
  + * &lt;body&gt;
  + * &lt;section&gt;
  + * &lt;title&gt;The Section&lt;/title&gt;
  + * &lt;p&gt;Some text with &lt;strong&gt;embedded&lt;/strong&gt; tags&lt;/p&gt;
  + * &lt;section&gt;
  + * &lt;/body&gt;
  + * &lt;/document&gt;
  + * </pre>
  + * </code>
  + * <p>If the parser is applied to <code>body</code> the result will be
  + * "The Section Some text with embedded tags". This permits the parser to generate
  + * fields with the full content of the body, so it can be indexed and searched later.</p>
  + * <p>If the parser now checks for <code>title</code> and <code>body</code> the
  + * results will be "The title" for <code>title</code> and the same as above for <code>body</code>.
  + * This demosntrates the parser is ignoring the <code>title</code> inside the
  + * <code>body</code>, since while the parser is buffering <code>body</code> is
  + * ignoring all the tags. This feature is useful to capture information inside
  + * the header.</p>
  + * <p>This is all what is needed to pass the information to Lucene, and by using this
  + * algorithm the class gets quite simple.</p>
  + *
  + * <p><em>(Hope my English it's not too bad ;-)</em></p>
  + *
  + * @author Ramon Prades [RPR]
  + * @version $Id$
  + */
  +
  +public class ForrestDocumentSAXParser extends DefaultHandler {
  +
  +  // Parser configuration constants
  +  static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
  +  static final String VALIDATION_FEATURE_ID = "http://xml.org/sax/features/validation";
  +  static final String EXTERNAL_DTD_FEATURE_ID = "http://apache.org/xml/features/nonvalidating/load-external-dtd";
  +
  +
  +  // List with the tags to capture
  +  static final String[] FORREST_HEADER_INDEXERS = {"title","abstract","body"};
  +  static String docAuthors = "";
  +  // Control variables
  +  XMLReader parser = null;
  +  HashMap results = null;
  +  String currentElement = "";
  +  StringBuffer textBuffer = new StringBuffer();
  +  Vector tags = null;
  +  boolean buffering = false;
  +  boolean isForrest = false;
  +
  +  /**
  +   * Constructor. Initiliazes the parser.
  +   */
  +  public ForrestDocumentSAXParser() {
  +    super();
  +    // Load the list of interesting tags in a vector for later processing
  +    tags = new Vector();
  +    for (int i=0; i<FORREST_HEADER_INDEXERS.length; i++) {
  +      tags.add(FORREST_HEADER_INDEXERS[i]);
  +    }
  +    // Instantiate the SAX parser
  +    try {
  +      parser = XMLReaderFactory.createXMLReader(DEFAULT_PARSER_NAME);
  +      parser.setFeature(VALIDATION_FEATURE_ID, false);
  +      parser.setFeature(EXTERNAL_DTD_FEATURE_ID, false);
  +      parser.setContentHandler(this);
  +      parser.setErrorHandler(this);
  +    } catch (SAXException ex) {
  +      System.err.println("Error getting the parser (" + ex.getMessage() + ")");
  +    }
  +  } // Constructor
  +
  +  /**
  +   * Gets a parser and parses the selected document
  +   * @param fileName Forrest document file name
  +   */
  +  public HashMap parseDocument(String fileName) throws SAXException {
  +    try {
  +      parser.parse(new InputSource(fileName));
  +    } catch (IOException ex) {
  +      ex.printStackTrace();
  +    }
  +    return results;
  +  } // parseDocument
  +
  +  /**
  +   * Gets a parser and parses the selected document
  +   * @param file Forrest document file
  +   */
  +  public HashMap parseDocument(File file) throws SAXException {
  +    try {
  +      parser.parse(new InputSource(new java.io.FileInputStream(file)));
  +    } catch (IOException ex) {
  +      ex.printStackTrace();
  +    }
  +    return results;
  +  } // parseDocument
  +
  +  /**
  +   * Gets the results
  +   * @return
  +   */
  +  public HashMap getResults() {
  +      return results;
  +  } // getResults
  +
  +  /**
  +   * Triggered when a new document is about to be parsed
  +   */
  +  public void startDocument() {
  +    // Reset control variables
  +    textBuffer.setLength(0);
  +    results = new HashMap();
  +    isForrest = false;
  +    docAuthors = "";
  +  } // startDocument
  +
  +  /**
  +   * Saves authors when document fully parsed
  +   */
  +  public void endDocument() {
  +    results.put("author", docAuthors);
  +  }
  +
  +  /**
  +   * Triggered when a new element is about to be parsed
  +   */
  +  public void startElement(String uri, String localName, String qName, Attributes attributes)
  +      throws SAXException  {
  +    // Check the new tag only when not buffering
  +    if (!buffering) {
  +      // Check the root element to see if the document is a Forrest one
  +      if (!isForrest && (!localName.equals("document"))) {
  +        results = null;
  +        // If not forrest, throw an exception to stop parsing (speed matters!)
  +        throw new SAXException("The document is not a Forrest document!");
  +      }
  +      // Is Forrest, so carry on processing
  +      isForrest = true;
  +      // Check "person". Here we want the attribute "@name"
  +      if (localName.equals("person")) {
  +        String separator = "";
  +        if (docAuthors!=null && docAuthors.length()>0) {
  +          separator = ";";
  +        }
  +        docAuthors += separator + attributes.getValue("name");
  +      } else if (tags.contains(localName)) {
  +        currentElement = localName;
  +        buffering = true;
  +      }
  +    }
  +  } // startElement
  +
  +  /**
  +   * End of element detected. If the closing element is the one the parser is
  +   * bufferig, store the text, otherwise don't do anything
  +   */
  +  public void endElement(String uri, String localName, String qName) {
  +    if (buffering) {
  +     if (localName.equals(currentElement)) {
  +       buffering = false;
  +       results.put(currentElement, textBuffer.toString());
  +       textBuffer.setLength(0); // reset buffer
  +     } else {
  +       // add an extra space to avoid the following case:
  +       // <body>
  +       //   <section>
  +       //     <title>A title</title>
  +       //     <p>A paragraph</p>
  +       //   </section>
  +       // </body>
  +       // Unless an extra space is added the result would be: "A titleA paragraph"
  +       textBuffer.append(' ');
  +     }
  +    }
  +  } // endElement
  +
  +  /**
  +   * Buffer the parsed character when "doCapture" tells so.
  +   */
  +  public void characters(char[] cbuf, int start, int len) {
  +    if (buffering) {
  +      textBuffer.append(cbuf, start, len);
  +    }
  +  } // characters
  +
  +} // ForrestDocumentSAXParser
  
  
  
  1.2       +212 -205  xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestIndexer.java
  
  Index: ForrestIndexer.java
  ===================================================================
  RCS file: /home/cvs/xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestIndexer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ForrestIndexer.java	12 Sep 2003 19:07:31 -0000	1.1
  +++ ForrestIndexer.java	13 Sep 2003 01:21:39 -0000	1.2
  @@ -1,205 +1,212 @@
  -/*
  - * The Apache Software License, Version 1.1
  - *
  - *
  - * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  - *    not be used to endorse or promote products derived from this
  - *    software without prior written permission. For written
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    nor may "Apache" appear in their name, without prior written
  - *    permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation and was
  - * originally based on software copyright (c) 1999, International
  - * Business Machines, Inc., http://www.apache.org.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - */
  -package org.apache.forrest.search;
  -
  -import java.io.*;
  -import java.net.*;
  -import java.util.*;
  -import java.lang.StringBuffer;
  -
  -import org.apache.lucene.analysis.standard.*;
  -import org.apache.lucene.document.*;
  -import org.apache.lucene.index.*;
  -import org.apache.lucene.util.Arrays;
  -
  -/**
  - * <p>Indexes all xml forrest documents below a given directory.</p>
  - * <p>Parametres:</p>
  - * <ul>
  - * <li>
  - * <strong><code>-index index_directory</code></strong> Directory where
  - * the index is to be created
  - * </li>
  - * <li>
  - * <strong><code>root_directory</code></strong> forrest 'xdocs' directory
  - * </li>
  - * </ul>
  - * <h3>Current Limitations/todo</h3>
  - * <ul>
  - * <li>This version indexes Forrest XML documents only. Would be nice if PDF and
  - * HTML could be added.</li>
  - * <li>FAQ and TODO aren't indexed. Add support to that.</li>
  - * <li>Full index created every time. Create sort of incremental indexing.</li>
  - * <li>Could be a good idea to create a list of "reserved" filenames (i.e. book.xml
  - * or status.xml) and force the indexer to skip them.</li>
  - *
  - * @author Ramon Prades [RPR]
  - * @version CVS $Id$
  - */
  -public class ForrestIndexer {
  -
  -  // Info about the class itself
  -  private static final String VERSION = "Version 0.21 (2003-08-08)";
  -  private static final String DIVIDER =
  -      "==============================================================================";
  -  private static final String BANNER = "ForrestIndexer (Powered by Lucene) " + VERSION;
  -  private static final String COPYRIGHT =
  -      "Copyright (c) 2001, 2003 The Apache Software Foundation.  All rights reserved.";
  -  private static final String USAGE =
  -      "ForrestIndexer [-index <index_directory>] <root_directory>";
  -
  -  // Some vars
  -  private static IndexReader reader; // Existing index
  -  private static IndexWriter writer; // New index being built
  -  private static String rootPath = "";
  -
  -  /**
  -   * Main method. See parametres at class javadoc.
  -   */
  -  public static void main(String[] argv) {
  -    try {
  -      String index = "";
  -      boolean create = true;
  -      File root = null;
  -      if (argv.length == 0) {
  -        System.err.println("Usage: " + USAGE);
  -        return;
  -      }
  -
  -      // Get parametres from args
  -      for (int i = 0; i < argv.length; i++) {
  -        if (argv[i].equals("-index")) { // parse -index option
  -          index = argv[++i];
  -        } else if (i != argv.length - 1) {
  -          System.err.println("Usage: " + USAGE);
  -          return;
  -        } else {
  -          root = new File(argv[i]);
  -        }
  -      }
  -      // Debugging
  -      // index = "C:/dev/uimlsite/build/webapp/index";
  -      // root = new File("C:/dev/uimlsite/src/documentation/content/xdocs");
  -
  -      // Print banner
  -      System.out.println(DIVIDER);
  -      System.out.println(BANNER);
  -      System.out.println(COPYRIGHT);
  -      System.out.println(DIVIDER);
  -      System.out.println("");
  -      rootPath = root.getPath().trim();
  -      System.out.println("Source Directory: " + rootPath);
  -      System.out.println("Index Directory: " + index);
  -      System.out.println("");
  -
  -      Date start = new Date();
  -      writer = new IndexWriter(index, new StandardAnalyzer(), create);
  -      writer.maxFieldLength = 1000000;
  -      indexDocs(root); // add new docs
  -      System.out.print("Index created! - Total milliseconds ");
  -      System.out.println(new Date().getTime() - start.getTime());
  -      System.out.println("");
  -
  -      System.out.println("Optimizing index...");
  -      writer.optimize();
  -      writer.close();
  -      System.out.print("Index optimized! - Total milliseconds ");
  -      System.out.println(new Date().getTime() - start.getTime());
  -    } catch (Exception e) {
  -      System.err.println(" Exception in " + e.getClass() +
  -                         "\n with message: " + e.getMessage());
  -      e.printStackTrace();
  -    }
  -  } // main
  -
  -  /*
  -   * Create the index
  -   */
  -  private static void indexDocs(File file) {
  -    if (file.isDirectory()) { // if a directory
  -      String[] files = file.list(); // list its files
  -      Arrays.sort(files); // sort the files
  -      for (int i = 0; i < files.length; i++) { // recursively index them
  -        indexDocs(new File(file, files[i]));
  -      }
  -    } else if (file.getPath().endsWith(".xml")) { // index .txt files
  -      String filePath = getRelativePath(file.getPath(), rootPath);
  -      System.out.print("Indexing ... " + filePath);
  -      Document doc = ForrestDocument.document(file);
  -      if (doc == null) {
  -        System.out.println(" [Ignored]");
  -      } else {
  -        try {
  -          // Add last modified and path
  -          doc.add(Field.Keyword("modified", new Long(file.lastModified()).toString()));
  -          doc.add(Field.Keyword("path", filePath));
  -          writer.addDocument(doc); // add docs unconditionally
  -        } catch (IOException ex) {
  -          System.out.println(" [Error: " + ex.getMessage() + "]");
  -        }
  -        System.out.println(" [Done]");
  -      }
  -    }
  -  } // indexDocs
  -
  -  /*
  -   * Utility method to calculate the relative path of a file
  -   */
  -  private static String getRelativePath(String filePath, String rootPath) {
  -    return filePath.substring(rootPath.length()+1);
  -  } // getRelativePath
  -} // Class ForrestLuceneIndexer
  +/*
  + * The Apache Software License, Version 1.1
  + *
  + *
  + * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  + * reserved.
  + *
  + * Redistribution and use in source and binary forms, with or without
  + * modification, are permitted provided that the following conditions
  + * are met:
  + *
  + * 1. Redistributions of source code must retain the above copyright
  + *    notice, this list of conditions and the following disclaimer.
  + *
  + * 2. Redistributions in binary form must reproduce the above copyright
  + *    notice, this list of conditions and the following disclaimer in
  + *    the documentation and/or other materials provided with the
  + *    distribution.
  + *
  + * 3. The end-user documentation included with the redistribution,
  + *    if any, must include the following acknowledgment:
  + *       "This product includes software developed by the
  + *        Apache Software Foundation (http://www.apache.org/)."
  + *    Alternately, this acknowledgment may appear in the software itself,
  + *    if and wherever such third-party acknowledgments normally appear.
  + *
  + * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  + *    not be used to endorse or promote products derived from this
  + *    software without prior written permission. For written
  + *    permission, please contact apache@apache.org.
  + *
  + * 5. Products derived from this software may not be called "Apache",
  + *    nor may "Apache" appear in their name, without prior written
  + *    permission of the Apache Software Foundation.
  + *
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + * SUCH DAMAGE.
  + * ====================================================================
  + *
  + * This software consists of voluntary contributions made by many
  + * individuals on behalf of the Apache Software Foundation and was
  + * originally based on software copyright (c) 1999, International
  + * Business Machines, Inc., http://www.apache.org.  For more
  + * information on the Apache Software Foundation, please see
  + * <http://www.apache.org/>.
  + */
  +package org.apache.forrest.search;
  +
  +import java.io.File;
  +import java.io.IOException;
  +
  +import java.util.Date;
  +
  +import org.apache.lucene.analysis.Analyzer;
  +
  +import org.apache.lucene.analysis.standard.StandardAnalyzer;
  +
  +import org.apache.lucene.document.Document;
  +import org.apache.lucene.document.Field;
  +
  +import org.apache.lucene.index.IndexReader;
  +import org.apache.lucene.index.IndexWriter;
  +
  +import org.apache.lucene.util.Arrays;
  +
  +/**
  + * <p>Indexes all xml forrest documents below a given directory.</p>
  + * <p>Parametres:</p>
  + * <ul>
  + * <li>
  + * <strong><code>-index index_directory</code></strong> Directory where
  + * the index is to be created
  + * </li>
  + * <li>
  + * <strong><code>root_directory</code></strong> forrest 'xdocs' directory
  + * </li>
  + * </ul>
  + * <h3>Current Limitations/todo</h3>
  + * <ul>
  + * <li>This version indexes Forrest XML documents only. Would be nice if PDF and
  + * HTML could be added.</li>
  + * <li>FAQ and TODO aren't indexed. Add support to that.</li>
  + * <li>Full index created every time. Create sort of incremental indexing.</li>
  + * <li>Could be a good idea to create a list of "reserved" filenames (i.e. book.xml
  + * or status.xml) and force the indexer to skip them.</li>
  + *
  + * @author Ramon Prades [RPR]
  + * @version CVS $Id$
  + */
  +public class ForrestIndexer {
  +
  +  // Info about the class itself
  +  private static final String VERSION = "Version 0.21 (2003-08-08)";
  +  private static final String DIVIDER =
  +      "==============================================================================";
  +  private static final String BANNER = "ForrestIndexer (Powered by Lucene) " + VERSION;
  +  private static final String COPYRIGHT =
  +      "Copyright (c) 2001, 2003 The Apache Software Foundation.  All rights reserved.";
  +  private static final String USAGE =
  +      "ForrestIndexer [-index <index_directory>] <root_directory>";
  +
  +  // Some vars
  +  private static IndexReader reader; // Existing index
  +  private static IndexWriter writer; // New index being built
  +  private static String rootPath = "";
  +
  +  /**
  +   * Main method. See parametres at class javadoc.
  +   */
  +  public static void main(String[] argv) {
  +    try {
  +      String index = "";
  +      boolean create = true;
  +      File root = null;
  +      if (argv.length == 0) {
  +        System.err.println("Usage: " + USAGE);
  +        return;
  +      }
  +
  +      // Get parametres from args
  +      for (int i = 0; i < argv.length; i++) {
  +        if (argv[i].equals("-index")) { // parse -index option
  +          index = argv[++i];
  +        } else if (i != argv.length - 1) {
  +          System.err.println("Usage: " + USAGE);
  +          return;
  +        } else {
  +          root = new File(argv[i]);
  +        }
  +      }
  +      // Debugging
  +      // index = "C:/dev/uimlsite/build/webapp/index";
  +      // root = new File("C:/dev/uimlsite/src/documentation/content/xdocs");
  +
  +      // Print banner
  +      System.out.println(DIVIDER);
  +      System.out.println(BANNER);
  +      System.out.println(COPYRIGHT);
  +      System.out.println(DIVIDER);
  +      System.out.println("");
  +      rootPath = root.getPath().trim();
  +      System.out.println("Source Directory: " + rootPath);
  +      System.out.println("Index Directory: " + index);
  +      System.out.println("");
  +
  +      Date start = new Date();
  +      writer = new IndexWriter(index, new StandardAnalyzer(), create);
  +      writer.maxFieldLength = 1000000;
  +      indexDocs(root); // add new docs
  +      System.out.print("Index created! - Total milliseconds ");
  +      System.out.println(new Date().getTime() - start.getTime());
  +      System.out.println("");
  +
  +      System.out.println("Optimizing index...");
  +      writer.optimize();
  +      writer.close();
  +      System.out.print("Index optimized! - Total milliseconds ");
  +      System.out.println(new Date().getTime() - start.getTime());
  +    } catch (Exception e) {
  +      System.err.println(" Exception in " + e.getClass() +
  +                         "\n with message: " + e.getMessage());
  +      e.printStackTrace();
  +    }
  +  } // main
  +
  +  /*
  +   * Create the index
  +   */
  +  private static void indexDocs(File file) {
  +    if (file.isDirectory()) { // if a directory
  +      String[] files = file.list(); // list its files
  +      Arrays.sort(files); // sort the files
  +      for (int i = 0; i < files.length; i++) { // recursively index them
  +        indexDocs(new File(file, files[i]));
  +      }
  +    } else if (file.getPath().endsWith(".xml")) { // index .txt files
  +      String filePath = getRelativePath(file.getPath(), rootPath);
  +      System.out.print("Indexing ... " + filePath);
  +      Document doc = ForrestDocument.document(file);
  +      if (doc == null) {
  +        System.out.println(" [Ignored]");
  +      } else {
  +        try {
  +          // Add last modified and path
  +          doc.add(Field.Keyword("modified", new Long(file.lastModified()).toString()));
  +          doc.add(Field.Keyword("path", filePath));
  +          writer.addDocument(doc); // add docs unconditionally
  +        } catch (IOException ex) {
  +          System.out.println(" [Error: " + ex.getMessage() + "]");
  +        }
  +        System.out.println(" [Done]");
  +      }
  +    }
  +  } // indexDocs
  +
  +  /*
  +   * Utility method to calculate the relative path of a file
  +   */
  +  private static String getRelativePath(String filePath, String rootPath) {
  +    return filePath.substring(rootPath.length()+1);
  +  } // getRelativePath
  +} // Class ForrestLuceneIndexer
  
  
  
  1.2       +138 -127  xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestSearchRenderer.java
  
  Index: ForrestSearchRenderer.java
  ===================================================================
  RCS file: /home/cvs/xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestSearchRenderer.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ForrestSearchRenderer.java	12 Sep 2003 19:07:31 -0000	1.1
  +++ ForrestSearchRenderer.java	13 Sep 2003 01:21:39 -0000	1.2
  @@ -1,127 +1,138 @@
  -/*
  - * The Apache Software License, Version 1.1
  - *
  - *
  - * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  - *    not be used to endorse or promote products derived from this
  - *    software without prior written permission. For written
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    nor may "Apache" appear in their name, without prior written
  - *    permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation and was
  - * originally based on software copyright (c) 1999, International
  - * Business Machines, Inc., http://www.apache.org.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - */
  -
  -package org.apache.forrest.search;
  -
  -import org.w3c.dom.*;
  -import javax.xml.transform.*;
  -import javax.xml.transform.dom.*;
  -import java.io.*;
  -import org.apache.xerces.dom.*;
  -import javax.xml.transform.sax.*;
  -import javax.xml.transform.stream.StreamSource;
  -import org.xml.sax.XMLReader;
  -import org.xml.sax.helpers.XMLReaderFactory;
  -
  -
  -/**
  - * <p>Title: </p>
  - * <p>Description: </p>
  - * <p>Copyright: Copyright (c) 2003</p>
  - * <p>Company: </p>
  - * @author not attributable
  - * @version 1.0
  - */
  -
  -public class ForrestSearchRenderer {
  -  Transformer transformer = null;
  -  Transformer transformer2 = null;
  -  private String skinconf = "";
  -  private static final String doc2html = "document2html.xsl";
  -  private static final String site2xhtml = "site2xhtml.xsl";
  -
  -  public ForrestSearchRenderer(String rootPath, String skin) {
  -    String fullPath = rootPath + "/skins/" + skin + "/xslt/html/";
  -    // Instantiate  a TransformerFactory.
  -    TransformerFactory tFactory = TransformerFactory.newInstance();
  -    try {
  -      skinconf = rootPath + "/skinconf.xml";
  -      transformer = tFactory.newTransformer
  -          (new javax.xml.transform.stream.StreamSource(fullPath + doc2html));
  -      transformer.setParameter("config-file", skinconf);
  -      transformer.setParameter("notoc", "true");
  -      transformer.setParameter("dynamic-page", "true");
  -      transformer2 = tFactory.newTransformer
  -          (new javax.xml.transform.stream.StreamSource(fullPath + site2xhtml));
  -      transformer2.setParameter("config-file", skinconf);
  -    } catch (TransformerConfigurationException ex) {
  -      System.err.println("Transformer Config exception");
  -    }
  -  } // Constructor
  -
  -  public String render(Document dom) {
  -    String page = null;
  -
  -    try {
  -      Document doc = new DocumentImpl();
  -      Element root = doc.createElement("site");
  -      DOMResult domResult = new DOMResult(root);
  -      transformer.transform(new DOMSource(dom.getDocumentElement()), domResult);
  -
  -      OutputStream result = new ByteArrayOutputStream();
  -      javax.xml.transform.stream.StreamResult theResult = new javax.xml.transform.stream.StreamResult(result);
  -
  -      transformer2.transform(new DOMSource(domResult.getNode()), theResult);
  -      page = result.toString();
  -    } catch (TransformerException ex) {
  -      ex.printStackTrace();
  -    }
  -
  -    return page;
  -  } // render
  -
  -}
  \ No newline at end of file
  +/*
  + * The Apache Software License, Version 1.1
  + *
  + *
  + * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  + * reserved.
  + *
  + * Redistribution and use in source and binary forms, with or without
  + * modification, are permitted provided that the following conditions
  + * are met:
  + *
  + * 1. Redistributions of source code must retain the above copyright
  + *    notice, this list of conditions and the following disclaimer.
  + *
  + * 2. Redistributions in binary form must reproduce the above copyright
  + *    notice, this list of conditions and the following disclaimer in
  + *    the documentation and/or other materials provided with the
  + *    distribution.
  + *
  + * 3. The end-user documentation included with the redistribution,
  + *    if any, must include the following acknowledgment:
  + *       "This product includes software developed by the
  + *        Apache Software Foundation (http://www.apache.org/)."
  + *    Alternately, this acknowledgment may appear in the software itself,
  + *    if and wherever such third-party acknowledgments normally appear.
  + *
  + * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  + *    not be used to endorse or promote products derived from this
  + *    software without prior written permission. For written
  + *    permission, please contact apache@apache.org.
  + *
  + * 5. Products derived from this software may not be called "Apache",
  + *    nor may "Apache" appear in their name, without prior written
  + *    permission of the Apache Software Foundation.
  + *
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + * SUCH DAMAGE.
  + * ====================================================================
  + *
  + * This software consists of voluntary contributions made by many
  + * individuals on behalf of the Apache Software Foundation and was
  + * originally based on software copyright (c) 1999, International
  + * Business Machines, Inc., http://www.apache.org.  For more
  + * information on the Apache Software Foundation, please see
  + * <http://www.apache.org/>.
  + */
  +
  +package org.apache.forrest.search;
  +
  +import java.io.ByteArrayOutputStream;
  +import java.io.OutputStream;
  +
  +import javax.xml.transform.Result;
  +import javax.xml.transform.Source;
  +import javax.xml.transform.Transformer;
  +import javax.xml.transform.TransformerConfigurationException;
  +import javax.xml.transform.TransformerException;
  +import javax.xml.transform.TransformerFactory;
  +
  +import javax.xml.transform.dom.DOMResult;
  +import javax.xml.transform.dom.DOMSource;
  +
  +import javax.xml.transform.stream.StreamResult;
  +import javax.xml.transform.stream.StreamSource;
  +
  +import org.apache.xerces.dom.DocumentImpl;
  +
  +import org.w3c.dom.Document;
  +import org.w3c.dom.Element;
  +import org.w3c.dom.Node;
  +
  +/**
  + * <p>Title: </p>
  + * <p>Description: </p>
  + * <p>Copyright: Copyright (c) 2003</p>
  + * <p>Company: </p>
  + * @author not attributable
  + * @version 1.0
  + */
  +
  +public class ForrestSearchRenderer {
  +  Transformer transformer = null;
  +  Transformer transformer2 = null;
  +  private String skinconf = "";
  +  private static final String doc2html = "document2html.xsl";
  +  private static final String site2xhtml = "site2xhtml.xsl";
  +
  +  public ForrestSearchRenderer(String rootPath, String skin) {
  +    String fullPath = rootPath + "/skins/" + skin + "/xslt/html/";
  +    // Instantiate  a TransformerFactory.
  +    TransformerFactory tFactory = TransformerFactory.newInstance();
  +    try {
  +      skinconf = rootPath + "/skinconf.xml";
  +      transformer = tFactory.newTransformer
  +          (new javax.xml.transform.stream.StreamSource(fullPath + doc2html));
  +      transformer.setParameter("config-file", skinconf);
  +      transformer.setParameter("notoc", "true");
  +      transformer.setParameter("dynamic-page", "true");
  +      transformer2 = tFactory.newTransformer
  +          (new javax.xml.transform.stream.StreamSource(fullPath + site2xhtml));
  +      transformer2.setParameter("config-file", skinconf);
  +    } catch (TransformerConfigurationException ex) {
  +      System.err.println("Transformer Config exception");
  +    }
  +  } // Constructor
  +
  +  public String render(Document dom) {
  +    String page = null;
  +
  +    try {
  +      Document doc = new DocumentImpl();
  +      Element root = doc.createElement("site");
  +      DOMResult domResult = new DOMResult(root);
  +      transformer.transform(new DOMSource(dom.getDocumentElement()), domResult);
  +
  +      OutputStream result = new ByteArrayOutputStream();
  +      javax.xml.transform.stream.StreamResult theResult = new javax.xml.transform.stream.StreamResult(result);
  +
  +      transformer2.transform(new DOMSource(domResult.getNode()), theResult);
  +      page = result.toString();
  +    } catch (TransformerException ex) {
  +      ex.printStackTrace();
  +    }
  +
  +    return page;
  +  } // render
  +
  +}
  
  
  
  1.2       +123 -123  xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestSearchServlet.java
  
  Index: ForrestSearchServlet.java
  ===================================================================
  RCS file: /home/cvs/xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestSearchServlet.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ForrestSearchServlet.java	12 Sep 2003 19:07:31 -0000	1.1
  +++ ForrestSearchServlet.java	13 Sep 2003 01:21:39 -0000	1.2
  @@ -1,123 +1,123 @@
  -/*
  - * The Apache Software License, Version 1.1
  - *
  - *
  - * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  - *    not be used to endorse or promote products derived from this
  - *    software without prior written permission. For written
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    nor may "Apache" appear in their name, without prior written
  - *    permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation and was
  - * originally based on software copyright (c) 1999, International
  - * Business Machines, Inc., http://www.apache.org.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - */
  -package org.apache.forrest.search;
  -
  -import javax.servlet.*;
  -import javax.servlet.http.*;
  -import java.io.*;
  -import java.util.*;
  -import org.apache.xalan.transformer.*;
  -import org.w3c.dom.*;
  -import javax.xml.transform.*;
  -import javax.xml.transform.dom.*;
  -import javax.xml.transform.stream.*;
  -import javax.xml.transform.*;
  -import java.net.*;
  -
  -/**
  - * <p>This sevlet processes all search request inside a Forrest site.</p>
  - * @author Ramon Prades [RPR]
  - * @version $Id$
  - */
  -public class ForrestSearchServlet extends HttpServlet {
  -
  -  private static final String CONTENT_TYPE = "text/html";
  -  private ForrestSearcher searcher = null;
  -  private static ForrestSearchRenderer renderer = null;
  -  private String servletPath = "";
  -  private String indexDir = "";     // Full path to lucene index directory
  -  private String skin = "";         // Skin configured
  -  private String searchPage = "/search.html";
  -  private static StringBuffer cache = null;
  -
  -
  -  /**
  -   * Prepares the servlet
  -   * @throws ServletException
  -   */
  -  public void init() throws ServletException {
  -    servletPath = this.getServletContext().getRealPath("");
  -    // FIXME: indexDir is hardcoded
  -    indexDir = servletPath + "/lucene-index";
  -    searcher = new ForrestSearcher();
  -    String skin = this.getInitParameter("project-skin");
  -    renderer = new ForrestSearchRenderer(servletPath, skin);
  - } // init
  -
  -  /**
  -   * Process the HTTP Get request
  -   */
  -  public void doGet(HttpServletRequest request, HttpServletResponse response)
  -      throws ServletException, IOException {
  -    response.setContentType("text/html");
  -    PrintWriter out = response.getWriter();
  -    // Query string should be in parametre "query".
  -    // A valid forrest document is returned.
  -    String query = request.getParameter("query");
  -    // Render the resulting document. Ideally the document
  -    // should be passed to Cocoon, but for the time being
  -    // use the renderer
  -    Document doc = searcher.search(indexDir, query);
  -    String page = renderer.render(doc);
  -    out.print(page);
  -  }
  -
  -  //Clean up resources
  -  public void destroy() {
  -  }
  -
  -} // ForrestSearchServlet
  +/*
  + * The Apache Software License, Version 1.1
  + *
  + *
  + * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  + * reserved.
  + *
  + * Redistribution and use in source and binary forms, with or without
  + * modification, are permitted provided that the following conditions
  + * are met:
  + *
  + * 1. Redistributions of source code must retain the above copyright
  + *    notice, this list of conditions and the following disclaimer.
  + *
  + * 2. Redistributions in binary form must reproduce the above copyright
  + *    notice, this list of conditions and the following disclaimer in
  + *    the documentation and/or other materials provided with the
  + *    distribution.
  + *
  + * 3. The end-user documentation included with the redistribution,
  + *    if any, must include the following acknowledgment:
  + *       "This product includes software developed by the
  + *        Apache Software Foundation (http://www.apache.org/)."
  + *    Alternately, this acknowledgment may appear in the software itself,
  + *    if and wherever such third-party acknowledgments normally appear.
  + *
  + * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  + *    not be used to endorse or promote products derived from this
  + *    software without prior written permission. For written
  + *    permission, please contact apache@apache.org.
  + *
  + * 5. Products derived from this software may not be called "Apache",
  + *    nor may "Apache" appear in their name, without prior written
  + *    permission of the Apache Software Foundation.
  + *
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + * SUCH DAMAGE.
  + * ====================================================================
  + *
  + * This software consists of voluntary contributions made by many
  + * individuals on behalf of the Apache Software Foundation and was
  + * originally based on software copyright (c) 1999, International
  + * Business Machines, Inc., http://www.apache.org.  For more
  + * information on the Apache Software Foundation, please see
  + * <http://www.apache.org/>.
  + */
  +package org.apache.forrest.search;
  +
  +import java.io.IOException;
  +import java.io.PrintWriter;
  +
  +import javax.servlet.ServletContext;
  +import javax.servlet.ServletException;
  +
  +import javax.servlet.http.HttpServlet;
  +import javax.servlet.http.HttpServletRequest;
  +import javax.servlet.http.HttpServletResponse;
  +
  +import org.w3c.dom.Document;
  +
  +/**
  + * <p>This sevlet processes all search request inside a Forrest site.</p>
  + * @author Ramon Prades [RPR]
  + * @version $Id$
  + */
  +public class ForrestSearchServlet extends HttpServlet {
  +
  +  private static final String CONTENT_TYPE = "text/html";
  +  private ForrestSearcher searcher = null;
  +  private static ForrestSearchRenderer renderer = null;
  +  private String servletPath = "";
  +  private String indexDir = "";     // Full path to lucene index directory
  +  private String skin = "";         // Skin configured
  +  private String searchPage = "/search.html";
  +  private static StringBuffer cache = null;
  +
  +
  +  /**
  +   * Prepares the servlet
  +   * @throws ServletException
  +   */
  +  public void init() throws ServletException {
  +    servletPath = this.getServletContext().getRealPath("");
  +    // FIXME: indexDir is hardcoded
  +    indexDir = servletPath + "/lucene-index";
  +    searcher = new ForrestSearcher();
  +    String skin = this.getInitParameter("project-skin");
  +    renderer = new ForrestSearchRenderer(servletPath, skin);
  + } // init
  +
  +  /**
  +   * Process the HTTP Get request
  +   */
  +  public void doGet(HttpServletRequest request, HttpServletResponse response)
  +      throws ServletException, IOException {
  +    response.setContentType("text/html");
  +    PrintWriter out = response.getWriter();
  +    // Query string should be in parametre "query".
  +    // A valid forrest document is returned.
  +    String query = request.getParameter("query");
  +    // Render the resulting document. Ideally the document
  +    // should be passed to Cocoon, but for the time being
  +    // use the renderer
  +    Document doc = searcher.search(indexDir, query);
  +    String page = renderer.render(doc);
  +    out.print(page);
  +  }
  +
  +  //Clean up resources
  +  public void destroy() {
  +  }
  +
  +} // ForrestSearchServlet
  
  
  
  1.2       +241 -225  xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestSearcher.java
  
  Index: ForrestSearcher.java
  ===================================================================
  RCS file: /home/cvs/xml-forrest/src/scratchpad/src/java/org/apache/forrest/search/ForrestSearcher.java,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -r1.1 -r1.2
  --- ForrestSearcher.java	12 Sep 2003 19:07:31 -0000	1.1
  +++ ForrestSearcher.java	13 Sep 2003 01:21:39 -0000	1.2
  @@ -1,225 +1,241 @@
  -/*
  - * The Apache Software License, Version 1.1
  - *
  - *
  - * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  - * reserved.
  - *
  - * Redistribution and use in source and binary forms, with or without
  - * modification, are permitted provided that the following conditions
  - * are met:
  - *
  - * 1. Redistributions of source code must retain the above copyright
  - *    notice, this list of conditions and the following disclaimer.
  - *
  - * 2. Redistributions in binary form must reproduce the above copyright
  - *    notice, this list of conditions and the following disclaimer in
  - *    the documentation and/or other materials provided with the
  - *    distribution.
  - *
  - * 3. The end-user documentation included with the redistribution,
  - *    if any, must include the following acknowledgment:
  - *       "This product includes software developed by the
  - *        Apache Software Foundation (http://www.apache.org/)."
  - *    Alternately, this acknowledgment may appear in the software itself,
  - *    if and wherever such third-party acknowledgments normally appear.
  - *
  - * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  - *    not be used to endorse or promote products derived from this
  - *    software without prior written permission. For written
  - *    permission, please contact apache@apache.org.
  - *
  - * 5. Products derived from this software may not be called "Apache",
  - *    nor may "Apache" appear in their name, without prior written
  - *    permission of the Apache Software Foundation.
  - *
  - * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  - * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  - * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  - * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  - * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  - * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  - * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  - * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  - * SUCH DAMAGE.
  - * ====================================================================
  - *
  - * This software consists of voluntary contributions made by many
  - * individuals on behalf of the Apache Software Foundation and was
  - * originally based on software copyright (c) 1999, International
  - * Business Machines, Inc., http://www.apache.org.  For more
  - * information on the Apache Software Foundation, please see
  - * <http://www.apache.org/>.
  - */
  -package org.apache.forrest.search;
  -
  -import java.io.IOException;
  -import java.io.StringReader;
  -import java.util.*;
  -import org.apache.lucene.analysis.standard.StandardAnalyzer;
  -import org.apache.lucene.queryParser.QueryParser;
  -import org.apache.lucene.queryParser.*;
  -import org.apache.lucene.search.*;
  -import org.apache.lucene.index.Term;
  -import org.apache.xerces.dom.DocumentImpl;
  -import org.apache.xerces.dom.DocumentTypeImpl;
  -import org.w3c.dom.*;
  -
  -/**
  - * <p>Searches the index for a given query string.</p>
  - * @author Ramon Prades [RPR]
  - * @version $Id$
  - */
  -public class ForrestSearcher {
  -  public ForrestSearcher() {
  -  }
  -
  -  /**
  -   * Searches "queryString" in "indexDir" and returns a Forrest Document (v1.2)
  -   * with the list of matches.
  -   * @param indexDir Directory with the Lucene index
  -   * @param queryString String to search
  -   * @return Forrest document
  -   */
  -  public Document search(String indexDir, String queryString) {
  -    // Create a Forrest document with the results
  -    DOMImplementation domImpl = new org.apache.xerces.dom.DOMImplementationImpl();
  -    DocumentType docType =
  -        domImpl.createDocumentType("document", "-//APACHE//DTD Documentation V1.1//EN", "document-v12.dtd");
  -    Document doc = domImpl.createDocument("", "document", docType);
  -    Element rootNode = doc.getDocumentElement();
  -    Element headerNode = doc.createElement("header");
  -    headerNode.appendChild(this.makeElement(doc, "title", "Search Results"));
  -    rootNode.appendChild(headerNode);
  -    Element bodyNode = doc.createElement("body");
  -    rootNode.appendChild(bodyNode);
  -
  -    // Element sectionNode = doc.createElement("section");
  -    // bodyNode.appendChild(sectionNode);
  -    // sectionNode.appendChild(makeElement(doc, "title", "List of Matches"));
  -
  -    IndexSearcher searcher = null;
  -    try {
  -      searcher = new IndexSearcher(indexDir);
  -    } catch (IOException ex) {
  -      System.err.println("Error: Index dir not found!");
  -      ex.printStackTrace();
  -    }
  -    Hits hits = null;
  -    int count = 0;
  -    if (queryString==null || queryString.length()==0) {
  -      Element pNode = doc.createElement("p");
  -      String txt = "Please enter a valid query";
  -      pNode.appendChild(doc.createTextNode(txt));
  -      bodyNode.appendChild(pNode);
  -    } else {
  -      Query query = null;
  -      try {
  -        query = QueryParser.parse(queryString, "contents", new StandardAnalyzer());
  -      } catch (ParseException ex3) {
  -        System.out.println("QueryParser error!");
  -        ex3.printStackTrace();
  -      }
  -      try {
  -        hits = searcher.search(query);
  -      } catch (IOException ex1) {
  -        System.err.println("Error in search");
  -        ex1.printStackTrace();
  -      }
  -
  -      // Build the section with the list of matches
  -      count = hits.length();
  -      Element pNode = doc.createElement("p");
  -      String txt = "";
  -      if (count == 0) {
  -        txt = "No documents found matching: ";
  -        pNode.appendChild(doc.createTextNode(txt));
  -        Element emNode = doc.createElement("em");
  -        pNode.appendChild(emNode);
  -        emNode.appendChild(doc.createTextNode(queryString));
  -        bodyNode.appendChild(pNode);
  -      } else {
  -        if (count == 1) {
  -          txt = count + " document found matching: ";
  -        } else {
  -          txt = count + " documents found matching: ";
  -        }
  -        pNode.appendChild(doc.createTextNode(txt));
  -        Element emNode = doc.createElement("em");
  -        pNode.appendChild(emNode);
  -        emNode.appendChild(doc.createTextNode(queryString));
  -        //pNode.appendChild(doc.createElement("em").appendChild(doc.createTextNode(queryString)));
  -        bodyNode.appendChild(pNode);
  -        Element listNode = doc.createElement("ul");
  -        // sectionNode.appendChild(listNode);
  -        bodyNode.appendChild(listNode);
  -
  -        for (int i = 0; i < count; i++) {
  -          try {
  -            String title = hits.doc(i).get("title");
  -            String summary = hits.doc(i).get("summary");
  -            String authors = hits.doc(i).get("author");
  -            String path = hits.doc(i).get("path").replaceAll(".xml", ".html");
  -            float score = hits.score(i);
  -            Date modified = new Date(new Long(hits.doc(i).get("modified")).
  -                                     longValue());
  -            java.text.DateFormat formatter = new java.text.SimpleDateFormat();
  -            String strModified = formatter.format(modified);
  -
  -            Element listItem = doc.createElement("li");
  -            listNode.appendChild(listItem);
  -            Element strongNode = doc.createElement("strong");
  -            listItem.appendChild(strongNode);
  -            Element linkNode = doc.createElement("link");
  -            linkNode.setAttribute("href", path);
  -            linkNode.appendChild(doc.createTextNode(title));
  -            strongNode.appendChild(linkNode);
  -
  -            String scoreText = " [" + score + "]";
  -            listItem.appendChild(doc.createTextNode(scoreText));
  -            listItem.appendChild(doc.createElement("br"));
  -
  -            if (summary != null && summary.length() > 0) {
  -              listItem.appendChild(doc.createTextNode(summary));
  -              listItem.appendChild(doc.createElement("br"));
  -            }
  -            Element lastLine = doc.createElement("em");
  -            listItem.appendChild(lastLine);
  -            lastLine.appendChild(doc.createTextNode("url: " + path));
  -            if (authors != null && authors.length() > 0) {
  -              lastLine.appendChild(doc.createTextNode(" - author: " + authors));
  -            }
  -            lastLine.appendChild(doc.createTextNode(" - last modified: " +
  -                strModified));
  -            listItem.appendChild(doc.createElement("br"));
  -            listItem.appendChild(doc.createElement("br"));
  -
  -          } catch (DOMException ex2) {
  -            System.err.println("DOM Error building results document (" +
  -                               ex2.getMessage() + ")");
  -          } catch (IOException ex2) {
  -            System.err.println("IO Error building results document (" +
  -                               ex2.getMessage() + ")");
  -          } catch (NumberFormatException ex2) {
  -            System.err.println("NUMBERFORMAT Error building results document (" +
  -                               ex2.getMessage() + ")");
  -          }
  -        } // for
  -      } // if (count==0) ...
  -      } // if queryString not null
  -    return doc;
  -  } // search
  -
  -  /*
  -   * Utility method to contruct a DOM element with no attributes and
  -   * ine text child
  -   */
  -  private Element makeElement(Document doc, String name, String text) {
  -    Element e = doc.createElement(name);
  -    e.appendChild(doc.createTextNode(text));
  -    return e;
  -  }
  -} // ForrestSearcher
  +/*
  + * The Apache Software License, Version 1.1
  + *
  + *
  + * Copyright (c) 2001, 2002 The Apache Software Foundation.  All rights
  + * reserved.
  + *
  + * Redistribution and use in source and binary forms, with or without
  + * modification, are permitted provided that the following conditions
  + * are met:
  + *
  + * 1. Redistributions of source code must retain the above copyright
  + *    notice, this list of conditions and the following disclaimer.
  + *
  + * 2. Redistributions in binary form must reproduce the above copyright
  + *    notice, this list of conditions and the following disclaimer in
  + *    the documentation and/or other materials provided with the
  + *    distribution.
  + *
  + * 3. The end-user documentation included with the redistribution,
  + *    if any, must include the following acknowledgment:
  + *       "This product includes software developed by the
  + *        Apache Software Foundation (http://www.apache.org/)."
  + *    Alternately, this acknowledgment may appear in the software itself,
  + *    if and wherever such third-party acknowledgments normally appear.
  + *
  + * 4. The names "Apache Forrest" and "Apache Software Foundation" must
  + *    not be used to endorse or promote products derived from this
  + *    software without prior written permission. For written
  + *    permission, please contact apache@apache.org.
  + *
  + * 5. Products derived from this software may not be called "Apache",
  + *    nor may "Apache" appear in their name, without prior written
  + *    permission of the Apache Software Foundation.
  + *
  + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  + * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  + * SUCH DAMAGE.
  + * ====================================================================
  + *
  + * This software consists of voluntary contributions made by many
  + * individuals on behalf of the Apache Software Foundation and was
  + * originally based on software copyright (c) 1999, International
  + * Business Machines, Inc., http://www.apache.org.  For more
  + * information on the Apache Software Foundation, please see
  + * <http://www.apache.org/>.
  + */
  +package org.apache.forrest.search;
  +
  +import java.io.IOException;
  +
  +import java.text.DateFormat;
  +import java.text.SimpleDateFormat;
  +
  +import java.util.Date;
  +
  +import org.apache.lucene.analysis.Analyzer;
  +
  +import org.apache.lucene.analysis.standard.StandardAnalyzer;
  +
  +import org.apache.lucene.queryParser.ParseException;
  +import org.apache.lucene.queryParser.QueryParser;
  +
  +import org.apache.lucene.search.Hits;
  +import org.apache.lucene.search.IndexSearcher;
  +import org.apache.lucene.search.Query;
  +
  +import org.apache.xerces.dom.DOMImplementationImpl;
  +
  +import org.w3c.dom.Document;
  +import org.w3c.dom.DocumentType;
  +import org.w3c.dom.DOMException;
  +import org.w3c.dom.DOMImplementation;
  +import org.w3c.dom.Element;
  +import org.w3c.dom.Node;
  +import org.w3c.dom.Text;
  +
  +/**
  + * <p>Searches the index for a given query string.</p>
  + * @author Ramon Prades [RPR]
  + * @version $Id$
  + */
  +public class ForrestSearcher {
  +  public ForrestSearcher() {
  +  }
  +
  +  /**
  +   * Searches "queryString" in "indexDir" and returns a Forrest Document (v1.2)
  +   * with the list of matches.
  +   * @param indexDir Directory with the Lucene index
  +   * @param queryString String to search
  +   * @return Forrest document
  +   */
  +  public Document search(String indexDir, String queryString) {
  +    // Create a Forrest document with the results
  +    DOMImplementation domImpl = new org.apache.xerces.dom.DOMImplementationImpl();
  +    DocumentType docType =
  +        domImpl.createDocumentType("document", "-//APACHE//DTD Documentation V1.1//EN", "document-v12.dtd");
  +    Document doc = domImpl.createDocument("", "document", docType);
  +    Element rootNode = doc.getDocumentElement();
  +    Element headerNode = doc.createElement("header");
  +    headerNode.appendChild(this.makeElement(doc, "title", "Search Results"));
  +    rootNode.appendChild(headerNode);
  +    Element bodyNode = doc.createElement("body");
  +    rootNode.appendChild(bodyNode);
  +
  +    // Element sectionNode = doc.createElement("section");
  +    // bodyNode.appendChild(sectionNode);
  +    // sectionNode.appendChild(makeElement(doc, "title", "List of Matches"));
  +
  +    IndexSearcher searcher = null;
  +    try {
  +      searcher = new IndexSearcher(indexDir);
  +    } catch (IOException ex) {
  +      System.err.println("Error: Index dir not found!");
  +      ex.printStackTrace();
  +    }
  +    Hits hits = null;
  +    int count = 0;
  +    if (queryString==null || queryString.length()==0) {
  +      Element pNode = doc.createElement("p");
  +      String txt = "Please enter a valid query";
  +      pNode.appendChild(doc.createTextNode(txt));
  +      bodyNode.appendChild(pNode);
  +    } else {
  +      Query query = null;
  +      try {
  +        query = QueryParser.parse(queryString, "contents", new StandardAnalyzer());
  +      } catch (ParseException ex3) {
  +        System.out.println("QueryParser error!");
  +        ex3.printStackTrace();
  +      }
  +      try {
  +        hits = searcher.search(query);
  +      } catch (IOException ex1) {
  +        System.err.println("Error in search");
  +        ex1.printStackTrace();
  +      }
  +
  +      // Build the section with the list of matches
  +      count = hits.length();
  +      Element pNode = doc.createElement("p");
  +      String txt = "";
  +      if (count == 0) {
  +        txt = "No documents found matching: ";
  +        pNode.appendChild(doc.createTextNode(txt));
  +        Element emNode = doc.createElement("em");
  +        pNode.appendChild(emNode);
  +        emNode.appendChild(doc.createTextNode(queryString));
  +        bodyNode.appendChild(pNode);
  +      } else {
  +        if (count == 1) {
  +          txt = count + " document found matching: ";
  +        } else {
  +          txt = count + " documents found matching: ";
  +        }
  +        pNode.appendChild(doc.createTextNode(txt));
  +        Element emNode = doc.createElement("em");
  +        pNode.appendChild(emNode);
  +        emNode.appendChild(doc.createTextNode(queryString));
  +        //pNode.appendChild(doc.createElement("em").appendChild(doc.createTextNode(queryString)));
  +        bodyNode.appendChild(pNode);
  +        Element listNode = doc.createElement("ul");
  +        // sectionNode.appendChild(listNode);
  +        bodyNode.appendChild(listNode);
  +
  +        for (int i = 0; i < count; i++) {
  +          try {
  +            String title = hits.doc(i).get("title");
  +            String summary = hits.doc(i).get("summary");
  +            String authors = hits.doc(i).get("author");
  +            String path = hits.doc(i).get("path").replaceAll(".xml", ".html");
  +            float score = hits.score(i);
  +            Date modified = new Date(new Long(hits.doc(i).get("modified")).
  +                                     longValue());
  +            java.text.DateFormat formatter = new java.text.SimpleDateFormat();
  +            String strModified = formatter.format(modified);
  +
  +            Element listItem = doc.createElement("li");
  +            listNode.appendChild(listItem);
  +            Element strongNode = doc.createElement("strong");
  +            listItem.appendChild(strongNode);
  +            Element linkNode = doc.createElement("link");
  +            linkNode.setAttribute("href", path);
  +            linkNode.appendChild(doc.createTextNode(title));
  +            strongNode.appendChild(linkNode);
  +
  +            String scoreText = " [" + score + "]";
  +            listItem.appendChild(doc.createTextNode(scoreText));
  +            listItem.appendChild(doc.createElement("br"));
  +
  +            if (summary != null && summary.length() > 0) {
  +              listItem.appendChild(doc.createTextNode(summary));
  +              listItem.appendChild(doc.createElement("br"));
  +            }
  +            Element lastLine = doc.createElement("em");
  +            listItem.appendChild(lastLine);
  +            lastLine.appendChild(doc.createTextNode("url: " + path));
  +            if (authors != null && authors.length() > 0) {
  +              lastLine.appendChild(doc.createTextNode(" - author: " + authors));
  +            }
  +            lastLine.appendChild(doc.createTextNode(" - last modified: " +
  +                strModified));
  +            listItem.appendChild(doc.createElement("br"));
  +            listItem.appendChild(doc.createElement("br"));
  +
  +          } catch (DOMException ex2) {
  +            System.err.println("DOM Error building results document (" +
  +                               ex2.getMessage() + ")");
  +          } catch (IOException ex2) {
  +            System.err.println("IO Error building results document (" +
  +                               ex2.getMessage() + ")");
  +          } catch (NumberFormatException ex2) {
  +            System.err.println("NUMBERFORMAT Error building results document (" +
  +                               ex2.getMessage() + ")");
  +          }
  +        } // for
  +      } // if (count==0) ...
  +      } // if queryString not null
  +    return doc;
  +  } // search
  +
  +  /*
  +   * Utility method to contruct a DOM element with no attributes and
  +   * ine text child
  +   */
  +  private Element makeElement(Document doc, String name, String text) {
  +    Element e = doc.createElement(name);
  +    e.appendChild(doc.createTextNode(text));
  +    return e;
  +  }
  +} // ForrestSearcher
  
  
  

Mime
View raw message