lucene-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kelv...@apache.org
Subject cvs commit: jakarta-lucene-sandbox/contributions/indyo/lib jakarta-oro-2.0.6.jar jdom.jar log4j-1.2.6.jar lucene-1.2.jar tartool.jar xmlParserAPIs-xerces-2.0.2.jar
Date Fri, 30 Aug 2002 18:03:53 GMT
kelvint     2002/08/30 11:03:53

  Added:       contributions/indyo/src/java/com/relevanz/indyo
                        AbstractDataSource.java DocumentHandler.java
                        FSDataSource.java
                        IllegalConfigurationException.java
                        IndexDataSource.java IndyoIndexer.java
                        SearchConfiguration.java
               contributions/indyo/src/java/com/relevanz/indyo/util
                        DataUnformatFilter.java IOUtils.java
                        StringUtils.java XMLFilterBase.java
               contributions/indyo/src/java/com/relevanz/indyo/contenthandler
                        FileContentHandler.java
                        FileContentHandlerAdapter.java
                        FileContentHandlerFactory.java GZipHandler.java
                        NestedFileContentHandlerAdapter.java
                        NullHandler.java TARHandler.java TextHandler.java
                        ZIPHandler.java
               contributions/indyo/lib jakarta-oro-2.0.6.jar jdom.jar
                        log4j-1.2.6.jar lucene-1.2.jar tartool.jar
                        xmlParserAPIs-xerces-2.0.2.jar
  Log:
  Initial import of source and libs.
  
  Revision  Changes    Path
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/AbstractDataSource.java
  
  Index: AbstractDataSource.java
  ===================================================================
  package com.relevanz.indyo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache POI" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.util.Map;
  import java.util.Set;
  
  /**
   * Generic implementation of an index datasource.
   * 
   * @version $Id: AbstractDataSource.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $
   */
  public abstract class AbstractDataSource implements IndexDataSource
  {
      protected AbstractDataSource()
      {
      }
  
      protected AbstractDataSource(Map map)
      {
          loadFields(map);
      }
  
      /**
       * Fields to index.
       */
      protected String[] fields;
  
      /**
       * Convenience method to load fields to index into a Map.
       */
      protected void loadFields(Map map)
      {
          Set fieldSet = map.keySet();
          fields = new String[fieldSet.size()];
          fieldSet.toArray(fields);
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/DocumentHandler.java
  
  Index: DocumentHandler.java
  ===================================================================
  package com.relevanz.indyo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import com.relevanz.indyo.util.StringUtils;
  import org.apache.log4j.Logger;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  import org.apache.lucene.index.IndexWriter;
  
  import java.io.IOException;
  import java.io.Reader;
  import java.util.*;
  
  /**
   * <p>
   * A document is the atomic unit used for indexing purposes. It consists of
   * metadata as well as its file contents. File contents are handled by
   * {@link ContentHandler}.
   * </p>
   * <p>
   * DocumentHandler creates the {@link org.apache.lucene.document.Document},
   * adds fields to it, delegates to {@link ContentHandler} to handle
   * file contents.
   * </p>
   * 
   * @version $Id: DocumentHandler.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $
   */
  public class DocumentHandler
  {
      /**
       * Field to retrieve all documents.
       */
      public static final String ALL_DOCUMENTS_FIELD = "AllDocuments";
  
      private static Logger log = Logger.getLogger(DocumentHandler.class);
  
      private static boolean isDebugEnabled = log.isDebugEnabled();
  
      /**
       * Should parent documents include data of its children?
       */
      private static boolean parentEncapsulation = false;
      /**
       * Document object this DocumentHandler is handling.
       */
      private Document doc;
  
      /**
       * Map of metadata for this document. Contains the field:value pair
       * to be added to the document.
       */
      private Map metadata;
  
      /**
       * Map of fields. Contains field:type_of_field pair.
       */
      private Map customFields;
  
      /**
       * IndexWriter.
       */
      private IndexWriter writer;
  
      /**
       * A collection of documents to be added to the writer.
       */
      private List documents = new ArrayList();
  
      /**
       * Ctor.
       *
       * @param Map of metadata for this document.
       * @param Map of fields.
       * @param Writer.
       */
      public DocumentHandler(Map metadata,
                             Map customFields,
                             IndexWriter writer)
      {
          this.metadata = metadata;
          this.customFields = customFields;
          this.writer = writer;
      }
  
      /**
       * Handles the actual processing of the document.
       */
      public void process() throws IOException, Exception
      {
          String objectid = (String) metadata.get(IndexDataSource.OBJECT_IDENTIFIER);
          if (objectid == null)
              return;
          doc = createDocument();
          addMapToDoc(metadata);
          addNestedDataSource(metadata);
          doc.add(Field.Text(ALL_DOCUMENTS_FIELD, ALL_DOCUMENTS_FIELD));
          //documents.add(doc);
          if (writer != null)
          {
              addToWriter();
          }
          else
          {
              documents.add(doc);
          }
      }
  
      private List getDocuments()
      {
          return documents;
      }
  
      private Document createDocument()
      {
          return new Document();
      }
  
      /**
       * Add the contents of a Map to a document.
       *
       * @param Map to add.
       */
      private void addMapToDoc(Map map)
      {
          for (Iterator it = map.keySet().iterator(); it.hasNext();)
          {
              String field = (String) it.next();
              Object value = map.get(field);
              if (value instanceof String)
              {
                  String type = null;
                  if (customFields != null)
                  {
                      type = (String) customFields.get(field);
                  }
                  addFieldToDoc(type, field, (String) value);
              }
              else if (value instanceof Reader)
              {
                  addFieldToDoc(field, (Reader) value);
              }
          }
      }
  
      /**
       * Add nested datasources.
       *
       * @param Map which contains the nested datasources.
       */
      private void addNestedDataSource(Map map) throws Exception
      {
          Object o = map.get(IndexDataSource.NESTED_DATASOURCE);
          if (o == null)
              return;
          if (o instanceof IndexDataSource)
          {
              IndexDataSource ds = (IndexDataSource) o;
              addDataSource(ds);
          }
          else if (o instanceof List)
          {
              List nestedDataSource = (List) o;
              for (int i = 0, n = nestedDataSource.size(); i < n; i++)
              {
                  IndexDataSource ds = (IndexDataSource) nestedDataSource.get(i);
                  addDataSource(ds);
              }
          }
          else if (o instanceof IndexDataSource[])
          {
              IndexDataSource[] nestedDataSource = (IndexDataSource[]) o;
              for (int i = 0, n = nestedDataSource.length; i < n; i++)
              {
                  IndexDataSource ds = (IndexDataSource) nestedDataSource[i];
                  addDataSource(ds);
              }
          }
          else
          {
              log.warn("Unknown object found as nested datasource:" + o);
          }
      }
  
      /**
       * Datasources are basically a collection of data maps to be indexed.
       * addMapToDoc is invoked for each map.
       *
       * @param Datasource to add.
       */
      private void addDataSource(IndexDataSource ds) throws Exception
      {
          Map[] data = ds.getData();
          for (int i = 0; i < data.length; i++)
          {
              Map map = data[i];
              if (map.containsKey(IndexDataSource.OBJECT_IDENTIFIER))
              {
                  /**
                   * Create a new document because child datasources may need
                   * to be retrieved independently of parent doc.
                   */
                  DocumentHandler docHandler = new DocumentHandler(map, null, null);
                  docHandler.process();
                  documents.addAll(docHandler.getDocuments());
              }
              else
              {
                  addMapToDoc(map);
                  /**
                   * Add nested datasources of this datasource's data
                   */
                  addNestedDataSource(map);
              }
          }
      }
  
      /**
       * Adds a String-based field to a document.
       *
       * @param Type of field.
       * @param Name of field.
       * @param Value of field.
       */
      private void addFieldToDoc(String type, String field, String value)
      {
          if (value == null)
              value = StringUtils.EMPTY_STRING;
          if (SearchConfiguration.KEYWORD_FIELD_TYPE.equalsIgnoreCase(type))
              doc.add(Field.Keyword(field, value));
          else if (SearchConfiguration.UNINDEXED_FIELD_TYPE.equalsIgnoreCase(type))
              doc.add(Field.UnIndexed(field, value));
          else if (SearchConfiguration.UNSTORED_FIELD_TYPE.equalsIgnoreCase(type))
              doc.add(Field.UnStored(field, value));
          else
              doc.add(Field.Text(field, value));
      }
  
      /**
       * Adds a Reader-based field to a document.
       *
       * @param Name of field.
       * @param Reader.
       */
      private void addFieldToDoc(String field, Reader reader)
      {
          doc.add(Field.Text(field, reader));
      }
  
      /**
       * Adds documents to the IndexWriter.
       */
      private void addToWriter() throws IOException
      {
          if (parentEncapsulation)
          {
              for (int i = 0, n = documents.size(); i < n; i++)
              {
                  Document d = (Document) documents.get(i);
                  for (Enumeration e = d.fields(); e.hasMoreElements();)
                  {
                      Field f = (Field) e.nextElement();
                      String fieldName = f.name();
                      if (!fieldName.equals(IndexDataSource.CONTAINER_IDENTIFIER)
                              && !fieldName.equals(IndexDataSource.OBJECT_CLASS)
                              && !fieldName.equals(IndexDataSource.OBJECT_IDENTIFIER))
                      {
                          doc.add(f);
                      }
                  }
              }
          }
          writer.addDocument(doc);
  
          for (int i = 0, n = documents.size(); i < n; i++)
          {
              writer.addDocument((Document) documents.get(i));
          }
      }
  }
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/FSDataSource.java
  
  Index: FSDataSource.java
  ===================================================================
  package com.relevanz.indyo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.document.DateField;
  import com.relevanz.indyo.contenthandler.FileContentHandler;
  import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
  import com.relevanz.indyo.util.IOUtils;
  
  import java.io.File;
  import java.io.Reader;
  import java.util.ArrayList;
  import java.util.HashMap;
  import java.util.List;
  import java.util.Map;
  
  /**
   * A filesystem-based datasource.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: FSDataSource.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $
   */
  public class FSDataSource extends AbstractDataSource
  {
      public static final String FILE_PATH_FIELD = "filePath";
      public static final String FILE_NAME_FIELD = "fileName";
      public static final String FILE_SIZE_FIELD = "fileSize";
      public static final String FILE_FORMAT_FIELD = "fileFormat";
      public static final String FILE_CONTENTS_FIELD = "fileContents";
      public static final String FILE_LAST_MODIFIED_DATE_FIELD = "fileLastModifiedDate";
  
      private File targetFileOrDir;
  
      public FSDataSource(String targetFileOrDirStr)
      {
          this(new File(targetFileOrDirStr));
      }
  
      public FSDataSource(File targetFileOrDir)
      {
          setTargetDirectory(targetFileOrDir);
      }
  
      public Map[] getData()
      {
          Map[] returnData = null;
          List temp = new ArrayList();
          loadDataFromFiles(targetFileOrDir, temp);
          returnData = new Map[temp.size()];
          returnData = (Map[]) temp.toArray(returnData);
          return returnData;
      }
  
      public void setTargetDirectory(File targetFileOrDir)
      {
          this.targetFileOrDir = targetFileOrDir;
      }
  
      private void loadDataFromFiles(File f, List list)
      {
          if (f.isDirectory())
          {
              File[] directoryTree = f.listFiles();
              for (int i = 0; i < directoryTree.length; i++)
              {
                  loadDataFromFiles(directoryTree[i], list);
              }
          }
          else
          {
              Map dataMap = new HashMap();
              dataMap.put(FILE_PATH_FIELD, f.getPath());
              dataMap.put(FILE_NAME_FIELD, f.getName());
              dataMap.put(FILE_LAST_MODIFIED_DATE_FIELD,
                          DateField.timeToString(f.lastModified()));
              dataMap.put(FILE_SIZE_FIELD, String.valueOf(f.length()));
              dataMap.put(FILE_FORMAT_FIELD,
                          IOUtils.getFileExtension(f));
              addFileContents(f, dataMap);
              list.add(dataMap);
          }
      }
  
      private void addFileContents(File targetFile, Map dataMap)
      {
          FileContentHandler cHandler =
                  FileContentHandlerFactory.getContentHandler(targetFile);
          if (cHandler != null)
          {
              if (cHandler.fileContentIsReadable())
              {
                  Reader r = cHandler.getReader();
                  if (r != null)
                  {
                      dataMap.put(FILE_CONTENTS_FIELD, r);
                  }
              }
              if (cHandler.containsNestedData())
              {
                  dataMap.put(NESTED_DATASOURCE, cHandler.getNestedDataSource());
              }
          }
          else
          {
              //cat.warn("ContentHandler not found for " + contentFile.getName());
          }
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/IllegalConfigurationException.java
  
  Index: IllegalConfigurationException.java
  ===================================================================
  package com.relevanz.indyo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache POI" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  /**
 * Thrown when loading SearchConfiguration.
 *
 * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: IllegalConfigurationException.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $
   */
  public class IllegalConfigurationException extends Exception
  {
      public IllegalConfigurationException(String msg)
      {
          super(msg);
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/IndexDataSource.java
  
  Index: IndexDataSource.java
  ===================================================================
  package com.relevanz.indyo;
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache POI" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.util.Map;
  
  /**
   * A datasource is any source of data (filesystem, database, URL, etc)
   * which is indexed by SearchIndexer.
   * 
   * @version $Id: IndexDataSource.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $
   */
  public interface IndexDataSource
  {
      /**
       * Key in the map (located in the list returned by getData)
       * to represent the class name of the object being indexed.
       */
      public static final String OBJECT_CLASS = "objectClass";
  
      /**
       * Key in the map (located in the list returned by getData)
       * to represent the uuid of the object being indexed.
       */
      public static final String OBJECT_IDENTIFIER = "objectId";
  
      /**
       * The key in the map (located in the list returned by getData)
       * to represent nested datasources.
       */
      public static final String NESTED_DATASOURCE = "nestedDataSource";
  
      /**
       * Key in the map (located in the list returned by getData)
       * to represent the id of the datasource's container. Applies to
       * nested datasources.
       */
      public static final String CONTAINER_IDENTIFIER = "containerId";
  
      /**
       * Key in the map to represent the class name of the Search Result
       * object for this datasource (if any).
       */
      public static final String SEARCH_RESULT_CLASSNAME = "resultClassname";
  
      /**
       * Retrieve a array of Maps. Each map represents the
       * a document to be indexed. The key:value pair of the map
       * is the metadata of the document.
       */
      public Map[] getData() throws Exception;
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/IndyoIndexer.java
  
  Index: IndyoIndexer.java
  ===================================================================
  package com.relevanz.indyo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache POI" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
  import org.apache.log4j.Logger;
  import org.apache.lucene.analysis.Analyzer;
  import org.apache.lucene.analysis.standard.StandardAnalyzer;
  import org.apache.lucene.index.IndexWriter;
  
  import java.io.IOException;
  import java.util.Collections;
  import java.util.Map;
  
  /**
   * Entry point for search engine indexing.
   * <p>
   * SearchIndexer is responsible for creating the IndexWriter
   * {@see org.apache.lucene.index.IndexWriter} and passing it to
   *  DocumentHandlers {@link DocumentHandler} to index individual documents.
   * </p>
   *
   * @version $Id: IndyoIndexer.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $
   */
  public class IndyoIndexer
  {
      private static Logger log = Logger.getLogger(IndyoIndexer.class);
      private IndexWriter fsWriter;
      private SearchConfiguration config;
  
      public IndyoIndexer(String indexDirectory, String configFile)
              throws IOException, IllegalConfigurationException
      {
          Analyzer a = new StandardAnalyzer();
          fsWriter = new IndexWriter(indexDirectory, a, true);
          fsWriter.maxFieldLength = 1000000;
          loadConfig(configFile);
      }
  
      /**
       * Indexes documents.
       */
      public synchronized void index(IndexDataSource ds) throws IOException, Exception
      {
          log.debug("Initiating search engine indexing...");
          long start = System.currentTimeMillis();
          // temporarily use an empty map whilst custom fields get implemented
          indexDataSource(ds, Collections.EMPTY_MAP);
          fsWriter.optimize();
          fsWriter.close();
          long stop = System.currentTimeMillis();
          log.debug("Indexing took " + (stop - start) + " milliseconds");
      }
  
      private void loadConfig(String configFile) throws IllegalConfigurationException
      {
          config = new SearchConfiguration(configFile);
          FileContentHandlerFactory.setHandlerRegistry(config.getContentHandlers());
      }
  
      private void indexDataSource(IndexDataSource source, Map customFields)
              throws Exception
      {
          Map[] data = source.getData();
          // here's a good place to spawn a couple of threads for indexing
          for (int i = 0; i < data.length; i++)
          {
              DocumentHandler docHandler =
                      new DocumentHandler(data[i], customFields, fsWriter);
              docHandler.process();
          }
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/SearchConfiguration.java
  
  Index: SearchConfiguration.java
  ===================================================================
  package com.relevanz.indyo;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import com.relevanz.indyo.contenthandler.FileContentHandlerFactory;
  import com.relevanz.indyo.util.DataUnformatFilter;
  import org.apache.log4j.Category;
  import org.apache.log4j.Logger;
  import org.jdom.Document;
  import org.jdom.Element;
  import org.jdom.input.SAXBuilder;
  
  import java.util.HashMap;
  import java.util.List;
  import java.util.Map;
  import java.util.StringTokenizer;
  
  /**
   * Configures the indexing process using an XML file.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: SearchConfiguration.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $
   */
  public class SearchConfiguration
  {
      public static final String TEXT_FIELD_TYPE = "text";
      public static final String KEYWORD_FIELD_TYPE = "keyword";
      public static final String UNINDEXED_FIELD_TYPE = "unindexed";
      public static final String UNSTORED_FIELD_TYPE = "unstored";
  
      /** Log4j category.
       */
      static Logger log = Logger.getLogger(SearchConfiguration.class.getName());
  
      /**
       * Key in the config file to declare content handlers.
       */
      private static final String CONTENT_HANDLER_KEY = "Search.ContentHandlers";
  
      /**
       * Key in the config file to declare custom fields.
       */
      private static final String FIELD_KEY = "Search.Fields";
  
      /**
       * Map of content handlers.
       */
      private Map contentHandlers = new HashMap();
  
      /**
       * Map of (non-standard) custom fields to index.
       */
      private Map customFields = new HashMap();
  
      /**
       * Document object which represents the xml configuration file.
       */
      private Document doc;
  
      /**
       * Creates a new SearchConfiguration.
       *
       * @param configFile Name of the xml configuration file.
       */
      public SearchConfiguration(String configFile) throws IllegalConfigurationException
      {
          try
          {
              SAXBuilder builder = new SAXBuilder();
              DataUnformatFilter format = new DataUnformatFilter();
              builder.setXMLFilter(format);
              doc = builder.build(configFile);
          }
          catch (Exception e)
          {
              log.error("Error creating XML parser:" + e.getMessage(), e);
          }
          loadContentHandlers();
          loadCustomFields();
      }
  
      public Map getContentHandlers()
      {
          return this.contentHandlers;
      }
  
      public Map getCustomFields()
      {
          return this.customFields;
      }
  
      /**
       * Loads the content handlers.
       */
      protected void loadContentHandlers() throws IllegalConfigurationException
      {
          String[] extensions = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "extension");
          String[] handlers = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "handler");
          if (extensions.length != handlers.length)
              throw new IllegalConfigurationException(
                      "Illegal configuration of Search Content Handlers!");
          for (int i = 0; i < extensions.length; i++)
          {
              contentHandlers.put(extensions[i], generateObject(handlers[i]));
          }
          String[] defaultExtension = getChildPropertyAttributeValues(CONTENT_HANDLER_KEY, "default");
          for (int i = 0; i < defaultExtension.length; i++)
          {
              if (defaultExtension[i] != null && defaultExtension[i].equals("true"))
              {
                  contentHandlers.put(FileContentHandlerFactory.DEFAULT_HANDLER_KEY
                                      , generateObject(handlers[i]));
              }
          }
      }
  
      /**
       * Loads the custom fields to index.
       */
      protected void loadCustomFields() throws IllegalConfigurationException
      {
          String[] fields = getChildPropertyAttributeValues(FIELD_KEY, "name");
          String[] fieldtypes = getChildPropertyAttributeValues(FIELD_KEY, "type");
          if (fields.length != fieldtypes.length)
              throw new IllegalConfigurationException(
                      "Illegal configuration of custom search fields!");
          for (int i = 0; i < fields.length; i++)
          {
              customFields.put(fields[i], fieldtypes[i]);
          }
      }
  
      /**
       * Return attribute values for all child nodes.
       */
      private String[] getChildPropertyAttributeValues(String parent,
                                                       String attributeName)
      {
          String[] nodeName = parseNodeName(parent);
          Element element = doc.getRootElement();
          for (int i = 0; i < nodeName.length; i++)
          {
              element = element.getChild(nodeName[i]);
              if (element == null)
              {
                  return new String[]{};
              }
          }
          List children = element.getChildren();
          int childCount = children.size();
          String[] childrenAttributeValue = new String[childCount];
          for (int i = 0; i < childCount; i++)
          {
              childrenAttributeValue[i] =
                      ((Element) children.get(i)).getAttributeValue(attributeName);
          }
          return childrenAttributeValue;
      }
  
      /**
       * Node names are in the form "x.y.z". Returns a String array
       * representation of the node elements.
       */
      private String[] parseNodeName(String nodeName)
      {
          StringTokenizer st = new StringTokenizer(nodeName, ".");
          String[] nodeElements = new String[st.countTokens()];
          int i = 0;
          while (st.hasMoreTokens())
          {
              nodeElements[i] = st.nextToken();
              ++i;
          }
          return nodeElements;
      }
  
      /**
       * Utility method to return an object based on its class name.
       * The object needs to have a constructor which accepts no parameters.
       *
       * @param className  Class name of object to be generated
       * @return Object
       */
      private static Object generateObject(String className)
      {
          Object o = null;
          try
          {
              Class c = Class.forName(className);
              o = c.newInstance();
          }
          catch (ClassNotFoundException cnfe)
          {
              log.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
          }
          catch (InstantiationException ie)
          {
              log.error(ie.getMessage() + " Class named '" + className + "' could not be  instantiated.", ie);
          }
          catch (IllegalAccessException iae)
          {
              log.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
          }
          return o;
      }
  
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/util/DataUnformatFilter.java
  
  Index: DataUnformatFilter.java
  ===================================================================
  /*--
  
   Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
   All rights reserved.
  
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
  
   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions, and the following disclaimer.
  
   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions, and the disclaimer that follows
      these conditions in the documentation and/or other materials
      provided with the distribution.
  
   3. The name "JDOM" must not be used to endorse or promote products
      derived from this software without prior written permission.  For
      written permission, please contact license@jdom.org.
  
   4. Products derived from this software may not be called "JDOM", nor
      may "JDOM" appear in their name, without prior written permission
      from the JDOM Project Management (pm@jdom.org).
  
   In addition, we request (but do not require) that you include in the
   end-user documentation provided with the redistribution and/or in the
   software itself an acknowledgement equivalent to the following:
       "This product includes software developed by the
        JDOM Project (http://www.jdom.org/)."
   Alternatively, the acknowledgment may be graphical using the logos
   available at http://www.jdom.org/images/logos.
  
   THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   DISCLAIMED.  IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   SUCH DAMAGE.
  
   This software consists of voluntary contributions made by many
   individuals on behalf of the JDOM Project and was originally
   created by Brett McLaughlin <brett@jdom.org> and
   Jason Hunter <jhunter@jdom.org>.  For more information on the
   JDOM Project, please see <http://www.jdom.org/>.
  
   */
  package com.relevanz.indyo.util;
  
  import java.util.Stack;
  
  import org.xml.sax.Attributes;
  import org.xml.sax.SAXException;
  import org.xml.sax.XMLReader;
  
  
  /**
   * Filter for removing formatting from data- or field-oriented XML.
   *
   * <i>Code and comments adapted from DataWriter-0.2, written
   * by David Megginson and released into the public domain,
   * without warranty.</i>
   *
   * <p>This filter removes leading and trailing whitespace from
   * field-oriented XML without mixed content. Note that this class will
   * likely not yield appropriate results for document-oriented XML like
   * XHTML pages, which mix character data and elements together.</p>
   *
   * @see DataFormatFilter
   */
  public class DataUnformatFilter extends XMLFilterBase
  {
  
      ////////////////////////////////////////////////////////////////////
      // Constructors.
      ////////////////////////////////////////////////////////////////////
  
      /**
       * Create a new filter.
       */
      public DataUnformatFilter()
      {
      }
  
      /**
       * Create a new filter.
       *
       * <p>Use the XMLReader provided as the source of events.</p>
       *
       * @param xmlreader The parent in the filter chain.
       */
      public DataUnformatFilter(XMLReader xmlreader)
      {
          super(xmlreader);
      }
  
      ////////////////////////////////////////////////////////////////////
      // Public methods.
      ////////////////////////////////////////////////////////////////////
  
      /**
       * Reset the filter so that it can be reused.
       *
       * <p>This method is especially useful if the filter failed
       * with an exception the last time through.</p>
       */
      public void reset ()
      {
          state = SEEN_NOTHING;
          stateStack = new Stack();
          whitespace = new StringBuffer();
      }
  
      /**
       * Filter a start document event.
       *
       * <p>Reset state and pass the event on for further processing.</p>
       *
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startDocument
       */
      public void startDocument ()
      throws SAXException
      {
          reset();
          super.startDocument();
      }
  
      /**
       * Filter a start element event.
       *
       * @param uri The element's Namespace URI.
       * @param localName The element's local name.
       * @param qName The element's qualified (prefixed) name.
       * @param atts The element's attribute list.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startElement
       */
      public void startElement (String uri, String localName,
                                String qName, Attributes atts)
      throws SAXException
      {
          clearWhitespace();
          stateStack.push(SEEN_ELEMENT);
          state = SEEN_NOTHING;
          super.startElement(uri, localName, qName, atts);
      }
  
      /**
       * Filter an end element event.
       *
       * @param uri The element's Namespace URI.
       * @param localName The element's local name.
       * @param qName The element's qualified (prefixed) name.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#endElement
       */
      public void endElement (String uri, String localName, String qName)
      throws SAXException
      {
          if (state == SEEN_ELEMENT) {
              clearWhitespace();
          } else {
              emitWhitespace();
          }
          state = stateStack.pop();
          super.endElement(uri, localName, qName);
      }
  
      /**
       * Filter a character data event.
       *
       * @param ch The characters to write.
       * @param start The starting position in the array.
       * @param length The number of characters to use.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#characters
       */
      public void characters (char ch[], int start, int length)
      throws SAXException
      {
          if (state != SEEN_DATA) {
  
              /* Look for non-whitespace. */
              int end = start + length;
              while (end-- > start) {
                  if (!isXMLWhitespace(ch[end]))
                      break;
              }
  
              /*
               * If all the characters are whitespace, save them for later.
               * If we've got some data, emit any saved whitespace and update
               * our state to show we've seen data.
               */
              if (end < start) {
                  saveWhitespace(ch, start, length);
              } else {
                  state = SEEN_DATA;
                  emitWhitespace();
              }
          }
  
          /* Pass on everything inside a data field. */
          if (state == SEEN_DATA) {
              super.characters(ch, start, length);
          }
      }
  
       /**
        * Filter an ignorable whitespace event.
        *
        * @param ch The array of characters to write.
        * @param start The starting position in the array.
        * @param length The number of characters to write.
        * @exception org.xml.sax.SAXException If a filter
        *            further down the chain raises an exception.
        * @see org.xml.sax.ContentHandler#ignorableWhitespace
        */
      public void ignorableWhitespace (char ch[], int start, int length)
      throws SAXException
      {
          emitWhitespace();
          // ignore
      }
  
      /**
       * Filter a processing instruction event.
       *
       * @param target The PI target.
       * @param data The PI data.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#processingInstruction
       */
      public void processingInstruction (String target, String data)
      throws SAXException
      {
          emitWhitespace();
          super.processingInstruction(target, data);
      }
  
      ////////////////////////////////////////////////////////////////////
      // Internal methods.
      ////////////////////////////////////////////////////////////////////
  
      /**
       * Saves trailing whitespace.
       */
      protected void saveWhitespace (char[] ch, int start, int length) {
          whitespace.append(ch, start, length);
      }
  
      /**
       * Passes saved whitespace down the filter chain.
       */
      protected void emitWhitespace ()
      throws SAXException
      {
          char[] data = new char[whitespace.length()];
          if (whitespace.length() > 0) {
              whitespace.getChars(0, data.length, data, 0);
              whitespace.setLength(0);
              super.characters(data, 0, data.length);
          }
      }
  
      /**
       * Discards saved whitespace.
       */
      protected void clearWhitespace () {
          whitespace.setLength(0);
      }
  
      /**
       * Returns <var>true</var> if character is XML whitespace.
       */
      private boolean isXMLWhitespace (char c)
      {
          return c == ' ' || c == '\t' || c == '\r' || c == '\n';
      }
  
      ////////////////////////////////////////////////////////////////////
      // Constants.
      ////////////////////////////////////////////////////////////////////
  
      private static final Object SEEN_NOTHING = new Object();
      private static final Object SEEN_ELEMENT = new Object();
      private static final Object SEEN_DATA = new Object();
  
  
      ////////////////////////////////////////////////////////////////////
      // Internal state.
      ////////////////////////////////////////////////////////////////////
  
      private Object state = SEEN_NOTHING;
      private Stack stateStack = new Stack();
  
      private StringBuffer whitespace = new StringBuffer();
  }
  
  // end of DataUnformatFilter.java
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/util/IOUtils.java
  
  Index: IOUtils.java
  ===================================================================
  package com.relevanz.indyo.util;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import com.ice.tar.TarArchive;
  import org.apache.log4j.Category;
  
  import java.io.*;
  import java.util.zip.GZIPInputStream;
  import java.util.zip.ZipEntry;
  import java.util.zip.ZipOutputStream;
  
  /**
   * Utility IO-related methods.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: IOUtils.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public final class IOUtils
  {
      /**
       * Log4j category.
       */
      private static Category cat = Category.getInstance(IOUtils.class.getName());
  
      /**
       * Writes data from the inputstream to the outputstream.
       *
       * @param in InputStream to read from.
       * @param out OutputStream to write to.
       * @throws IOException I/O error.
       */
      public static void transferData(InputStream in, OutputStream out)
              throws IOException
      {
          byte[] data = new byte[10000];
          int len;
          while ((len = in.read(data)) != -1)
          {
              out.write(data, 0, len);
          }
      }
  
      /**
       * Recursively deletes a directory.
       * @param File Directory to delete.
       */
      public static void deleteDirectory(File directory)
      {
          File[] fArray = directory.listFiles();
          for (int i = 0; i < fArray.length; i++)
          {
              if (fArray[i].isDirectory())
              {
                  deleteDirectory(fArray[i]);
              }
              fArray[i].delete();
          }
          directory.delete();
      }
  
      /**
       * Writes an input stream to a temporary file which is set
       * to delete when the VM exits.
       * @param Inputstream to read data from
       * @param Temporary file to write to
       */
      public static void writeToTempFile(InputStream in, String tempfile)
              throws IOException
      {
          OutputStream out = null;
          try
          {
              File f = new File(tempfile);
              f.deleteOnExit();
              char lastChar = tempfile.charAt(tempfile.length() - 1);
              // make no assumptions that java.io.File detects directories
              // in a cross-platform manner
              if (f.isDirectory() || lastChar == '\\' || lastChar == '/')
                  f.mkdirs();
              else
              {
                  // ensure that all necessary directories are created
                  File parent = f.getParentFile();
                  parent.deleteOnExit();
                  parent.mkdirs();
                  out = new FileOutputStream(tempfile);
                  transferData(in, out);
              }
          }
          finally
          {
              if (out != null)
                  out.close();
          }
      }
  
      /**
       * Writes an file to a ZipOutputStream.
       * @param File to read data from
       * @param Path of the ZipEntry
       * @param ZipOutputStream to write to
       */
      public static void addToZipOutputStream(String file,
                                              String zipPath,
                                              ZipOutputStream out)
              throws FileNotFoundException, IOException
      {
          File f = new File(file);
          byte[] buffer = new byte[8192];  // Create a buffer for copying
          int bytes_read;
          FileInputStream in = null;
          try
          {
              in = new FileInputStream(f); // Stream to read file
              ZipEntry entry = new ZipEntry(zipPath);      // Make a ZipEntry
              out.putNextEntry(entry);                     // Store entry in zipfile
              while ((bytes_read = in.read(buffer)) != -1) // Copy bytes to zipfile
                  out.write(buffer, 0, bytes_read);
          }
          finally
          {
              if (in != null)
                  in.close(); // Close input stream
          }
      }
  
      /**
       * Extracts a tar file to a directory.
       * @param Tar file to read data from
       * @param Directory to write to
       */
      public static void extractTar(File tarFile, File destDir)
              throws IOException
      {
          FileInputStream fis = null;
          try
          {
              fis = new FileInputStream(tarFile);
              TarArchive ta = new TarArchive(fis);
              ta.extractContents(destDir);
              ta.closeArchive();
          }
          finally
          {
              if (fis != null)
                  fis.close();
          }
      }
  
      /**
       * Extracts a GZip file to a file.
       * @param GZip file to read data from
       * @param File to write to
       */
      public static void extractGZip(File f, File destFile) throws IOException
      {
          FileOutputStream out = null;
          FileInputStream fis = null;
          GZIPInputStream gzin = null;
          try
          {
              out = new FileOutputStream(destFile);
              fis = new FileInputStream(f);
              gzin = new GZIPInputStream(fis);
              byte[] data = new byte[10000];
              int len;
              while ((len = gzin.read(data)) != -1)
              {
                  out.write(data, 0, len);
              }
              out.flush();
          }
          finally
          {
              if (gzin != null)
                  gzin.close();
              if (out != null)
                  out.close();
              if (fis != null)
                  fis.close();
          }
      }
  
      /**
       * reads all bytes from the given stream
       * @param is the stream to read from
       */
      public static final byte[] loadBytes(InputStream is) throws IOException
      {
          // read in the entry data
          int count = 0;
          byte[] buffer = new byte[0];
          byte[] chunk = new byte[4096];
          while ((count = is.read(chunk)) >= 0)
          {
              byte[] t = new byte[buffer.length + count];
              System.arraycopy(buffer, 0, t, 0, buffer.length);
              System.arraycopy(chunk, 0, t, buffer.length, count);
              buffer = t;
          }
          return buffer;
      }
  
      /** Returns the file extension of a file.
       * @param filename Filename to obtain the file extension.
       * @return File extension (without the ".").
       */
      public static String getFileExtension(String filename)
      {
          return filename.substring(filename.lastIndexOf(".") + 1); // + 1 to remove the "."
      }
  
      /** Returns the file extension of a file.
       * @param f File object to obtain the file extension.
       * @return File extension (without the ".").
       */
      public static String getFileExtension(File f)
      {
          return getFileExtension(f.getName());
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/util/StringUtils.java
  
  Index: StringUtils.java
  ===================================================================
  package com.relevanz.indyo.util;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.oro.text.perl.Perl5Util;
  
  /**
   * Utility String-related methods.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: StringUtils.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public final class StringUtils
  {
      public static final String EMPTY_STRING = "";
      private static final char[] QUOTE_ENCODE = "&quot;".toCharArray();
      private static final char[] AMP_ENCODE = "&amp;".toCharArray();
      private static final char[] LT_ENCODE = "&lt;".toCharArray();
      private static final char[] GT_ENCODE = "&gt;".toCharArray();
      private static final char[] APOS_ENCODE = "&apos;".toCharArray();
      // Create a regular expression engine
      private static Perl5Util perl5Util = new Perl5Util();
  
      public static final String removeUnreadableCharacters(String s)
      {
          if (perl5Util.match("/\\W+/", s))
          {
              // replace unreadable characters with a space
              s = perl5Util.substitute("s#[^a-zA-Z0-9_@]+# #gm", s);
              // remove any single/double word characters
              s = perl5Util.substitute("s#\\b[a-zA-Z0-9_]{1,2}\\b##gm", s);
          }
          return trimWhitespace(s);
      }
  
      public static final String trimWhitespace(String s)
      {
          s = perl5Util.substitute("s#[\\s]{3,}# #m", s);
          return s;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/util/XMLFilterBase.java
  
  Index: XMLFilterBase.java
  ===================================================================
  /*--
  
   Copyright (C) 2000 Brett McLaughlin & Jason Hunter.
   All rights reserved.
  
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
  
   1. Redistributions of source code must retain the above copyright
      notice, this list of conditions, and the following disclaimer.
  
   2. Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions, and the disclaimer that follows
      these conditions in the documentation and/or other materials
      provided with the distribution.
  
   3. The name "JDOM" must not be used to endorse or promote products
      derived from this software without prior written permission.  For
      written permission, please contact license@jdom.org.
  
   4. Products derived from this software may not be called "JDOM", nor
      may "JDOM" appear in their name, without prior written permission
      from the JDOM Project Management (pm@jdom.org).
  
   In addition, we request (but do not require) that you include in the
   end-user documentation provided with the redistribution and/or in the
   software itself an acknowledgement equivalent to the following:
       "This product includes software developed by the
        JDOM Project (http://www.jdom.org/)."
   Alternatively, the acknowledgment may be graphical using the logos
   available at http://www.jdom.org/images/logos.
  
   THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   DISCLAIMED.  IN NO EVENT SHALL THE JDOM AUTHORS OR THE PROJECT
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   SUCH DAMAGE.
  
   This software consists of voluntary contributions made by many
   individuals on behalf of the JDOM Project and was originally
   created by Brett McLaughlin <brett@jdom.org> and
   Jason Hunter <jhunter@jdom.org>.  For more information on the
   JDOM Project, please see <http://www.jdom.org/>.
  
   */
  package com.relevanz.indyo.util;
  
  import java.io.IOException;
  
  import org.xml.sax.Attributes;
  import org.xml.sax.InputSource;
  import org.xml.sax.SAXException;
  import org.xml.sax.SAXNotRecognizedException;
  import org.xml.sax.SAXNotSupportedException;
  import org.xml.sax.XMLReader;
  import org.xml.sax.ext.LexicalHandler;
  import org.xml.sax.helpers.AttributesImpl;
  import org.xml.sax.helpers.XMLFilterImpl;
  
  /**
   * Adds convenience methods to base SAX2 Filter implementation.
   *
   * <i>Code and comments adapted from XMLWriter-0.2, written
   * by David Megginson and released into the public domain,
   * without warranty.</i>
   *
   * <p>The convenience methods are provided so that clients do not have to
   * create empty attribute lists or provide empty strings as parameters;
   * for example, the method invocation</p>
   *
   * <pre>
   * w.startElement("foo");
   * </pre>
   *
   * <p>is equivalent to the regular SAX2 ContentHandler method</p>
   *
   * <pre>
   * w.startElement("", "foo", "", new AttributesImpl());
   * </pre>
   *
   * <p>Except that it is more efficient because it does not allocate
   * a new empty attribute list each time.</p>
   *
   * <p>In fact, there is an even simpler convenience method,
   * <var>dataElement</var>, designed for writing elements that
   * contain only character data.</p>
   *
   * <pre>
   * w.dataElement("greeting", "Hello, world!");
   * </pre>
   *
   * <p>is equivalent to</p>
   *
   * <pre>
   * w.startElement("greeting");
   * w.characters("Hello, world!");
   * w.endElement("greeting");
   * </pre>
   *
   * @see org.xml.sax.helpers.XMLFilterImpl
   */
  class XMLFilterBase extends XMLFilterImpl
  {
  
      ////////////////////////////////////////////////////////////////////
      // Constructors.
      ////////////////////////////////////////////////////////////////////
  
      /**
       * Construct an XML filter with no parent.
       *
       * <p>This filter will have no parent: you must assign a parent
       * before you start a parse or do any configuration with
       * setFeature or setProperty.</p>
       *
       * @see org.xml.sax.XMLReader#setFeature
       * @see org.xml.sax.XMLReader#setProperty
       */
      public XMLFilterBase()
      {
      }
  
      /**
       * Create an XML filter with the specified parent.
       *
       * <p>Use the XMLReader provided as the source of events.</p>
       *
       * @param xmlreader The parent in the filter chain.
       */
      public XMLFilterBase(XMLReader parent)
      {
          super(parent);
      }
  
      ////////////////////////////////////////////////////////////////////
      // Convenience methods.
      ////////////////////////////////////////////////////////////////////
  
      /**
       * Start a new element without a qname or attributes.
       *
       * <p>This method will provide a default empty attribute
       * list and an empty string for the qualified name.
       * It invokes {@link
       * #startElement(String, String, String, Attributes)}
       * directly.</p>
       *
       * @param uri The element's Namespace URI.
       * @param localName The element's local name.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startElement
       */
      public void startElement (String uri, String localName) throws SAXException
      {
          startElement(uri, localName, "", EMPTY_ATTS);
      }
  
      /**
       * Start a new element without a qname, attributes or a Namespace URI.
       *
       * <p>This method will provide an empty string for the
       * Namespace URI, and empty string for the qualified name,
       * and a default empty attribute list. It invokes
       * #startElement(String, String, String, Attributes)}
       * directly.</p>
       *
       * @param localName The element's local name.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startElement
       */
      public void startElement (String localName) throws SAXException
      {
          startElement("", localName, "", EMPTY_ATTS);
      }
  
      /**
       * End an element without a qname.
       *
       * <p>This method will supply an empty string for the qName.
       * It invokes {@link #endElement(String, String, String)}
       * directly.</p>
       *
       * @param uri The element's Namespace URI.
       * @param localName The element's local name.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#endElement
       */
      public void endElement (String uri, String localName) throws SAXException
      {
          endElement(uri, localName, "");
      }
  
      /**
       * End an element without a Namespace URI or qname.
       *
       * <p>This method will supply an empty string for the qName
       * and an empty string for the Namespace URI.
       * It invokes {@link #endElement(String, String, String)}
       * directly.</p>
       *
       * @param localName The element's local name.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#endElement
       */
      public void endElement (String localName) throws SAXException
      {
          endElement("", localName, "");
      }
  
      /**
       * Add an empty element.
       *
       * Both a {@link #startElement startElement} and an
       * {@link #endElement endElement} event will be passed on down
       * the filter chain.
       *
       * @param uri The element's Namespace URI, or the empty string
       *        if the element has no Namespace or if Namespace
       *        processing is not being performed.
       * @param localName The element's local name (without prefix).  This
       *        parameter must be provided.
       * @param qName The element's qualified name (with prefix), or
       *        the empty string if none is available.  This parameter
       *        is strictly advisory: the writer may or may not use
       *        the prefix attached.
       * @param atts The element's attribute list.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startElement
       * @see org.xml.sax.ContentHandler#endElement
       */
      public void emptyElement (String uri, String localName, String qName,
              Attributes atts) throws SAXException
      {
          startElement(uri, localName, qName, atts);
          endElement(uri, localName, qName);
      }
  
       /**
        * Add an empty element without a qname or attributes.
        *
        * <p>This method will supply an empty string for the qname
        * and an empty attribute list.  It invokes
        * {@link #emptyElement(String, String, String, Attributes)}
        * directly.</p>
        *
        * @param uri The element's Namespace URI.
        * @param localName The element's local name.
        * @exception org.xml.sax.SAXException If a filter
        *            further down the chain raises an exception.
        * @see #emptyElement(String, String, String, Attributes)
        */
      public void emptyElement (String uri, String localName) throws SAXException
      {
          emptyElement(uri, localName, "", EMPTY_ATTS);
      }
  
      /**
       * Add an empty element without a Namespace URI, qname or attributes.
       *
       * <p>This method will supply an empty string for the qname,
       * and empty string for the Namespace URI, and an empty
       * attribute list.  It invokes
       * {@link #emptyElement(String, String, String, Attributes)}
       * directly.</p>
       *
       * @param localName The element's local name.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
        * @see #emptyElement(String, String, String, Attributes)
       */
      public void emptyElement (String localName) throws SAXException
      {
          emptyElement("", localName, "", EMPTY_ATTS);
      }
  
      /**
       * Add an element with character data content.
       *
       * <p>This is a convenience method to add a complete element
       * with character data content, including the start tag
       * and end tag.</p>
       *
       * <p>This method invokes
       * {@link @see org.xml.sax.ContentHandler#startElement},
       * followed by
       * {@link #characters(String)}, followed by
       * {@link @see org.xml.sax.ContentHandler#endElement}.</p>
       *
       * @param uri The element's Namespace URI.
       * @param localName The element's local name.
       * @param qName The element's default qualified name.
       * @param atts The element's attributes.
       * @param content The character data content.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startElement
       * @see #characters(String)
       * @see org.xml.sax.ContentHandler#endElement
       */
      public void dataElement (String uri, String localName, String qName,
              Attributes atts, String content) throws SAXException
      {
          startElement(uri, localName, qName, atts);
          characters(content);
          endElement(uri, localName, qName);
      }
  
      /**
       * Add an element with character data content but no attributes.
       *
       * <p>This is a convenience method to add a complete element
       * with character data content, including the start tag
       * and end tag.  This method provides an empty string
       * for the qname and an empty attribute list.</p>
       *
       * <p>This method invokes
       * {@link @see org.xml.sax.ContentHandler#startElement},
       * followed by
       * {@link #characters(String)}, followed by
       * {@link @see org.xml.sax.ContentHandler#endElement}.</p>
       *
       * @param uri The element's Namespace URI.
       * @param localName The element's local name.
       * @param content The character data content.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startElement
       * @see #characters(String)
       * @see org.xml.sax.ContentHandler#endElement
       */
      public void dataElement (String uri, String localName, String content)
              throws SAXException
      {
          dataElement(uri, localName, "", EMPTY_ATTS, content);
      }
  
      /**
       * Add an element with character data content but no attributes or
       * Namespace URI.
       *
       * <p>This is a convenience method to add a complete element
       * with character data content, including the start tag
       * and end tag.  The method provides an empty string for the
       * Namespace URI, and empty string for the qualified name,
       * and an empty attribute list.</p>
       *
       * <p>This method invokes
       * {@link @see org.xml.sax.ContentHandler#startElement},
       * followed by
       * {@link #characters(String)}, followed by
       * {@link @see org.xml.sax.ContentHandler#endElement}.</p>
       *
       * @param localName The element's local name.
       * @param content The character data content.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see org.xml.sax.ContentHandler#startElement
       * @see #characters(String)
       * @see org.xml.sax.ContentHandler#endElement
       */
      public void dataElement (String localName, String content)
              throws SAXException
      {
          dataElement("", localName, "", EMPTY_ATTS, content);
      }
  
      /**
       * Add a string of character data, with XML escaping.
       *
       * <p>This is a convenience method that takes an XML
       * String, converts it to a character array, then invokes
       * {@link @see org.xml.sax.ContentHandler#characters}.</p>
       *
       * @param data The character data.
       * @exception org.xml.sax.SAXException If a filter
       *            further down the chain raises an exception.
       * @see @see org.xml.sax.ContentHandler#characters
       */
      public void characters (String data) throws SAXException
      {
          char ch[] = data.toCharArray();
          characters(ch, 0, ch.length);
      }
  
      ////////////////////////////////////////////////////////////////////
      // Constants.
      ////////////////////////////////////////////////////////////////////
      protected static final Attributes EMPTY_ATTS = new AttributesImpl();
  }
  
  // end of XMLFilterBase.java
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/FileContentHandler.java
  
  Index: FileContentHandler.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.Reader;
  import java.util.List;
  
  /**
   * A content handler determines how to index a file's contents.
   *
   * @version $Id: FileContentHandler.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public interface FileContentHandler
  {
      /**
       * Do the file contents of this file have any meaning? Should
       * its contents be indexed?
       */
      public boolean fileContentIsReadable();
  
      /**
       * Returns a reader for this file's contents.
       */
      public Reader getReader();
  
      /**
       * Does this file have nested data within?
       */
      public boolean containsNestedData();
  
      /**
       * Return the datasources contained within the parent file.
       * This can be URLs contained within a HTML file, files
       * within a ZIP file, basically anything represented by a
       * DataSource.
       */
      public List getNestedDataSource();
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/FileContentHandlerAdapter.java
  
  Index: FileContentHandlerAdapter.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import java.io.File;
  import java.io.Reader;
  import java.util.List;
  
  /**
   * A no-op implementation to make FileContentHandler creation easier.
   * <p>
   * Classes which need to implement the FileContentHandler interface should
   * extend this class or {@link NestedFileContentHandlerAdapter}.
   * </p>
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: FileContentHandlerAdapter.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public abstract class FileContentHandlerAdapter implements FileContentHandler
  {
      protected File file;
  
      protected FileContentHandlerAdapter(File file)
      {
          this.file = file;
      }
  
      public Reader getReader()
      {
          return null;
      }
  
      public List getNestedDataSource()
      {
          return null;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/FileContentHandlerFactory.java
  
  Index: FileContentHandlerFactory.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.log4j.Category;
  
  import java.util.Map;
  import java.io.File;
  import java.lang.reflect.InvocationTargetException;
  import java.lang.reflect.Constructor;
  
  import com.relevanz.indyo.util.IOUtils;
  
  /**
   * Factory responsible for obtaining ContentHandlers.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: FileContentHandlerFactory.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public abstract class FileContentHandlerFactory
  {
      public static final String DEFAULT_HANDLER_KEY = "DEFAULT";
      static Category cat = Category.getInstance(FileContentHandlerFactory.class.getName());
      private static Map handlerRegistry;
  
      public static FileContentHandler getContentHandler(File f)
      {
          String extension = IOUtils.getFileExtension(f);
          if (handlerRegistry.containsKey(extension))
          {
              String handlerClassname = (String) handlerRegistry.get(extension);
              return (FileContentHandler) generateObject(handlerClassname,
                                                                       new Class[]{File.class},
                                                                       new Object[]{f});
          }
          else if (handlerRegistry.containsKey(DEFAULT_HANDLER_KEY))
          {
              String handlerClassname = (String) handlerRegistry.get(DEFAULT_HANDLER_KEY);
              return (FileContentHandler) generateObject(handlerClassname);
          }
          else
          {
              return NullHandler.getInstance();
          }
      }
  
      public static void setHandlerRegistry(Map handlerRegistry)
      {
          FileContentHandlerFactory.handlerRegistry = handlerRegistry;
      }
  
          /**
       * Utility method to return an object based on its class name.
       * The object needs to have a constructor which accepts no parameters.
       *
       * @param className  Class name of object to be generated
       * @return Object
       */
      private static Object generateObject(String className)
      {
          Object o = null;
          try
          {
              Class c = Class.forName(className);
              o = c.newInstance();
          }
          catch (ClassNotFoundException cnfe)
          {
              cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
          }
          catch (InstantiationException ie)
          {
              cat.error(ie.getMessage() + " Class named '" + className + "' could not be  instantiated.", ie);
          }
          catch (IllegalAccessException iae)
          {
              cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
          }
          return o;
      }
  
          /**
       * Utility method to return an object based on its class name.
       *
       * @param type  Class name of object to be generated
       * @param clazz Class array of parameters.
       * @param args Object array of arguments.
       * @return Object
       */
      private static Object generateObject(String className,
                                          Class[] clazz,
                                          Object[] args)
      {
          Object o = null;
          try
          {
              Class c = Class.forName(className);
              Constructor con = c.getConstructor(clazz);
              if (con != null)
              {
                  o = con.newInstance(args);
              }
              else
                  throw new InstantiationException("Constructor with arguments:" + clazz.toString() + " non-existent.");
          }
          catch (ClassNotFoundException cnfe)
          {
              cat.error(cnfe.getMessage() + " No class named '" + className + "' was found.", cnfe);
          }
          catch (InstantiationException ie)
          {
              cat.error(ie.getMessage() + " Class named '" + className + "' could not be  instantiated.", ie);
          }
          catch (IllegalAccessException iae)
          {
              cat.error(iae.getMessage() + " No access to class named '" + className + "'.", iae);
          }
          catch (NoSuchMethodException nsme)
          {
              cat.error(nsme.getMessage() + " No method in class named '" + className + "'.", nsme);
          }
          catch (InvocationTargetException ite)
          {
              cat.error(ite.getMessage() + " in class named '" + className + "'.", ite);
          }
          return o;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/GZipHandler.java
  
  Index: GZipHandler.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.log4j.Category;
  import com.relevanz.indyo.IndexDataSource;
  import com.relevanz.indyo.FSDataSource;
  import com.relevanz.indyo.util.IOUtils;
  
  import java.io.File;
  import java.io.IOException;
  import java.io.Reader;
  import java.util.List;
  
  /**
   * Handles GZip content.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: GZipHandler.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public class GZipHandler extends NestedFileContentHandlerAdapter
  {
      private static Category cat = Category.getInstance(GZipHandler.class.getName());
  
      public GZipHandler(File file)
      {
          super(file);
      }
  
      public Reader getReader()
      {
          return null;
      }
  
      public List getNestedDataSource()
      {
          if (!file.exists())
              return null;
          try
          {
              File tempDir = new File(TEMP_FOLDER);
              tempDir.mkdirs();
              tempDir.deleteOnExit();
              String filename = file.getName();
              File tempFile = new File(tempDir, filename.substring(0, filename.lastIndexOf(".")));
              tempFile.deleteOnExit();
              IOUtils.extractGZip(file, tempFile);
              indexGZipDirectory(tempDir);
          }
          catch (IOException ioe)
          {
              cat.error("IOException ungzipping " + file.toString(), ioe);
          }
          return nestedDataSource;
      }
  
      public boolean fileContentIsReadable()
      {
          return false;
      }
  
      // only one file, but let's just treat it like a directory anyway
      private void indexGZipDirectory(File dir)
      {
          if (dir.isDirectory())
          {
              File[] dirContents = dir.listFiles();
              for (int i = 0; i < dirContents.length; i++)
              {
                  indexGZipDirectory(dirContents[i]);
              }
          }
          else if (dir.isFile())
          {
              IndexDataSource ds = new FSDataSource(dir);
              nestedDataSource.add(nestedDataSource);
          }
      }
  }
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/NestedFileContentHandlerAdapter.java
  
  Index: NestedFileContentHandlerAdapter.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.lucene.document.Document;
  
  import java.io.File;
  import java.util.ArrayList;
  import java.util.List;
  
  /**
   * A no-op implementation to make FileContentHandler creation easier.
   * <p>
   * Classes which need to implement the FileContentHandler interface
   * and need to handle nested content (example: zip, tar, rar, etc) should
   * extend this class.
   * </p>
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: NestedFileContentHandlerAdapter.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public abstract class NestedFileContentHandlerAdapter
          extends FileContentHandlerAdapter
  {
      protected final String TEMP_FOLDER = "/usr/temp" + '/'
              + Math.random() + '/';
  
      protected List nestedDataSource;
  
      public NestedFileContentHandlerAdapter(File file)
      {
          super(file);
      }
  
      public boolean containsNestedData()
      {
          return true;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/NullHandler.java
  
  Index: NullHandler.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  import java.io.File;
  import java.io.Reader;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  /**
   * Do-nothing content handler.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: NullHandler.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public class NullHandler extends FileContentHandlerAdapter
  {
      private static NullHandler singleton = new NullHandler(null);
  
      public static FileContentHandler getInstance()
      {
          return singleton;
      }
  
      private NullHandler(File file)
      {
          super(file);
      }
  
      public boolean fileContentIsReadable()
      {
          return false;
      }
  
      public Reader getReader()
      {
          return null;
      }
  
      public boolean containsNestedData()
      {
          return false;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/TARHandler.java
  
  Index: TARHandler.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.log4j.Category;
  import com.relevanz.indyo.IndexDataSource;
  import com.relevanz.indyo.FSDataSource;
  import com.relevanz.indyo.util.IOUtils;
  
  import java.io.File;
  import java.io.IOException;
  import java.io.Reader;
  import java.util.ArrayList;
  import java.util.List;
  
  /**
   * Handles Tar files.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: TARHandler.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public class TARHandler extends NestedFileContentHandlerAdapter
  {
      static Category cat = Category.getInstance(TARHandler.class.getName());
  
      public TARHandler(File file)
      {
          super(file);
      }
  
      public Reader getReader()
      {
          return null;
      }
  
      public boolean fileContentIsReadable()
      {
          return false;
      }
  
      public List getNestedDataSource()
      {
          if (!file.exists())
              return null;
          if (nestedDataSource == null)
          {
              nestedDataSource = new ArrayList();
          }
          try
          {
              File tempDir = new File(TEMP_FOLDER);
              tempDir.deleteOnExit();
              IOUtils.extractTar(file, tempDir);
              indexTarDirectory(tempDir);
          }
          catch (IOException ioe)
          {
              cat.error(ioe.getMessage(), ioe);
          }
          return nestedDataSource;
      }
  
      private void indexTarDirectory(File dir)
      {
          if (dir.isDirectory())
          {
              File[] dirContents = dir.listFiles();
              for (int i = 0; i < dirContents.length; i++)
              {
                  indexTarDirectory(dirContents[i]);
              }
          }
          else if (dir.isFile())
          {
              // here create new DataMap for the tarred file
              IndexDataSource ds = new FSDataSource(dir);
              nestedDataSource.add(nestedDataSource);
          }
      }
  }
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/TextHandler.java
  
  Index: TextHandler.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.log4j.Category;
  import org.apache.lucene.document.Document;
  import org.apache.lucene.document.Field;
  
  import java.io.*;
  
  import com.relevanz.indyo.util.StringUtils;
  
  /**
   * Handles text-based content.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: TextHandler.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public class TextHandler extends FileContentHandlerAdapter
  {
      static Category cat = Category.getInstance(TextHandler.class.getName());
  
      public TextHandler(File file)
      {
          super(file);
      }
  
      public Reader getReader()
      {
          if (!file.exists())
          {
              cat.error(file.toString() + " doesn't exist! Failing silently...");
              return null;
          }
          return getReader(file);
      }
  
      public boolean containsNestedData()
      {
          return false;
      }
  
      public boolean fileContentIsReadable()
      {
          return true;
      }
  
      private Reader getReader(File f)
      {
          Reader reader = null;
          try
          {
              reader = new FileReader(f);
          }
          catch (FileNotFoundException nfe)
          {
              cat.error("File Not Found Exception:" + f.toString(), nfe);
          }
          catch (IOException ioe)
          {
              cat.error(ioe.getMessage(), ioe);
          }
          return reader;
      }
  }
  
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/src/java/com/relevanz/indyo/contenthandler/ZIPHandler.java
  
  Index: ZIPHandler.java
  ===================================================================
  package com.relevanz.indyo.contenthandler;
  
  /* ====================================================================
   * The Apache Software License, Version 1.1
   *
   * Copyright (c) 2001 The Apache Software Foundation.  All rights
   * reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer.
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. The end-user documentation included with the redistribution,
   *    if any, must include the following acknowledgment:
   *       "This product includes software developed by the
   *        Apache Software Foundation (http://www.apache.org/)."
   *    Alternately, this acknowledgment may appear in the software itself,
   *    if and wherever such third-party acknowledgments normally appear.
   *
   * 4. The names "Apache" and "Apache Software Foundation" and
   *    "Apache Lucene" must not be used to endorse or promote products
   *    derived from this software without prior written permission. For
   *    written permission, please contact apache@apache.org.
   *
   * 5. Products derived from this software may not be called "Apache",
   *    "Apache Lucene", nor may "Apache" appear in their name, without
   *    prior written permission of the Apache Software Foundation.
   *
   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
   * DISCLAIMED.  IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
   * SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Software Foundation.  For more
   * information on the Apache Software Foundation, please see
   * <http://www.apache.org/>.
   */
  
  import org.apache.log4j.Category;
  import com.relevanz.indyo.IndexDataSource;
  import com.relevanz.indyo.FSDataSource;
  import com.relevanz.indyo.util.IOUtils;
  
  import java.io.File;
  import java.io.IOException;
  import java.io.Reader;
  import java.util.ArrayList;
  import java.util.Enumeration;
  import java.util.List;
  import java.util.zip.ZipEntry;
  import java.util.zip.ZipException;
  import java.util.zip.ZipFile;
  
  /**
   * Handles Zip files.
   *
   * @author <a href="mailto:kelvint@apache.org">Kelvin Tan</a>
   * @version $Id: ZIPHandler.java,v 1.1 2002/08/30 18:03:52 kelvint Exp $ 
   */
  public class ZIPHandler extends NestedFileContentHandlerAdapter
  {
      private static Category cat = Category.getInstance(ZIPHandler.class);
  
      public ZIPHandler(File file)
      {
          super(file);
      }
  
      public boolean fileContentIsReadable()
      {
          return false;
      }
  
      public Reader getReader()
      {
          return null;
      }
  
      public List getNestedDataSource()
      {
          if (!file.exists())
              return null;
          if (nestedDataSource == null)
          {
              nestedDataSource = new ArrayList();
          }
          try
          {
              ZipFile zFile = new ZipFile(file);
              for (Enumeration e = zFile.entries(); e.hasMoreElements();)
              {
                  ZipEntry entry = (ZipEntry) e.nextElement();
                  String entryName = entry.getName();
                  IOUtils.writeToTempFile(zFile.getInputStream(entry),
                                          TEMP_FOLDER + entryName);
                  if (!entry.isDirectory())
                  {
                      // create a new DataMap for each zip entry
                      IndexDataSource ds = new FSDataSource(TEMP_FOLDER + entryName);
                      nestedDataSource.add(ds);
                  }
              }
              zFile.close();
          }
          catch (ZipException ze)
          {
              cat.error("ZipException parsing zip:" + ze.getMessage(), ze);
          }
          catch (IOException ioe)
          {
              cat.error("IOException parsing zip:" + ioe.getMessage(), ioe);
          }
          return nestedDataSource;
      }
  }
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/lib/jakarta-oro-2.0.6.jar
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/lib/jdom.jar
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/lib/log4j-1.2.6.jar
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/lib/lucene-1.2.jar
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/lib/tartool.jar
  
  	<<Binary file>>
  
  
  1.1                  jakarta-lucene-sandbox/contributions/indyo/lib/xmlParserAPIs-xerces-2.0.2.jar
  
  	<<Binary file>>
  
  

--
To unsubscribe, e-mail:   <mailto:lucene-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:lucene-dev-help@jakarta.apache.org>


Mime
View raw message