lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From "Malcolm Clark" <malcycl...@btinternet.com>
Subject Lucene and SAX
Date Tue, 25 Oct 2005 17:21:08 GMT
Hi again,
I am desperately asking for aid!!

I have used the sandbox demo to parse the INEX collection.The problem being 
it points to a volume file which references 50 other xml articles.Lucene 
only treats this as one document.Is there any method of which I'm 
overlooking that halts after each reference?
Could somebody please help and I wont post again until I submit something 
useful.

The code is:
public class XMLDocumentHandlerSAX
extends HandlerBase
{
    /** A buffer for each XML element */
    private StringBuffer elementBuffer = new StringBuffer();

    private Document mDocument;

    // constructor
    public XMLDocumentHandlerSAX(File xmlFile)
 throws ParserConfigurationException, SAXException, IOException
    {
 SAXParserFactory spf = SAXParserFactory.newInstance();

 SAXParser parser = spf.newSAXParser();
 parser.parse(xmlFile, this);
    }

    // call at document start
    public void startDocument()
    {
     mDocument = new Document();
 //mDocument = new Document();
 elementBuffer.setLength(0);
    }

    // call at element start
    public void startElement(String localName, AttributeList atts)
 throws SAXException
    {

     if (localName.equals("article")) {
      elementBuffer.setLength(0);
     }

    }
    // call when cdata found
    public void characters(char[] text, int start, int length)
    {

      elementBuffer.append(text, start, length);

    }

    // call at element end
    public void endElement(String localName)
 throws SAXException
    {

     if (localName.equals("article")) {
      System.out.println("Article: "+elementBuffer.length());
      elementBuffer.setLength(0);
     }

      mDocument.add(Field.Text(localName,elementBuffer.toString()));
      System.out.println("EB: "+elementBuffer);
      elementBuffer.setLength(0);

    }


    public Document getDocument()
    {

 return mDocument;
    }

    public static void main(String[] args)
 throws Exception
    {
 try
 {
     Date start = new Date();
     String indexDir = "C:\\LuceneDemo\\index";
     IndexWriter writer = new IndexWriter(indexDir, new StandardAnalyzer(), 
true);
     indexDocs(writer, new File("C:\\1995\\volume.xml"));


     writer.optimize();
     writer.close();

     Date end = new Date();

    }
 catch (Exception e)
 {
     System.out.println(" caught a " + e.getClass() + "\n with message: " + 
e.getMessage());
     throw e;
 }
    }

    public static void indexDocs(IndexWriter writer, File file)
 throws Exception
    {

 if (file.isDirectory())

 {
     String[] files = file.list();
     for (int i = 0; i < files.length; i++)
     indexDocs(writer, new File(file, files[i]));

 }
 else
 {
     System.out.println("adding " + file);

     XMLDocumentHandlerSAX hdlr = new XMLDocumentHandlerSAX(file);
     StandardAnalyzer anal = new StandardAnalyzer();
     writer.addDocument(hdlr.getDocument(),anal);
     System.out.println("Documents added to Index: "+writer.docCount());



 }
    }
}
Thanks very much again.
MC 


---------------------------------------------------------------------
To unsubscribe, e-mail: java-user-unsubscribe@lucene.apache.org
For additional commands, e-mail: java-user-help@lucene.apache.org


Mime
View raw message