lucene-java-user mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Mario Ivankovits <mario.ivankov...@ops.co.at>
Subject Re: Bridge with OpenOffice
Date Mon, 19 Apr 2004 20:01:33 GMT
Stephane James Vaucher wrote:

> Anyone try what Joerg suggested here?
> http://nagoya.apache.org/eyebrowse/ReadMsg?listName=lucene-user@jakarta.apache.org&msgNo=6231

>
>  
>
Dont know what you would like to do, but if you simply would like to 
extract text, you could simply try this sniplet:

---snip---             JarFile jar = new JarFile(file, false);
       ZipEntry entry = jar.getEntry("content.xml");
       if (entry == null)
       {
           throw new IOException("content.xml missing in file: " + file);
       }
       InputStream is = jar.getInputStream(entry);

       XMLReader xr = 
XMLReaderFactory.createXMLReader("org.apache.crimson.parser.XMLReaderImpl"); 

       xr.setEntityResolver(new EntityResolver()
       {
           public InputSource resolveEntity(String publicId, String 
systemId) throws SAXException, IOException
           {
               if (systemId.toLowerCase().endsWith(".dtd"))
               {
                   StringReader stringInput = new StringReader(" ");
                   return new InputSource(stringInput);
               }
               else
               {
                   return null;
               }
           }
       });

       final StringBuffer sbText = new StringBuffer(10240);
       xr.setContentHandler(new ContentHandler()
       {
           public void skippedEntity(String name) throws SAXException
           {
           }

           public void setDocumentLocator(Locator locator)
           {
           }

           public void ignorableWhitespace(char ch[], int start, int 
length) throws SAXException
           {
           }

           public void processingInstruction(String target, String data) 
throws SAXException
           {
           }

           public void startDocument() throws SAXException
           {
           }

           public void startElement(String namespaceURI, String 
localName, String qName, Attributes atts) throws SAXException
           {
               if (qName.equals("text:p"))
               {
                   if (sbText.length() > 0 && 
sbText.charAt(sbText.length() - 1) != '\n')
                   {
                       sbText.append('\n');
                   }
               }
           }

           public void endPrefixMapping(String prefix) throws SAXException
           {
           }

           public void characters(char ch[], int start, int length) 
throws SAXException
           {
               sbText.append(ch, start, length);
           }

           public void endElement(String namespaceURI, String localName, 
String qName) throws SAXException
           {
           }

           public void endDocument() throws SAXException
           {
           }

           public void startPrefixMapping(String prefix, String uri) 
throws SAXException
           {
           }
       });

       InputSource source = new InputSource(is);
       source.setPublicId("");
       source.setSystemId("");
       xr.parse(source);

       System.err.println("TXT: " + sbText.toString());
---snip---

Ciao,
Mario

Mime
View raw message