jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From mreut...@apache.org
Subject svn commit: r374708 - in /incubator/jackrabbit/trunk/contrib/textfilters: ./ src/java/org/apache/jackrabbit/core/query/ src/test/org/apache/jackrabbit/core/query/test/
Date Fri, 03 Feb 2006 16:48:06 GMT
Author: mreutegg
Date: Fri Feb  3 08:47:55 2006
New Revision: 374708

URL: http://svn.apache.org/viewcvs?rev=374708&view=rev
Log:
JCR-315: Support for OpenOffice text extraction
- open office text filter contributed by Nicolas Jouanin

Added:
    incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OOoContentHandler.java
  (with props)
    incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OpenOfficeTextFilter.java
  (with props)
    incubator/jackrabbit/trunk/contrib/textfilters/src/test/org/apache/jackrabbit/core/query/test/OpenOfficeTest.java
  (with props)
Modified:
    incubator/jackrabbit/trunk/contrib/textfilters/project.xml

Modified: incubator/jackrabbit/trunk/contrib/textfilters/project.xml
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/contrib/textfilters/project.xml?rev=374708&r1=374707&r2=374708&view=diff
==============================================================================
--- incubator/jackrabbit/trunk/contrib/textfilters/project.xml (original)
+++ incubator/jackrabbit/trunk/contrib/textfilters/project.xml Fri Feb  3 08:47:55 2006
@@ -185,7 +185,7 @@
     <dependency>
       <groupId>poi</groupId>
       <artifactId>poi</artifactId>
-      <version>2.0-final-20040126</version>
+      <version>2.5.1-final-20040804</version>
       <type>jar</type>
     </dependency>
     <dependency>

Added: incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OOoContentHandler.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OOoContentHandler.java?rev=374708&view=auto
==============================================================================
--- incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OOoContentHandler.java
(added)
+++ incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OOoContentHandler.java
Fri Feb  3 08:47:55 2006
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2004-2005 The Apache Software Foundation or its licensors,
+ *                     as applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+public class OOoContentHandler extends DefaultHandler {
+
+    private StringBuffer content;
+    private boolean appendChar;
+
+    public OOoContentHandler() {
+        content = new StringBuffer();
+        appendChar = false;
+    }
+
+    /**
+     * Returns the text content extracted from parsed content.xml
+     */
+    public String getContent() {
+        return content.toString();
+    }
+
+    public void startElement(String namespaceURI, String localName,
+                             String rawName, Attributes atts)
+            throws SAXException {
+        if (rawName.startsWith("text:")) {
+            appendChar = true;
+        }
+    }
+
+    public void characters(char[] ch, int start, int length)
+            throws SAXException {
+        if (appendChar) {
+            content.append(ch, start, length).append(" ");
+        }
+    }
+
+    public void endElement(java.lang.String namespaceURI,
+                           java.lang.String localName,
+                           java.lang.String qName)
+            throws SAXException {
+        appendChar = false;
+    }
+}

Propchange: incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OOoContentHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OpenOfficeTextFilter.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OpenOfficeTextFilter.java?rev=374708&view=auto
==============================================================================
--- incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OpenOfficeTextFilter.java
(added)
+++ incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OpenOfficeTextFilter.java
Fri Feb  3 08:47:55 2006
@@ -0,0 +1,108 @@
+/*
+ * Copyright 2004-2005 The Apache Software Foundation or its licensors,
+ *                     as applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query;
+
+import javax.jcr.RepositoryException;
+
+import org.apache.jackrabbit.core.query.lucene.FieldNames;
+import org.apache.jackrabbit.core.state.PropertyState;
+import org.apache.jackrabbit.core.value.BLOBFileValue;
+import org.apache.jackrabbit.core.value.InternalValue;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.zip.ZipInputStream;
+import java.util.zip.ZipEntry;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+
+import org.xml.sax.InputSource;
+import org.xml.sax.XMLReader;
+
+/**
+ * Extracts texts from OpenOffice document data.
+ */
+public class OpenOfficeTextFilter implements TextFilter {
+    private XMLReader xmlReader;
+
+    public boolean canFilter(String mimeType) {
+        return "application/vnd.oasis.opendocument.database".equalsIgnoreCase(mimeType) ||
+                "application/vnd.oasis.opendocument.formula".equalsIgnoreCase(mimeType) ||
+                "application/vnd.oasis.opendocument.graphics".equalsIgnoreCase(mimeType)
||
+                "application/vnd.oasis.opendocument.presentation".equalsIgnoreCase(mimeType)
||
+                "application/vnd.oasis.opendocument.spreadsheet".equalsIgnoreCase(mimeType)
||
+                "application/vnd.oasis.opendocument.text".equalsIgnoreCase(mimeType);
+    }
+
+    public Map doFilter(PropertyState data, String encoding)
+            throws RepositoryException {
+        ZipInputStream zis = null;
+        if (xmlReader == null) {
+            initParser();
+        }
+
+        InternalValue[] values = data.getValues();
+        if (values.length > 0) {
+            BLOBFileValue blob = (BLOBFileValue) values[0].internalValue();
+
+            try {
+                zis = new ZipInputStream(blob.getStream());
+                ZipEntry ze = zis.getNextEntry();
+                while (!ze.getName().equals("content.xml"))
+                    ze = zis.getNextEntry();
+                OOoContentHandler contentHandler = new OOoContentHandler();
+                xmlReader.setContentHandler(contentHandler);
+                xmlReader.parse(new InputSource(zis));
+                zis.close();
+
+                Map result = new HashMap();
+                result.put(FieldNames.FULLTEXT, new StringReader(contentHandler.getContent()));
+                return result;
+            } catch (Exception ex) {
+                throw new RepositoryException(ex);
+            } finally {
+                if (zis != null) {
+                    try {
+                        zis.close();
+                    } catch (IOException ioe) {
+                        ioe.printStackTrace();
+                    }
+                }
+            }
+        } else {
+            // multi value not supported
+            throw new RepositoryException("Multi-valued binary properties not supported.");
+        }
+
+    }
+
+    private void initParser() throws RepositoryException {
+        try {
+            SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
+            saxParserFactory.setValidating(false);
+            SAXParser saxParser = saxParserFactory.newSAXParser();
+            xmlReader = saxParser.getXMLReader();
+            xmlReader.setFeature("http://xml.org/sax/features/validation", false);
+            xmlReader.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd",
false);
+        } catch (Exception e) {
+            throw new RepositoryException(e);
+        }
+    }
+
+}

Propchange: incubator/jackrabbit/trunk/contrib/textfilters/src/java/org/apache/jackrabbit/core/query/OpenOfficeTextFilter.java
------------------------------------------------------------------------------
    svn:eol-style = native

Added: incubator/jackrabbit/trunk/contrib/textfilters/src/test/org/apache/jackrabbit/core/query/test/OpenOfficeTest.java
URL: http://svn.apache.org/viewcvs/incubator/jackrabbit/trunk/contrib/textfilters/src/test/org/apache/jackrabbit/core/query/test/OpenOfficeTest.java?rev=374708&view=auto
==============================================================================
--- incubator/jackrabbit/trunk/contrib/textfilters/src/test/org/apache/jackrabbit/core/query/test/OpenOfficeTest.java
(added)
+++ incubator/jackrabbit/trunk/contrib/textfilters/src/test/org/apache/jackrabbit/core/query/test/OpenOfficeTest.java
Fri Feb  3 08:47:55 2006
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2004-2006 The Apache Software Foundation or its licensors,
+ *                     as applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.jackrabbit.core.query.test;
+
+import java.io.File;
+
+import org.apache.jackrabbit.core.query.OpenOfficeTextFilter;
+
+
+public class OpenOfficeTest extends AbstractTextFilterTest {
+
+    public static void main(String[] args) throws Exception {
+        OpenOfficeTest test = new OpenOfficeTest();
+        File file = new File(args[0]);
+        test.showResult(file, new OpenOfficeTextFilter());
+    }
+}

Propchange: incubator/jackrabbit/trunk/contrib/textfilters/src/test/org/apache/jackrabbit/core/query/test/OpenOfficeTest.java
------------------------------------------------------------------------------
    svn:eol-style = native



Mime
View raw message