jackrabbit-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ju...@apache.org
Subject svn commit: r779501 - in /jackrabbit/branches/1.x: ./ jackrabbit-text-extractors/ jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/ jackrabbit-text-extractors/src/main/resources/org/
Date Thu, 28 May 2009 09:21:11 GMT
Author: jukka
Date: Thu May 28 09:21:11 2009
New Revision: 779501

URL: http://svn.apache.org/viewvc?rev=779501&view=rev
Log:
1.x: Revered JCR-1878 and JCR-1997 changes, targetting those features for 2.0 instead.

Removed:
    jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java
    jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/resources/org/
Modified:
    jackrabbit/branches/1.x/   (props changed)
    jackrabbit/branches/1.x/jackrabbit-text-extractors/pom.xml
    jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
    jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
    jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java
    jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
    jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java

Propchange: jackrabbit/branches/1.x/
------------------------------------------------------------------------------
--- svn:mergeinfo (original)
+++ svn:mergeinfo Thu May 28 09:21:11 2009
@@ -1 +1 @@
-/jackrabbit/trunk:770143-773197,773483,773525-773554,773584,773588,773828,773835-775756,775833,775836,775840,775868,775981,775986,776036,776256,776310,776313,776321-776322,776332,776356-776357,776362,776373,776650-776693,776737,776757,776776-776777,777024,777029,777034,777478,777509,777541,777548,777936,778645
+/jackrabbit/trunk:770143-773197,773483,773525-773554,773584,773588,773828,773835-775756,775833,775836,775840,775868,775981,775986,776036,776256,776310,776313,776321-776322,776332,776356-776357,776362,776373,776650-776693,776737,776757,776776-776777,777024,777029,777034,777478,777509,777541,777548,777936,778445,778613,778616,778621,778645

Modified: jackrabbit/branches/1.x/jackrabbit-text-extractors/pom.xml
URL: http://svn.apache.org/viewvc/jackrabbit/branches/1.x/jackrabbit-text-extractors/pom.xml?rev=779501&r1=779500&r2=779501&view=diff
==============================================================================
--- jackrabbit/branches/1.x/jackrabbit-text-extractors/pom.xml (original)
+++ jackrabbit/branches/1.x/jackrabbit-text-extractors/pom.xml Thu May 28 09:21:11 2009
@@ -62,10 +62,19 @@
 
   <dependencies>
     <dependency>
-      <groupId>org.apache.tika</groupId>
-      <artifactId>tika</artifactId>
-      <version>0.3</version>
-      <classifier>jdk14</classifier>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi</artifactId>
+      <version>3.5-beta5</version>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.poi</groupId>
+      <artifactId>poi-scratchpad</artifactId>
+      <version>3.5-beta5</version>
+    </dependency>
+    <dependency>
+      <groupId>pdfbox</groupId>
+      <artifactId>pdfbox</artifactId>
+      <version>0.7.3</version>
       <exclusions>
         <exclusion>
           <groupId>bouncycastle</groupId>
@@ -78,9 +87,9 @@
       </exclusions>
     </dependency>
     <dependency>
-      <groupId>xml-apis</groupId>
-      <artifactId>xml-apis</artifactId>
-      <version>1.3.04</version>
+      <groupId>net.sourceforge.nekohtml</groupId>
+      <artifactId>nekohtml</artifactId>
+      <version>1.9.7</version>
     </dependency>
     <dependency>
       <groupId>org.slf4j</groupId>

Modified: jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java?rev=779501&r1=779500&r2=779501&view=diff
==============================================================================
--- jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
(original)
+++ jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/DefaultTextExtractor.java
Thu May 28 09:21:11 2009
@@ -16,75 +16,19 @@
  */
 package org.apache.jackrabbit.extractor;
 
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.Reader;
-import java.util.Set;
-
-import org.apache.tika.config.TikaConfig;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.ParsingReader;
-
 /**
- * Default text extractor based on Apache Tika.
+ * Composite text extractor that by default contains the standard
+ * text extractors found in this package.
  */
-public class DefaultTextExtractor implements TextExtractor {
-
-    /**
-     * Auto-detecting parser.
-     */
-    private static final Parser PARSER;
+public class DefaultTextExtractor extends CompositeTextExtractor {
 
     /**
-     * Supported content types.
+     * Creates the default text extractor by adding instances of the standard
+     * text extractors as components.
      */
-    private static final String[] TYPES;
-
-    static {
-        // The default Tika configuration refers to Apache POI libraries that
-        // are compiled for Java 5, and can thus not be loaded in Java 1.4.
-        // This makes it impossible to load the default Tika configuration
-        // (see TIKA-217 for background), and so we need to use the following
-        // workaround to instantiate the Tika AutoDetectParser without the
-        // POI classes (and thus support for MS Office formats) when running
-        // on Java 1.4.
-        AutoDetectParser parser;
-        if ("1.4".equals(System.getProperty("java.specification.version"))) {
-            InputStream stream =
-                DefaultTextExtractor.class.getResourceAsStream("tika-config-jdk14.xml");
-            try {
-                try {
-                    parser = new AutoDetectParser(new TikaConfig(stream));
-                } finally {
-                    stream.close();
-                }
-            } catch (Exception e) {
-                throw new RuntimeException(
-                        "Unable to load Tika configuration", e);
-            }
-        } else {
-            parser = new AutoDetectParser();
-        }
-        PARSER = parser;
-
-        Set types = parser.getParsers().keySet();
-        TYPES = (String[]) types.toArray(new String[types.size()]);
-    }
-
-    public String[] getContentTypes() {
-        return TYPES;
-    }
-
-    public Reader extractText(InputStream stream, String type, String encoding)
-            throws IOException {
-        Metadata metadata = new Metadata();
-        if (type != null && type.trim().length() > 0) {
-            metadata.set(Metadata.CONTENT_TYPE, type.trim());
-        }
-        // TODO: This creates a background thread. Is that a problem?
-        return new ParsingReader(PARSER, stream, metadata);
+    public DefaultTextExtractor() {
+        addTextExtractor(new PlainTextExtractor());
+        addTextExtractor(new XMLTextExtractor());
     }
 
 }

Modified: jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java?rev=779501&r1=779500&r2=779501&view=diff
==============================================================================
--- jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
(original)
+++ jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsExcelTextExtractor.java
Thu May 28 09:21:11 2009
@@ -16,19 +16,61 @@
  */
 package org.apache.jackrabbit.extractor;
 
+import org.apache.poi.poifs.filesystem.POIFSFileSystem;
+import org.apache.poi.hssf.extractor.ExcelExtractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
 /**
  * Text extractor for Microsoft Excel sheets.
  */
-public class MsExcelTextExtractor extends DefaultTextExtractor {
+public class MsExcelTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+        LoggerFactory.getLogger(MsExcelTextExtractor.class);
 
-    private static String[] TYPES = new String[] {
-        "application/vnd.ms-excel",
-        "application/msexcel",
-        "application/excel"
-    };
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        POIFSFileSystem.class.getName();
+    }
 
-    public String[] getContentTypes() {
-        return TYPES;
+    /**
+     * Creates a new <code>MsExcelTextExtractor</code> instance.
+     */
+    public MsExcelTextExtractor() {
+        super(new String[] {
+                "application/vnd.ms-excel",
+                "application/msexcel",
+                "application/excel"
+        });
     }
 
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            POIFSFileSystem fs = new POIFSFileSystem(stream);
+            return new StringReader(new ExcelExtractor(fs).getText());
+        } catch (RuntimeException e) {
+            logger.warn("Failed to extract Excel text content", e);
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
+    }
 }

Modified: jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java?rev=779501&r1=779500&r2=779501&view=diff
==============================================================================
--- jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java
(original)
+++ jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsOutlookTextExtractor.java
Thu May 28 09:21:11 2009
@@ -16,17 +16,64 @@
  */
 package org.apache.jackrabbit.extractor;
 
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Reader;
+import java.io.StringReader;
+
+import org.apache.poi.hsmf.MAPIMessage;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
 /**
  * Text extractor for Microsoft Outlook messages.
  */
-public class MsOutlookTextExtractor extends DefaultTextExtractor {
+public class MsOutlookTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+        LoggerFactory.getLogger(MsOutlookTextExtractor.class);
+
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        MAPIMessage.class.getName();
+    }
+
+    /**
+     * Creates a new <code>MsOutlookTextExtractor</code> instance.
+     */
+    public MsOutlookTextExtractor() {
+        super(new String[]{"application/vnd.ms-outlook"});
+    }
 
-    private static String[] TYPES = new String[] {
-        "application/vnd.ms-outlook"
-    };
+    //-------------------------------------------------------< TextExtractor >
 
-    public String[] getContentTypes() {
-        return TYPES;
+    /**
+     * {@inheritDoc}
+     * Returns an empty reader if an error occured extracting text from
+     * the outlook message.
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+        	MAPIMessage message = new MAPIMessage(stream);
+        	StringBuffer buffer = new StringBuffer();
+        	buffer.append(message.getDisplayFrom()).append('\n');
+        	buffer.append(message.getDisplayTo()).append('\n');
+        	buffer.append(message.getSubject()).append('\n');
+        	buffer.append(message.getTextBody());
+            return new StringReader(buffer.toString());
+        } catch (Exception e) {
+            logger.warn("Failed to extract Message content", e);
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
     }
 
 }

Modified: jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java?rev=779501&r1=779500&r2=779501&view=diff
==============================================================================
--- jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
(original)
+++ jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsPowerPointTextExtractor.java
Thu May 28 09:21:11 2009
@@ -16,19 +16,64 @@
  */
 package org.apache.jackrabbit.extractor;
 
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
 /**
  * Text extractor for Microsoft PowerPoint presentations.
  */
-public class MsPowerPointTextExtractor extends DefaultTextExtractor {
+public class MsPowerPointTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+        LoggerFactory.getLogger(MsPowerPointTextExtractor.class);
 
-    private static String[] TYPES = new String[] {
-        "application/vnd.ms-powerpoint",
-        "application/mspowerpoint",
-        "application/powerpoint"
-    };
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        POIFSReader.class.getName();
+    }
 
-    public String[] getContentTypes() {
-        return TYPES;
+    /**
+     * Creates a new <code>MsPowerPointTextExtractor</code> instance.
+     */
+    public MsPowerPointTextExtractor() {
+        super(new String[]{
+                "application/vnd.ms-powerpoint",
+                "application/mspowerpoint",
+                "application/powerpoint"
+        });
     }
 
+    //-------------------------------------------------------< TextExtractor >
+
+    /**
+     * {@inheritDoc}
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            PowerPointExtractor extractor = new PowerPointExtractor(stream);
+            return new StringReader(extractor.getText(true, true));
+        } catch (RuntimeException e) {
+            logger.warn("Failed to extract PowerPoint text content", e);
+            return new StringReader("");
+        } finally {
+            try {
+                stream.close();
+            } catch (IOException ignored) {
+            }
+        }
+    }
 }

Modified: jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
URL: http://svn.apache.org/viewvc/jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java?rev=779501&r1=779500&r2=779501&view=diff
==============================================================================
--- jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
(original)
+++ jackrabbit/branches/1.x/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsWordTextExtractor.java
Thu May 28 09:21:11 2009
@@ -16,18 +16,58 @@
  */
 package org.apache.jackrabbit.extractor;
 
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.poi.hwpf.extractor.WordExtractor;
+
+import java.io.Reader;
+import java.io.InputStream;
+import java.io.IOException;
+import java.io.StringReader;
+
 /**
  * Text extractor for Microsoft Word documents.
  */
-public class MsWordTextExtractor extends DefaultTextExtractor {
+public class MsWordTextExtractor extends AbstractTextExtractor {
+
+    /**
+     * Logger instance.
+     */
+    private static final Logger logger =
+        LoggerFactory.getLogger(MsWordTextExtractor.class);
+
+    /**
+     * Force loading of dependent class.
+     */
+    static {
+        WordExtractor.class.getName();
+    }
+
+    /**
+     * Creates a new <code>MsWordTextExtractor</code> instance.
+     */
+    public MsWordTextExtractor() {
+        super(new String[]{"application/vnd.ms-word", "application/msword"});
+    }
 
-    private static String[] TYPES = new String[] {
-        "application/vnd.ms-word",
-        "application/msword"
-    };
+    //-------------------------------------------------------< TextExtractor >
 
-    public String[] getContentTypes() {
-        return TYPES;
+    /**
+     * {@inheritDoc}
+     * Returns an empty reader if an error occured extracting text from
+     * the word document.
+     */
+    public Reader extractText(InputStream stream,
+                              String type,
+                              String encoding) throws IOException {
+        try {
+            return new StringReader(new WordExtractor(stream).getText());
+        } catch (Exception e) {
+            logger.warn("Failed to extract Word text content", e);
+            return new StringReader("");
+        } finally {
+            stream.close();
+        }
     }
 
 }



Mime
View raw message