Return-Path: Delivered-To: apmail-jackrabbit-commits-archive@www.apache.org Received: (qmail 44082 invoked from network); 8 Apr 2009 04:11:33 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.3) by minotaur.apache.org with SMTP; 8 Apr 2009 04:11:33 -0000 Received: (qmail 3888 invoked by uid 500); 8 Apr 2009 04:11:33 -0000 Delivered-To: apmail-jackrabbit-commits-archive@jackrabbit.apache.org Received: (qmail 3837 invoked by uid 500); 8 Apr 2009 04:11:33 -0000 Mailing-List: contact commits-help@jackrabbit.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@jackrabbit.apache.org Delivered-To: mailing list commits@jackrabbit.apache.org Received: (qmail 3828 invoked by uid 99); 8 Apr 2009 04:11:33 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Apr 2009 04:11:33 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Apr 2009 04:11:31 +0000 Received: by eris.apache.org (Postfix, from userid 1221) id 2133B2388B9A; Wed, 8 Apr 2009 04:11:10 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r762808 - /jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Date: Wed, 08 Apr 2009 04:11:10 -0000 To: commits@jackrabbit.apache.org From: jukka@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20090408041110.2133B2388B9A@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: jukka Date: Tue Apr 7 15:05:15 2009 New Revision: 762808 URL: http://svn.apache.org/viewvc?rev=762808&view=rev Log: JCR-1887: msoffice text extractor for office 2007 files Replace the implementation with a Apache Tika from TIKA-1878. This way we won't get compile errors due to the Java 5 POI libraries. Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Modified: jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java URL: http://svn.apache.org/viewvc/jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java?rev=762808&r1=762807&r2=762808&view=diff ============================================================================== --- jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java (original) +++ jackrabbit/trunk/jackrabbit-text-extractors/src/main/java/org/apache/jackrabbit/extractor/MsTextExtractor.java Tue Apr 7 15:05:15 2009 @@ -16,67 +16,25 @@ */ package org.apache.jackrabbit.extractor; -import org.apache.poi.extractor.ExtractorFactory; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.Reader; -import java.io.InputStream; -import java.io.IOException; -import java.io.StringReader; /** * Text extractor for Microsoft Word documents. */ -public class MsTextExtractor extends AbstractTextExtractor { - - /** - * Logger instance. - */ - private static final Logger logger = - LoggerFactory.getLogger(MsTextExtractor.class); - - /** - * Force loading of dependent class. - */ - static { - ExtractorFactory.class.getName(); - } - - /** - * Creates a new MsWordTextExtractor instance. - */ - public MsTextExtractor() { - super(new String[]{"application/vnd.ms-word", - "application/msword", - "application/vnd.ms-powerpoint", - "application/mspowerpoint", - "application/vnd.ms-excel", - "application/vnd.openxmlformats-officedocument.wordprocessingml.document", - "application/vnd.openxmlformats-officedocument.presentationml.presentation", - "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"}); - } +public class MsTextExtractor extends TikaTextExtractor { - //-------------------------------------------------------< TextExtractor > + private static String[] TYPES = new String[] { + "application/vnd.ms-word", + "application/msword", + "application/vnd.ms-powerpoint", + "application/mspowerpoint", + "application/vnd.ms-excel", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + }; - /** - * {@inheritDoc} - * Returns an empty reader if an error occured extracting text from - * the word document. - */ - public Reader extractText(InputStream stream, - String type, - String encoding) throws IOException { - try { - String text = ExtractorFactory.createExtractor(stream).getText(); - return new StringReader(text); - } catch (Exception e) { - logger.warn("Failed to extract Microsoft Document text content", e); - return new StringReader(""); - } finally { - stream.close(); - } + public String[] getContentTypes() { + return TYPES; } }