manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From shinich...@apache.org
Subject svn commit: r1700924 - in /manifoldcf/trunk: ./ connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/...
Date Thu, 03 Sep 2015 01:41:22 GMT
Author: shinichiro
Date: Thu Sep  3 01:41:21 2015
New Revision: 1700924

URL: http://svn.apache.org/r1700924
Log:
Fix for CONNECTORS-1230

Added:
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
    manifoldcf/trunk/connectors/tika/connector/src/test/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/
    manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
    manifoldcf/trunk/connectors/tika/connector/src/test/resources/
    manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/
    manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
  (with props)
    manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
    manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
  (with props)
Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Thu Sep  3 01:41:21 2015
@@ -3,6 +3,8 @@ $Id$
 
 ======================= 2.3-dev =====================
 
+CONNECTORS-1230: Add writeLimit option on Tika extractor.
+(Shinichiro Abe)
 
 ======================= Release 2.2 =====================
 

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
Thu Sep  3 01:41:21 2015
@@ -30,6 +30,8 @@ public class TikaConfig {
   public static final String NODE_FIELDMAP = "fieldmap";
   public static final String NODE_KEEPMETADATA = "keepAllMetadata";
   public static final String NODE_LOWERNAMES = "lowerNames";
+  public static final String NODE_WRITELIMIT = "writeLimit";
+  public static final int WRITELIMIT_DEFAULT = -1;
   public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
   public static final String NODE_BOILERPLATEPROCESSOR = "boilerplateprocessor";
   public static final String ATTRIBUTE_SOURCE = "source";

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
Thu Sep  3 01:41:21 2015
@@ -27,10 +27,6 @@ import java.util.*;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.parser.AutoDetectParser;
-import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
 
@@ -222,14 +218,12 @@ public class TikaExtractor extends org.a
           try
           {
             // Use tika to parse stuff
-            Parser parser = new AutoDetectParser();
-            ContentHandler handler = new BodyContentHandler(w);
+            ContentHandler handler = TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
             if (extractorClassInstance != null)
               handler = new BoilerpipeContentHandler(handler, extractorClassInstance);
-            ParseContext pc = new ParseContext();
             try
             {
-              parser.parse(document.getBinaryStream(), handler, metadata, pc);
+              TikaParser.parse(document.getBinaryStream(), metadata, handler);
             }
             catch (TikaException e)
             {
@@ -458,7 +452,8 @@ public class TikaExtractor extends org.a
         SpecificationNode node = os.getChild(i);
         if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
           || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
-          || node.getType().equals(TikaConfig.NODE_LOWERNAMES))
+          || node.getType().equals(TikaConfig.NODE_LOWERNAMES)
+          || node.getType().equals(TikaConfig.NODE_WRITELIMIT))
           os.removeChild(i);
         else
           i++;
@@ -523,6 +518,18 @@ public class TikaExtractor extends org.a
         node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
       }
       os.addChild(os.getChildCount(), node2);
+      
+      SpecificationNode node3 = new SpecificationNode(TikaConfig.NODE_WRITELIMIT);
+      String writeLimit = variableContext.getParameter(seqPrefix+"writelimit");
+      if (writeLimit != null)
+      {
+        node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, writeLimit);
+      }
+      else
+      {
+        node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+      }
+      os.addChild(os.getChildCount(), node3);
     }
     
     if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present") != null)
@@ -602,6 +609,7 @@ public class TikaExtractor extends org.a
     List<Map<String,String>> fieldMappings = new ArrayList<Map<String,String>>();
     String keepAllMetadataValue = "true";
     String lowernamesValue = "false";
+    String writeLimitValue = "";
     for (int i = 0; i < os.getChildCount(); i++)
     {
       SpecificationNode sn = os.getChild(i);
@@ -630,10 +638,15 @@ public class TikaExtractor extends org.a
       {
         lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
       }
+      else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT))
+      {
+        writeLimitValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      }
     }
     paramMap.put("FIELDMAPPINGS",fieldMappings);
     paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
     paramMap.put("LOWERNAMES",lowernamesValue);
+    paramMap.put("WRITELIMIT",writeLimitValue);
   }
 
   protected static void fillInExceptionsSpecificationMap(Map<String,Object> paramMap,
Specification os)
@@ -832,12 +845,14 @@ public class TikaExtractor extends org.a
     private final Map<String,String> sourceTargets = new HashMap<String,String>();
     private final boolean keepAllMetadata;
     private final boolean lowerNames;
+    private final int writeLimit;
     private final boolean ignoreTikaException;
     private final String extractorClassName;
     
     public SpecPacker(Specification os) {
       boolean keepAllMetadata = true;
       boolean lowerNames = false;
+      int writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
       boolean ignoreTikaException = true;
       String extractorClassName = null;
       for (int i = 0; i < os.getChildCount(); i++) {
@@ -849,6 +864,13 @@ public class TikaExtractor extends org.a
         } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           lowerNames = Boolean.parseBoolean(value);
+        } else if(sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          if (value.length() == 0) {
+            writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
+          } else {
+            writeLimit = Integer.parseInt(value);
+          }
         } else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
           String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
           String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
@@ -866,6 +888,7 @@ public class TikaExtractor extends org.a
       }
       this.keepAllMetadata = keepAllMetadata;
       this.lowerNames = lowerNames;
+      this.writeLimit = writeLimit;
       this.ignoreTikaException = ignoreTikaException;
       this.extractorClassName = extractorClassName;
     }
@@ -903,6 +926,13 @@ public class TikaExtractor extends org.a
           sb.append('+');
         else
           sb.append('-');
+
+      if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT)
+      {
+        sb.append('+');
+        sb.append(writeLimit);
+      }
+
       if (ignoreTikaException)
         sb.append('+');
       else
@@ -931,6 +961,10 @@ public class TikaExtractor extends org.a
       return lowerNames;
     }
     
+    public int writeLimit() {
+      return writeLimit;
+    }
+    
     public boolean ignoreTikaException() {
       return ignoreTikaException;
     }

Added: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java?rev=1700924&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
(added)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaParser.java
Thu Sep  3 01:41:21 2015
@@ -0,0 +1,51 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.agents.transformation.tika;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Writer;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.WriteOutContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class TikaParser {
+
+  private static Parser parser = new AutoDetectParser();
+
+  private TikaParser() { }
+
+  public static ContentHandler newWriteOutBodyContentHandler(Writer w, int writeLimit) {
+    ContentHandler writeOutContentHandler = new WriteOutContentHandler(w, writeLimit);
+    return new BodyContentHandler(writeOutContentHandler);
+  }
+
+  public static void parse(InputStream stream, Metadata metadata, ContentHandler handler)
+    throws IOException, SAXException, TikaException {
+    ParseContext context = new ParseContext();
+    context.set(Parser.class, parser);
+    parser.parse(stream, handler, metadata, context);
+  }
+
+}

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
Thu Sep  3 01:41:21 2015
@@ -30,6 +30,7 @@ TikaExtractor.FinalFieldName=Final field
 TikaExtractor.NoFieldMappingSpecified=No field mapping specified
 TikaExtractor.KeepAllMetadata=Keep all metadata:
 TikaExtractor.LowerNames=Lower names:
+TikaExtractor.WriteLimit=Write limit:
 TikaExtractor.Add=Add
 TikaExtractor.AddFieldMapping=Add field mapping
 TikaExtractor.Delete=Delete

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
Thu Sep  3 01:41:21 2015
@@ -30,6 +30,7 @@ TikaExtractor.FinalFieldName=最å¾�
 TikaExtractor.NoFieldMappingSpecified=フィールドマッピングを指定してください
 TikaExtractor.KeepAllMetadata=全メタデータを保存:
 TikaExtractor.LowerNames=小文字名:
+TikaExtractor.WriteLimit=最大文字長:
 TikaExtractor.Add=追加
 TikaExtractor.AddFieldMapping=フィールドマッピングを追加
 TikaExtractor.Delete=削除

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
Thu Sep  3 01:41:21 2015
@@ -30,6 +30,7 @@ TikaExtractor.FinalFieldName=最ç»�
 TikaExtractor.NoFieldMappingSpecified=未指定字段映射
 TikaExtractor.KeepAllMetadata=保存所有元数据:
 TikaExtractor.LowerNames=小写:
+TikaExtractor.WriteLimit=最大字符长度:
 TikaExtractor.Add=添加
 TikaExtractor.AddFieldMapping=添加字段映射
 TikaExtractor.Delete=删除

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_FieldMapping.html
Thu Sep  3 01:41:21 2015
@@ -102,6 +102,13 @@
   #end
     </td>
   </tr>
+
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.WriteLimit'))</nobr></td>
+    <td class="value"><input name="s${SEQNUM}_writelimit" type="text"
+      value="$Encoder.attributeEscape($WRITELIMIT)" size="16" />
+    </td>
+  </tr>
 </table>
       
 #else
@@ -115,5 +122,6 @@
 <input type="hidden" name="s${SEQNUM}_fieldmapping_count" value="$fieldcounter"/>
 <input type="hidden" name="s${SEQNUM}_keepallmetadata" value="$Encoder.bodyEscape($KEEPALLMETADATA)"/>
 <input type="hidden" name="s${SEQNUM}_lowernames" value="$Encoder.bodyEscape($LOWERNAMES)"/>
+<input type="hidden" name="s${SEQNUM}_writelimit" value="$Encoder.attributeEscape($WRITELIMIT)"
/>
 
 #end
\ No newline at end of file

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1700924&r1=1700923&r2=1700924&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Thu Sep  3 01:41:21 2015
@@ -58,6 +58,11 @@
   </tr>
   <tr><td class="separator" colspan="2"><hr/></td></tr>
   <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.WriteLimit'))</nobr></td>
+    <td class="value"><nobr>$Encoder.bodyEscape($WRITELIMIT)</nobr></td>
+  </tr>
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
     <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
     <td class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
   </tr>

Added: manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java?rev=1700924&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
(added)
+++ manifoldcf/trunk/connectors/tika/connector/src/test/java/org/apache/manifoldcf/agents/transformation/tika/tests/TikaParserTest.java
Thu Sep  3 01:41:21 2015
@@ -0,0 +1,99 @@
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.agents.transformation.tika.tests;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.manifoldcf.agents.transformation.tika.TikaParser;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
+import org.junit.Test;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+import static org.junit.Assert.*;
+import static org.hamcrest.CoreMatchers.*;
+
+public class TikaParserTest {
+
+  private static List<String> docs = new ArrayList<>();
+  static {
+    docs.add("/test-documents/testEXCEL.xlsx");
+    docs.add("/test-documents/testHTML.html");
+    docs.add("/test-documents/testPDF.pdf");
+  }
+
+  @Test
+  public void testSimple() throws IOException, SAXException, TikaException {
+    for (String doc : docs) {
+      String path = doc;
+      InputStream stream = getClass().getResourceAsStream(path);
+      Metadata metadata = new Metadata();
+      metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(getClass().getResource(path).getFile()).getName());
+      ContentHandler unlimitedHandler
+        = TikaParser.newWriteOutBodyContentHandler(new StringWriter(), -1);
+      TikaParser.parse(stream, metadata, unlimitedHandler);
+ 
+      assertThat(unlimitedHandler.toString().length(), not(0));
+      assertThat(metadata.get("Content-Type"), notNullValue());
+      assertThat(metadata.get("resourceName"), notNullValue());
+    }
+  }
+
+  @Test
+  public void testExtractWithWriteLimit() throws IOException, SAXException, TikaException
{
+    for (String doc : docs) {
+      String path = doc;
+      InputStream stream = getClass().getResourceAsStream(path);
+      Metadata metadata = new Metadata();
+      metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(getClass().getResource(path).getFile()).getName());
+      ContentHandler limitedHandler
+        = TikaParser.newWriteOutBodyContentHandler(new StringWriter(), 100 * 1000);
+      TikaParser.parse(stream, metadata, limitedHandler);
+
+      assertThat(limitedHandler.toString().length(), not(0));
+      assertThat(metadata.get("Content-Type"), notNullValue());
+      assertThat(metadata.get("resourceName"), notNullValue());
+    }
+  }
+
+  @Test
+  public void testExtractWithTooShortWriteLimit() {
+    for (String doc : docs) {
+      String path = doc;
+      InputStream stream = getClass().getResourceAsStream(path);
+      Metadata metadata = new Metadata();
+      metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, new File(getClass().getResource(path).getFile()).getName());
+      ContentHandler limitedHandler
+        = TikaParser.newWriteOutBodyContentHandler(new StringWriter(), 10);
+      try {
+        TikaParser.parse(stream, metadata, limitedHandler);
+        fail("Should not get here");
+      } catch (Exception e) {
+        assert e instanceof SAXException;
+        assertThat(e.toString().indexOf("org.apache.tika.sax.WriteOutContentHandler$WriteLimitReachedException"),
not(-1));
+      }
+    }
+  }
+
+}

Added: manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx?rev=1700924&view=auto
==============================================================================
Binary file - no diff available.

Propchange: manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
------------------------------------------------------------------------------
    svn:executable = *

Propchange: manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testEXCEL.xlsx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html?rev=1700924&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
(added)
+++ manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testHTML.html
Thu Sep  3 01:41:21 2015
@@ -0,0 +1,17 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<title>Welcome to the Apache ManifoldCF&trade; project!</title>
+</head>
+
+<body>
+<div id="content">
+<h1>Welcome to the Apache ManifoldCF&trade; project!</h1>
+
+<p>Please click the appropriate tab above to see this site in the language of your
choice.</p>
+
+</div>
+</body>
+</html>

Added: manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf?rev=1700924&view=auto
==============================================================================
Binary file - no diff available.

Propchange: manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
------------------------------------------------------------------------------
    svn:executable = *

Propchange: manifoldcf/trunk/connectors/tika/connector/src/test/resources/test-documents/testPDF.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



Mime
View raw message