manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1612814 - in /manifoldcf/trunk: ./ connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/ connectors/tika/...
Date Wed, 23 Jul 2014 11:51:00 GMT
Author: kwright
Date: Wed Jul 23 11:50:59 2014
New Revision: 1612814

URL: http://svn.apache.org/r1612814
Log:
Fix for CONNECTORS-984.

Added:
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
  (with props)
Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
    manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
    manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1612814&r1=1612813&r2=1612814&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Wed Jul 23 11:50:59 2014
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 1.7-dev =====================
 
+CONNECTORS-984: Add Tika extraction metadata, and also add
+ability to ignore tika exceptions.
+(Shinichiro Abe, Karl Wright)
+
 CONNECTORS-989: Introduce document sub-components, which
 is a way of having multiple indexed documents corresponding to a
 single repository document.

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1612814&r1=1612813&r2=1612814&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
Wed Jul 23 11:50:59 2014
@@ -19,7 +19,7 @@
 
 package org.apache.manifoldcf.agents.transformation.tika;
 
-/** Parameters for AmazonCloudSearch output connector.
+/** Parameters for Tika transformation connector.
  */
 public class TikaConfig {
 
@@ -29,6 +29,7 @@ public class TikaConfig {
   // Specification nodes and values
   public static final String NODE_FIELDMAP = "fieldmap";
   public static final String NODE_KEEPMETADATA = "keepAllMetadata";
+  public static final String NODE_IGNORETIKAEXCEPTION = "ignoreException";
   public static final String ATTRIBUTE_SOURCE = "source";
   public static final String ATTRIBUTE_TARGET = "target";
   public static final String ATTRIBUTE_VALUE = "value";

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1612814&r1=1612813&r2=1612814&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
Wed Jul 23 11:50:59 2014
@@ -31,6 +31,7 @@ import org.apache.tika.parser.ParseConte
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.metadata.TikaMetadataKeys;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -43,6 +44,7 @@ public class TikaExtractor extends org.a
 
   private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
   private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML = "editSpecification_FieldMapping.html";
+  private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML = "editSpecification_Exceptions.html";
   private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
 
   protected static final String ACTIVITY_EXTRACT = "extract";
@@ -188,6 +190,15 @@ public class TikaExtractor extends org.a
     try
     {
       Metadata metadata = new Metadata();
+      if (document.getFileName() != null)
+      {
+        metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, document.getFileName());
+        metadata.add("stream_name", document.getFileName());
+      }
+      if (document.getMimeType() != null)
+        metadata.add("Content-Type", document.getMimeType());
+      metadata.add("stream_size", new Long(document.getBinaryLength()).toString());
+
       // We only log the extraction
       long startTime = System.currentTimeMillis();
       String resultCode = "OK";
@@ -211,15 +222,29 @@ public class TikaExtractor extends org.a
             }
             catch (TikaException e)
             {
-              resultCode = "TIKAEXCEPTION";
-              description = e.getMessage();
-              return handleTikaException(e);
+              if (sp.ignoreTikaException())
+              {
+                resultCode = "TIKAEXCEPTION";
+                description = e.getMessage();
+              }
+              else
+              {
+                resultCode = "TIKAREJECTION";
+                description = e.getMessage();
+                int rval = handleTikaException(e);
+                if (rval == DOCUMENTSTATUS_REJECTED)
+                  activities.noDocument();
+                return rval;
+              }
             }
             catch (SAXException e)
             {
               resultCode = "SAXEXCEPTION";
               description = e.getMessage();
-              return handleSaxException(e);
+              int rval = handleSaxException(e);
+              if (rval == DOCUMENTSTATUS_REJECTED)
+                activities.noDocument();
+              return rval;
             }
             catch (IOException e)
             {
@@ -284,10 +309,7 @@ public class TikaExtractor extends org.a
         }
 
         // Send new document downstream
-        int rval = activities.sendDocument(documentURI,docCopy);
-        length =  new Long(newBinaryLength);
-        resultCode = (rval == DOCUMENTSTATUS_ACCEPTED)?"ACCEPTED":"REJECTED";
-        return rval;
+        return activities.sendDocument(documentURI,docCopy);
       }
       finally
       {
@@ -339,10 +361,12 @@ public class TikaExtractor extends org.a
     paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
 
     tabsArray.add(Messages.getString(locale, "TikaExtractor.FieldMappingTabName"));
+    tabsArray.add(Messages.getString(locale, "TikaExtractor.ExceptionsTabName"));
 
     // Fill in the specification header map, using data from all tabs.
     fillInFieldMappingSpecificationMap(paramMap, os);
-
+    fillInExceptionsSpecificationMap(paramMap, os);
+    
     Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap);
   }
   
@@ -371,7 +395,10 @@ public class TikaExtractor extends org.a
 
     // Fill in the field mapping tab data
     fillInFieldMappingSpecificationMap(paramMap, os);
+    fillInExceptionsSpecificationMap(paramMap, os);
+
     Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_FIELDMAPPING_HTML,paramMap);
+    Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_EXCEPTIONS_HTML,paramMap);
   }
 
   /** Process a specification post.
@@ -455,6 +482,27 @@ public class TikaExtractor extends org.a
       os.addChild(os.getChildCount(), node);
     }
     
+    if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present") != null)
+    {
+      int i = 0;
+      while (i < os.getChildCount())
+      {
+        SpecificationNode node = os.getChild(i);
+        if (node.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
+          os.removeChild(i);
+        else
+          i++;
+      }
+
+      String value = variableContext.getParameter(seqPrefix+"ignoretikaexceptions");
+      if (value == null)
+        value = "false";
+
+      SpecificationNode node = new SpecificationNode(TikaConfig.NODE_IGNORETIKAEXCEPTION);
+      node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, value);
+      os.addChild(os.getChildCount(), node);
+    }
+    
     return null;
   }
   
@@ -477,6 +525,7 @@ public class TikaExtractor extends org.a
 
     // Fill in the map with data from all tabs
     fillInFieldMappingSpecificationMap(paramMap, os);
+    fillInExceptionsSpecificationMap(paramMap, os);
 
     Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap);
     
@@ -516,6 +565,20 @@ public class TikaExtractor extends org.a
     paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
   }
 
+  protected static void fillInExceptionsSpecificationMap(Map<String,Object> paramMap,
Specification os)
+  {
+    String ignoreTikaExceptions = "true";
+    for (int i = 0; i < os.getChildCount(); i++)
+    {
+      SpecificationNode sn = os.getChild(i);
+      if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
+      {
+        ignoreTikaExceptions = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      }
+    }
+    paramMap.put("IGNORETIKAEXCEPTIONS",ignoreTikaExceptions);
+  }
+
   protected static int handleTikaException(TikaException e)
     throws IOException, ManifoldCFException, ServiceInterruption
   {
@@ -683,9 +746,11 @@ public class TikaExtractor extends org.a
     
     private final Map<String,String> sourceTargets = new HashMap<String,String>();
     private final boolean keepAllMetadata;
+    private final boolean ignoreTikaException;
     
     public SpecPacker(Specification os) {
       boolean keepAllMetadata = true;
+      boolean ignoreTikaException = true;
       for (int i = 0; i < os.getChildCount(); i++) {
         SpecificationNode sn = os.getChild(i);
         
@@ -700,9 +765,13 @@ public class TikaExtractor extends org.a
             target = "";
           }
           sourceTargets.put(source, target);
+        } else if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          ignoreTikaException = Boolean.parseBoolean(value);
         }
       }
       this.keepAllMetadata = keepAllMetadata;
+      this.ignoreTikaException = ignoreTikaException;
     }
     
     public SpecPacker(String packedString) {
@@ -723,6 +792,12 @@ public class TikaExtractor extends org.a
         keepAllMetadata = (packedString.charAt(index++) == '+');
       else
         keepAllMetadata = true;
+
+      // Ignore tika exception
+      if (packedString.length() > index)
+        ignoreTikaException = (packedString.charAt(index++) == '+');
+      else
+        ignoreTikaException = true;
       
     }
     
@@ -756,6 +831,11 @@ public class TikaExtractor extends org.a
       else
         sb.append('-');
       
+      if (ignoreTikaException)
+        sb.append('+');
+      else
+        sb.append('-');
+
       return sb.toString();
     }
     
@@ -766,6 +846,10 @@ public class TikaExtractor extends org.a
     public boolean keepAllMetadata() {
       return keepAllMetadata;
     }
+    
+    public boolean ignoreTikaException() {
+      return ignoreTikaException;
+    }
   }
 
 }

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1612814&r1=1612813&r2=1612814&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
Wed Jul 23 11:50:59 2014
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 TikaExtractor.FieldMappingTabName=Field mapping
+TikaExtractor.ExceptionsTabName=Exceptions
 TikaExtractor.FieldMappings=Field mappings:
 TikaExtractor.MetadataFieldName=Metadata field name
 TikaExtractor.FinalFieldName=Final field name
@@ -24,3 +25,4 @@ TikaExtractor.AddFieldMapping=Add field 
 TikaExtractor.Delete=Delete
 TikaExtractor.DeleteFieldMapping=Delete field mapping
 TikaExtractor.NoFieldNameSpecified=Please specify a field name
+TikaExtractor.IgnoreTikaExceptions=Ignore Tika exceptions
\ No newline at end of file

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1612814&r1=1612813&r2=1612814&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
Wed Jul 23 11:50:59 2014
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 TikaExtractor.FieldMappingTabName=Field mapping
+TikaExtractor.ExceptionsTabName=Exceptions
 TikaExtractor.FieldMappings=Field mappings:
 TikaExtractor.MetadataFieldName=Metadata field name
 TikaExtractor.FinalFieldName=Final field name
@@ -24,3 +25,4 @@ TikaExtractor.AddFieldMapping=Add field 
 TikaExtractor.Delete=Delete
 TikaExtractor.DeleteFieldMapping=Delete field mapping
 TikaExtractor.NoFieldNameSpecified=Please specify a field name
+TikaExtractor.IgnoreTikaExceptions=Ignore Tika exceptions
\ No newline at end of file

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1612814&r1=1612813&r2=1612814&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
Wed Jul 23 11:50:59 2014
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 TikaExtractor.FieldMappingTabName=Field mapping
+TikaExtractor.ExceptionsTabName=Exceptions
 TikaExtractor.FieldMappings=Field mappings:
 TikaExtractor.MetadataFieldName=Metadata field name
 TikaExtractor.FinalFieldName=Final field name
@@ -24,3 +25,4 @@ TikaExtractor.AddFieldMapping=Add field 
 TikaExtractor.Delete=Delete
 TikaExtractor.DeleteFieldMapping=Delete field mapping
 TikaExtractor.NoFieldNameSpecified=Please specify a field name
+TikaExtractor.IgnoreTikaExceptions=Ignore Tika exceptions
\ No newline at end of file

Added: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html?rev=1612814&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
(added)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
Wed Jul 23 11:50:59 2014
@@ -0,0 +1,40 @@
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+#if($TABNAME == $ResourceBundle.getString('TikaExtractor.ExceptionsTabName') && ${SEQNUM}
== ${SELECTEDNUM})
+
+<table class="displaytable">
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
+    <td class="value">
+        <input type="hidden" name="s${SEQNUM}_ignoretikaexceptions_present" value="true"/>
+  #if($IGNORETIKAEXCEPTIONS == 'true')
+       <input type="checkbox" checked="true" name="s${SEQNUM}_ignoretikaexceptions" value="true"/>
+  #else
+       <input type="checkbox" name="s${SEQNUM}_ignoretikaexceptions" value="true"/>
+  #end
+    </td>
+  </tr>
+</table>
+      
+#else
+
+<input type="hidden" name="s${SEQNUM}_ignoretikaexceptions_present" value="true"/>
+<input type="hidden" name="s${SEQNUM}_ignoretikaexceptions" value="$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)"/>
+
+#end
\ No newline at end of file

Propchange: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_Exceptions.html
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html?rev=1612814&r1=1612813&r2=1612814&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
(original)
+++ manifoldcf/trunk/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html
Wed Jul 23 11:50:59 2014
@@ -51,5 +51,10 @@
     <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.KeepAllMetadata'))</nobr></td>
     <td class="value"><nobr>$Encoder.bodyEscape($KEEPALLMETADATA)</nobr></td>
   </tr>
+  <tr><td class="separator" colspan="2"><hr/></td></tr>
+  <tr>
+    <td class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('TikaExtractor.IgnoreTikaExceptions'))</nobr></td>
+    <td class="value"><nobr>$Encoder.bodyEscape($IGNORETIKAEXCEPTIONS)</nobr></td>
+  </tr>
 
 </table>



Mime
View raw message