manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1794722 [1/2] - in /manifoldcf/branches/CONNECTORS-1425/connectors/tika: ./ connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/ connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/ connect...
Date Wed, 10 May 2017 13:27:34 GMT
Author: kwright
Date: Wed May 10 13:27:33 2017
New Revision: 1794722

URL: http://svn.apache.org/viewvc?rev=1794722&view=rev
Log:
Add Tika external access functionality

Added:
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification_TikaType.html
Modified:
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/editSpecification.js
    manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/resources/org/apache/manifoldcf/agents/transformation/tika/viewSpecification.html

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/.gitignore Wed May 10 13:27:33 2017
@@ -1,3 +1,4 @@
+/target/
 /.classpath
-/.project
 /.settings/
+/.project

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaConfig.java Wed May 10 13:27:33 2017
@@ -37,5 +37,12 @@ public class TikaConfig {
   public static final String ATTRIBUTE_SOURCE = "source";
   public static final String ATTRIBUTE_TARGET = "target";
   public static final String ATTRIBUTE_VALUE = "value";
+  public static final String TIKAHOSTNAME_DEFAULT = "localhost";
+  public static final int TIKAPORT_DEFAULT = 9998;
+  public static final String NODE_TIKAHOSTNAME = "tikaHostname";
+  public static final String NODE_TIKAPORT = "tikaPort";
+  public static final String NODE_TIKASERVER = "tikaServer";
+  public static final long TIKARETRY_DEFAULT = 10000;
+  public static final String NODE_TIKARETRY = "tikaRetry";
   
 }

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/java/org/apache/manifoldcf/agents/transformation/tika/TikaExtractor.java Wed May 10 13:27:33 2017
@@ -19,30 +19,48 @@
 package org.apache.manifoldcf.agents.transformation.tika;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.TeeInputStream;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpHost;
+import org.apache.http.HttpResponse;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.client.methods.HttpPut;
+import org.apache.http.entity.InputStreamEntity;
+import org.apache.http.entity.mime.Header;
+import org.apache.http.impl.client.HttpClientBuilder;
 import org.apache.manifoldcf.agents.interfaces.*;
 import org.apache.manifoldcf.agents.system.Logging;
 
 import java.io.*;
+import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.*;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.parser.html.BoilerpipeContentHandler;
+import org.json.simple.JSONObject;
+import org.json.simple.parser.JSONParser;
+import org.json.simple.parser.ParseException;
 
 import de.l3s.boilerpipe.BoilerpipeExtractor;
 
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
-/** This connector works as a transformation connector, but does nothing other than logging.
-*
-*/
-public class TikaExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector
-{
+/**
+ * This connector works as a transformation connector, but does nothing other
+ * than logging.
+ *
+ */
+public class TikaExtractor extends org.apache.manifoldcf.agents.transformation.BaseTransformationConnector {
   public static final String _rcsid = "@(#)$Id$";
 
   private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
+  private static final String EDIT_SPECIFICATION_TIKATYPE_HTML = "editSpecification_TikaType.html";
   private static final String EDIT_SPECIFICATION_FIELDMAPPING_HTML = "editSpecification_FieldMapping.html";
   private static final String EDIT_SPECIFICATION_EXCEPTIONS_HTML = "editSpecification_Exceptions.html";
   private static final String EDIT_SPECIFICATION_BOILERPLATE_HTML = "editSpecification_Boilerplate.html";
@@ -50,159 +68,221 @@ public class TikaExtractor extends org.a
 
   protected static final String ACTIVITY_EXTRACT = "extract";
 
-  protected static final String[] activitiesList = new String[]{ACTIVITY_EXTRACT};
-  
+  protected static final String[] activitiesList = new String[] { ACTIVITY_EXTRACT };
+
   /** We handle up to 64K in memory; after that we go to disk. */
   protected static final long inMemoryMaximumFile = 65536;
-  
-  /** Return a list of activities that this connector generates.
-  * The connector does NOT need to be connected before this method is called.
-  *@return the set of activities.
-  */
+
+  /**
+   * Return a list of activities that this connector generates. The connector
+   * does NOT need to be connected before this method is called.
+   * 
+   * @return the set of activities.
+   */
   @Override
-  public String[] getActivitiesList()
-  {
+  public String[] getActivitiesList() {
     return activitiesList;
   }
 
-  /** Get an output version string, given an output specification.  The output version string is used to uniquely describe the pertinent details of
-  * the output specification and the configuration, to allow the Connector Framework to determine whether a document will need to be output again.
-  * Note that the contents of the document cannot be considered by this method, and that a different version string (defined in IRepositoryConnector)
-  * is used to describe the version of the actual document.
-  *
-  * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
-  * necessary.
-  *@param os is the current output specification for the job that is doing the crawling.
-  *@return a string, of unlimited length, which uniquely describes output configuration and specification in such a way that if two such strings are equal,
-  * the document will not need to be sent again to the output data store.
-  */
-  @Override
-  public VersionContext getPipelineDescription(Specification os)
-    throws ManifoldCFException, ServiceInterruption
-  {
+  /**
+   * Get an output version string, given an output specification. The output
+   * version string is used to uniquely describe the pertinent details of the
+   * output specification and the configuration, to allow the Connector
+   * Framework to determine whether a document will need to be output again.
+   * Note that the contents of the document cannot be considered by this method,
+   * and that a different version string (defined in IRepositoryConnector) is
+   * used to describe the version of the actual document.
+   *
+   * This method presumes that the connector object has been configured, and it
+   * is thus able to communicate with the output data store should that be
+   * necessary.
+   * 
+   * @param os
+   *          is the current output specification for the job that is doing the
+   *          crawling.
+   * @return a string, of unlimited length, which uniquely describes output
+   *         configuration and specification in such a way that if two such
+   *         strings are equal, the document will not need to be sent again to
+   *         the output data store.
+   */
+  @Override
+  public VersionContext getPipelineDescription(Specification os) throws ManifoldCFException, ServiceInterruption {
     SpecPacker sp = new SpecPacker(os);
-    return new VersionContext(sp.toPackedString(),params,os);
+    return new VersionContext(sp.toPackedString(), params, os);
   }
 
-  // We intercept checks pertaining to the document format and send modified checks further down
-  
-  /** Detect if a mime type is acceptable or not.  This method is used to determine whether it makes sense to fetch a document
-  * in the first place.
-  *@param pipelineDescription is the document's pipeline version string, for this connection.
-  *@param mimeType is the mime type of the document.
-  *@param checkActivity is an object including the activities that can be performed by this method.
-  *@return true if the mime type can be accepted by this connector.
-  */
-  @Override
-  public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
-    throws ManifoldCFException, ServiceInterruption
-  {
+  // We intercept checks pertaining to the document format and send modified
+  // checks further down
+
+  /**
+   * Detect if a mime type is acceptable or not. This method is used to
+   * determine whether it makes sense to fetch a document in the first place.
+   * 
+   * @param pipelineDescription
+   *          is the document's pipeline version string, for this connection.
+   * @param mimeType
+   *          is the mime type of the document.
+   * @param checkActivity
+   *          is an object including the activities that can be performed by
+   *          this method.
+   * @return true if the mime type can be accepted by this connector.
+   */
+  @Override
+  public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType,
+      IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption {
     // We should see what Tika will transform
     // MHL
     // Do a downstream check
     return checkActivity.checkMimeTypeIndexable("text/plain;charset=utf-8");
   }
 
-  /** Pre-determine whether a document (passed here as a File object) is acceptable or not.  This method is
-  * used to determine whether a document needs to be actually transferred.  This hook is provided mainly to support
-  * search engines that only handle a small set of accepted file types.
-  *@param pipelineDescription is the document's pipeline version string, for this connection.
-  *@param localFile is the local file to check.
-  *@param checkActivity is an object including the activities that can be done by this method.
-  *@return true if the file is acceptable, false if not.
-  */
-  @Override
-  public boolean checkDocumentIndexable(VersionContext pipelineDescription, File localFile, IOutputCheckActivity checkActivity)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    // Document contents are not germane anymore, unless it looks like Tika won't accept them.
+  /**
+   * Pre-determine whether a document (passed here as a File object) is
+   * acceptable or not. This method is used to determine whether a document
+   * needs to be actually transferred. This hook is provided mainly to support
+   * search engines that only handle a small set of accepted file types.
+   * 
+   * @param pipelineDescription
+   *          is the document's pipeline version string, for this connection.
+   * @param localFile
+   *          is the local file to check.
+   * @param checkActivity
+   *          is an object including the activities that can be done by this
+   *          method.
+   * @return true if the file is acceptable, false if not.
+   */
+  @Override
+  public boolean checkDocumentIndexable(VersionContext pipelineDescription, File localFile,
+      IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption {
+    // Document contents are not germane anymore, unless it looks like Tika
+    // won't accept them.
     // Not sure how to check that...
     return true;
   }
 
-  /** Pre-determine whether a document's length is acceptable.  This method is used
-  * to determine whether to fetch a document in the first place.
-  *@param pipelineDescription is the document's pipeline version string, for this connection.
-  *@param length is the length of the document.
-  *@param checkActivity is an object including the activities that can be done by this method.
-  *@return true if the file is acceptable, false if not.
-  */
-  @Override
-  public boolean checkLengthIndexable(VersionContext pipelineDescription, long length, IOutputCheckActivity checkActivity)
-    throws ManifoldCFException, ServiceInterruption
-  {
+  /**
+   * Pre-determine whether a document's length is acceptable. This method is
+   * used to determine whether to fetch a document in the first place.
+   * 
+   * @param pipelineDescription
+   *          is the document's pipeline version string, for this connection.
+   * @param length
+   *          is the length of the document.
+   * @param checkActivity
+   *          is an object including the activities that can be done by this
+   *          method.
+   * @return true if the file is acceptable, false if not.
+   */
+  @Override
+  public boolean checkLengthIndexable(VersionContext pipelineDescription, long length,
+      IOutputCheckActivity checkActivity) throws ManifoldCFException, ServiceInterruption {
     // Always true
     return true;
   }
 
-  /** Add (or replace) a document in the output data store using the connector.
-  * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be
-  * necessary.
-  * The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the
-  * output description, since that was what was partly used to determine if output should be taking place.  So it may be necessary for this method to decode
-  * an output description string in order to determine what should be done.
-  *@param documentURI is the URI of the document.  The URI is presumed to be the unique identifier which the output data store will use to process
-  * and serve the document.  This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors.
-  *@param outputDescription is the description string that was constructed for this document by the getOutputDescription() method.
-  *@param document is the document data to be processed (handed to the output data store).
-  *@param authorityNameString is the name of the authority responsible for authorizing any access tokens passed in with the repository document.  May be null.
-  *@param activities is the handle to an object that the implementer of a pipeline connector may use to perform operations, such as logging processing activity,
-  * or sending a modified document to the next stage in the pipeline.
-  *@return the document status (accepted or permanently rejected).
-  *@throws IOException only if there's a stream error reading the document data.
-  */
-  @Override
-  public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
-    throws ManifoldCFException, ServiceInterruption, IOException
-  {
-    // First, make sure downstream pipeline will now accept text/plain;charset=utf-8
-    if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8"))
-    {
+  /**
+   * Add (or replace) a document in the output data store using the connector.
+   * This method presumes that the connector object has been configured, and it
+   * is thus able to communicate with the output data store should that be
+   * necessary. The OutputSpecification is *not* provided to this method,
+   * because the goal is consistency, and if output is done it must be
+   * consistent with the output description, since that was what was partly used
+   * to determine if output should be taking place. So it may be necessary for
+   * this method to decode an output description string in order to determine
+   * what should be done.
+   * 
+   * @param documentURI
+   *          is the URI of the document. The URI is presumed to be the unique
+   *          identifier which the output data store will use to process and
+   *          serve the document. This URI is constructed by the repository
+   *          connector which fetches the document, and is thus universal across
+   *          all output connectors.
+   * @param outputDescription
+   *          is the description string that was constructed for this document
+   *          by the getOutputDescription() method.
+   * @param document
+   *          is the document data to be processed (handed to the output data
+   *          store).
+   * @param authorityNameString
+   *          is the name of the authority responsible for authorizing any
+   *          access tokens passed in with the repository document. May be null.
+   * @param activities
+   *          is the handle to an object that the implementer of a pipeline
+   *          connector may use to perform operations, such as logging
+   *          processing activity, or sending a modified document to the next
+   *          stage in the pipeline.
+   * @return the document status (accepted or permanently rejected).
+   * @throws IOException
+   *           only if there's a stream error reading the document data.
+   */
+  @Override
+  public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription,
+      RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
+      throws ManifoldCFException, ServiceInterruption, IOException {
+    // First, make sure downstream pipeline will now accept
+    // text/plain;charset=utf-8
+    if (!activities.checkMimeTypeIndexable("text/plain;charset=utf-8")) {
       activities.noDocument();
-      activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI,
-        activities.EXCLUDED_MIMETYPE, "Downstream pipeline rejected mime type 'text/plain;charset=utf-8'");
+      activities.recordActivity(null, ACTIVITY_EXTRACT, null, documentURI, activities.EXCLUDED_MIMETYPE,
+          "Downstream pipeline rejected mime type 'text/plain;charset=utf-8'");
       return DOCUMENTSTATUS_REJECTED;
     }
 
     SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
 
+    // Tika server variables
+    String mime = "";
+    InputStream tikaServerIs = null;
+    int retry = 0;
+    HttpResponse response = null;
+    IOException tikaServerDownException = null;
+
     BoilerpipeExtractor extractorClassInstance = sp.getExtractorClassInstance();
-    
+
     // Tika's API reads from an input stream and writes to an output Writer.
-    // Since a RepositoryDocument includes readers and inputstreams exclusively, AND all downstream
-    // processing needs to occur in a ManifoldCF thread, we have some constraints on the architecture we need to get this done:
-    // (1) The principle worker thread must call the downstream pipeline send() method.
-    // (2) The callee of the send() method must call a reader in the Repository Document.
-    // (3) The Reader, if its databuffer is empty, must pull more data from the original input stream and hand it to Tika, which populates the Reader's databuffer.
-    // So all this can be done in one thread, with some work, and the creation of a special InputStream or Reader implementation.  Where it fails, though, is the
-    // requirement that tika-extracted metadata be included in the RepositoryDocument right from the beginning.  Effectively this means that the entire document
-    // must be parsed before it is handed downstream -- so basically a temporary file (or in-memory buffer if small enough) must be created.
+    // Since a RepositoryDocument includes readers and inputstreams exclusively,
+    // AND all downstream
+    // processing needs to occur in a ManifoldCF thread, we have some
+    // constraints on the architecture we need to get this done:
+    // (1) The principle worker thread must call the downstream pipeline send()
+    // method.
+    // (2) The callee of the send() method must call a reader in the Repository
+    // Document.
+    // (3) The Reader, if its databuffer is empty, must pull more data from the
+    // original input stream and hand it to Tika, which populates the Reader's
+    // databuffer.
+    // So all this can be done in one thread, with some work, and the creation
+    // of a special InputStream or Reader implementation. Where it fails,
+    // though, is the
+    // requirement that tika-extracted metadata be included in the
+    // RepositoryDocument right from the beginning. Effectively this means that
+    // the entire document
+    // must be parsed before it is handed downstream -- so basically a temporary
+    // file (or in-memory buffer if small enough) must be created.
     // Instead of the elegant flow above, we have the following:
     // (1) Create a temporary file (or in-memory buffer if file is small enough)
     // (2) Run Tika to completion, streaming content output to temporary file
-    // (3) Modify RepositoryDocument to read from temporary file, and include Tika-extracted metadata
+    // (3) Modify RepositoryDocument to read from temporary file, and include
+    // Tika-extracted metadata
     // (4) Call downstream document processing
-      
+
     DestinationStorage ds;
-      
-    if (document.getBinaryLength() <= inMemoryMaximumFile)
-    {
-      ds = new MemoryDestinationStorage((int)document.getBinaryLength());
-    }
-    else
-    {
+
+    if (document.getBinaryLength() <= inMemoryMaximumFile) {
+      ds = new MemoryDestinationStorage((int) document.getBinaryLength());
+    } else {
       ds = new FileDestinationStorage();
     }
-    try
-    {
+    try {
       Metadata metadata = new Metadata();
-      if (document.getFileName() != null)
-      {
+      if (document.getFileName() != null) {
         metadata.add(TikaMetadataKeys.RESOURCE_NAME_KEY, document.getFileName());
         metadata.add("stream_name", document.getFileName());
       }
-      if (document.getMimeType() != null)
-        metadata.add("Content-Type", document.getMimeType());
+      if (document.getMimeType() != null) {
+        mime = document.getMimeType();
+        metadata.add("Content-Type", mime);
+      }
       metadata.add("stream_size", new Long(document.getBinaryLength()).toString());
 
       // We only log the extraction
@@ -210,334 +290,470 @@ public class TikaExtractor extends org.a
       String resultCode = "OK";
       String description = null;
       Long length = null;
-      try
-      {
-        OutputStream os = ds.getOutputStream();
-        try
-        {
-          Writer w = new OutputStreamWriter(os,"utf-8");
-          try
-          {
-            // Use tika to parse stuff
-            ContentHandler handler = TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
-            if (extractorClassInstance != null)
-              handler = new BoilerpipeContentHandler(handler, extractorClassInstance);
-            try
-            {
-              TikaParser.parse(document.getBinaryStream(), metadata, handler);
-            }
-            catch (TikaException e)
-            {
-              if (sp.ignoreTikaException())
-              {
-                resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
-                description = e.getMessage();
+
+      try {
+        if (sp.tikaServer) {
+          try {
+            final HttpClient client = HttpClientBuilder.create().build();
+            final HttpHost tikaHost = new HttpHost(sp.tikaHostname, sp.tikaPort);
+
+            // Make a copy of the original stream as it needs to be sent two
+            // times to Tika
+            // one for the metadata and one for the content
+            IOUtils.copy(document.getBinaryStream(), ds.getOutputStream());
+            HttpPut httpPut;
+            HttpEntity entity;
+
+            // Metadata
+            httpPut = new HttpPut(sp.metaURI);
+            if (!mime.isEmpty()) {
+              httpPut.addHeader("Content-Type", mime);
+            }
+            httpPut.addHeader("Accept", "application/json");
+            entity = new InputStreamEntity(ds.getInputStream());
+            httpPut.setEntity(entity);
+            while (retry < 3 && response == null) {
+              try {
+                response = client.execute(tikaHost, httpPut);
+                tikaServerDownException = null;
+              } catch (IOException e) {
+                tikaServerDownException = e;
+                retry++;
+                if (retry < 3) {
+                  try {
+                    Thread.sleep(sp.tikaRetry);
+                  } catch (InterruptedException e1) {
+                    // Should not happen
+                  }
+                }
               }
-              else
-              {
-                resultCode = "TIKAREJECTION";
+            }
+            if (tikaServerDownException != null) {
+              throw tikaServerDownException;
+            }
+            int responseCode = response.getStatusLine().getStatusCode();
+            if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
+              tikaServerIs = response.getEntity().getContent();
+              try {
+                final BufferedReader br = new BufferedReader(new InputStreamReader(tikaServerIs));
+                final JSONParser parser = new JSONParser();
+                JSONObject metaJson;
+                final StringBuilder sb = new StringBuilder();
+                String output;
+                while ((output = br.readLine()) != null) {
+                  sb.append(output);
+                }
+                metaJson = (JSONObject) parser.parse(sb.toString());
+                for (Object key : metaJson.keySet()) {
+                  metadata.add(key.toString(), metaJson.get(key).toString());
+                }
+              } finally {
+                tikaServerIs.close();
+              }
+            } else {
+              activities.noDocument();
+              if (responseCode == 422) {
+                resultCode = "TIKASERVERREJECTS";
+                description = "Tika Server rejected document with the following reason: "
+                    + response.getStatusLine().getReasonPhrase();
+                handleTikaServerRejects(description);
+              } else {
+                resultCode = "TIKASERVERERROR";
+                description = "Tika Server failed to parse document with the following error: "
+                    + response.getStatusLine().getReasonPhrase();
+                handleTikaServerError(description);
+              }
+              return DOCUMENTSTATUS_REJECTED;
+            }
+
+            // Content
+            httpPut = new HttpPut(sp.contentURI);
+            if (!mime.isEmpty()) {
+              httpPut.addHeader("Content-Type", mime);
+            }
+            httpPut.addHeader("Accept", "text/plain");
+            entity = new InputStreamEntity(ds.getInputStream());
+            httpPut.setEntity(entity);
+
+            // Retry mecanism
+            retry = 0;
+            response = null;
+            while (retry < 3 && response == null) {
+              try {
+                response = client.execute(tikaHost, httpPut);
+                tikaServerDownException = null;
+              } catch (IOException e) {
+                tikaServerDownException = e;
+                retry++;
+                if (retry < 3) {
+                  try {
+                    Thread.sleep(sp.tikaRetry);
+                  } catch (InterruptedException e1) {
+                    // Should not happen
+                  }
+                }
+              }
+            }
+            if (tikaServerDownException != null) {
+              throw tikaServerDownException;
+            }
+
+            responseCode = response.getStatusLine().getStatusCode();
+            if (response.getStatusLine().getStatusCode() == 200 || response.getStatusLine().getStatusCode() == 204) {
+              tikaServerIs = response.getEntity().getContent();
+              try {
+                ds.close();
+                ds = new FileDestinationStorage();
+                IOUtils.copyLarge(tikaServerIs, ds.getOutputStream(), 0L, sp.writeLimit);
+                length = new Long(ds.getBinaryLength());
+              } finally {
+                tikaServerIs.close();
+              }
+            } else {
+              activities.noDocument();
+              if (responseCode == 422) {
+                resultCode = "TIKASERVERREJECTS";
+                description = "Tika Server rejected document with the following reason: "
+                    + response.getStatusLine().getReasonPhrase();
+                handleTikaServerRejects(description);
+              } else {
+                resultCode = "TIKASERVERERROR";
+                description = "Tika Server failed to parse document with the following error: "
+                    + response.getStatusLine().getReasonPhrase();
+                handleTikaServerError(description);
+              }
+              return DOCUMENTSTATUS_REJECTED;
+            }
+
+          } catch (IOException | ParseException e) {
+            resultCode = "TIKASERVERRESPONSEISSUE";
+            description = e.getMessage();
+            int rval;
+            if (e instanceof IOException) {
+              rval = handleTikaServerException((IOException) e);
+            } else {
+              rval = handleTikaServerException((ParseException) e);
+            }
+            if (rval == DOCUMENTSTATUS_REJECTED) {
+              activities.noDocument();
+            }
+            return rval;
+          }
+        } else {
+
+          OutputStream os = ds.getOutputStream();
+          try {
+            Writer w = new OutputStreamWriter(os, "utf-8");
+            try {
+              // Use tika to parse stuff
+              ContentHandler handler = TikaParser.newWriteOutBodyContentHandler(w, sp.writeLimit());
+              if (extractorClassInstance != null)
+                handler = new BoilerpipeContentHandler(handler, extractorClassInstance);
+              try {
+                TikaParser.parse(document.getBinaryStream(), metadata, handler);
+              } catch (TikaException e) {
+                if (sp.ignoreTikaException()) {
+                  resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+                  description = e.getMessage();
+                } else {
+                  resultCode = "TIKAREJECTION";
+                  description = e.getMessage();
+                  int rval = handleTikaException(e);
+                  if (rval == DOCUMENTSTATUS_REJECTED)
+                    activities.noDocument();
+                  return rval;
+                }
+              } catch (SAXException e) {
+                resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
                 description = e.getMessage();
-                int rval = handleTikaException(e);
+                int rval = handleSaxException(e);
                 if (rval == DOCUMENTSTATUS_REJECTED)
                   activities.noDocument();
                 return rval;
+              } catch (IOException e) {
+                resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+                description = e.getMessage();
+                throw e;
               }
+            } finally {
+              w.flush();
             }
-            catch (SAXException e)
-            {
-              resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
-              description = e.getMessage();
-              int rval = handleSaxException(e);
-              if (rval == DOCUMENTSTATUS_REJECTED)
-                activities.noDocument();
-              return rval;
-            }
-            catch (IOException e)
-            {
-              resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
-              description = e.getMessage();
-              throw e;
-            }
-          }
-          finally
-          {
-            w.flush();
+          } finally {
+            os.close();
+            length = new Long(ds.getBinaryLength());
           }
         }
-        finally
-        {
-          os.close();
-          length = new Long(ds.getBinaryLength());
-        }
-        
-        // Check to be sure downstream pipeline will accept document of specified length
-        if (!activities.checkLengthIndexable(ds.getBinaryLength()))
-        {
+
+        if (!activities.checkLengthIndexable(ds.getBinaryLength())) {
           activities.noDocument();
           resultCode = activities.EXCLUDED_LENGTH;
-          description = "Downstream pipeline rejected document with length "+ds.getBinaryLength();
+          description = "Downstream pipeline rejected document with length " + ds.getBinaryLength();
           return DOCUMENTSTATUS_REJECTED;
         }
 
-      }
-      finally
-      {
+      } finally {
         // Log the extraction processing
-        activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT, length, documentURI,
-          resultCode, description);
+        activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT, length, documentURI, resultCode, description);
       }
-      
+
       // Parsing complete!
       // Create a copy of Repository Document
       RepositoryDocument docCopy = document.duplicate();
-        
+
       // Get new stream length
       long newBinaryLength = ds.getBinaryLength();
       // Open new input stream
       InputStream is = ds.getInputStream();
-      try
-      {
-        docCopy.setBinary(is,newBinaryLength);
 
-        // Set up all metadata from Tika.  We may want to run this through a mapper eventually...
+      try {
+        docCopy.setBinary(is, newBinaryLength);
+
+        // Set up all metadata from Tika. We may want to run this through a
+        // mapper eventually...
         String[] metaNames = metadata.names();
-        for(String mName : metaNames){
+        for (String mName : metaNames) {
           String value = metadata.get(mName);
-          if (sp.lowerNames())
-          {
+          if (sp.lowerNames()) {
             StringBuilder sb = new StringBuilder();
-            for (int i=0; i<mName.length(); i++) {
+            for (int i = 0; i < mName.length(); i++) {
               char ch = mName.charAt(i);
-              if (!Character.isLetterOrDigit(ch)) ch='_';
-              else ch=Character.toLowerCase(ch);
+              if (!Character.isLetterOrDigit(ch))
+                ch = '_';
+              else
+                ch = Character.toLowerCase(ch);
               sb.append(ch);
             }
             mName = sb.toString();
           }
           String target = sp.getMapping(mName);
-          if(target!=null)
-          {
+          if (target != null) {
             docCopy.addField(target, value);
-          }
-          else
-          {
-            if(sp.keepAllMetadata())
-            {
-             docCopy.addField(mName, value);
+          } else {
+            if (sp.keepAllMetadata()) {
+              docCopy.addField(mName, value);
             }
           }
         }
 
         // Send new document downstream
-        return activities.sendDocument(documentURI,docCopy);
-      }
-      finally
-      {
+        return activities.sendDocument(documentURI, docCopy);
+      } finally {
         is.close();
       }
-    }
-    finally
-    {
+    } finally {
       ds.close();
     }
 
   }
 
-  /** Obtain the name of the form check javascript method to call.
-  *@param connectionSequenceNumber is the unique number of this connection within the job.
-  *@return the name of the form check javascript method.
-  */
-  @Override
-  public String getFormCheckJavascriptMethodName(int connectionSequenceNumber)
-  {
-    return "s"+connectionSequenceNumber+"_checkSpecification";
-  }
-
-  /** Obtain the name of the form presave check javascript method to call.
-  *@param connectionSequenceNumber is the unique number of this connection within the job.
-  *@return the name of the form presave check javascript method.
-  */
-  @Override
-  public String getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber)
-  {
-    return "s"+connectionSequenceNumber+"_checkSpecificationForSave";
-  }
-
-  /** Output the specification header section.
-  * This method is called in the head section of a job page which has selected a pipeline connection of the current type.  Its purpose is to add the required tabs
-  * to the list, and to output any javascript methods that might be needed by the job editing HTML.
-  *@param out is the output to which any HTML should be sent.
-  *@param locale is the preferred local of the output.
-  *@param os is the current pipeline specification for this connection.
-  *@param connectionSequenceNumber is the unique number of this connection within the job.
-  *@param tabsArray is an array of tab names.  Add to this array any tab names that are specific to the connector.
-  */
-  @Override
-  public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os,
-    int connectionSequenceNumber, List<String> tabsArray)
-    throws ManifoldCFException, IOException
-  {
+  /**
+   * Obtain the name of the form check javascript method to call.
+   * 
+   * @param connectionSequenceNumber
+   *          is the unique number of this connection within the job.
+   * @return the name of the form check javascript method.
+   */
+  @Override
+  public String getFormCheckJavascriptMethodName(int connectionSequenceNumber) {
+    return "s" + connectionSequenceNumber + "_checkSpecification";
+  }
+
+  /**
+   * Obtain the name of the form presave check javascript method to call.
+   * 
+   * @param connectionSequenceNumber
+   *          is the unique number of this connection within the job.
+   * @return the name of the form presave check javascript method.
+   */
+  @Override
+  public String getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber) {
+    return "s" + connectionSequenceNumber + "_checkSpecificationForSave";
+  }
+
+  /**
+   * Output the specification header section. This method is called in the head
+   * section of a job page which has selected a pipeline connection of the
+   * current type. Its purpose is to add the required tabs to the list, and to
+   * output any javascript methods that might be needed by the job editing HTML.
+   * 
+   * @param out
+   *          is the output to which any HTML should be sent.
+   * @param locale
+   *          is the preferred local of the output.
+   * @param os
+   *          is the current pipeline specification for this connection.
+   * @param connectionSequenceNumber
+   *          is the unique number of this connection within the job.
+   * @param tabsArray
+   *          is an array of tab names. Add to this array any tab names that are
+   *          specific to the connector.
+   */
+  @Override
+  public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber,
+      List<String> tabsArray) throws ManifoldCFException, IOException {
     Map<String, Object> paramMap = new HashMap<String, Object>();
-    paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
+    paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
 
+    tabsArray.add(Messages.getString(locale, "TikaExtractor.TikaTypeTabName"));
     tabsArray.add(Messages.getString(locale, "TikaExtractor.FieldMappingTabName"));
     tabsArray.add(Messages.getString(locale, "TikaExtractor.ExceptionsTabName"));
     tabsArray.add(Messages.getString(locale, "TikaExtractor.BoilerplateTabName"));
 
     // Fill in the specification header map, using data from all tabs.
+    fillInTikaTypeSpecificationMap(paramMap, os);
     fillInFieldMappingSpecificationMap(paramMap, os);
     fillInExceptionsSpecificationMap(paramMap, os);
     fillInBoilerplateSpecificationMap(paramMap, os);
-    
-    Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_JS,paramMap);
+
+    Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
   }
-  
-  /** Output the specification body section.
-  * This method is called in the body section of a job page which has selected a pipeline connection of the current type.  Its purpose is to present the required form elements for editing.
-  * The coder can presume that the HTML that is output from this configuration will be within appropriate <html>, <body>, and <form> tags.  The name of the
-  * form is "editjob".
-  *@param out is the output to which any HTML should be sent.
-  *@param locale is the preferred local of the output.
-  *@param os is the current pipeline specification for this job.
-  *@param connectionSequenceNumber is the unique number of this connection within the job.
-  *@param actualSequenceNumber is the connection within the job that has currently been selected.
-  *@param tabName is the current tab name.
-  */
-  @Override
-  public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os,
-    int connectionSequenceNumber, int actualSequenceNumber, String tabName)
-    throws ManifoldCFException, IOException
-  {
+
+  /**
+   * Output the specification body section. This method is called in the body
+   * section of a job page which has selected a pipeline connection of the
+   * current type. Its purpose is to present the required form elements for
+   * editing. The coder can presume that the HTML that is output from this
+   * configuration will be within appropriate <html>, <body>, and <form> tags.
+   * The name of the form is "editjob".
+   * 
+   * @param out
+   *          is the output to which any HTML should be sent.
+   * @param locale
+   *          is the preferred local of the output.
+   * @param os
+   *          is the current pipeline specification for this job.
+   * @param connectionSequenceNumber
+   *          is the unique number of this connection within the job.
+   * @param actualSequenceNumber
+   *          is the connection within the job that has currently been selected.
+   * @param tabName
+   *          is the current tab name.
+   */
+  @Override
+  public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber,
+      int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException {
     Map<String, Object> paramMap = new HashMap<String, Object>();
 
     // Set the tab name
     paramMap.put("TABNAME", tabName);
-    paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
-    paramMap.put("SELECTEDNUM",Integer.toString(actualSequenceNumber));
+    paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
+    paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
 
     // Fill in the field mapping tab data
+    fillInTikaTypeSpecificationMap(paramMap, os);
     fillInFieldMappingSpecificationMap(paramMap, os);
     fillInExceptionsSpecificationMap(paramMap, os);
     fillInBoilerplateSpecificationMap(paramMap, os);
-    
-    Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_FIELDMAPPING_HTML,paramMap);
-    Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_EXCEPTIONS_HTML,paramMap);
-    Messages.outputResourceWithVelocity(out,locale,EDIT_SPECIFICATION_BOILERPLATE_HTML,paramMap);
-  }
-
-  /** Process a specification post.
-  * This method is called at the start of job's edit or view page, whenever there is a possibility that form data for a connection has been
-  * posted.  Its purpose is to gather form information and modify the transformation specification accordingly.
-  * The name of the posted form is "editjob".
-  *@param variableContext contains the post data, including binary file-upload information.
-  *@param locale is the preferred local of the output.
-  *@param os is the current pipeline specification for this job.
-  *@param connectionSequenceNumber is the unique number of this connection within the job.
-  *@return null if all is well, or a string error message if there is an error that should prevent saving of the job (and cause a redirection to an error page).
-  */
+
+    Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_TIKATYPE_HTML, paramMap);
+    Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_FIELDMAPPING_HTML, paramMap);
+    Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_EXCEPTIONS_HTML, paramMap);
+    Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_BOILERPLATE_HTML, paramMap);
+  }
+
+  /**
+   * Process a specification post. This method is called at the start of job's
+   * edit or view page, whenever there is a possibility that form data for a
+   * connection has been posted. Its purpose is to gather form information and
+   * modify the transformation specification accordingly. The name of the posted
+   * form is "editjob".
+   * 
+   * @param variableContext
+   *          contains the post data, including binary file-upload information.
+   * @param locale
+   *          is the preferred local of the output.
+   * @param os
+   *          is the current pipeline specification for this job.
+   * @param connectionSequenceNumber
+   *          is the unique number of this connection within the job.
+   * @return null if all is well, or a string error message if there is an error
+   *         that should prevent saving of the job (and cause a redirection to
+   *         an error page).
+   */
   @Override
   public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os,
-    int connectionSequenceNumber)
-    throws ManifoldCFException {
-    String seqPrefix = "s"+connectionSequenceNumber+"_";
+      int connectionSequenceNumber) throws ManifoldCFException {
+    String seqPrefix = "s" + connectionSequenceNumber + "_";
 
     String x;
-        
-    x = variableContext.getParameter(seqPrefix+"fieldmapping_count");
-    if (x != null && x.length() > 0)
-    {
+
+    x = variableContext.getParameter(seqPrefix + "fieldmapping_count");
+    if (x != null && x.length() > 0) {
       // About to gather the fieldmapping nodes, so get rid of the old ones.
       int i = 0;
-      while (i < os.getChildCount())
-      {
+      while (i < os.getChildCount()) {
         SpecificationNode node = os.getChild(i);
-        if (node.getType().equals(TikaConfig.NODE_FIELDMAP)
-          || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
-          || node.getType().equals(TikaConfig.NODE_LOWERNAMES)
-          || node.getType().equals(TikaConfig.NODE_WRITELIMIT))
+        if (node.getType().equals(TikaConfig.NODE_FIELDMAP) || node.getType().equals(TikaConfig.NODE_KEEPMETADATA)
+            || node.getType().equals(TikaConfig.NODE_LOWERNAMES) || node.getType().equals(TikaConfig.NODE_WRITELIMIT))
           os.removeChild(i);
         else
           i++;
       }
       int count = Integer.parseInt(x);
       i = 0;
-      while (i < count)
-      {
-        String prefix = seqPrefix+"fieldmapping_";
-        String suffix = "_"+Integer.toString(i);
-        String op = variableContext.getParameter(prefix+"op"+suffix);
-        if (op == null || !op.equals("Delete"))
-        {
+      while (i < count) {
+        String prefix = seqPrefix + "fieldmapping_";
+        String suffix = "_" + Integer.toString(i);
+        String op = variableContext.getParameter(prefix + "op" + suffix);
+        if (op == null || !op.equals("Delete")) {
           // Gather the fieldmap etc.
-          String source = variableContext.getParameter(prefix+"source"+suffix);
-          String target = variableContext.getParameter(prefix+"target"+suffix);
+          String source = variableContext.getParameter(prefix + "source" + suffix);
+          String target = variableContext.getParameter(prefix + "target" + suffix);
           if (target == null)
             target = "";
           SpecificationNode node = new SpecificationNode(TikaConfig.NODE_FIELDMAP);
-          node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source);
-          node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target);
-          os.addChild(os.getChildCount(),node);
+          node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
+          node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
+          os.addChild(os.getChildCount(), node);
         }
         i++;
       }
-      
-      String addop = variableContext.getParameter(seqPrefix+"fieldmapping_op");
-      if (addop != null && addop.equals("Add"))
-      {
-        String source = variableContext.getParameter(seqPrefix+"fieldmapping_source");
-        String target = variableContext.getParameter(seqPrefix+"fieldmapping_target");
+
+      String addop = variableContext.getParameter(seqPrefix + "fieldmapping_op");
+      if (addop != null && addop.equals("Add")) {
+        String source = variableContext.getParameter(seqPrefix + "fieldmapping_source");
+        String target = variableContext.getParameter(seqPrefix + "fieldmapping_target");
         if (target == null)
           target = "";
         SpecificationNode node = new SpecificationNode(TikaConfig.NODE_FIELDMAP);
-        node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE,source);
-        node.setAttribute(TikaConfig.ATTRIBUTE_TARGET,target);
-        os.addChild(os.getChildCount(),node);
+        node.setAttribute(TikaConfig.ATTRIBUTE_SOURCE, source);
+        node.setAttribute(TikaConfig.ATTRIBUTE_TARGET, target);
+        os.addChild(os.getChildCount(), node);
       }
-      
+
       // Gather the keep all metadata parameter to be the last one
       SpecificationNode node = new SpecificationNode(TikaConfig.NODE_KEEPMETADATA);
-      String keepAll = variableContext.getParameter(seqPrefix+"keepallmetadata");
-      if (keepAll != null)
-      {
+      String keepAll = variableContext.getParameter(seqPrefix + "keepallmetadata");
+      if (keepAll != null) {
         node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, keepAll);
-      }
-      else
-      {
+      } else {
         node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
       }
-      // Add the new keepallmetadata config parameter 
+      // Add the new keepallmetadata config parameter
       os.addChild(os.getChildCount(), node);
-      
+
       SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_LOWERNAMES);
-      String lower = variableContext.getParameter(seqPrefix+"lowernames");
-      if (lower != null)
-      {
+      String lower = variableContext.getParameter(seqPrefix + "lowernames");
+      if (lower != null) {
         node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, lower);
-      }
-      else
-      {
+      } else {
         node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
       }
       os.addChild(os.getChildCount(), node2);
-      
+
       SpecificationNode node3 = new SpecificationNode(TikaConfig.NODE_WRITELIMIT);
-      String writeLimit = variableContext.getParameter(seqPrefix+"writelimit");
-      if (writeLimit != null)
-      {
+      String writeLimit = variableContext.getParameter(seqPrefix + "writelimit");
+      if (writeLimit != null) {
         node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, writeLimit);
-      }
-      else
-      {
+      } else {
         node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
       }
       os.addChild(os.getChildCount(), node3);
     }
-    
-    if (variableContext.getParameter(seqPrefix+"ignoretikaexceptions_present") != null)
-    {
+
+    if (variableContext.getParameter(seqPrefix + "ignoretikaexceptions_present") != null) {
       int i = 0;
-      while (i < os.getChildCount())
-      {
+      while (i < os.getChildCount()) {
         SpecificationNode node = os.getChild(i);
         if (node.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
           os.removeChild(i);
@@ -545,7 +761,7 @@ public class TikaExtractor extends org.a
           i++;
       }
 
-      String value = variableContext.getParameter(seqPrefix+"ignoretikaexceptions");
+      String value = variableContext.getParameter(seqPrefix + "ignoretikaexceptions");
       if (value == null)
         value = "false";
 
@@ -553,13 +769,11 @@ public class TikaExtractor extends org.a
       node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, value);
       os.addChild(os.getChildCount(), node);
     }
-    
-    x = variableContext.getParameter(seqPrefix+"boilerplateclassname");
-    if (x != null)
-    {
+
+    x = variableContext.getParameter(seqPrefix + "boilerplateclassname");
+    if (x != null) {
       int i = 0;
-      while (i < os.getChildCount())
-      {
+      while (i < os.getChildCount()) {
         SpecificationNode node = os.getChild(i);
         if (node.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
           os.removeChild(i);
@@ -567,183 +781,265 @@ public class TikaExtractor extends org.a
           i++;
       }
 
-      if (x.length() > 0)
-      {
+      if (x.length() > 0) {
         SpecificationNode node = new SpecificationNode(TikaConfig.NODE_BOILERPLATEPROCESSOR);
         node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, x);
         os.addChild(os.getChildCount(), node);
       }
     }
-    
+
+    x = variableContext.getParameter(seqPrefix + "tikaserver");
+    if (x != null) {
+      int i = 0;
+      while (i < os.getChildCount()) {
+        SpecificationNode node = os.getChild(i);
+        if (node.getType().equals(TikaConfig.NODE_TIKASERVER) || node.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)
+            || node.getType().equals(TikaConfig.NODE_TIKAPORT) || node.getType().equals(TikaConfig.NODE_TIKARETRY))
+          os.removeChild(i);
+        else
+          i++;
+      }
+
+      SpecificationNode node = new SpecificationNode(TikaConfig.NODE_TIKASERVER);
+      String tikaServer = variableContext.getParameter(seqPrefix + "tikaserver");
+      if (tikaServer != null) {
+        node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaServer);
+      } else {
+        node.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "false");
+      }
+      // Add the new tikaserver config parameter
+      os.addChild(os.getChildCount(), node);
+
+      SpecificationNode node2 = new SpecificationNode(TikaConfig.NODE_TIKAHOSTNAME);
+      String tikaHostname = variableContext.getParameter(seqPrefix + "tikahostname");
+      if (tikaHostname != null) {
+        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaHostname);
+      } else {
+        node2.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+      }
+      // Add the new tikahostname config parameter
+      os.addChild(os.getChildCount(), node2);
+
+      SpecificationNode node3 = new SpecificationNode(TikaConfig.NODE_TIKAPORT);
+      String tikaPort = variableContext.getParameter(seqPrefix + "tikaport");
+      if (tikaPort != null) {
+        node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaPort);
+      } else {
+        node3.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+      }
+      // Add the new tikaport config parameter
+      os.addChild(os.getChildCount(), node3);
+
+      SpecificationNode node4 = new SpecificationNode(TikaConfig.NODE_TIKARETRY);
+      String tikaRetry = variableContext.getParameter(seqPrefix + "tikaretry");
+      if (tikaRetry != null) {
+        node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, tikaRetry);
+      } else {
+        node4.setAttribute(TikaConfig.ATTRIBUTE_VALUE, "");
+      }
+      // Add the new tikaport config parameter
+      os.addChild(os.getChildCount(), node4);
+    }
+
     return null;
   }
-  
 
-  /** View specification.
-  * This method is called in the body section of a job's view page.  Its purpose is to present the pipeline specification information to the user.
-  * The coder can presume that the HTML that is output from this configuration will be within appropriate <html> and <body> tags.
-  *@param out is the output to which any HTML should be sent.
-  *@param locale is the preferred local of the output.
-  *@param connectionSequenceNumber is the unique number of this connection within the job.
-  *@param os is the current pipeline specification for this job.
-  */
-  @Override
-  public void viewSpecification(IHTTPOutput out, Locale locale, Specification os,
-    int connectionSequenceNumber)
-    throws ManifoldCFException, IOException
-  {
+  /**
+   * View specification. This method is called in the body section of a job's
+   * view page. Its purpose is to present the pipeline specification information
+   * to the user. The coder can presume that the HTML that is output from this
+   * configuration will be within appropriate <html> and <body> tags.
+   * 
+   * @param out
+   *          is the output to which any HTML should be sent.
+   * @param locale
+   *          is the preferred local of the output.
+   * @param connectionSequenceNumber
+   *          is the unique number of this connection within the job.
+   * @param os
+   *          is the current pipeline specification for this job.
+   */
+  @Override
+  public void viewSpecification(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber)
+      throws ManifoldCFException, IOException {
     Map<String, Object> paramMap = new HashMap<String, Object>();
-    paramMap.put("SEQNUM",Integer.toString(connectionSequenceNumber));
+    paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
 
     // Fill in the map with data from all tabs
+    fillInTikaTypeSpecificationMap(paramMap, os);
     fillInFieldMappingSpecificationMap(paramMap, os);
     fillInExceptionsSpecificationMap(paramMap, os);
     fillInBoilerplateSpecificationMap(paramMap, os);
 
-    Messages.outputResourceWithVelocity(out,locale,VIEW_SPECIFICATION_HTML,paramMap);
-    
+    Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
+
+  }
+
+  protected static void fillInTikaTypeSpecificationMap(Map<String, Object> paramMap, Specification os) {
+    String tikaServer = "false";
+    String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
+    String tikaPort = String.valueOf(TikaConfig.TIKAPORT_DEFAULT);
+    String tikaRetry = String.valueOf(TikaConfig.TIKARETRY_DEFAULT);
+    for (int i = 0; i < os.getChildCount(); i++) {
+      SpecificationNode sn = os.getChild(i);
+      if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
+        tikaServer = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
+        tikaHostname = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
+        tikaPort = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
+        tikaRetry = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+      }
+    }
+    paramMap.put("TIKASERVER", tikaServer);
+    paramMap.put("TIKAHOSTNAME", tikaHostname);
+    paramMap.put("TIKAPORT", tikaPort);
+    paramMap.put("TIKARETRY", tikaRetry);
   }
 
-  protected static void fillInFieldMappingSpecificationMap(Map<String,Object> paramMap, Specification os)
-  {
+  protected static void fillInFieldMappingSpecificationMap(Map<String, Object> paramMap, Specification os) {
     // Prep for field mappings
-    List<Map<String,String>> fieldMappings = new ArrayList<Map<String,String>>();
+    List<Map<String, String>> fieldMappings = new ArrayList<Map<String, String>>();
     String keepAllMetadataValue = "true";
     String lowernamesValue = "false";
     String writeLimitValue = "";
-    for (int i = 0; i < os.getChildCount(); i++)
-    {
+    for (int i = 0; i < os.getChildCount(); i++) {
       SpecificationNode sn = os.getChild(i);
       if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
         String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
         String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
         String targetDisplay;
-        if (target == null)
-        {
+        if (target == null) {
           target = "";
           targetDisplay = "(remove)";
-        }
-        else
+        } else
           targetDisplay = target;
-        Map<String,String> fieldMapping = new HashMap<String,String>();
-        fieldMapping.put("SOURCE",source);
-        fieldMapping.put("TARGET",target);
-        fieldMapping.put("TARGETDISPLAY",targetDisplay);
+        Map<String, String> fieldMapping = new HashMap<String, String>();
+        fieldMapping.put("SOURCE", source);
+        fieldMapping.put("TARGET", target);
+        fieldMapping.put("TARGETDISPLAY", targetDisplay);
         fieldMappings.add(fieldMapping);
-      }
-      else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA))
-      {
+      } else if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
         keepAllMetadataValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-      }
-      else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES))
-      {
+      } else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
         lowernamesValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
-      }
-      else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT))
-      {
+      } else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
         writeLimitValue = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
       }
     }
-    paramMap.put("FIELDMAPPINGS",fieldMappings);
-    paramMap.put("KEEPALLMETADATA",keepAllMetadataValue);
-    paramMap.put("LOWERNAMES",lowernamesValue);
-    paramMap.put("WRITELIMIT",writeLimitValue);
+    paramMap.put("FIELDMAPPINGS", fieldMappings);
+    paramMap.put("KEEPALLMETADATA", keepAllMetadataValue);
+    paramMap.put("LOWERNAMES", lowernamesValue);
+    paramMap.put("WRITELIMIT", writeLimitValue);
   }
 
-  protected static void fillInExceptionsSpecificationMap(Map<String,Object> paramMap, Specification os)
-  {
+  protected static void fillInExceptionsSpecificationMap(Map<String, Object> paramMap, Specification os) {
     String ignoreTikaExceptions = "true";
-    for (int i = 0; i < os.getChildCount(); i++)
-    {
+    for (int i = 0; i < os.getChildCount(); i++) {
       SpecificationNode sn = os.getChild(i);
-      if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION))
-      {
+      if (sn.getType().equals(TikaConfig.NODE_IGNORETIKAEXCEPTION)) {
         ignoreTikaExceptions = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
       }
     }
-    paramMap.put("IGNORETIKAEXCEPTIONS",ignoreTikaExceptions);
+    paramMap.put("IGNORETIKAEXCEPTIONS", ignoreTikaExceptions);
   }
 
-  protected static void fillInBoilerplateSpecificationMap(Map<String,Object> paramMap, Specification os)
-  {
+  protected static void fillInBoilerplateSpecificationMap(Map<String, Object> paramMap, Specification os) {
     String boilerplateClassName = "";
-    for (int i = 0; i < os.getChildCount(); i++)
-    {
+    for (int i = 0; i < os.getChildCount(); i++) {
       SpecificationNode sn = os.getChild(i);
-      if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR))
-      {
+      if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
         boilerplateClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
       }
     }
-    paramMap.put("BOILERPLATECLASSNAME",boilerplateClassName);
+    paramMap.put("BOILERPLATECLASSNAME", boilerplateClassName);
   }
 
   protected static int handleTikaException(TikaException e)
-    throws IOException, ManifoldCFException, ServiceInterruption
-  {
+      throws IOException, ManifoldCFException, ServiceInterruption {
+    // MHL - what does Tika throw if it gets an IOException reading the stream??
+    Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(), e);
+    return DOCUMENTSTATUS_REJECTED;
+  }
+
+  protected static int handleTikaServerRejects(String reason)
+      throws IOException, ManifoldCFException, ServiceInterruption {
+    // MHL - what does Tika throw if it gets an IOException reading the stream??
+    Logging.ingest.warn("Tika Server: Tika Server rejects: " + reason);
+    return DOCUMENTSTATUS_REJECTED;
+  }
+
+  protected static int handleTikaServerError(String description)
+      throws IOException, ManifoldCFException, ServiceInterruption {
+    // MHL - what does Tika throw if it gets an IOException reading the stream??
+    Logging.ingest.warn("Tika Server: Tika Server error: " + description);
+    return DOCUMENTSTATUS_REJECTED;
+  }
+
+  protected static int handleTikaServerException(IOException e)
+      throws IOException, ManifoldCFException, ServiceInterruption {
     // MHL - what does Tika throw if it gets an IOException reading the stream??
-    Logging.ingest.warn("Tika: Tika exception extracting: "+e.getMessage(),e);
+    Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(), e);
     return DOCUMENTSTATUS_REJECTED;
   }
-  
-  protected static int handleSaxException(SAXException e)
-    throws IOException, ManifoldCFException, ServiceInterruption
-  {
+
+  protected static int handleTikaServerException(ParseException e)
+      throws IOException, ManifoldCFException, ServiceInterruption {
+    // MHL - what does Tika throw if it gets an IOException reading the stream??
+    Logging.ingest.warn("Tika: Tika exception extracting: " + e.getMessage(), e);
+    return DOCUMENTSTATUS_REJECTED;
+  }
+
+  protected static int handleSaxException(SAXException e) throws IOException, ManifoldCFException, ServiceInterruption {
     // MHL - what does this mean?
-    Logging.ingest.warn("Tika: SAX exception extracting: "+e.getMessage(),e);
+    Logging.ingest.warn("Tika: SAX exception extracting: " + e.getMessage(), e);
     return DOCUMENTSTATUS_REJECTED;
   }
-  
-  protected static int handleIOException(IOException e)
-    throws ManifoldCFException
-  {
+
+  protected static int handleIOException(IOException e) throws ManifoldCFException {
     // IOException reading from our local storage...
     if (e instanceof InterruptedIOException)
-      throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-    throw new ManifoldCFException(e.getMessage(),e);
+      throw new ManifoldCFException(e.getMessage(), e, ManifoldCFException.INTERRUPTED);
+    throw new ManifoldCFException(e.getMessage(), e);
   }
-  
-  protected static interface DestinationStorage
-  {
-    /** Get the output stream to write to.  Caller should explicitly close this stream when done writing.
-    */
-    public OutputStream getOutputStream()
-      throws ManifoldCFException;
-    
-    /** Get new binary length.
-    */
-    public long getBinaryLength()
-      throws ManifoldCFException;
-
-    /** Get the input stream to read from.  Caller should explicitly close this stream when done reading.
-    */
-    public InputStream getInputStream()
-      throws ManifoldCFException;
-    
-    /** Close the object and clean up everything.
-    * This should be called when the data is no longer needed.
-    */
-    public void close()
-      throws ManifoldCFException;
-  }
-  
-  protected static class FileDestinationStorage implements DestinationStorage
-  {
+
+  protected static interface DestinationStorage {
+    /**
+     * Get the output stream to write to. Caller should explicitly close this
+     * stream when done writing.
+     */
+    public OutputStream getOutputStream() throws ManifoldCFException;
+
+    /**
+     * Get new binary length.
+     */
+    public long getBinaryLength() throws ManifoldCFException;
+
+    /**
+     * Get the input stream to read from. Caller should explicitly close this
+     * stream when done reading.
+     */
+    public InputStream getInputStream() throws ManifoldCFException;
+
+    /**
+     * Close the object and clean up everything. This should be called when the
+     * data is no longer needed.
+     */
+    public void close() throws ManifoldCFException;
+  }
+
+  protected static class FileDestinationStorage implements DestinationStorage {
     protected final File outputFile;
     protected final OutputStream outputStream;
 
-    public FileDestinationStorage()
-      throws ManifoldCFException
-    {
+    public FileDestinationStorage() throws ManifoldCFException {
       File outputFile;
       OutputStream outputStream;
-      try
-      {
-        outputFile = File.createTempFile("mcftika","tmp");
+      try {
+        outputFile = File.createTempFile("mcftika", "tmp");
         outputStream = new FileOutputStream(outputFile);
-      }
-      catch (IOException e)
-      {
+      } catch (IOException e) {
         handleIOException(e);
         outputFile = null;
         outputStream = null;
@@ -751,121 +1047,127 @@ public class TikaExtractor extends org.a
       this.outputFile = outputFile;
       this.outputStream = outputStream;
     }
-    
+
     @Override
-    public OutputStream getOutputStream()
-      throws ManifoldCFException
-    {
+    public OutputStream getOutputStream() throws ManifoldCFException {
       return outputStream;
     }
-    
-    /** Get new binary length.
-    */
+
+    /**
+     * Get new binary length.
+     */
     @Override
-    public long getBinaryLength()
-      throws ManifoldCFException
-    {
+    public long getBinaryLength() throws ManifoldCFException {
       return outputFile.length();
     }
 
-    /** Get the input stream to read from.  Caller should explicitly close this stream when done reading.
-    */
+    /**
+     * Get the input stream to read from. Caller should explicitly close this
+     * stream when done reading.
+     */
     @Override
-    public InputStream getInputStream()
-      throws ManifoldCFException
-    {
-      try
-      {
+    public InputStream getInputStream() throws ManifoldCFException {
+      try {
         return new FileInputStream(outputFile);
-      }
-      catch (IOException e)
-      {
+      } catch (IOException e) {
         handleIOException(e);
         return null;
       }
     }
-    
-    /** Close the object and clean up everything.
-    * This should be called when the data is no longer needed.
-    */
+
+    /**
+     * Close the object and clean up everything. This should be called when the
+     * data is no longer needed.
+     */
     @Override
-    public void close()
-      throws ManifoldCFException
-    {
+    public void close() throws ManifoldCFException {
       outputFile.delete();
     }
 
   }
-  
-  protected static class MemoryDestinationStorage implements DestinationStorage
-  {
+
+  protected static class MemoryDestinationStorage implements DestinationStorage {
     protected final ByteArrayOutputStream outputStream;
-    
-    public MemoryDestinationStorage(int sizeHint)
-    {
+
+    public MemoryDestinationStorage(int sizeHint) {
       outputStream = new ByteArrayOutputStream(sizeHint);
     }
-    
+
     @Override
-    public OutputStream getOutputStream()
-      throws ManifoldCFException
-    {
+    public OutputStream getOutputStream() throws ManifoldCFException {
       return outputStream;
     }
 
-    /** Get new binary length.
-    */
+    /**
+     * Get new binary length.
+     */
     @Override
-    public long getBinaryLength()
-      throws ManifoldCFException
-    {
+    public long getBinaryLength() throws ManifoldCFException {
       return outputStream.size();
     }
-    
-    /** Get the input stream to read from.  Caller should explicitly close this stream when done reading.
-    */
+
+    /**
+     * Get the input stream to read from. Caller should explicitly close this
+     * stream when done reading.
+     */
     @Override
-    public InputStream getInputStream()
-      throws ManifoldCFException
-    {
+    public InputStream getInputStream() throws ManifoldCFException {
       return new ByteArrayInputStream(outputStream.toByteArray());
     }
-    
-    /** Close the object and clean up everything.
-    * This should be called when the data is no longer needed.
-    */
-    public void close()
-      throws ManifoldCFException
-    {
+
+    /**
+     * Close the object and clean up everything. This should be called when the
+     * data is no longer needed.
+     */
+    public void close() throws ManifoldCFException {
     }
 
   }
 
   protected static class SpecPacker {
-    
-    private final Map<String,String> sourceTargets = new HashMap<String,String>();
+
+    private final Map<String, String> sourceTargets = new HashMap<String, String>();
     private final boolean keepAllMetadata;
     private final boolean lowerNames;
     private final int writeLimit;
     private final boolean ignoreTikaException;
     private final String extractorClassName;
-    
+    private URI metaURI;
+    private URI contentURI;
+    private final String tikaHostname;
+    private final int tikaPort;
+    private final boolean tikaServer;
+    private final long tikaRetry;
+
     public SpecPacker(Specification os) {
       boolean keepAllMetadata = true;
       boolean lowerNames = false;
       int writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
       boolean ignoreTikaException = true;
       String extractorClassName = null;
+      String tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
+      int tikaPort = TikaConfig.TIKAPORT_DEFAULT;
+      boolean tikaServer = false;
+      long tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
+      try {
+        metaURI = new URI("/meta");
+        contentURI = new URI("/tika");
+      } catch (URISyntaxException e) {
+        // Should be impossible
+        metaURI = null;
+        contentURI = null;
+      }
+
       for (int i = 0; i < os.getChildCount(); i++) {
         SpecificationNode sn = os.getChild(i);
-        
-        if(sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
+
+        if (sn.getType().equals(TikaConfig.NODE_KEEPMETADATA)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           keepAllMetadata = Boolean.parseBoolean(value);
-        } else if(sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
+        } else if (sn.getType().equals(TikaConfig.NODE_LOWERNAMES)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           lowerNames = Boolean.parseBoolean(value);
-        } else if(sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
+        } else if (sn.getType().equals(TikaConfig.NODE_WRITELIMIT)) {
           String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
           if (value.length() == 0) {
             writeLimit = TikaConfig.WRITELIMIT_DEFAULT;
@@ -875,7 +1177,7 @@ public class TikaExtractor extends org.a
         } else if (sn.getType().equals(TikaConfig.NODE_FIELDMAP)) {
           String source = sn.getAttributeValue(TikaConfig.ATTRIBUTE_SOURCE);
           String target = sn.getAttributeValue(TikaConfig.ATTRIBUTE_TARGET);
-          
+
           if (target == null) {
             target = "";
           }
@@ -885,6 +1187,34 @@ public class TikaExtractor extends org.a
           ignoreTikaException = Boolean.parseBoolean(value);
         } else if (sn.getType().equals(TikaConfig.NODE_BOILERPLATEPROCESSOR)) {
           extractorClassName = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+        } else if (sn.getType().equals(TikaConfig.NODE_TIKAHOSTNAME)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          if (value.length() == 0) {
+            tikaHostname = TikaConfig.TIKAHOSTNAME_DEFAULT;
+          } else {
+            tikaHostname = value;
+          }
+        } else if (sn.getType().equals(TikaConfig.NODE_TIKAPORT)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          if (value.length() == 0) {
+            tikaPort = TikaConfig.TIKAPORT_DEFAULT;
+          } else {
+            tikaPort = Integer.parseInt(value);
+          }
+        } else if (sn.getType().equals(TikaConfig.NODE_TIKASERVER)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          if (value.length() == 0) {
+            tikaServer = false;
+          } else {
+            tikaServer = Boolean.parseBoolean(value);
+          }
+        } else if (sn.getType().equals(TikaConfig.NODE_TIKARETRY)) {
+          String value = sn.getAttributeValue(TikaConfig.ATTRIBUTE_VALUE);
+          if (value.length() == 0) {
+            tikaRetry = TikaConfig.TIKARETRY_DEFAULT;
+          } else {
+            tikaRetry = Long.parseLong(value);
+          }
         }
       }
       this.keepAllMetadata = keepAllMetadata;
@@ -892,12 +1222,16 @@ public class TikaExtractor extends org.a
       this.writeLimit = writeLimit;
       this.ignoreTikaException = ignoreTikaException;
       this.extractorClassName = extractorClassName;
+      this.tikaHostname = tikaHostname;
+      this.tikaPort = tikaPort;
+      this.tikaServer = tikaServer;
+      this.tikaRetry = tikaRetry;
     }
-    
+
     public String toPackedString() {
       StringBuilder sb = new StringBuilder();
       int i;
-      
+
       // Mappings
       final String[] sortArray = new String[sourceTargets.size()];
       i = 0;
@@ -905,7 +1239,7 @@ public class TikaExtractor extends org.a
         sortArray[i++] = source;
       }
       java.util.Arrays.sort(sortArray);
-      
+
       List<String> packedMappings = new ArrayList<String>();
       String[] fixedList = new String[2];
       for (String source : sortArray) {
@@ -913,10 +1247,10 @@ public class TikaExtractor extends org.a
         StringBuilder localBuffer = new StringBuilder();
         fixedList[0] = source;
         fixedList[1] = target;
-        packFixedList(localBuffer,fixedList,':');
+        packFixedList(localBuffer, fixedList, ':');
         packedMappings.add(localBuffer.toString());
       }
-      packList(sb,packedMappings,'+');
+      packList(sb, packedMappings, '+');
 
       // Keep all metadata
       if (keepAllMetadata)
@@ -924,12 +1258,11 @@ public class TikaExtractor extends org.a
       else
         sb.append('-');
       if (lowerNames)
-          sb.append('+');
-        else
-          sb.append('-');
+        sb.append('+');
+      else
+        sb.append('-');
 
-      if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT)
-      {
+      if (writeLimit != TikaConfig.WRITELIMIT_DEFAULT) {
         sb.append('+');
         sb.append(writeLimit);
       }
@@ -939,55 +1272,60 @@ public class TikaExtractor extends org.a
       else
         sb.append('-');
 
-      if (extractorClassName != null)
-      {
+      if (extractorClassName != null) {
         sb.append('+');
         sb.append(extractorClassName);
-      }
-      else
+      } else
         sb.append('-');
-      
+
       return sb.toString();
     }
-    
+
+    public URI metaURI() {
+      return metaURI;
+    }
+
+    public URI contentURI() {
+      return contentURI;
+    }
+
     public String getMapping(String source) {
       return sourceTargets.get(source);
     }
-    
+
     public boolean keepAllMetadata() {
       return keepAllMetadata;
     }
-    
+
     public boolean lowerNames() {
       return lowerNames;
     }
-    
+
     public int writeLimit() {
       return writeLimit;
     }
-    
+
     public boolean ignoreTikaException() {
       return ignoreTikaException;
     }
-    
-    public BoilerpipeExtractor getExtractorClassInstance()
-      throws ManifoldCFException {
+
+    public BoilerpipeExtractor getExtractorClassInstance() throws ManifoldCFException {
       if (extractorClassName == null)
         return null;
       try {
         ClassLoader loader = BoilerpipeExtractor.class.getClassLoader();
         Class extractorClass = loader.loadClass(extractorClassName);
         java.lang.reflect.Field f = extractorClass.getField("INSTANCE");
-        return (BoilerpipeExtractor)f.get(null);
+        return (BoilerpipeExtractor) f.get(null);
       } catch (ClassNotFoundException e) {
-        throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' not found: "+e.getMessage(),e);
+        throw new ManifoldCFException(
+            "Boilerpipe extractor class '" + extractorClassName + "' not found: " + e.getMessage(), e);
       } catch (Exception e) {
-        throw new ManifoldCFException("Boilerpipe extractor class '"+extractorClassName+"' exception on instantiation: "+e.getMessage(),e);
+        throw new ManifoldCFException(
+            "Boilerpipe extractor class '" + extractorClassName + "' exception on instantiation: " + e.getMessage(), e);
       }
     }
 
   }
 
 }
-
-

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_en_US.properties Wed May 10 13:27:33 2017
@@ -13,6 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Retry interval (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:
 TikaExtractor.FieldMappingTabName=Field mapping
 TikaExtractor.ExceptionsTabName=Exceptions
 TikaExtractor.BoilerplateTabName=Boilerplate

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_es_ES.properties Wed May 10 13:27:33 2017
@@ -36,4 +36,11 @@ TikaExtractor.AddFieldMapping=Añadir
 TikaExtractor.Delete=Borrar
 TikaExtractor.DeleteFieldMapping=Eliminar asignación de campos
 TikaExtractor.NoFieldNameSpecified=Por favor, especifique un nombre de campo
-TikaExtractor.IgnoreTikaExceptions=No haga caso de excepciones Tika:
\ No newline at end of file
+TikaExtractor.IgnoreTikaExceptions=No haga caso de excepciones Tika:
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Intervalo de reintento (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_ja_JP.properties Wed May 10 13:27:33 2017
@@ -13,6 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Retry interval (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:
 TikaExtractor.FieldMappingTabName=フィールドマッピング
 TikaExtractor.ExceptionsTabName=例外
 TikaExtractor.BoilerplateTabName=Boilerplate

Modified: manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties?rev=1794722&r1=1794721&r2=1794722&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties (original)
+++ manifoldcf/branches/CONNECTORS-1425/connectors/tika/connector/src/main/native2ascii/org/apache/manifoldcf/agents/transformation/tika/common_zh_CN.properties Wed May 10 13:27:33 2017
@@ -13,6 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+TikaExtractor.TikaHostname=Tika hostname:
+TikaExtractor.TikaPort=Tika port:
+TikaExtractor.TikaRetry=Retry interval (ms):
+TikaExtractor.TikaParsersSelected=Tika Parsers
+TikaExtractor.TikaServerSelected=Tika Server
+TikaExtractor.TikaTypeTabName=Tika type
+TikaExtractor.TikaType=Tika type:
 TikaExtractor.FieldMappingTabName=字段映射
 TikaExtractor.ExceptionsTabName=异常
 TikaExtractor.BoilerplateTabName=Boilerplate



Mime
View raw message