incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r933029 - in /incubator/lcf/trunk/modules: connectors/gts/connector/org/apache/lcf/agents/output/gts/ connectors/rss/connector/org/apache/lcf/crawler/connectors/rss/ connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawl...
Date Sun, 11 Apr 2010 23:51:12 GMT
Author: kwright
Date: Sun Apr 11 23:51:11 2010
New Revision: 933029

URL: http://svn.apache.org/viewvc?rev=933029&view=rev
Log:
Add output connector support for identifying the mime types that are interesting to ingest.

Modified:
    incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
    incubator/lcf/trunk/modules/connectors/rss/connector/org/apache/lcf/crawler/connectors/rss/RSSConnector.java
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
    incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
    incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IVersionActivity.java
    incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java

Modified: incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
(original)
+++ incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
Sun Apr 11 23:51:11 2010
@@ -143,6 +143,74 @@ public class GTSConnector extends org.ap
     }
   }
 
+  protected static final String[] ingestableMimeTypeArray = new String[]
+  {
+    "application/excel",
+      "application/powerpoint",
+      "application/ppt",
+      "application/rtf",
+      "application/xls",
+      "text/html",
+      "text/rtf",
+      "text/pdf",
+      "application/x-excel",
+      "application/x-msexcel",
+      "application/x-mspowerpoint",
+      "application/x-msword-doc",
+      "application/x-msword",
+      "application/x-word",
+      "Application/pdf",
+      "text/xml",
+      "no-type",
+      "text/plain",
+      "application/pdf",
+      "application/x-rtf",
+      "application/vnd.ms-excel",
+      "application/vnd.ms-pps",
+      "application/vnd.ms-powerpoint",
+      "application/vnd.ms-word",
+      "application/msword",
+      "application/msexcel",
+      "application/mspowerpoint",
+      "application/ms-powerpoint",
+      "application/ms-word",
+      "application/ms-excel",
+      "Adobe",
+      "application/Vnd.Ms-Excel",
+      "vnd.ms-powerpoint",
+      "application/x-pdf",
+      "winword",
+      "text/richtext",
+      "Text",
+      "Text/html",
+      "application/MSWORD",
+      "application/PDF",
+      "application/MSEXCEL",
+      "application/MSPOWERPOINT"
+  };
+
+  protected static final Map ingestableMimeTypeMap = new HashMap();
+  static
+  {
+    int i = 0;
+    while (i < ingestableMimeTypeArray.length)
+    {
+      String type = ingestableMimeTypeArray[i++];
+      ingestableMimeTypeMap.put(type,type);
+    }
+  }
+
+  /** Detect if a mime type is indexable or not.  This method is used by participating repository
connectors to pre-filter the number of
+  * unusable documents that will be passed to this output connector.
+  *@param mimeType is the mime type of the document.
+  *@param true if the mime type is indexable by this connector.
+  */
+  public boolean checkMimeTypeIndexable(String mimeType)
+    throws LCFException, ServiceInterruption
+  {
+    return (ingestableMimeTypeMap.get(mimeType) != null);
+  }
+
   /** Pre-determine whether a document (passed here as a File object) is indexable by this
connector.  This method is used by participating
   * repository connectors to help reduce the number of unmanageable documents that are passed
to this output connector in advance of an
   * actual transfer.  This hook is provided mainly to support search engines that only handle
a small set of accepted file types.

Modified: incubator/lcf/trunk/modules/connectors/rss/connector/org/apache/lcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/rss/connector/org/apache/lcf/crawler/connectors/rss/RSSConnector.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/rss/connector/org/apache/lcf/crawler/connectors/rss/RSSConnector.java
(original)
+++ incubator/lcf/trunk/modules/connectors/rss/connector/org/apache/lcf/crawler/connectors/rss/RSSConnector.java
Sun Apr 11 23:51:11 2010
@@ -140,65 +140,6 @@ public class RSSConnector extends org.ap
 
   protected static DataCache cache = new DataCache();
 
-  protected static final String[] interestingMimeTypeArray = new String[]
-  {
-    "application/excel",
-      "application/powerpoint",
-      "application/ppt",
-      "application/rtf",
-      "application/xls",
-      "text/html",
-      "text/rtf",
-      "text/pdf",
-      "application/x-excel",
-      "application/x-msexcel",
-      "application/x-mspowerpoint",
-      "application/x-msword-doc",
-      "application/x-msword",
-      "application/x-word",
-      "Application/pdf",
-      "text/xml",
-      "no-type",
-      "text/plain",
-      "application/pdf",
-      "application/x-rtf",
-      "application/vnd.ms-excel",
-      "application/vnd.ms-pps",
-      "application/vnd.ms-powerpoint",
-      "application/vnd.ms-word",
-      "application/msword",
-      "application/msexcel",
-      "application/mspowerpoint",
-      "application/ms-powerpoint",
-      "application/ms-word",
-      "application/ms-excel",
-      "Adobe",
-      "application/Vnd.Ms-Excel",
-      "vnd.ms-powerpoint",
-      "application/x-pdf",
-      "winword",
-      "text/richtext",
-      "Text",
-      "Text/html",
-      "application/MSWORD",
-      "application/PDF",
-      "application/MSEXCEL",
-      "application/MSPOWERPOINT",
-      "application/rss+xml",
-      "application/xml"
-  };
-
-  protected static final Map interestingMimeTypeMap = new HashMap();
-  static
-  {
-    int i = 0;
-    while (i < interestingMimeTypeArray.length)
-    {
-      String type = interestingMimeTypeArray[i++];
-      interestingMimeTypeMap.put(type,type);
-    }
-  }
-
 
   protected static final Map understoodProtocols = new HashMap();
   static
@@ -1098,7 +1039,7 @@ public class RSSConnector extends org.ap
                         // Decide whether to exclude this document based on what we see here.
                         // Basically, we want to get rid of everything that we don't know
what
                         // to do with in the ingestion system.
-                        if (!isContentInteresting(contentType))
+                        if (!isContentInteresting(activities,contentType))
                         {
                           if (Logging.connectors.isDebugEnabled())
                             Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because
it had the wrong content type: "+((contentType==null)?"null":"'"+contentType+"'"));
@@ -3284,8 +3225,8 @@ public class RSSConnector extends org.ap
 
   /** Code to check if data is interesting, based on response code and content type.
   */
-  protected boolean isContentInteresting(String contentType)
-    throws LCFException
+  protected boolean isContentInteresting(IFingerprintActivity activities, String contentType)
+    throws ServiceInterruption, LCFException
   {
     // Look at the content type and decide if it's a kind we want.  This is defined
     // as something we think we can either ingest, or extract links from.
@@ -3299,8 +3240,8 @@ public class RSSConnector extends org.ap
     if (pos != -1)
       contentType = contentType.substring(0,pos);
     contentType = contentType.trim();
-
-    return interestingMimeTypeMap.get(contentType) != null;
+    
+    return activities.checkMimeTypeIndexable(contentType);
   }
 
   /** Stuffer for packing a single string with an end delimiter */

Modified: incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Sun Apr 11 23:51:11 2010
@@ -78,110 +78,34 @@ public class WebcrawlerConnector extends
   protected static final int RESULTSTATUS_TRUE = 1;
   protected static final int RESULTSTATUS_NOTYETDETERMINED = 2;
 
-  protected static final String[] ingestableMimeTypeArray = new String[]
-  {
-    "application/excel",
-      "application/powerpoint",
-      "application/ppt",
-      "application/rtf",
-      "application/xls",
-      "text/html",
-      "text/rtf",
-      "text/pdf",
-      "application/x-excel",
-      "application/x-msexcel",
-      "application/x-mspowerpoint",
-      "application/x-msword-doc",
-      "application/x-msword",
-      "application/x-word",
-      "Application/pdf",
-      "text/xml",
-      "no-type",
-      "text/plain",
-      "application/pdf",
-      "application/x-rtf",
-      "application/vnd.ms-excel",
-      "application/vnd.ms-pps",
-      "application/vnd.ms-powerpoint",
-      "application/vnd.ms-word",
-      "application/msword",
-      "application/msexcel",
-      "application/mspowerpoint",
-      "application/ms-powerpoint",
-      "application/ms-word",
-      "application/ms-excel",
-      "Adobe",
-      "application/Vnd.Ms-Excel",
-      "vnd.ms-powerpoint",
-      "application/x-pdf",
-      "winword",
-      "text/richtext",
-      "Text",
-      "Text/html",
-      "application/MSWORD",
-      "application/PDF",
-      "application/MSEXCEL",
-      "application/MSPOWERPOINT"
-  };
-
-
+  /** This represents a list of the mime types that this connector knows how to extract links
from.
+  * Documents that are indexable are described by the output connector. */
   protected static final String[] interestingMimeTypeArray = new String[]
   {
-    "application/excel",
-      "application/powerpoint",
-      "application/ppt",
-      "application/rtf",
-      "application/xls",
-      "text/html",
-      "text/rtf",
-      "text/pdf",
-      "application/x-excel",
-      "application/x-msexcel",
-      "application/x-mspowerpoint",
-      "application/x-msword-doc",
-      "application/x-msword",
-      "application/x-word",
-      "Application/pdf",
-      "text/xml",
-      "no-type",
-      "text/plain",
-      "application/pdf",
-      "application/x-rtf",
-      "application/vnd.ms-excel",
-      "application/vnd.ms-pps",
-      "application/vnd.ms-powerpoint",
-      "application/vnd.ms-word",
-      "application/msword",
-      "application/msexcel",
-      "application/mspowerpoint",
-      "application/ms-powerpoint",
-      "application/ms-word",
-      "application/ms-excel",
-      "Adobe",
-      "application/Vnd.Ms-Excel",
-      "vnd.ms-powerpoint",
-      "application/x-pdf",
-      "winword",
-      "text/richtext",
-      "Text",
-      "Text/html",
-      "application/MSWORD",
-      "application/PDF",
-      "application/MSEXCEL",
-      "application/MSPOWERPOINT"
+    "application/rtf",
+    "application/xls",
+    "text/html",
+    "text/rtf",
+    "application/x-excel",
+    "application/x-msexcel",
+    "application/x-mspowerpoint",
+    "application/x-msword-doc",
+    "application/x-msword",
+    "application/x-word",
+    "text/xml",
+    "no-type",
+    "text/plain",
+    "application/x-rtf",
+    "application/x-pdf",
+    "text/richtext",
+    "Text",
+    "Text/html"
   };
 
-  protected static final Map ingestableMimeTypeMap = new HashMap();
   protected static final Map interestingMimeTypeMap = new HashMap();
   static
   {
     int i = 0;
-    while (i < ingestableMimeTypeArray.length)
-    {
-      String type = ingestableMimeTypeArray[i++];
-      ingestableMimeTypeMap.put(type,type);
-    }
-    i = 0;
     while (i < interestingMimeTypeArray.length)
     {
       String type = interestingMimeTypeArray[i++];
@@ -774,7 +698,7 @@ public class WebcrawlerConnector extends
                             contentType = null;
                         }
 
-                        if (isContentInteresting(currentURI,response,contentType))
+                        if (isContentInteresting(activities,currentURI,response,contentType))
                         {
                           // Treat it as real, and cache it.
                           checkSum = cache.addData(activities,currentURI,connection);
@@ -1218,7 +1142,7 @@ public class WebcrawlerConnector extends
         // We can exclude it if it does not seem to be a kind of document that the ingestion
system knows
         // about.
 
-        if (isDataIngestable(documentIdentifier))
+        if (isDataIngestable(activities,documentIdentifier))
         {
           // Ingest the document
           if (Logging.connectors.isDebugEnabled())
@@ -1966,8 +1890,8 @@ public class WebcrawlerConnector extends
 
   /** Code to check if data is interesting, based on response code and content type.
   */
-  protected boolean isContentInteresting(String documentIdentifier, int response, String
contentType)
-    throws LCFException
+  protected boolean isContentInteresting(IFingerprintActivity activities, String documentIdentifier,
int response, String contentType)
+    throws ServiceInterruption, LCFException
   {
     // Additional filtering only done if it's a 200 response
     if (response != 200)
@@ -1987,13 +1911,17 @@ public class WebcrawlerConnector extends
       contentType = contentType.substring(0,pos);
     contentType = contentType.trim();
 
-    return interestingMimeTypeMap.get(contentType) != null;
+    // There are presumably mime types we can extract links from that we can't index?
+    if (interestingMimeTypeMap.get(contentType) != null)
+      return true;
+    
+    return activities.checkMimeTypeIndexable(contentType);
   }
 
   /** Code to check if an already-fetched document should be ingested.
   */
-  protected boolean isDataIngestable(String documentIdentifier)
-    throws LCFException
+  protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier)
+    throws ServiceInterruption, LCFException
   {
     if (cache.getResponseCode(documentIdentifier) != 200)
       return false;
@@ -2019,13 +1947,7 @@ public class WebcrawlerConnector extends
       contentType = contentType.substring(0,pos);
     contentType = contentType.trim();
 
-    if (ingestableMimeTypeMap.get(contentType) == null)
-      return false;
-
-    // Now, it looks good, but let's be certain by doing fingerprinting.
-    // MHL
-
-    return true;
+    return activities.checkMimeTypeIndexable(contentType);
   }
 
   /** Find a redirection URI, if it exists */

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
Sun Apr 11 23:51:11 2010
@@ -191,6 +191,29 @@ public class IncrementalIngester extends
     performDelete("",null,null);
   }
 
+  /** Check if a mime type is indexable.
+  *@param outputConnectionName is the name of the output connection associated with this
action.
+  *@param mimeType is the mime type to check.
+  *@return true if the mimeType is indexable.
+  */
+  public boolean checkMimeTypeIndexable(String outputConnectionName, String mimeType)
+    throws LCFException, ServiceInterruption
+  {
+    IOutputConnection connection = connectionManager.load(outputConnectionName);
+    IOutputConnector connector = OutputConnectorFactory.grab(threadContext,connection.getClassName(),connection.getConfigParams(),connection.getMaxConnections());
+    if (connector == null)
+      // The connector is not installed; treat this as a service interruption.
+      throw new ServiceInterruption("Output connector not installed",300000L);
+    try
+    {
+      return connector.checkMimeTypeIndexable(mimeType);
+    }
+    finally
+    {
+      OutputConnectorFactory.release(connector);
+    }
+  }
+
   /** Check if a file is indexable.
   *@param outputConnectionName is the name of the output connection associated with this
action.
   *@param localFile is the local file to check.

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
Sun Apr 11 23:51:11 2010
@@ -69,6 +69,14 @@ public interface IIncrementalIngester
   public void clearAll()
     throws LCFException;
 
+  /** Check if a mime type is indexable.
+  *@param outputConnectionName is the name of the output connection associated with this
action.
+  *@param mimeType is the mime type to check.
+  *@return true if the mimeType is indexable.
+  */
+  public boolean checkMimeTypeIndexable(String outputConnectionName, String mimeType)
+    throws LCFException, ServiceInterruption;
+
   /** Check if a file is indexable.
   *@param outputConnectionName is the name of the output connection associated with this
action.
   *@param localFile is the local file to check.

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
Sun Apr 11 23:51:11 2010
@@ -128,6 +128,14 @@ public interface IOutputConnector
   */
   public ConfigParams getConfiguration();
 
+  /** Detect if a mime type is indexable or not.  This method is used by participating repository
connectors to pre-filter the number of
+  * unusable documents that will be passed to this output connector.
+  *@param mimeType is the mime type of the document.
+  *@return true if the mime type is indexable by this connector.
+  */
+  public boolean checkMimeTypeIndexable(String mimeType)
+    throws LCFException, ServiceInterruption;
+
   /** Pre-determine whether a document (passed here as a File object) is indexable by this
connector.  This method is used by participating
   * repository connectors to help reduce the number of unmanageable documents that are passed
to this output connector in advance of an
   * actual transfer.  This hook is provided mainly to support search engines that only handle
a small set of accepted file types.

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
Sun Apr 11 23:51:11 2010
@@ -143,6 +143,17 @@ public abstract class BaseOutputConnecto
     return params;
   }
 
+  /** Detect if a mime type is indexable or not.  This method is used by participating repository
connectors to pre-filter the number of
+  * unusable documents that will be passed to this output connector.
+  *@param mimeType is the mime type of the document.
+  *@return true if the mime type is indexable by this connector.
+  */
+  public boolean checkMimeTypeIndexable(String mimeType)
+    throws LCFException, ServiceInterruption
+  {
+    return true;
+  }
+
   /** Pre-determine whether a document (passed here as a File object) is indexable by this
connector.  This method is used by participating
   * repository connectors to help reduce the number of unmanageable documents that are passed
to this output connector in advance of an
   * actual transfer.  This hook is provided mainly to support search engines that only handle
a small set of accepted file types.

Modified: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
(original)
+++ incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
Sun Apr 11 23:51:11 2010
@@ -23,12 +23,20 @@ import org.apache.lcf.agents.interfaces.
 import java.util.*;
 import java.io.*;
 
-/** This interface abstracts from the activities that handle document fingerprinting.
+/** This interface abstracts from the activities that handle document fingerprinting and
mime type acceptance.
 */
 public interface IFingerprintActivity
 {
   public static final String _rcsid = "@(#)$Id$";
 
+  /** Detect if a mime type is indexable or not.  This method is used by participating repository
connectors to pre-filter the number of
+  * unusable documents that will be passed to this output connector.
+  *@param mimeType is the mime type of the document.
+  *@return true if the mime type is indexable by this connector.
+  */
+  public boolean checkMimeTypeIndexable(String mimeType)
+    throws LCFException, ServiceInterruption;
+
   /** Check whether a document is indexable by the currently specified output connector.
   *@param localFile is the local copy of the file to check.
   *@return true if the document is indexable.

Modified: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IVersionActivity.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IVersionActivity.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IVersionActivity.java
(original)
+++ incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IVersionActivity.java
Sun Apr 11 23:51:11 2010
@@ -25,7 +25,7 @@ import java.util.*;
 /** This interface abstracts from the activities that a versioning operation can do.
 * See IProcessActivity for a description of the event model.
 */
-public interface IVersionActivity extends IHistoryActivity, IEventActivity, IAbortActivity
+public interface IVersionActivity extends IHistoryActivity, IEventActivity, IAbortActivity,
IFingerprintActivity
 {
   public static final String _rcsid = "@(#)$Id$";
 

Modified: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java?rev=933029&r1=933028&r2=933029&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java
(original)
+++ incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java
Sun Apr 11 23:51:11 2010
@@ -328,7 +328,7 @@ public class WorkerThread extends Thread
 
                       HashMap abortSet = new HashMap();
                       ProcessActivity activity;
-                      VersionActivity versionActivity = new VersionActivity(connectionName,connMgr,jobManager,job.getID(),abortSet);
+                      VersionActivity versionActivity = new VersionActivity(connectionName,connMgr,jobManager,job,ingester,abortSet);
 
                       String aclAuthority = connection.getACLAuthority();
                       boolean isDefaultAuthority = (aclAuthority == null || aclAuthority.length()
== 0);
@@ -1137,20 +1137,33 @@ public class WorkerThread extends Thread
     protected IRepositoryConnectionManager connMgr;
     protected IJobManager jobManager;
     protected Long jobID;
+    protected IJobDescription job;
+    protected IIncrementalIngester ingester;
     protected HashMap abortSet;
 
     /** Constructor.
     */
     public VersionActivity(String connectionName, IRepositoryConnectionManager connMgr,
-      IJobManager jobManager, Long jobID, HashMap abortSet)
+      IJobManager jobManager, IJobDescription job, IIncrementalIngester ingester, HashMap
abortSet)
     {
       this.connectionName = connectionName;
       this.connMgr = connMgr;
       this.jobManager = jobManager;
-      this.jobID = jobID;
+      this.job = job;
+      this.ingester = ingester;
       this.abortSet = abortSet;
     }
 
+    /** Check whether a mime type is indexable by the currently specified output connector.
+    *@param mimeType is the mime type to check, not including any character set specification.
+    *@return true if the mime type is indexable.
+    */
+    public boolean checkMimeTypeIndexable(String mimeType)
+      throws LCFException, ServiceInterruption
+    {
+      return ingester.checkMimeTypeIndexable(job.getOutputConnectionName(),mimeType);
+    }
+
     /** Record time-stamped information about the activity of the connector.
     *@param startTime is either null or the time since the start of epoch in milliseconds
(Jan 1, 1970).  Every
     *       activity has an associated time; the startTime field records when the activity
began.  A null value
@@ -1183,7 +1196,7 @@ public class WorkerThread extends Thread
     public String[] retrieveParentData(String localIdentifier, String dataName)
       throws LCFException
     {
-      return jobManager.retrieveParentData(jobID,LCF.hash(localIdentifier),dataName);
+      return jobManager.retrieveParentData(job.getID(),LCF.hash(localIdentifier),dataName);
     }
 
     /** Retrieve data passed from parents to a specified child document.
@@ -1194,7 +1207,7 @@ public class WorkerThread extends Thread
     public CharacterInput[] retrieveParentDataAsFiles(String localIdentifier, String dataName)
       throws LCFException
     {
-      return jobManager.retrieveParentDataAsFiles(jobID,LCF.hash(localIdentifier),dataName);
+      return jobManager.retrieveParentDataAsFiles(job.getID(),LCF.hash(localIdentifier),dataName);
     }
 
     /** Check whether current job is still active.
@@ -1205,7 +1218,7 @@ public class WorkerThread extends Thread
     public void checkJobStillActive()
       throws LCFException, ServiceInterruption
     {
-      if (jobManager.checkJobActive(jobID) == false)
+      if (jobManager.checkJobActive(job.getID()) == false)
         throw new ServiceInterruption("Job no longer active",System.currentTimeMillis());
     }
 
@@ -1893,6 +1906,16 @@ public class WorkerThread extends Thread
       abortSet.put(localIdentifier,localIdentifier);
     }
 
+    /** Check whether a mime type is indexable by the currently specified output connector.
+    *@param mimeType is the mime type to check, not including any character set specification.
+    *@return true if the mime type is indexable.
+    */
+    public boolean checkMimeTypeIndexable(String mimeType)
+      throws LCFException, ServiceInterruption
+    {
+      return ingester.checkMimeTypeIndexable(job.getOutputConnectionName(),mimeType);
+    }
+
     /** Check whether a document is indexable by the currently specified output connector.
     *@param localFile is the local copy of the file to check.
     *@return true if the document is indexable.



Mime
View raw message