manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r924788 - in /incubator/lcf/trunk/modules: ./ connectors/gts/connector/org/apache/lcf/agents/output/gts/ connectors/jcifs/connector/org/apache/lcf/crawler/connectors/sharedrive/ framework/agents/org/apache/lcf/agents/incrementalingest/ fram...
Date Thu, 18 Mar 2010 14:08:41 GMT
Author: kwright
Date: Thu Mar 18 14:08:41 2010
New Revision: 924788

URL: http://svn.apache.org/viewvc?rev=924788&view=rev
Log:
Address CONNECTORS-16.  Move fingerprinting functionality under the output connector umbrella,
which then permits the Solr connector to behave differently from the GTS connector.

Added:
    incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
  (with props)
Modified:
    incubator/lcf/trunk/modules/build.xml
    incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
    incubator/lcf/trunk/modules/connectors/jcifs/connector/org/apache/lcf/crawler/connectors/sharedrive/SharedDriveConnector.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
    incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
    incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IProcessActivity.java
    incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java

Modified: incubator/lcf/trunk/modules/build.xml
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/build.xml?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/build.xml (original)
+++ incubator/lcf/trunk/modules/build.xml Thu Mar 18 14:08:41 2010
@@ -129,6 +129,9 @@
         <mkdir dir="connectors/gts/war"/>
         <copy todir="connectors/gts/lib">
             <fileset dir="framework/lib"/>
+	    <fileset dir="lib">
+	      <include name="poi*.jar"/>
+	    </fileset>
         </copy>
         <copy todir="connectors/gts/lib">
             <fileset dir="framework/build/jar"/>
@@ -143,14 +146,7 @@
         <mkdir dir="connectors/jcifs/lib"/>
         <mkdir dir="connectors/jcifs/war"/>
         <copy todir="connectors/jcifs/lib">
-            <fileset dir="lib">
-                <include name="poi*.jar"/>
-            </fileset>
-        </copy>
-        <copy todir="connectors/jcifs/lib">
             <fileset dir="framework/lib"/>
-        </copy>
-        <copy todir="connectors/jcifs/lib">
             <fileset dir="framework/build/jar"/>
         </copy>
         <copy todir="connectors/jcifs/war">

Modified: incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
(original)
+++ incubator/lcf/trunk/modules/connectors/gts/connector/org/apache/lcf/agents/output/gts/GTSConnector.java
Thu Mar 18 14:08:41 2010
@@ -20,8 +20,22 @@ package org.apache.lcf.agents.output.gts
 
 import org.apache.lcf.core.interfaces.*;
 import org.apache.lcf.agents.interfaces.*;
+import org.apache.lcf.agents.system.Logging;
+
+// POIFS stuff
+import org.apache.poi.poifs.eventfilesystem.POIFSReader;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
+import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
+import org.apache.poi.poifs.filesystem.POIFSDocumentPath;
+import org.apache.poi.hpsf.SummaryInformation;
+import org.apache.poi.hpsf.PropertySetFactory;
+import org.apache.poi.hpsf.PropertySet;
+import org.apache.poi.hpsf.NoPropertySetStreamException;
+import org.apache.poi.hpsf.MarkUnsupportedException;
+import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
 
 import java.util.*;
+import java.io.*;
 
 /** This is the output connector for the MetaCarta appliance.  It establishes a notion of
 * collection(s) a document is ingested into, as well as the idea of a document template for
the
@@ -38,6 +52,17 @@ public class GTSConnector extends org.ap
   /** Document removal activity */
   public final static String REMOVE_ACTIVITY = "document deletion";
 
+  // These are the document types the fingerprinter understands
+  protected static final int DT_UNKNOWN = -1;
+  protected static final int DT_COMPOUND_DOC = 0;
+  protected static final int DT_MSWORD = 1;
+  protected static final int DT_MSEXCEL = 2;
+  protected static final int DT_MSPOWERPOINT = 3;
+  protected static final int DT_MSOUTLOOK = 4;
+  protected static final int DT_TEXT = 5;
+  protected static final int DT_ZERO = 6;
+  protected static final int DT_PDF = 7;
+
   /** Local data */
   protected HttpPoster poster = null;
 
@@ -118,6 +143,25 @@ public class GTSConnector extends org.ap
     }
   }
 
+  /** Pre-determine whether a document (passed here as a File object) is indexable by this
connector.  This method is used by participating
+  * repository connectors to help reduce the number of unmanageable documents that are passed
to this output connector in advance of an
+  * actual transfer.  This hook is provided mainly to support search engines that only handle
a small set of accepted file types.
+  *@param localFile is the local file to check.
+  *@return true if the file is indexable.
+  */
+  public boolean checkDocumentIndexable(File localFile)
+    throws LCFException, ServiceInterruption
+  {
+    if (!super.checkDocumentIndexable(localFile))
+      return false;
+    int docType = fingerprint(localFile);
+    return (docType == DT_TEXT ||
+      docType == DT_MSWORD ||
+      docType == DT_MSEXCEL ||
+      docType == DT_PDF ||
+      docType == DT_MSPOWERPOINT);
+  }
+
   /** Get an output version string, given an output specification.  The output version string
is used to uniquely describe the pertinent details of
   * the output specification and the configuration, to allow the Connector Framework to determine
whether a document will need to be output again.
   * Note that the contents of the document cannot be considered by this method, and that
a different version string (defined in IRepositoryConnector)
@@ -342,4 +386,306 @@ public class GTSConnector extends org.ap
     return startPosition;
   }
 
+  /** Fingerprint a file!
+  * Pass in the name of the (local) temporary file that we should be looking at.
+  * This method will read it as needed until the file has been identified (or found
+  * to remain "unknown").
+  * The code here has been lifted algorithmically from products/ShareCrawler/Fingerprinter.pas.
+  */
+  protected static int fingerprint(File file)
+    throws LCFException
+  {
+    try
+    {
+      // Look at the first 4K
+      byte[] byteBuffer = new byte[4096];
+      int amt;
+
+      // Open file for reading.
+      InputStream is = new FileInputStream(file);
+      try
+      {
+        amt = 0;
+        while (amt < byteBuffer.length)
+        {
+          int incr = is.read(byteBuffer,amt,byteBuffer.length-amt);
+          if (incr == -1)
+            break;
+          amt += incr;
+        }
+      }
+      finally
+      {
+        is.close();
+      }
+
+      if (amt == 0)
+        return DT_ZERO;
+
+      if (isText(byteBuffer,amt))
+      {
+        // Treat as ASCII text
+        // We don't need to distinguish between the various flavors (e.g. HTML,
+        // XML, RTF, or plain TEXT, because GTS will eat them all regardless.
+        // Since it's a bit dicey to figure out the encoding, we'll just presume
+        // it's something that GTS will understand.
+        return DT_TEXT;
+      }
+
+      // Treat it as binary
+
+      // Is it PDF?  Does it begin with "%PDF-"?
+      if (byteBuffer[0] == (byte)0x25 && byteBuffer[1] == (byte)0x50 && byteBuffer[2]
== (byte)0x44 && byteBuffer[3] == (byte)0x46)
+        return DT_PDF;
+
+      // Is it a compound document? Does it begin with 0xD0CF11E0A1B11AE1?
+      if (Logging.ingest.isDebugEnabled())
+        Logging.ingest.debug("GTS: Document begins with: "+hexprint(byteBuffer[0])+hexprint(byteBuffer[1])+
+        hexprint(byteBuffer[2])+hexprint(byteBuffer[3])+hexprint(byteBuffer[4])+hexprint(byteBuffer[5])+
+        hexprint(byteBuffer[6])+hexprint(byteBuffer[7]));
+      if (byteBuffer[0] == (byte)0xd0 && byteBuffer[1] == (byte)0xcf && byteBuffer[2]
== (byte)0x11 && byteBuffer[3] == (byte)0xe0 &&
+        byteBuffer[4] == (byte)0xa1 && byteBuffer[5] == (byte)0xb1 && byteBuffer[6]
== (byte)0x1a && byteBuffer[7] == (byte)0xe1)
+      {
+        Logging.ingest.debug("GTS: Compound document signature detected");
+        // Figure out what kind of compound document it is.
+        String appName = getAppName(file);
+        if (appName == null)
+          return DT_UNKNOWN;
+        else
+        {
+          if (Logging.ingest.isDebugEnabled())
+            Logging.ingest.debug("GTS: Appname is '"+appName+"'");
+        }
+        return recognizeApp(appName);
+      }
+
+      return DT_UNKNOWN;
+    }
+    catch (java.net.SocketTimeoutException e)
+    {
+      return DT_UNKNOWN;
+    }
+    catch (InterruptedIOException e)
+    {
+      throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
+    }
+    catch (IOException e)
+    {
+      // An I/O error indicates that the type is unknown.
+      return DT_UNKNOWN;
+    }
+    catch (IllegalArgumentException e)
+    {
+      // Another POI error, means unknown document type
+      return DT_UNKNOWN;
+    }
+    catch (IllegalStateException e)
+    {
+      // Another POI error, means unknown document type
+      return DT_UNKNOWN;
+    }
+    catch (ArrayIndexOutOfBoundsException e)
+    {
+      // This means that poi couldn't find the bytes it was expecting, so just treat it as
unknown
+      return DT_UNKNOWN;
+    }
+    catch (ClassCastException e)
+    {
+      // This means that poi had an internal error
+      return DT_UNKNOWN;
+    }
+    catch (OutOfMemoryError e)
+    {
+      // POI seems to throw this for some kinds of corrupt documents.
+      // I'm not sure this is the right thing to do but it's the best I
+      // can at the moment, until I get some documents from Norway that
+      // demonstrate the problem.
+      return DT_UNKNOWN;
+    }
+  }
+
+  /** Get a binary document's APPNAME field, or return null if the document
+  * does not seem to be an OLE compound document.
+  */
+  protected static String getAppName(File documentPath)
+    throws LCFException
+  {
+    try
+    {
+      InputStream is = new FileInputStream(documentPath);
+      try
+      {
+        // Use POIFS to traverse the file
+        POIFSReader reader = new POIFSReader();
+        ReaderListener listener = new ReaderListener();
+        reader.registerListener(listener,"\u0005SummaryInformation");
+        reader.read(is);
+        if (Logging.ingest.isDebugEnabled())
+          Logging.ingest.debug("GTS: Done finding appname for '"+documentPath.toString()+"'");
+        return listener.getAppName();
+      }
+      finally
+      {
+        is.close();
+      }
+    }
+    catch (java.net.SocketTimeoutException e)
+    {
+      return null;
+    }
+    catch (InterruptedIOException e)
+    {
+      throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
+    }
+    catch (Throwable e)
+    {
+      // We should eat all errors.  Also, even though our policy is to stop the crawler on
out-of-memory errors, in this case we will
+      // not do that, because there's no "collateral damage" that can result from a fingerprinting
failure.  No locks can be dropped, and
+      // we cannot screw up the database driver.
+      // Any collateral damage that we *do* need to stop for should manifest itself in another
thread.
+
+      // The exception effectively means that we cannot identify the document.
+      return null;
+    }
+  }
+
+  /** Translate a string application name to one of the kinds of documents
+  * we care about.
+  */
+  protected static int recognizeApp(String appName)
+  {
+    appName = appName.toUpperCase();
+    if (appName.indexOf("MICROSOFT WORD") != -1)
+      return DT_MSWORD;
+    if (appName.indexOf("MICROSOFT OFFICE WORD") != -1)
+      return DT_MSWORD;
+    if (appName.indexOf("MICROSOFT EXCEL") != -1)
+      return DT_MSEXCEL;
+    if (appName.indexOf("MICROSOFT POWERPOINT") != -1)
+      return DT_MSPOWERPOINT;
+    if (appName.indexOf("MICROSOFT OFFICE POWERPOINT") != -1)
+      return DT_MSPOWERPOINT;
+    if (appName.indexOf("MICROSOFT OUTLOOK") != -1)
+      return DT_MSOUTLOOK;
+    return DT_COMPOUND_DOC;
+  }
+
+  /** Test to see if a document is text or not.  The first n bytes are passed
+  * in, and this code returns "true" if it thinks they represent text.  The code
+  * has been lifted algorithmically from products/Sharecrawler/Fingerprinter.pas,
+  * which was based on "perldoc -f -T".
+  */
+  protected static boolean isText(byte[] beginChunk, int chunkLength)
+  {
+    if (chunkLength == 0)
+      return true;
+    int i = 0;
+    int count = 0;
+    while (i < chunkLength)
+    {
+      byte x = beginChunk[i++];
+      if (x == 0)
+        return false;
+      if (isStrange(x))
+        count++;
+    }
+    return ((double)count)/((double)chunkLength) < 0.30;
+  }
+
+  /** Check if character is not typical ASCII. */
+  protected static boolean isStrange(byte x)
+  {
+    return (x > 127 || x < 32) && (!isWhiteSpace(x));
+  }
+
+  /** Check if a byte is a whitespace character. */
+  protected static boolean isWhiteSpace(byte x)
+  {
+    return (x == 0x09 || x == 0x0a || x == 0x0d || x == 0x20);
+  }
+
+  protected static String hexprint(byte x)
+  {
+    StringBuffer sb = new StringBuffer();
+    sb.append(nibbleprint(0x0f & (((int)x)>>4))).append(nibbleprint(0x0f &
((int)x)));
+    return sb.toString();
+  }
+
+  protected static char nibbleprint(int x)
+  {
+    if (x >= 10)
+      return (char)(x - 10 + 'a');
+    return (char)(x + '0');
+  }
+
+  /** Reader listener object that extracts the app name */
+  protected static class ReaderListener implements POIFSReaderListener
+  {
+    protected String appName = null;
+
+    /** Constructor. */
+    public ReaderListener()
+    {
+    }
+
+    /** Get the app name.
+    */
+    public String getAppName()
+    {
+      return appName;
+    }
+
+    /** Process an "event" from POIFS - which is basically just the fact that we saw what
we
+    * said we wanted to see, namely the SummaryInfo stream.
+    */
+    public void processPOIFSReaderEvent(POIFSReaderEvent event)
+    {
+      // Catch exceptions
+      try
+      {
+        InputStream is = event.getStream();
+        try
+        {
+          PropertySet ps = PropertySetFactory.create(is);
+          if (!(ps instanceof SummaryInformation))
+          {
+            appName = null;
+            return;
+          }
+          appName = ((SummaryInformation)ps).getApplicationName();
+        }
+        finally
+        {
+          is.close();
+        }
+
+      }
+      catch (NoPropertySetStreamException e)
+      {
+        // This means we couldn't figure out what the application was
+        appName = null;
+        return;
+      }
+      catch (MarkUnsupportedException e)
+      {
+        // Bad code; need to suport mark operation.
+        Logging.ingest.error("Need to feed a stream that supports mark(): "+e.getMessage(),e);
+        appName = null;
+        return;
+      }
+      catch (java.io.UnsupportedEncodingException e)
+      {
+        // Bad code; need to support encoding properly
+        Logging.ingest.error("Need to support encoding: "+e.getMessage(),e);
+        appName = null;
+        return;
+      }
+      catch (IOException e)
+      {
+        appName = null;
+        return;
+      }
+    }
+  }
+
 }

Modified: incubator/lcf/trunk/modules/connectors/jcifs/connector/org/apache/lcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/jcifs/connector/org/apache/lcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/jcifs/connector/org/apache/lcf/crawler/connectors/sharedrive/SharedDriveConnector.java
(original)
+++ incubator/lcf/trunk/modules/connectors/jcifs/connector/org/apache/lcf/crawler/connectors/sharedrive/SharedDriveConnector.java
Thu Mar 18 14:08:41 2010
@@ -45,22 +45,11 @@ import org.apache.lcf.core.interfaces.LC
 import org.apache.lcf.crawler.interfaces.DocumentSpecification;
 import org.apache.lcf.crawler.interfaces.IDocumentIdentifierStream;
 import org.apache.lcf.crawler.interfaces.IProcessActivity;
+import org.apache.lcf.crawler.interfaces.IFingerprintActivity;
 import org.apache.lcf.core.interfaces.SpecificationNode;
 import org.apache.lcf.crawler.interfaces.IVersionActivity;
 import org.apache.lcf.crawler.system.Logging;
 
-// POIFS stuff
-import org.apache.poi.poifs.eventfilesystem.POIFSReader;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderListener;
-import org.apache.poi.poifs.eventfilesystem.POIFSReaderEvent;
-import org.apache.poi.poifs.filesystem.POIFSDocumentPath;
-import org.apache.poi.hpsf.SummaryInformation;
-import org.apache.poi.hpsf.PropertySetFactory;
-import org.apache.poi.hpsf.PropertySet;
-import org.apache.poi.hpsf.NoPropertySetStreamException;
-import org.apache.poi.hpsf.MarkUnsupportedException;
-import org.apache.poi.hpsf.UnexpectedPropertySetTypeException;
-
 /** This is the "repository connector" for a smb/cifs shared drive file system.  It's a relative
of the share crawler, and should have
 * comparable basic functionality.
 */
@@ -71,17 +60,6 @@ public class SharedDriveConnector extend
   // Activities we log
   public final static String ACTIVITY_ACCESS = "access";
 
-  // These are the document types the fingerprinter understands
-  protected static final int DT_UNKNOWN = -1;
-  protected static final int DT_COMPOUND_DOC = 0;
-  protected static final int DT_MSWORD = 1;
-  protected static final int DT_MSEXCEL = 2;
-  protected static final int DT_MSPOWERPOINT = 3;
-  protected static final int DT_MSOUTLOOK = 4;
-  protected static final int DT_TEXT = 5;
-  protected static final int DT_ZERO = 6;
-  protected static final int DT_PDF = 7;
-
   // These are the share connector nodes and attributes in the document specification
   public static final String NODE_STARTPOINT = "startpoint";
   public static final String NODE_INCLUDE = "include";
@@ -699,7 +677,7 @@ public class SharedDriveConnector extend
                     }
 
 
-                    if (checkIngest(tempFile, newPath, spec))
+                    if (checkIngest(tempFile, newPath, spec, activities))
                     {
                       if (Logging.connectors.isDebugEnabled())
                         Logging.connectors.debug("JCIFS: Decided to ingest '"+documentIdentifier+"'");
@@ -1648,9 +1626,10 @@ public class SharedDriveConnector extend
   *@param localFile is the file.
   *@param fileName is the JCIFS file name.
   *@param documentSpecification is the specification.
+  *@param activities are the activities available to determine indexability.
   *@return true if the file should be ingested.
   */
-  protected boolean checkIngest(File localFile, String fileName, DocumentSpecification documentSpecification)
+  protected boolean checkIngest(File localFile, String fileName, DocumentSpecification documentSpecification,
IFingerprintActivity activities)
     throws LCFException, ServiceInterruption
   {
     if (Logging.connectors.isDebugEnabled())
@@ -1734,12 +1713,7 @@ public class SharedDriveConnector extend
                     isIndexable = false;
                   else
                   {
-                    int docType = fingerprint(localFile);
-                    isIndexable = (docType == DT_TEXT ||
-                      docType == DT_MSWORD ||
-                      docType == DT_MSEXCEL ||
-                      docType == DT_PDF ||
-                      docType == DT_MSPOWERPOINT);
+                    isIndexable = activities.checkDocumentIndexable(localFile);
                   }
 
                   isMatch = (indexable.equals("yes") && isIndexable) ||
@@ -1979,125 +1953,6 @@ public class SharedDriveConnector extend
     return getFileCanonicalPath(new SmbFile(uri,pa));
   }
 
-  /** Fingerprint a file!
-  * Pass in the name of the (local) temporary file that we should be looking at.
-  * This method will read it as needed until the file has been identified (or found
-  * to remain "unknown").
-  * The code here has been lifted algorithmically from products/ShareCrawler/Fingerprinter.pas.
-  */
-  protected static int fingerprint(File file)
-    throws LCFException
-  {
-    try
-    {
-      // Look at the first 4K
-      byte[] byteBuffer = new byte[4096];
-      int amt;
-
-      // Open file for reading.
-      InputStream is = new FileInputStream(file);
-      try
-      {
-        amt = 0;
-        while (amt < byteBuffer.length)
-        {
-          int incr = is.read(byteBuffer,amt,byteBuffer.length-amt);
-          if (incr == -1)
-            break;
-          amt += incr;
-        }
-      }
-      finally
-      {
-        is.close();
-      }
-
-      if (amt == 0)
-        return DT_ZERO;
-
-      if (isText(byteBuffer,amt))
-      {
-        // Treat as ASCII text
-        // We don't need to distinguish between the various flavors (e.g. HTML,
-        // XML, RTF, or plain TEXT, because GTS will eat them all regardless.
-        // Since it's a bit dicey to figure out the encoding, we'll just presume
-        // it's something that GTS will understand.
-        return DT_TEXT;
-      }
-
-      // Treat it as binary
-
-      // Is it PDF?  Does it begin with "%PDF-"?
-      if (byteBuffer[0] == (byte)0x25 && byteBuffer[1] == (byte)0x50 && byteBuffer[2]
== (byte)0x44 && byteBuffer[3] == (byte)0x46)
-        return DT_PDF;
-
-      // Is it a compound document? Does it begin with 0xD0CF11E0A1B11AE1?
-      if (Logging.connectors.isDebugEnabled())
-        Logging.connectors.debug("JCIFS: Document begins with: "+hexprint(byteBuffer[0])+hexprint(byteBuffer[1])+
-        hexprint(byteBuffer[2])+hexprint(byteBuffer[3])+hexprint(byteBuffer[4])+hexprint(byteBuffer[5])+
-        hexprint(byteBuffer[6])+hexprint(byteBuffer[7]));
-      if (byteBuffer[0] == (byte)0xd0 && byteBuffer[1] == (byte)0xcf && byteBuffer[2]
== (byte)0x11 && byteBuffer[3] == (byte)0xe0 &&
-        byteBuffer[4] == (byte)0xa1 && byteBuffer[5] == (byte)0xb1 && byteBuffer[6]
== (byte)0x1a && byteBuffer[7] == (byte)0xe1)
-      {
-        Logging.connectors.debug("JCIFS: Compound document signature detected");
-        // Figure out what kind of compound document it is.
-        String appName = getAppName(file);
-        if (appName == null)
-          return DT_UNKNOWN;
-        else
-        {
-          if (Logging.connectors.isDebugEnabled())
-            Logging.connectors.debug("JCIFS: Appname is '"+appName+"'");
-        }
-        return recognizeApp(appName);
-      }
-
-      return DT_UNKNOWN;
-    }
-    catch (java.net.SocketTimeoutException e)
-    {
-      return DT_UNKNOWN;
-    }
-    catch (InterruptedIOException e)
-    {
-      throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
-    }
-    catch (IOException e)
-    {
-      // An I/O error indicates that the type is unknown.
-      return DT_UNKNOWN;
-    }
-    catch (IllegalArgumentException e)
-    {
-      // Another POI error, means unknown document type
-      return DT_UNKNOWN;
-    }
-    catch (IllegalStateException e)
-    {
-      // Another POI error, means unknown document type
-      return DT_UNKNOWN;
-    }
-    catch (ArrayIndexOutOfBoundsException e)
-    {
-      // This means that poi couldn't find the bytes it was expecting, so just treat it as
unknown
-      return DT_UNKNOWN;
-    }
-    catch (ClassCastException e)
-    {
-      // This means that poi had an internal error
-      return DT_UNKNOWN;
-    }
-    catch (OutOfMemoryError e)
-    {
-      // POI seems to throw this for some kinds of corrupt documents.
-      // I'm not sure this is the right thing to do but it's the best I
-      // can at the moment, until I get some documents from Norway that
-      // demonstrate the problem.
-      return DT_UNKNOWN;
-    }
-  }
-
-
   /** Stuffer for packing a single string with an end delimiter */
   protected static void pack(StringBuffer output, String value, char delimiter)
   {
@@ -2205,122 +2060,6 @@ public class SharedDriveConnector extend
     return startPosition;
   }
 
-  protected static String hexprint(byte x)
-  {
-    StringBuffer sb = new StringBuffer();
-    sb.append(nibbleprint(0x0f & (((int)x)>>4))).append(nibbleprint(0x0f &
((int)x)));
-    return sb.toString();
-  }
-
-  protected static char nibbleprint(int x)
-  {
-    if (x >= 10)
-      return (char)(x - 10 + 'a');
-    return (char)(x + '0');
-  }
-
-  /** Get a binary document's APPNAME field, or return null if the document
-  * does not seem to be an OLE compound document.
-  */
-  protected static String getAppName(File documentPath)
-    throws LCFException
-  {
-    try
-    {
-      InputStream is = new FileInputStream(documentPath);
-      try
-      {
-        // Use POIFS to traverse the file
-        POIFSReader reader = new POIFSReader();
-        ReaderListener listener = new ReaderListener();
-        reader.registerListener(listener,"\u0005SummaryInformation");
-        reader.read(is);
-        if (Logging.connectors.isDebugEnabled())
-          Logging.connectors.debug("JCIFS: Done finding appname for '"+documentPath.toString()+"'");
-        return listener.getAppName();
-      }
-      finally
-      {
-        is.close();
-      }
-    }
-    catch (java.net.SocketTimeoutException e)
-    {
-      return null;
-    }
-    catch (InterruptedIOException e)
-    {
-      throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
-    }
-    catch (Throwable e)
-    {
-      // We should eat all errors.  Also, even though our policy is to stop the crawler on
out-of-memory errors, in this case we will
-      // not do that, because there's no "collateral damage" that can result from a fingerprinting
failure.  No locks can be dropped, and
-      // we cannot screw up the database driver.
-      // Any collateral damage that we *do* need to stop for should manifest itself in another
thread.
-
-      // The exception effectively means that we cannot identify the document.
-      return null;
-    }
-  }
-
-
-
-  /** Translate a string application name to one of the kinds of documents
-  * we care about.
-  */
-  protected static int recognizeApp(String appName)
-  {
-    appName = appName.toUpperCase();
-    if (appName.indexOf("MICROSOFT WORD") != -1)
-      return DT_MSWORD;
-    if (appName.indexOf("MICROSOFT OFFICE WORD") != -1)
-      return DT_MSWORD;
-    if (appName.indexOf("MICROSOFT EXCEL") != -1)
-      return DT_MSEXCEL;
-    if (appName.indexOf("MICROSOFT POWERPOINT") != -1)
-      return DT_MSPOWERPOINT;
-    if (appName.indexOf("MICROSOFT OFFICE POWERPOINT") != -1)
-      return DT_MSPOWERPOINT;
-    if (appName.indexOf("MICROSOFT OUTLOOK") != -1)
-      return DT_MSOUTLOOK;
-    return DT_COMPOUND_DOC;
-  }
-
-  /** Test to see if a document is text or not.  The first n bytes are passed
-  * in, and this code returns "true" if it thinks they represent text.  The code
-  * has been lifted algorithmically from products/Sharecrawler/Fingerprinter.pas,
-  * which was based on "perldoc -f -T".
-  */
-  protected static boolean isText(byte[] beginChunk, int chunkLength)
-  {
-    if (chunkLength == 0)
-      return true;
-    int i = 0;
-    int count = 0;
-    while (i < chunkLength)
-    {
-      byte x = beginChunk[i++];
-      if (x == 0)
-        return false;
-      if (isStrange(x))
-        count++;
-    }
-    return ((double)count)/((double)chunkLength) < 0.30;
-  }
-
-  /** Check if character is not typical ASCII. */
-  protected static boolean isStrange(byte x)
-  {
-    return (x > 127 || x < 32) && (!isWhiteSpace(x));
-  }
-
-  /** Check if a byte is a whitespace character. */
-  protected static boolean isWhiteSpace(byte x)
-  {
-    return (x == 0x09 || x == 0x0a || x == 0x0d || x == 0x20);
-  }
-
   // These methods allow me to experiment with cluster-mandated error handling on an entirely
local level.  They correspond to individual SMBFile methods.
 
   /** Get canonical path */
@@ -2970,76 +2709,6 @@ public class SharedDriveConnector extend
     return directories;
   }
 
-  /** Reader listener object that extracts the app name */
-  protected static class ReaderListener implements POIFSReaderListener
-  {
-    protected String appName = null;
-
-    /** Constructor. */
-    public ReaderListener()
-    {
-    }
-
-    /** Get the app name.
-    */
-    public String getAppName()
-    {
-      return appName;
-    }
-
-    /** Process an "event" from POIFS - which is basically just the fact that we saw what
we
-    * said we wanted to see, namely the SummaryInfo stream.
-    */
-    public void processPOIFSReaderEvent(POIFSReaderEvent event)
-    {
-      // Catch exceptions
-      try
-      {
-        InputStream is = event.getStream();
-        try
-        {
-          PropertySet ps = PropertySetFactory.create(is);
-          if (!(ps instanceof SummaryInformation))
-          {
-            appName = null;
-            return;
-          }
-          appName = ((SummaryInformation)ps).getApplicationName();
-        }
-        finally
-        {
-          is.close();
-        }
-
-      }
-      catch (NoPropertySetStreamException e)
-      {
-        // This means we couldn't figure out what the application was
-        appName = null;
-        return;
-      }
-      catch (MarkUnsupportedException e)
-      {
-        // Bad code; need to suport mark operation.
-        Logging.connectors.error("Need to feed a stream that supports mark()",e);
-        appName = null;
-        return;
-      }
-      catch (java.io.UnsupportedEncodingException e)
-      {
-        // Bad code; need to support encoding properly
-        Logging.connectors.error("Need to support encoding",e);
-        appName = null;
-        return;
-      }
-      catch (IOException e)
-      {
-        appName = null;
-        return;
-      }
-    }
-  }
-
   /**
   * inner class which returns only shares. used by listfiles(SmbFileFilter)
   *

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/incrementalingest/IncrementalIngester.java
Thu Mar 18 14:08:41 2010
@@ -23,6 +23,7 @@ import org.apache.lcf.agents.interfaces.
 import org.apache.lcf.agents.system.Logging;
 import org.apache.lcf.agents.system.LCF;
 import java.util.*;
+import java.io.*;
 
 /** Incremental ingestion API implementation.
 * This class is responsible for keeping track of what has been sent where, and also the corresponding
version of
@@ -190,6 +191,29 @@ public class IncrementalIngester extends
     performDelete("",null,null);
   }
 
+  /** Check if a file is indexable.
+  *@param outputConnectionName is the name of the output connection associated with this
action.
+  *@param localFile is the local file to check.
+  *@return true if the local file is indexable.
+  */
+  public boolean checkDocumentIndexable(String outputConnectionName, File localFile)
+    throws LCFException, ServiceInterruption
+  {
+    IOutputConnection connection = connectionManager.load(outputConnectionName);
+    IOutputConnector connector = OutputConnectorFactory.grab(threadContext,connection.getClassName(),connection.getConfigParams(),connection.getMaxConnections());
+    if (connector == null)
+      // The connector is not installed; treat this as a service interruption.
+      throw new ServiceInterruption("Output connector not installed",300000L);
+    try
+    {
+      return connector.checkDocumentIndexable(localFile);
+    }
+    finally
+    {
+      OutputConnectorFactory.release(connector);
+    }
+  }
+  
   /** Record a document version, but don't ingest it.
   * The purpose of this method is to keep track of the frequency at which ingestion "attempts"
take place.
   * ServiceInterruption is thrown if this action must be rescheduled.

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IIncrementalIngester.java
Thu Mar 18 14:08:41 2010
@@ -19,6 +19,7 @@
 package org.apache.lcf.agents.interfaces;
 
 import org.apache.lcf.core.interfaces.*;
+import java.io.*;
 
 /** This interface describes the incremental ingestion API.
 * SOME NOTES:
@@ -68,6 +69,14 @@ public interface IIncrementalIngester
   public void clearAll()
     throws LCFException;
 
+  /** Check if a file is indexable.
+  *@param outputConnectionName is the name of the output connection associated with this
action.
+  *@param localFile is the local file to check.
+  *@return true if the local file is indexable.
+  */
+  public boolean checkDocumentIndexable(String outputConnectionName, File localFile)
+    throws LCFException, ServiceInterruption;
+
   /** Record a document version, but don't ingest it.
   * The purpose of this method is to keep track of the frequency at which ingestion "attempts"
take place.
   * ServiceInterruption is thrown if this action must be rescheduled.

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/interfaces/IOutputConnector.java
Thu Mar 18 14:08:41 2010
@@ -128,6 +128,15 @@ public interface IOutputConnector
   */
   public ConfigParams getConfiguration();
 
+  /** Pre-determine whether a document (passed here as a File object) is indexable by this
connector.  This method is used by participating
+  * repository connectors to help reduce the number of unmanageable documents that are passed
to this output connector in advance of an
+  * actual transfer.  This hook is provided mainly to support search engines that only handle
a small set of accepted file types.
+  *@param localFile is the local file to check.
+  *@return true if the file is indexable.
+  */
+  public boolean checkDocumentIndexable(File localFile)
+    throws LCFException, ServiceInterruption;
+
   /** Get an output version string, given an output specification.  The output version string
is used to uniquely describe the pertinent details of
   * the output specification and the configuration, to allow the Connector Framework to determine
whether a document will need to be output again.
   * Note that the contents of the document cannot be considered by this method, and that
a different version string (defined in IRepositoryConnector)

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
(original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/output/BaseOutputConnector.java
Thu Mar 18 14:08:41 2010
@@ -143,6 +143,18 @@ public abstract class BaseOutputConnecto
     return params;
   }
 
+  /** Pre-determine whether a document (passed here as a File object) is indexable by this
connector.  This method is used by participating
+  * repository connectors to help reduce the number of unmanageable documents that are passed
to this output connector in advance of an
+  * actual transfer.  This hook is provided mainly to support search engines that only handle
a small set of accepted file types.
+  *@param localFile is the local file to check.
+  *@return true if the file is indexable.
+  */
+  public boolean checkDocumentIndexable(File localFile)
+    throws LCFException, ServiceInterruption
+  {
+    return true;
+  }
+
 }
 
 

Added: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java?rev=924788&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
(added)
+++ incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
Thu Mar 18 14:08:41 2010
@@ -0,0 +1,39 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.lcf.crawler.interfaces;
+
+import org.apache.lcf.core.interfaces.*;
+import org.apache.lcf.agents.interfaces.*;
+import java.util.*;
+import java.io.*;
+
+/** This interface abstracts from the activities that handle document fingerprinting.
+*/
+public interface IFingerprintActivity
+{
+  public static final String _rcsid = "@(#)$Id$";
+
+  /** Check whether a document is indexable by the currently specified output connector.
+  *@param localFile is the local copy of the file to check.
+  *@return true if the document is indexable.
+  */
+  public boolean checkDocumentIndexable(File localFile)
+    throws LCFException, ServiceInterruption;
+
+}

Propchange: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
------------------------------------------------------------------------------
    svn:executable = *

Propchange: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IFingerprintActivity.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IProcessActivity.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IProcessActivity.java
(original)
+++ incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/interfaces/IProcessActivity.java
Thu Mar 18 14:08:41 2010
@@ -24,7 +24,7 @@ import java.util.*;
 
 /** This interface abstracts from the activities that a fetched document processor can do.
 */
-public interface IProcessActivity extends IHistoryActivity, IEventActivity, IAbortActivity
+public interface IProcessActivity extends IHistoryActivity, IEventActivity, IAbortActivity,
IFingerprintActivity
 {
   public static final String _rcsid = "@(#)$Id$";
 

Modified: incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java?rev=924788&r1=924787&r2=924788&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java
(original)
+++ incubator/lcf/trunk/modules/framework/pull-agent/org/apache/lcf/crawler/system/WorkerThread.java
Thu Mar 18 14:08:41 2010
@@ -23,6 +23,7 @@ import org.apache.lcf.agents.interfaces.
 import org.apache.lcf.crawler.interfaces.*;
 import org.apache.lcf.crawler.system.Logging;
 import java.util.*;
+import java.io.*;
 import java.lang.reflect.*;
 
 /** This class represents a worker thread.  Hundreds of these threads are instantiated in
order to
@@ -1281,7 +1282,6 @@ public class WorkerThread extends Thread
   protected static class ProcessActivity implements IProcessActivity
   {
     // Member variables
-    // MHL to remove version map and add specified version to this method call
     protected IThreadContext threadContext;
     protected IJobManager jobManager;
     protected IIncrementalIngester ingester;
@@ -1893,6 +1893,16 @@ public class WorkerThread extends Thread
       abortSet.put(localIdentifier,localIdentifier);
     }
 
+    /** Check whether a document is indexable by the currently specified output connector.
+    *@param localFile is the local copy of the file to check.
+    *@return true if the document is indexable.
+    */
+    public boolean checkDocumentIndexable(File localFile)
+      throws LCFException, ServiceInterruption
+    {
+      return ingester.checkDocumentIndexable(job.getOutputConnectionName(),localFile);
+    }
+
     /** Create a global string from a simple string.
     *@param simpleString is the simple string.
     *@return a global string.



Mime
View raw message