incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1140147 - in /incubator/lcf/trunk: ./ connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/ connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/ connectors/jdb...
Date Mon, 27 Jun 2011 13:12:24 GMT
Author: kwright
Date: Mon Jun 27 13:12:23 2011
New Revision: 1140147

URL: http://svn.apache.org/viewvc?rev=1140147&view=rev
Log:
Fix for CONNECTORS-216.  Regularize deleteDocument() method.

Modified:
    incubator/lcf/trunk/CHANGES.txt
    incubator/lcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java
    incubator/lcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
    incubator/lcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
    incubator/lcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
    incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
    incubator/lcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
    incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Mon Jun 27 13:12:23 2011
@@ -3,10 +3,16 @@ $Id$
 
 ======================= 0.3-dev =========================
 
+CONNECTORS-216: Clean up document removal options in IProcessActivity,
+and modify connectors accordingly to avoid using the deprecated form of
+deleteDocument().
+(Karl Wright)
+
 CONNECTORS-214: Add output connector support for restricting documents based on
 mime type, URL, and document length.  Hook this up to the web and RSS
 connectors, and add mime type and maximum length fields to the Solr
 connector.
+(Erlend GarĂ¥sen, Karl Wright)
 
 CONNECTORS-212: Failure during notify should send job back to
 ReadyForNotify state, not ReadyForDelete state.

Modified: incubator/lcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java
(original)
+++ incubator/lcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java
Mon Jun 27 13:12:23 2011
@@ -1250,7 +1250,7 @@ public class FilenetConnector extends or
                     null,documentIdentifier,"Authorization error",e.getMessage(),null);
                   if (Logging.connectors.isDebugEnabled())
                     Logging.connectors.debug("FileNet: Removing file '"+documentIdentifier+"'
because: "+e.getMessage(),e);
-                  activities.deleteDocument(documentIdentifier);
+                  activities.deleteDocument(documentIdentifier,documentVersion);
                   i++;
                   continue;
                 }
@@ -1361,7 +1361,7 @@ public class FilenetConnector extends or
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("FileNet: Removing version '"+documentIdentifier+"'
because it seems to no longer exist");
 
-            activities.deleteDocument(documentIdentifier);
+            activities.deleteDocument(documentIdentifier,documentVersion);
             i++;
             continue;
           }
@@ -1391,7 +1391,7 @@ public class FilenetConnector extends or
           {
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("FileNet: Removing version '"+documentIdentifier+"'
because: "+e.getMessage(),e);
-            activities.deleteDocument(documentIdentifier);
+            activities.deleteDocument(documentIdentifier,documentVersion);
             i++;
             continue;
           }

Modified: incubator/lcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
(original)
+++ incubator/lcf/trunk/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
Mon Jun 27 13:12:23 2011
@@ -775,7 +775,7 @@ public class SharedDriveConnector extend
                         index = setPathMetadata(rd,version,index);
                         StringBuilder ingestURI = new StringBuilder();
                         index = unpack(ingestURI,version,index,'+');
-                        activities.ingestDocument(documentIdentifier, versions[i], ingestURI.toString(),
rd);
+                        activities.ingestDocument(documentIdentifier, version, ingestURI.toString(),
rd);
                       }
                       finally
                       {
@@ -796,7 +796,7 @@ public class SharedDriveConnector extend
                       // method has no way of signalling this, since it does not do the fingerprinting.
                       if (Logging.connectors.isDebugEnabled())
                         Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
-                      activities.deleteDocument(documentIdentifier);
+                      activities.deleteDocument(documentIdentifier, version);
                       // We should record the access here as well, since this is a non-exception
way through the code path.
                       // (I noticed that this was not being recorded in the history while
fixing 25477.)
                       activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
@@ -859,8 +859,8 @@ public class SharedDriveConnector extend
         Logging.connectors.warn("JCIFS: Authorization exception reading document/directory
"+documentIdentifier+" - skipping");
         activities.recordActivity(null,ACTIVITY_ACCESS,
           null,documentIdentifier,"Skip","Authorization: "+e.getMessage(),null);
-        // We call the delete even if it's a directory; this is harmless and it cleans up
the jobqueue row.
-        activities.deleteDocument(documentIdentifier);
+        // We call the delete even if it's a directory; this is harmless.
+        activities.deleteDocument(documentIdentifier, version);
       }
       catch (SmbException se)
       {
@@ -919,7 +919,7 @@ public class SharedDriveConnector extend
             Logging.connectors.debug("JCIFS: Skipping document/directory "+documentIdentifier+"
because it cannot be found");
           activities.recordActivity(null,ACTIVITY_ACCESS,
             null,documentIdentifier,"Not found",null,null);
-          activities.deleteDocument(documentIdentifier);
+          activities.deleteDocument(documentIdentifier, version);
         }
         else if (se.getMessage().indexOf("is denied") != -1)
         {
@@ -927,7 +927,7 @@ public class SharedDriveConnector extend
           // We call the delete even if it's a directory; this is harmless and it cleans
up the jobqueue row.
           activities.recordActivity(null,ACTIVITY_ACCESS,
             null,documentIdentifier,"Skip","Authorization: "+se.getMessage(),null);
-          activities.deleteDocument(documentIdentifier);
+          activities.deleteDocument(documentIdentifier, version);
         }
         else
         {

Modified: incubator/lcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
(original)
+++ incubator/lcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
Mon Jun 27 13:12:23 2011
@@ -621,7 +621,7 @@ public class JDBCConnector extends org.a
           if (map.get(documentIdentifier) != null)
           {
             // This means we did not see it (or data for it) in the result set.  Delete it!
-            activities.deleteDocument(documentIdentifier);
+            activities.deleteDocument(documentIdentifier,versions[i]);
           }
         }
         i++;

Modified: incubator/lcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
(original)
+++ incubator/lcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
Mon Jun 27 13:12:23 2011
@@ -1170,7 +1170,7 @@ public class MeridioConnector extends or
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("Meridio: Could not retrieve document data for document
id '" +
               new Long(docId).toString() + "' in processDocuments method - deleting document.");
-            activities.deleteDocument(documentIdentifier);
+            activities.deleteDocument(documentIdentifier,docVersion);
             i++;
             continue;
           }
@@ -1182,7 +1182,7 @@ public class MeridioConnector extends or
               Logging.connectors.debug("Meridio: Could not retrieve document owner for document
id '" +
               new Long(docId).toString() + "' in processDocuments method. No information
or incorrect amount " +
               "of information was returned");
-            activities.deleteDocument(documentIdentifier);
+            activities.deleteDocument(documentIdentifier,docVersion);
             i++;
             continue;
           }
@@ -1346,7 +1346,7 @@ public class MeridioConnector extends or
               if (Logging.connectors.isDebugEnabled())
                 Logging.connectors.debug("Meridio: Failed to get content for document '"
+ new Long(docId).toString() + "'");
               // No document.  Delete what's there
-              activities.deleteDocument(documentIdentifier);
+              activities.deleteDocument(documentIdentifier,docVersion);
               i++;
               continue;
             }

Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
(original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Mon Jun 27 13:12:23 2011
@@ -1241,7 +1241,7 @@ public class RSSConnector extends org.ap
         // Leave document in jobqueue, but do NOT get rid of it, or we will wind up seeing
it queued again by
         // somebody else.  We *do* have to signal the document to be removed from the index,
however, or it will
         // stick around until the job is deleted.
-        activities.deleteDocument(urlValue);
+        activities.deleteDocument(urlValue,version);
         continue;
       }
 
@@ -1478,9 +1478,7 @@ public class RSSConnector extends org.ap
         }
         else
         {
-          // This is NOT quite the same as deleteDocument().  The deleteDocument() method
removes the record, and
-          // thus the version string.  So, when that is used, we cannot tell if the document
has changed; we simply have to try again.
-          activities.ingestDocument(urlValue,version,null,null);
+          activities.deleteDocument(urlValue,version);
 
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because it cannot
be indexed");

Modified: incubator/lcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
(original)
+++ incubator/lcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
Mon Jun 27 13:12:23 2011
@@ -949,14 +949,14 @@ public class SharePointRepository extend
               // Site/library no longer exists, so delete entry
               if (Logging.connectors.isDebugEnabled())
                 Logging.connectors.debug("SharePoint: No list found for library '"+siteLibPath+"'
- deleting");
-              activities.deleteDocument(documentIdentifier);
+              activities.deleteDocument(documentIdentifier,version);
             }
           }
           else
           {
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("SharePoint: GUID lookup failed for library '"+siteLibPath+"'
- deleting");
-            activities.deleteDocument(documentIdentifier);
+            activities.deleteDocument(documentIdentifier,version);
           }
         }
         else
@@ -1064,7 +1064,7 @@ public class SharePointRepository extend
                           Logging.connectors.debug("SharePoint: Document at '"+encodedServerLocation+encodedDocumentPath+"'
failed to fetch with code "+Integer.toString(returnCode)+", deleting");
                         activities.recordActivity(new Long(startFetchTime),ACTIVITY_FETCH,
                           null,documentIdentifier,"Not found",Integer.toString(returnCode),null);
-                        activities.deleteDocument(documentIdentifier);
+                        activities.deleteDocument(documentIdentifier,version);
                         i++;
                         continue;
                       }
@@ -1245,7 +1245,7 @@ public class SharePointRepository extend
                     {
                       if (Logging.connectors.isDebugEnabled())
                         Logging.connectors.debug("SharePoint: Library '"+decodedLibPath+"'
no longer exists - deleting document '"+documentIdentifier+"'");
-                      activities.deleteDocument( documentIdentifier );
+                      activities.deleteDocument(documentIdentifier,version);
                       i++;
                       continue;
                     }
@@ -1267,7 +1267,7 @@ public class SharePointRepository extend
                       // Document has vanished
                       if (Logging.connectors.isDebugEnabled())
                         Logging.connectors.debug("SharePoint: Document metadata fetch failure
indicated that document is gone: '"+documentIdentifier+"' - removing");
-                      activities.deleteDocument( documentIdentifier );
+                      activities.deleteDocument(documentIdentifier,version);
                       i++;
                       continue;
                     }

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
(original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Mon Jun 27 13:12:23 2011
@@ -1124,7 +1124,7 @@ public class WebcrawlerConnector extends
         // Leave document in jobqueue, but do NOT get rid of it, or we will wind up seeing
it queued again by
         // somebody else.  We *do* have to signal the document to be removed from the index,
however, or it will
         // stick around until the job is deleted.
-        activities.deleteDocument(documentIdentifier);
+        activities.deleteDocument(documentIdentifier,version);
         continue;
       }
 
@@ -1250,9 +1250,7 @@ public class WebcrawlerConnector extends
           // We do this by using a null url and a null repository document.  If a document
with this identifier was
           // previously indexed, it will be removed.
           
-          // This is NOT quite the same as deleteDocument().  The deleteDocument() method
removes the record, and
-          // thus the version string.  So, when that is used, we cannot tell if the document
has changed; we simply have to try again.
-          activities.ingestDocument(documentIdentifier,version,null,null);
+          activities.deleteDocument(documentIdentifier,version);
           
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("WEB: Decided not to ingest '"+documentIdentifier+"'
because it did not match ingestability criteria");

Modified: incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
(original)
+++ incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
Mon Jun 27 13:12:23 2011
@@ -119,7 +119,19 @@ public interface IProcessActivity extend
   public void ingestDocument(String localIdentifier, String version, String documentURI,
RepositoryDocument data)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Delete the current document from the search engine index.
+  /** Delete the current document from the search engine index, while keeping track of the
version information
+  * for it (to reduce churn).
+  *@param localIdentifier is the document's local identifier.
+  *@param version is the version of the document, as reported by the getDocumentVersions()
method of the
+  *       corresponding repository connector.
+  */
+  public void deleteDocument(String localIdentifier, String version)
+    throws ManifoldCFException, ServiceInterruption;
+
+  /** Delete the current document from the search engine index.  This method does NOT keep
track of version
+  * information for the document and thus can lead to "churn", whereby the same document
is queued, versioned,
+  * and removed on subsequent crawls.  It therefore should be considered to be deprecated,
in favor of
+  * deleteDocument(String localIdentifier, String version).
   *@param localIdentifier is the document's local identifier.
   */
   public void deleteDocument(String localIdentifier)

Modified: incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1140147&r1=1140146&r2=1140147&view=diff
==============================================================================
--- incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
(original)
+++ incubator/lcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Mon Jun 27 13:12:23 2011
@@ -1595,9 +1595,27 @@ public class WorkerThread extends Thread
         ingestLogger);
     }
 
-    /** Delete the current document from the search engine index.
+    /** Delete the current document from the search engine index, while keeping track of
the version information
+    * for it (to reduce churn).
     *@param documentIdentifier is the document's local identifier.
+    *@param version is the version of the document, as reported by the getDocumentVersions()
method of the
+    *       corresponding repository connector.
     */
+    public void deleteDocument(String documentIdentifier, String version)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      if (version.length() == 0)
+        deleteDocument(documentIdentifier);
+      else
+        ingestDocument(documentIdentifier,version,null,null);
+    }
+
+  /** Delete the current document from the search engine index.  This method does NOT keep
track of version
+  * information for the document and thus can lead to "churn", whereby the same document
is queued, versioned,
+  * and removed on subsequent crawls.  It therefore should be considered to be deprecated,
in favor of
+  * deleteDocument(String localIdentifier, String version).
+  *@param documentIdentifier is the document's local identifier.
+  */
     public void deleteDocument(String documentIdentifier)
       throws ManifoldCFException, ServiceInterruption
     {



Mime
View raw message