manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1610713 [1/2] - in /manifoldcf/trunk: ./ connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/ connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/ connectors/h...
Date Tue, 15 Jul 2014 14:25:40 GMT
Author: kwright
Date: Tue Jul 15 14:25:39 2014
New Revision: 1610713

URL: http://svn.apache.org/r1610713
Log:
Fix for CONNECTORS-990.  WARNING: API change!  Also, this change is very extensive, and should be tried out extensively to be sure ServiceExceptions etc are thoroughly exercised.

Added:
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
      - copied unchanged from r1610707, manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
Modified:
    manifoldcf/trunk/   (props changed)
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
    manifoldcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java
    manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
    manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
    manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
    manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
    manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
    manifoldcf/trunk/connectors/rss/   (props changed)
    manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
    manifoldcf/trunk/connectors/sharepoint/   (props changed)
    manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
    manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IJobManager.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/Carrydown.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/IntrinsicLink.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/JobManager.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-990:r1610284-1610707

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Jul 15 14:25:39 2014
@@ -3,6 +3,14 @@ $Id$
 
 ======================= 1.7-dev =====================
 
+CONNECTORS-990: Revamp IRepositoryConnector API to no longer separate
+getDocumentVersions() and processDocuments().  This modification basically
+pushes responsibility to determine changes to the repository connector.
+Backwards compatibility is maintained via code in BaseRepositoryConnector,
+and new methods have been added to IProcessActivity.
+WorkerThread has been largely rewritten as a result.
+(Karl Wright)
+
 CONNECTORS-991: Make Jira connector perform pre-emptive basic auth
 since Jira supports guest users.
 (Daniel Aschauer, Karl Wright)

Modified: manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java (original)
+++ manifoldcf/trunk/connectors/documentum/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/DCTM/DCTM.java Tue Jul 15 14:25:39 2014
@@ -1866,7 +1866,7 @@ public class DCTM extends org.apache.man
                 }
                 
                 if (rd == null)
-                  activities.deleteDocument(documentIdentifier,versionString);
+                  activities.noDocument(documentIdentifier,versionString);
                 
                 // Abort the retry loop and go on to the next document
                 break;

Modified: manifoldcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java (original)
+++ manifoldcf/trunk/connectors/filenet/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filenet/FilenetConnector.java Tue Jul 15 14:25:39 2014
@@ -1267,7 +1267,7 @@ public class FilenetConnector extends or
                     null,documentIdentifier,"Authorization error",e.getMessage(),null);
                   if (Logging.connectors.isDebugEnabled())
                     Logging.connectors.debug("FileNet: Removing file '"+documentIdentifier+"' because: "+e.getMessage(),e);
-                  activities.deleteDocument(documentIdentifier,documentVersion);
+                  activities.noDocument(documentIdentifier,documentVersion);
                   i++;
                   continue;
                 }
@@ -1350,7 +1350,7 @@ public class FilenetConnector extends or
                 }
               }
               else
-                activities.deleteDocument(documentIdentifier,documentVersion);
+                activities.noDocument(documentIdentifier,documentVersion);
             }
             finally
             {
@@ -1384,7 +1384,7 @@ public class FilenetConnector extends or
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("FileNet: Removing version '"+documentIdentifier+"' because it seems to no longer exist");
 
-            activities.deleteDocument(documentIdentifier,documentVersion);
+            activities.noDocument(documentIdentifier,documentVersion);
             i++;
             continue;
           }
@@ -1414,7 +1414,7 @@ public class FilenetConnector extends or
           {
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("FileNet: Removing version '"+documentIdentifier+"' because: "+e.getMessage(),e);
-            activities.deleteDocument(documentIdentifier,documentVersion);
+            activities.noDocument(documentIdentifier,documentVersion);
             i++;
             continue;
           }

Modified: manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java Tue Jul 15 14:25:39 2014
@@ -409,7 +409,7 @@ public class HDFSRepositoryConnector ext
         
       if (fileStatus == null) {
         // It is no longer there , so delete right away
-        activities.deleteDocument(documentIdentifier,version);
+        activities.deleteDocument(documentIdentifier);
         continue;
       }
         
@@ -421,7 +421,7 @@ public class HDFSRepositoryConnector ext
         FileStatus[] fileStatuses = getChildren(fileStatus.getPath());
         if (fileStatuses == null) {
           // Directory was deleted, so remove
-          activities.deleteDocument(documentIdentifier,version);
+          activities.deleteDocument(documentIdentifier);
           continue;
         }
         for (int j = 0; j < fileStatuses.length; j++) {

Modified: manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java (original)
+++ manifoldcf/trunk/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java Tue Jul 15 14:25:39 2014
@@ -722,7 +722,7 @@ public class JDBCConnector extends org.a
           if (map.get(documentIdentifier) != null)
           {
             // This means we did not see it (or data for it) in the result set.  Delete it!
-            activities.deleteDocument(documentIdentifier,versions[i]);
+            activities.deleteDocument(documentIdentifier);
           }
         }
         i++;

Modified: manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java (original)
+++ manifoldcf/trunk/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java Tue Jul 15 14:25:39 2014
@@ -962,7 +962,7 @@ public class JiraRepositoryConnector ext
             String issueKey = nodeId.substring(2);
             JiraIssue jiraFile = getIssue(issueKey);
             if (jiraFile == null) {
-              activities.deleteDocument(nodeId, version);
+              activities.deleteDocument(nodeId);
               continue;
             }
             

Modified: manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java (original)
+++ manifoldcf/trunk/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java Tue Jul 15 14:25:39 2014
@@ -4440,7 +4440,7 @@ public class LivelinkConnector extends o
                     // Since we logged in, we should fail here if the ingestion user doesn't have access to the
                     // the document, but if we do, don't fail hard.
                     resultCode = "UNAUTHORIZED";
-                    activities.deleteDocument(documentIdentifier,version);
+                    activities.noDocument(documentIdentifier,version);
                     return;
 
                   case HttpStatus.SC_OK:
@@ -4533,7 +4533,7 @@ public class LivelinkConnector extends o
                     else
                     {
                       resultCode = "SESSIONLOGINFAILED";
-                      activities.deleteDocument(documentIdentifier,version);
+                      activities.noDocument(documentIdentifier,version);
                     }
                     break;
                   case HttpStatus.SC_BAD_REQUEST:
@@ -4731,7 +4731,7 @@ public class LivelinkConnector extends o
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("Livelink: Excluding document "+documentIdentifier+" because its length ("+dataSize+") was rejected by output connector");
             resultCode = "DOCUMENTTOOLONG";
-            activities.deleteDocument(documentIdentifier,version);
+            activities.noDocument(documentIdentifier,version);
           }
         }
         else
@@ -4741,7 +4741,7 @@ public class LivelinkConnector extends o
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("Livelink: Excluding document "+documentIdentifier+" because its mime type ("+mimeType+") was rejected by output connector");
           resultCode = "MIMETYPEEXCLUSION";
-          activities.deleteDocument(documentIdentifier,version);
+          activities.noDocument(documentIdentifier,version);
         }
       }
       else
@@ -4751,7 +4751,7 @@ public class LivelinkConnector extends o
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("Livelink: Excluding document "+documentIdentifier+" because its URL ("+viewHttpAddress+") was rejected by output connector");
         resultCode = "URLEXCLUSION";
-        activities.deleteDocument(documentIdentifier,version);
+        activities.noDocument(documentIdentifier,version);
       }
     }
     finally

Modified: manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java (original)
+++ manifoldcf/trunk/connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/meridio/MeridioConnector.java Tue Jul 15 14:25:39 2014
@@ -1164,7 +1164,7 @@ public class MeridioConnector extends or
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("Meridio: Could not retrieve document data for document id '" +
               new Long(docId).toString() + "' in processDocuments method - deleting document.");
-            activities.deleteDocument(documentIdentifier,docVersion);
+            activities.noDocument(documentIdentifier,docVersion);
             i++;
             continue;
           }
@@ -1176,7 +1176,7 @@ public class MeridioConnector extends or
               Logging.connectors.debug("Meridio: Could not retrieve document owner for document id '" +
               new Long(docId).toString() + "' in processDocuments method. No information or incorrect amount " +
               "of information was returned");
-            activities.deleteDocument(documentIdentifier,docVersion);
+            activities.noDocument(documentIdentifier,docVersion);
             i++;
             continue;
           }
@@ -1336,7 +1336,7 @@ public class MeridioConnector extends or
               if (Logging.connectors.isDebugEnabled())
                 Logging.connectors.debug("Meridio: Failed to get content for document '" + new Long(docId).toString() + "'");
               // No document.  Delete what's there
-              activities.deleteDocument(documentIdentifier,docVersion);
+              activities.noDocument(documentIdentifier,docVersion);
               i++;
               continue;
             }
@@ -1370,13 +1370,13 @@ public class MeridioConnector extends or
                   }
                 }
                 else
-                  activities.deleteDocument(documentIdentifier, docVersion);
+                  activities.noDocument(documentIdentifier, docVersion);
               }
               else
               {
                 if (Logging.connectors.isDebugEnabled())
                   Logging.connectors.debug("Meridio: Expected temporary file was not present - skipping document '"+new Long(docId).toString() + "'");
-                activities.deleteDocument(documentIdentifier, docVersion);
+                activities.deleteDocument(documentIdentifier);
               }
             }
             finally

Propchange: manifoldcf/trunk/connectors/rss/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-990/connectors/rss:r1610284-1610707

Modified: manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Tue Jul 15 14:25:39 2014
@@ -1239,7 +1239,7 @@ public class RSSConnector extends org.ap
         // Leave document in jobqueue, but do NOT get rid of it, or we will wind up seeing it queued again by
         // somebody else.  We *do* have to signal the document to be removed from the index, however, or it will
         // stick around until the job is deleted.
-        activities.deleteDocument(urlValue,version);
+        activities.noDocument(urlValue,version);
         continue;
       }
 
@@ -1528,7 +1528,7 @@ public class RSSConnector extends org.ap
         }
         else
         {
-          activities.deleteDocument(urlValue,version);
+          activities.noDocument(urlValue,version);
 
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because it cannot be indexed");

Propchange: manifoldcf/trunk/connectors/sharepoint/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-990/connectors/sharepoint:r1610284-1610707

Modified: manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java (original)
+++ manifoldcf/trunk/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java Tue Jul 15 14:25:39 2014
@@ -1322,28 +1322,28 @@ public class SharePointRepository extend
                     // Site/list no longer exists, so delete entry
                     if (Logging.connectors.isDebugEnabled())
                       Logging.connectors.debug("SharePoint: No list found for list '"+siteListPath+"' - deleting");
-                    activities.deleteDocument(documentIdentifier,version);
+                    activities.deleteDocument(documentIdentifier);
                   }
                 }
                 else
                 {
                   if (Logging.connectors.isDebugEnabled())
                     Logging.connectors.debug("SharePoint: Access token lookup failed for list '"+siteListPath+"' - deleting");
-                  activities.deleteDocument(documentIdentifier,version);
+                  activities.noDocument(documentIdentifier,version);
                 }
               }
               else
               {
                 if (Logging.connectors.isDebugEnabled())
                   Logging.connectors.debug("SharePoint: Field list lookup failed for list '"+siteListPath+"' - deleting");
-                activities.deleteDocument(documentIdentifier,version);
+                activities.noDocument(documentIdentifier,version);
               }
             }
             else
             {
               if (Logging.connectors.isDebugEnabled())
                 Logging.connectors.debug("SharePoint: GUID lookup failed for list '"+siteListPath+"' - deleting");
-              activities.deleteDocument(documentIdentifier,version);
+              activities.noDocument(documentIdentifier,version);
             }
           }
           else
@@ -1413,7 +1413,7 @@ public class SharePointRepository extend
               {
                 if (Logging.connectors.isDebugEnabled())
                   Logging.connectors.debug("SharePoint: List '"+decodedListPath+"' no longer exists - deleting item '"+documentIdentifier+"'");
-                activities.deleteDocument(documentIdentifier,version);
+                activities.deleteDocument(documentIdentifier);
                 i++;
                 continue;
               }
@@ -1479,7 +1479,7 @@ public class SharePointRepository extend
                     // Item has vanished
                     if (Logging.connectors.isDebugEnabled())
                       Logging.connectors.debug("SharePoint: Item metadata fetch failure indicated that item is gone: '"+documentIdentifier+"' - removing");
-                    activities.deleteDocument(documentIdentifier,version);
+                    activities.noDocument(documentIdentifier,version);
                     i++;
                     continue;
                   }
@@ -1536,7 +1536,7 @@ public class SharePointRepository extend
                 }
                 else
                   // Document too long (should never happen; length is 0)
-                  activities.deleteDocument( documentIdentifier, version );
+                  activities.noDocument( documentIdentifier, version );
               }
             }
             else
@@ -1584,7 +1584,7 @@ public class SharePointRepository extend
                     accessTokens, denyTokens, createdDate, modifiedDate, null, guid, sDesc))
                   {
                     // Document not indexed for whatever reason
-                    activities.deleteDocument(documentIdentifier,version);
+                    activities.noDocument(documentIdentifier,version);
                     i++;
                     continue;
                   }
@@ -1593,7 +1593,7 @@ public class SharePointRepository extend
                 {
                   if (Logging.connectors.isDebugEnabled())
                     Logging.connectors.debug("SharePoint: Skipping attachment '"+documentIdentifier+"' because no parent guid found");
-                  activities.deleteDocument(documentIdentifier,version);
+                  activities.noDocument(documentIdentifier,version);
                   i++;
                   continue;
                 }
@@ -1664,28 +1664,28 @@ public class SharePointRepository extend
                     // Site/library no longer exists, so delete entry
                     if (Logging.connectors.isDebugEnabled())
                       Logging.connectors.debug("SharePoint: No list found for library '"+siteLibPath+"' - deleting");
-                    activities.deleteDocument(documentIdentifier,version);
+                    activities.deleteDocument(documentIdentifier);
                   }
                 }
                 else
                 {
                   if (Logging.connectors.isDebugEnabled())
                     Logging.connectors.debug("SharePoint: Access token lookup failed for library '"+siteLibPath+"' - deleting");
-                  activities.deleteDocument(documentIdentifier,version);
+                  activities.noDocument(documentIdentifier,version);
                 }
               }
               else
               {
                 if (Logging.connectors.isDebugEnabled())
                   Logging.connectors.debug("SharePoint: Field list lookup failed for library '"+siteLibPath+"' - deleting");
-                activities.deleteDocument(documentIdentifier,version);
+                activities.noDocument(documentIdentifier,version);
               }
             }
             else
             {
               if (Logging.connectors.isDebugEnabled())
                 Logging.connectors.debug("SharePoint: GUID lookup failed for library '"+siteLibPath+"' - deleting");
-              activities.deleteDocument(documentIdentifier,version);
+              activities.noDocument(documentIdentifier,version);
             }
           }
           else
@@ -1751,7 +1751,7 @@ public class SharePointRepository extend
                 {
                   if (Logging.connectors.isDebugEnabled())
                     Logging.connectors.debug("SharePoint: Library '"+decodedLibPath+"' no longer exists - deleting document '"+documentIdentifier+"'");
-                  activities.deleteDocument(documentIdentifier,version);
+                  activities.deleteDocument(documentIdentifier);
                   i++;
                   continue;
                 }
@@ -1763,7 +1763,7 @@ public class SharePointRepository extend
                   // Document has vanished
                   if (Logging.connectors.isDebugEnabled())
                     Logging.connectors.debug("SharePoint: Document metadata fetch failure indicated that document is gone: '"+documentIdentifier+"' - removing");
-                  activities.deleteDocument(documentIdentifier,version);
+                  activities.noDocument(documentIdentifier,version);
                   i++;
                   continue;
                 }
@@ -1774,7 +1774,7 @@ public class SharePointRepository extend
                 acls, denyAcls, createdDate, modifiedDate, metadataValues, guid, sDesc))
               {
                 // Document not indexed for whatever reason
-                activities.deleteDocument(documentIdentifier,version);
+                activities.noDocument(documentIdentifier,version);
                 i++;
                 continue;
               }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Tue Jul 15 14:25:39 2014
@@ -1331,7 +1331,7 @@ public class WebcrawlerConnector extends
         // Leave document in jobqueue, but do NOT get rid of it, or we will wind up seeing it queued again by
         // somebody else.  We *do* have to signal the document to be removed from the index, however, or it will
         // stick around until the job is deleted.
-        activities.deleteDocument(documentIdentifier,version);
+        activities.noDocument(documentIdentifier,version);
         continue;
       }
 
@@ -1465,7 +1465,7 @@ public class WebcrawlerConnector extends
           // We do this by using a null url and a null repository document.  If a document with this identifier was
           // previously indexed, it will be removed.
           
-          activities.deleteDocument(documentIdentifier,version);
+          activities.noDocument(documentIdentifier,version);
           
           if (Logging.connectors.isDebugEnabled())
             Logging.connectors.debug("WEB: Decided not to ingest '"+documentIdentifier+"' because it did not match ingestability criteria");

Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java Tue Jul 15 14:25:39 2014
@@ -511,12 +511,11 @@ public class IncrementalIngester extends
     String newParameterVersion,
     String newAuthorityNameString)
   {
+    if (newAuthorityNameString == null)
+      newAuthorityNameString = "";
     IPipelineSpecification pipelineSpecification = pipelineSpecificationWithVersions.getPipelineSpecification();
     IPipelineSpecificationBasic basicSpecification = pipelineSpecification.getBasicPipelineSpecification();
-    // Empty document version has a special meaning....
-    if (newDocumentVersion.length() == 0)
-      return true;
-    // Otherwise, cycle through the outputs
+    // Cycle through the outputs
     for (int i = 0; i < basicSpecification.getOutputCount(); i++)
     {
       int stage = basicSpecification.getOutputStage(i);
@@ -609,27 +608,35 @@ public class IncrementalIngester extends
   }
 
   /** Record a document version, but don't ingest it.
-  * The purpose of this method is to keep track of the frequency at which ingestion "attempts" take place.
-  * ServiceInterruption is thrown if this action must be rescheduled.
+  * The purpose of this method is to update document version information without reindexing the document.
   *@param pipelineSpecificationBasic is the basic pipeline specification needed.
   *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
   *@param identifierHash is the hashed document identifier.
   *@param documentVersion is the document version.
   *@param recordTime is the time at which the recording took place, in milliseconds since epoch.
-  *@param activities is the object used in case a document needs to be removed from the output index as the result of this operation.
   */
   @Override
   public void documentRecord(
     IPipelineSpecificationBasic pipelineSpecificationBasic,
     String identifierClass, String identifierHash,
-    String documentVersion, long recordTime,
-    IOutputActivity activities)
-    throws ManifoldCFException, ServiceInterruption
+    String documentVersion, long recordTime)
+    throws ManifoldCFException
   {
+    // This method is called when a connector decides that the last indexed version of the document is in fact just fine,
+    // but the document version information should be updated.
+    // The code pathway is therefore similar to that of document indexing, EXCEPT that no indexing will ever
+    // take place.  This has some interesting side effects.  For example:
+    // (1) In the case of a document collision with another job using the same repository connection, the last document
+    //    indexed cannot be changed.  Updating the version string for the document would therefore be misleading.  This
+    //    case should be detected and prevented from occurring, by refusing to perform the update.
+    //    On the other hand, only one thread at a time can be processing the document at a given time, and therefore
+    //    since the connector detected "no change", we are safe to presume we can just update the version info.
+    // (2) In the case of a URL conflict with another job, since nothing changes and no new URL is recorded, no cleanup
+    //    of conflicting records sharing the same URL should be needed.
+    
     String docKey = makeKey(identifierClass,identifierHash);
 
     String[] outputConnectionNames = extractOutputConnectionNames(pipelineSpecificationBasic);
-    IOutputConnection[] outputConnections = connectionManager.loadMultiple(outputConnectionNames);
 
     if (Logging.ingest.isDebugEnabled())
     {
@@ -639,99 +646,10 @@ public class IncrementalIngester extends
     for (int k = 0; k < outputConnectionNames.length; k++)
     {
       String outputConnectionName = outputConnectionNames[k];
-      IOutputConnection connection = outputConnections[k];
-
-      String oldURI = null;
-      String oldURIHash = null;
-      String oldOutputVersion = null;
-
-      // Repeat if needed
-      while (true)
-      {
-        long sleepAmt = 0L;
-        try
-        {
-          // See what uri was used before for this doc, if any
-          ArrayList list = new ArrayList();
-          String query = buildConjunctionClause(list,new ClauseDescription[]{
-            new UnitaryClause(docKeyField,docKey),
-            new UnitaryClause(outputConnNameField,outputConnectionName)});
-            
-          IResultSet set = performQuery("SELECT "+docURIField+","+uriHashField+","+lastOutputVersionField+" FROM "+getTableName()+
-            " WHERE "+query,list,null,null);
-
-          if (set.getRowCount() > 0)
-          {
-            IResultRow row = set.getRow(0);
-            oldURI = (String)row.getValue(docURIField);
-            oldURIHash = (String)row.getValue(uriHashField);
-            oldOutputVersion = (String)row.getValue(lastOutputVersionField);
-          }
-          
-          break;
-        }
-        catch (ManifoldCFException e)
-        {
-          // Look for deadlock and retry if so
-          if (e.getErrorCode() == e.DATABASE_TRANSACTION_ABORT)
-          {
-            if (Logging.perf.isDebugEnabled())
-              Logging.perf.debug("Aborted select looking for status: "+e.getMessage());
-            sleepAmt = getSleepAmt();
-            continue;
-          }
-          throw e;
-        }
-        finally
-        {
-          sleepFor(sleepAmt);
-        }
-      }
-
-      // If uri hashes collide, then we must be sure to eliminate only the *correct* records from the table, or we will leave
-      // dangling documents around.  So, all uri searches and comparisons MUST compare the actual uri as well.
 
-      // But, since we need to insure that any given URI is only worked on by one thread at a time, use critical sections
-      // to block the rare case that multiple threads try to work on the same URI.
-      
-      String[] lockArray = computeLockArray(null,oldURI,outputConnectionName);
-      lockManager.enterLocks(null,null,lockArray);
-      try
-      {
-
-        ArrayList list = new ArrayList();
-        
-        if (oldURI != null)
-        {
-          IOutputConnector connector = outputConnectorPool.grab(connection);
-          if (connector == null)
-            // The connector is not installed; treat this as a service interruption.
-            throw new ServiceInterruption("Output connector not installed",0L);
-          try
-          {
-            connector.removeDocument(oldURI,oldOutputVersion,new OutputRemoveActivitiesWrapper(activities,outputConnectionName));
-          }
-          finally
-          {
-            outputConnectorPool.release(connection,connector);
-          }
-          // Delete all records from the database that match the old URI, except for THIS record.
-          list.clear();
-          String query = buildConjunctionClause(list,new ClauseDescription[]{
-            new UnitaryClause(uriHashField,"=",oldURIHash),
-            new UnitaryClause(outputConnNameField,outputConnectionName)});
-          list.add(docKey);
-          performDelete("WHERE "+query+" AND "+docKeyField+"!=?",list,null);
-        }
-
-        // If we get here, it means we are noting that the document was examined, but that no change was required.  This is signaled
-        // to noteDocumentIngest by having the null documentURI.
-        noteDocumentIngest(outputConnectionName,docKey,documentVersion,null,null,null,null,recordTime,null,null);
-      }
-      finally
-      {
-        lockManager.leaveLocks(null,null,lockArray);
-      }
+      // If we get here, it means we are noting that the document was examined, but that no change was required.  This is signaled
+      // to noteDocumentIngest by having the null documentURI.
+      noteDocumentIngest(outputConnectionName,docKey,documentVersion,null,null,null,null,recordTime,null,null);
     }
   }
 

Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java (original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java Tue Jul 15 14:25:39 2014
@@ -150,21 +150,18 @@ public interface IIncrementalIngester
     String newAuthorityNameString);
 
   /** Record a document version, but don't ingest it.
-  * The purpose of this method is to keep track of the frequency at which ingestion "attempts" take place.
-  * ServiceInterruption is thrown if this action must be rescheduled.
+  * The purpose of this method is to update document version information without reindexing the document.
   *@param pipelineSpecificationBasic is the basic pipeline specification needed.
   *@param identifierClass is the name of the space in which the identifier hash should be interpreted.
   *@param identifierHash is the hashed document identifier.
   *@param documentVersion is the document version.
   *@param recordTime is the time at which the recording took place, in milliseconds since epoch.
-  *@param activities is the object used in case a document needs to be removed from the output index as the result of this operation.
   */
   public void documentRecord(
     IPipelineSpecificationBasic pipelineSpecificationBasic,
     String identifierClass, String identifierHash,
-    String documentVersion, long recordTime,
-    IOutputActivity activities)
-    throws ManifoldCFException, ServiceInterruption;
+    String documentVersion, long recordTime)
+    throws ManifoldCFException;
 
   /** Ingest a document.
   * This ingests the document, and notes it.  If this is a repeat ingestion of the document, this

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java Tue Jul 15 14:25:39 2014
@@ -324,6 +324,93 @@ public abstract class BaseRepositoryConn
     return null;
   }
 
+  /** Process a set of documents.
+  * This is the method that should cause each document to be fetched, processed, and the results either added
+  * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
+  * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
+  *@param documentIdentifiers is the set of document identifiers to process.
+  *@param statuses are the currently-stored document versions for each document in the set of document identifiers
+  * passed in above.
+  *@param activities is the interface this method should use to queue up new document references
+  * and ingest documents.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
+  */
+  @Override
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // The backwards-compatible base implementation does the following:
+    // (1) Uses the deprecated methods to obtain a set of version information
+    // (2) Based on the provided version information, determines whether processing is required
+    // (3) Uses deprecated methods to process documents
+    // (4) Releases document versions
+
+    // We need to get the old version strings together in order to use the deprecated methods
+    String[] oldVersions = new String[documentIdentifiers.length];
+    for (int i = 0; i < oldVersions.length; i++)
+    {
+      oldVersions[i] = statuses.getIndexedVersionString(documentIdentifiers[i]);
+    }
+    DocumentVersions dv = new DocumentVersions();
+    getDocumentVersions(dv,documentIdentifiers,oldVersions,activities,spec,jobMode,usesDefaultAuthority);
+    try
+    {
+      // Next, we determine what part of the set of documents were unchanged, and what part we need to refetch.
+      Set<String> fetchDocuments = new HashSet<String>();
+      Set<String> scanDocuments = new HashSet<String>();
+      for (int i = 0; i < documentIdentifiers.length; i++)
+      {
+        String documentIdentifier = documentIdentifiers[i];
+        VersionContext vc = dv.getDocumentVersion(documentIdentifier);
+        if (vc != null)
+        {
+          if (dv.isAlwaysRefetch(documentIdentifier) || activities.checkDocumentNeedsReindexing(documentIdentifier,vc.getVersionString()))
+          {
+            // These documents need reprocessing
+            fetchDocuments.add(documentIdentifier);
+          }
+          else
+          {
+            // These documents have been checked and found NOT to need reprocessing
+            activities.noteUnchangedDocument(documentIdentifier);
+          }
+          scanDocuments.add(documentIdentifier);
+        }
+        else
+        {
+          // These documents must go away permanently
+          // MHL to collect these and do them as a group
+          activities.deleteDocument(documentIdentifier);
+        }
+      }
+
+      // Construct the appropriate data to call processDocuments() with
+      String[] processIDs = new String[scanDocuments.size()];
+      boolean[] scanOnly = new boolean[scanDocuments.size()];
+      int index = 0;
+      for (int i = 0; i < documentIdentifiers.length; i++)
+      {
+        String documentIdentifier = documentIdentifiers[i];
+        if (scanDocuments.contains(documentIdentifier))
+        {
+          processIDs[index] = documentIdentifier;
+          scanOnly[index] = !fetchDocuments.contains(documentIdentifier);
+          index++;
+        }
+      }
+      processDocuments(processIDs,dv,activities,scanOnly,jobMode);
+      
+    }
+    finally
+    {
+      // Release document versions
+      releaseDocumentVersions(documentIdentifiers,dv);
+    }
+  }
+
   /** Get document versions given an array of document identifiers.
   * This method is called for EVERY document that is considered. It is therefore important to perform
   * as little work as possible here.
@@ -340,7 +427,6 @@ public abstract class BaseRepositoryConn
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
   *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
   */
-  @Override
   public void getDocumentVersions(
     DocumentVersions documentVersions,
     String[] documentIdentifiers, String[] oldVersions,
@@ -488,7 +574,6 @@ public abstract class BaseRepositoryConn
   *@param documentIdentifiers is the set of document identifiers.
   *@param versions is the corresponding set of version strings (individual identifiers may have no version).
   */
-  @Override
   public void releaseDocumentVersions(String[] documentIdentifiers, DocumentVersions versions)
     throws ManifoldCFException
   {
@@ -543,7 +628,6 @@ public abstract class BaseRepositoryConn
   * should only find other references, and should not actually call the ingestion methods.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
   */
-  @Override
   public void processDocuments(String[] documentIdentifiers, DocumentVersions versions, IProcessActivity activities,
     boolean[] scanOnly, int jobMode)
     throws ManifoldCFException, ServiceInterruption

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IJobManager.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IJobManager.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IJobManager.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IJobManager.java Tue Jul 15 14:25:39 2014
@@ -717,6 +717,18 @@ public interface IJobManager
     String[] parentIdentifierHashes, int hopcountMethod)
     throws ManifoldCFException;
 
+  /** Undo the addition of child documents to the queue, for a set of documents.
+  * This method is called at the end of document processing, to back out any incomplete additions to the queue, and restore
+  * the status quo ante prior to the incomplete additions.  Call this method instead of finishDocuments() if the
+  * addition of documents was not completed.
+  *@param jobID is the job identifier.
+  *@param legalLinkTypes is the set of legal link types that this connector generates.
+  *@param parentIdentifierHashes are the hashes of the document identifiers for whom child link extraction just took place.
+  */
+  public void revertDocuments(Long jobID, String[] legalLinkTypes,
+    String[] parentIdentifierHashes)
+    throws ManifoldCFException;
+
   /** Retrieve specific parent data for a given document.
   *@param jobID is the job identifier.
   *@param docIDHash is the hash of the document identifier.

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java Tue Jul 15 14:25:39 2014
@@ -23,15 +23,43 @@ import java.io.*;
 import org.apache.manifoldcf.core.interfaces.*;
 import org.apache.manifoldcf.agents.interfaces.*;
 
-/** This interface abstracts from the activities that a fetched document processor can do.
+/** This interface abstracts from the activities that a connector's processDocuments() method can do.
+* The processing flow for a document is expected to go something like this:
+* (1) The connector's processDocuments() method is called with a set of documents to be processed.
+* (2) The connector computes a version string for each document in the set as part of determining
+*    whether the document indeed needs to be refetched.
+* (3) For each document processed, there can be one of several dispositions:
+*   (a) There is no such document (anymore): deleteDocument() called for the document.
+*   (b) The document is (re)indexed: ingestDocumentWithException() is called for the document.
+*   (c) The document is determined to be unchanged and no updates are needed: noteUnchangedDocument() is called
+*     for the document.
+*   (d) The document is determined to be unchanged BUT the version string needs to be updated: recordDocument()
+*     is called for the document.
+*   (e) The document is determined to be unindexable BUT it still exists in the repository: noDocument()
+*    is called for the document.
+*   (f) There was a service interruption: ServiceInterruption is thrown.
+*   (g) Nothing is called describing the document's disposition.  In that case, for backwards compatibility,
+*    the framework marks the document as having been processed.
+* (4) In order to determine whether a document needs to be reindexed, the method checkDocumentNeedsReindexing()
+*    is available to return an opinion on that matter.
 */
-public interface IProcessActivity extends IHistoryActivity, IEventActivity, IAbortActivity, IFingerprintActivity,
-    ICarrydownActivity
+public interface IProcessActivity extends IVersionActivity
 {
   public static final String _rcsid = "@(#)$Id: IProcessActivity.java 988245 2010-08-23 18:39:35Z kwright $";
 
+  /** Check if a document needs to be reindexed, based on a computed version string.
+  * Call this method to determine whether reindexing is necessary.  Pass in a newly-computed version
+  * string.  This method will return "true" if the document needs to be re-indexed.
+  *@param documentIdentifier is the document identifier.
+  *@param newVersionString is the newly-computed version string.
+  *@return true if the document needs to be reindexed.
+  */
+  public boolean checkDocumentNeedsReindexing(String documentIdentifier,
+    String newVersionString)
+    throws ManifoldCFException;
+
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the local document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -45,12 +73,12 @@ public interface IProcessActivity extend
   *@param originationTime is the time, in ms since epoch, that the document originated.  Pass null if none or unknown.
   *@param prereqEventNames are the names of the prerequisite events which this document requires prior to processing.  Pass null if none.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType,
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
     String[] dataNames, Object[][] dataValues, Long originationTime, String[] prereqEventNames)
     throws ManifoldCFException;
 
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -63,13 +91,12 @@ public interface IProcessActivity extend
   *          The type of each object must either be a String, or a CharacterInput.
   *@param originationTime is the time, in ms since epoch, that the document originated.  Pass null if none or unknown.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType,
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
     String[] dataNames, Object[][] dataValues, Long originationTime)
     throws ManifoldCFException;
 
-
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -81,12 +108,12 @@ public interface IProcessActivity extend
   *@param dataValues are the values that correspond to the data names in the dataNames parameter.  May be null only if dataNames is null.
   *          The type of each object must either be a String, or a CharacterInput.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType,
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType,
     String[] dataNames, Object[][] dataValues)
     throws ManifoldCFException;
 
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -94,27 +121,19 @@ public interface IProcessActivity extend
   * reference.  This must be one of the strings returned by the IRepositoryConnector method
   * "getRelationshipTypes()".  May be null.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String relationshipType)
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String relationshipType)
     throws ManifoldCFException;
 
   /** Add a document description to the current job's queue.  This method is equivalent to
   * addDocumentReference(localIdentifier,null,null).
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   */
-  public void addDocumentReference(String localIdentifier)
+  public void addDocumentReference(String documentIdentifier)
     throws ManifoldCFException;
 
-
-  /** Record a document version, but don't ingest it.
-  *@param localIdentifier is the document identifier.
-  *@param version is the document version.
-  */
-  public void recordDocument(String localIdentifier, String version)
-    throws ManifoldCFException, ServiceInterruption;
-
   /** Ingest the current document.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param version is the version of the document, as reported by the getDocumentVersions() method of the
   *       corresponding repository connector.
   *@param documentURI is the URI to use to retrieve this document from the search interface (and is
@@ -122,11 +141,11 @@ public interface IProcessActivity extend
   *@param data is the document data.  The data is closed after ingestion is complete.
   *@throws IOException only when data stream reading fails.
   */
-  public void ingestDocumentWithException(String localIdentifier, String version, String documentURI, RepositoryDocument data)
+  public void ingestDocumentWithException(String documentIdentifier, String version, String documentURI, RepositoryDocument data)
     throws ManifoldCFException, ServiceInterruption, IOException;
 
   /** Ingest the current document.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param version is the version of the document, as reported by the getDocumentVersions() method of the
   *       corresponding repository connector.
   *@param documentURI is the URI to use to retrieve this document from the search interface (and is
@@ -136,48 +155,78 @@ public interface IProcessActivity extend
   * according to standard best practices.
   */
   @Deprecated
-  public void ingestDocument(String localIdentifier, String version, String documentURI, RepositoryDocument data)
+  public void ingestDocument(String documentIdentifier, String version, String documentURI, RepositoryDocument data)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Delete the current document from the search engine index, while keeping track of the version information
-  * for it (to reduce churn).
-  *@param localIdentifier is the document's local identifier.
-  *@param version is the version of the document, as reported by the getDocumentVersions() method of the
-  *       corresponding repository connector.
+  /** Note the fact that a document exists but is unchanged, and nothing further
+  * needs to be done to it.
+  * Call this method if it is determined that the document in question is identical to
+  * the formerly indexed document, AND when the version string for the document
+  * has not changed either.
   */
-  public void deleteDocument(String localIdentifier, String version)
+  public void noteUnchangedDocument(String documentIdentifier)
+    throws ManifoldCFException;
+
+  /** Remove the specified document from the search engine index, and update the
+  * recorded version information for the document.
+  *@param documentIdentifier is the document's local identifier.
+  *@param version is the version string to be recorded for the document.
+  */
+  public void noDocument(String documentIdentifier, String version)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Delete the current document from the search engine index.  This method does NOT keep track of version
-  * information for the document and thus can lead to "churn", whereby the same document is queued, versioned,
-  * and removed on subsequent crawls.  It therefore should be considered to be deprecated, in favor of
-  * deleteDocument(String localIdentifier, String version).
-  *@param localIdentifier is the document's local identifier.
+  /** Delete the specified document permanently from the search engine index, and from the status table.
+  * This method does NOT keep track of any document version information for the document and thus can
+  * lead to "churn", whereby the same document is queued, processed,
+  * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+  * in any case where the same decision will need to be made over and over.
+  *@param documentIdentifier is the document's identifier.
+  */
+  public void deleteDocument(String documentIdentifier)
+    throws ManifoldCFException;
+
+  /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
+  * documents with the same URL, however, will still be removed.)  This is
+  * useful if the version string changes but the document contents are known not
+  * to have changed.
+  *@param documentIdentifier is the document identifier.
+  *@param version is the document version.
   */
-  public void deleteDocument(String localIdentifier)
+  public void recordDocument(String documentIdentifier, String version)
+    throws ManifoldCFException;
+
+  /** Delete the current document from the search engine index, while keeping track of the version information
+  * for it (to reduce churn).
+  * Deprecated; use noDocument() above instead.
+  *@param documentIdentifier is the document's local identifier.
+  *@param version is the version string to be recorded for the document.
+  */
+  @Deprecated
+  public void deleteDocument(String documentIdentifier, String version)
     throws ManifoldCFException, ServiceInterruption;
 
+
   /** Override the schedule for the next time a document is crawled.
   * Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper expire bound, lower expire bound,
   * or a combination of these, on a specific document.  This method is only effective if the job is a continuous one, and if the
   * identifier you pass in is being processed.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param lowerRecrawlBoundTime is the time in ms since epoch that the reschedule time should not fall BELOW, or null if none.
   *@param upperRecrawlBoundTime is the time in ms since epoch that the reschedule time should not rise ABOVE, or null if none.
   *@param lowerExpireBoundTime is the time in ms since epoch that the expire time should not fall BELOW, or null if none.
   *@param upperExpireBoundTime is the time in ms since epoch that the expire time should not rise ABOVE, or null if none.
   */
-  public void setDocumentScheduleBounds(String localIdentifier,
+  public void setDocumentScheduleBounds(String documentIdentifier,
     Long lowerRecrawlBoundTime, Long upperRecrawlBoundTime,
     Long lowerExpireBoundTime, Long upperExpireBoundTime)
     throws ManifoldCFException;
 
   /** Override a document's origination time.
   * Use this method to signal the framework that a document's origination time is something other than the first time it was crawled.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param originationTime is the document's origination time, or null if unknown.
   */
-  public void setDocumentOriginationTime(String localIdentifier,
+  public void setDocumentOriginationTime(String documentIdentifier,
     Long originationTime)
     throws ManifoldCFException;
 

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java Tue Jul 15 14:25:39 2014
@@ -48,16 +48,13 @@ import java.util.*;
 * It therefore establishes a space of document identifiers.  Each connector will only ever be
 * asked to deal with identifiers that have in some way originated from the connector.
 *
-* Documents are fetched by ManifoldCF in three stages.  First, the addSeedDocuments() method is called in the connector
+* Documents are fetched by ManifoldCF in two stages.  First, the addSeedDocuments() method is called in the connector
 * implementation.  This method is meant to add a set of document identifiers to the queue.  When ManifoldCF is ready
-* to process a document, the document identifier is used to obtain a current document version string, using the
-* getDocumentVersions() method (the second stage).  This version string is used to decide whether or not the
-* third stage need be called for the document or not.  The third stage is responsible for sending document content
-* to the output, and for extracting any references to additional documents, and consists of the processDocuments() method.
+* to process a document, the document identifier is used to build a version string for the document and check whether
+* the document needs to be indexed, and index it if needed (the second stage).  The second stage
+* consists of the processDocuments() method.
 *
-* All of these methods interact with ManifoldCF by means of an "activity" interface.  For example, an IVersionActivity object
-* is passed to the getDocumentVersions() method, and that object contains methods that are necessary for getDocumentVersions()
-* to do its job.  A similar architecture is used throughout the connector framework.
+* All of these methods interact with ManifoldCF by means of an "activity" interface.
 */
 public interface IRepositoryConnector extends IConnector
 {
@@ -182,57 +179,23 @@ public interface IRepositoryConnector ex
     String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Get document versions given an array of document identifiers.
-  * This method is called for EVERY document that is considered. It is therefore important to perform
-  * as little work as possible here.
-  * The connector will be connected before this method can be called.
-  *@param documentVersions is the versions object, to be filled in by this method.
-  *@param documentIdentifiers is the array of local document identifiers, as understood by this connector.
-  *@param oldVersions is the corresponding array of version strings that have been saved for the document identifiers.
-  *   A null value indicates that this is a first-time fetch, while an empty string indicates that the previous document
-  *   had an empty version string.
-  *@param activities is the interface this method should use to perform whatever framework actions are desired.
-  *@param spec is the current document specification for the current job.  If there is a dependency on this
-  * specification, then the version string should include the pertinent data, so that reingestion will occur
-  * when the specification changes.  This is primarily useful for metadata.
-  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
-  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
-  */
-  public void getDocumentVersions(
-    DocumentVersions documentVersions,
-    String[] documentIdentifiers, String[] oldVersions,
-    IVersionActivity activities,
-    Specification spec, int jobMode, boolean usesDefaultAuthority)
-    throws ManifoldCFException, ServiceInterruption;
-
   /** Process a set of documents.
   * This is the method that should cause each document to be fetched, processed, and the results either added
   * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
   * The document specification allows this class to filter what is done based on the job.
   * The connector will be connected before this method can be called.
   *@param documentIdentifiers is the set of document identifiers to process.
-  *@param versions are the version strings returned by getDocumentVersions() above.
+  *@param statuses are the currently-stored document versions for each document in the set of document identifiers
+  * passed in above.
   *@param activities is the interface this method should use to queue up new document references
   * and ingest documents.
-  *@param scanOnly is an array corresponding to the document identifiers.  It is set to true to indicate when the processing
-  * should only find other references, and should not actually call the ingestion methods.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
   */
-  public void processDocuments(String[] documentIdentifiers, DocumentVersions versions, IProcessActivity activities,
-    boolean[] scanOnly, int jobMode)
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Free a set of documents.  This method is called for all documents whose versions have been fetched using
-  * the getDocumentVersions() method, including those that returned null versions.  It may be used to free resources
-  * committed during the getDocumentVersions() method.  It is guaranteed to be called AFTER any calls to
-  * processDocuments() for the documents in question.
-  * The connector will be connected before this method can be called.
-  *@param documentIdentifiers is the set of document identifiers.
-  *@param versions is the corresponding set of version strings (individual identifiers may have no version).
-  */
-  public void releaseDocumentVersions(String[] documentIdentifiers, DocumentVersions versions)
-    throws ManifoldCFException;
-
   /** Get the maximum number of documents to amalgamate together into one batch, for this connector.
   * The connector does not need to be connected for this method to be called.
   *@return the maximum number. 0 indicates "unlimited".

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/Carrydown.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/Carrydown.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/Carrydown.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/Carrydown.java Tue Jul 15 14:25:39 2014
@@ -457,57 +457,102 @@ public class Carrydown extends org.apach
       presentMap.put(vr,vr);
     }
   }
+  
+  /** Revert all records belonging to the specified parent documents to their original,
+  * pre-modified, state.
+  */
+  public void revertRecords(Long jobID, String[] parentDocumentIDHashes)
+    throws ManifoldCFException
+  {
+    int maxClause = getMaxInClause();
+    StringBuilder sb = new StringBuilder();
+    List<String> list = new ArrayList<String>();
+    int k = 0;
+    for (String parentDocumentIDHash : parentDocumentIDHashes)
+    {
+      if (k == maxClause)
+      {
+        performRevertRecords(sb.toString(),jobID,list);
+        sb.setLength(0);
+        list.clear();
+        k = 0;
+      }
+      if (k > 0)
+        sb.append(",");
+      sb.append("?");
+      list.add(parentDocumentIDHash);
+      k++;
+    }
+
+    if (k > 0)
+      performRevertRecords(sb.toString(),jobID,list);
+  }
+  
+  protected void performRevertRecords(String query, Long jobID, List<String> list)
+    throws ManifoldCFException
+  {
+    // Delete new records
+    StringBuilder sb = new StringBuilder("WHERE ");
+    ArrayList newList = new ArrayList();
+    
+    sb.append(buildConjunctionClause(newList,new ClauseDescription[]{
+      new UnitaryClause(jobIDField,jobID),
+      new MultiClause(parentIDHashField,list)})).append(" AND ");
+      
+    sb.append(newField).append("=?");
+    newList.add(statusToString(ISNEW_NEW));
+    performDelete(sb.toString(),newList,null);
+
+    // Restore old values
+    sb = new StringBuilder("WHERE ");
+    newList.clear();
+
+    sb.append(buildConjunctionClause(newList,new ClauseDescription[]{
+      new UnitaryClause(jobIDField,jobID),
+      new MultiClause(parentIDHashField,list)})).append(" AND ");
+
+    sb.append(newField).append("=?");
+    newList.add(statusToString(ISNEW_EXISTING));
+    
+    HashMap map = new HashMap();
+    map.put(newField,statusToString(ISNEW_BASE));
+    map.put(processIDField,null);
+    performUpdate(map,sb.toString(),newList,null);
+    
+    noteModifications(0,list.size(),0);
+  }
+
   /** Return all records belonging to the specified parent documents to the base state,
   * and delete the old (eliminated) child records.
   */
   public void restoreRecords(Long jobID, String[] parentDocumentIDHashes)
     throws ManifoldCFException
   {
-    beginTransaction();
-    try
+    int maxClause = getMaxInClause();
+    StringBuilder sb = new StringBuilder();
+    List<String> list = new ArrayList<String>();
+    int k = 0;
+    for (String parentDocumentIDHash : parentDocumentIDHashes)
     {
-      int maxClause = getMaxInClause();
-      StringBuilder sb = new StringBuilder();
-      ArrayList list = new ArrayList();
-      int i = 0;
-      int k = 0;
-      while (i < parentDocumentIDHashes.length)
+      if (k == maxClause)
       {
-        if (k == maxClause)
-        {
-          performRestoreRecords(sb.toString(),jobID,list);
-          sb.setLength(0);
-          list.clear();
-          k = 0;
-        }
-        if (k > 0)
-          sb.append(",");
-        sb.append("?");
-        String parentDocumentIDHash = parentDocumentIDHashes[i++];
-        list.add(parentDocumentIDHash);
-        k++;
+        performRestoreRecords(sb.toString(),jobID,list);
+        sb.setLength(0);
+        list.clear();
+        k = 0;
       }
-
       if (k > 0)
-        performRestoreRecords(sb.toString(),jobID,list);
-    }
-    catch (ManifoldCFException e)
-    {
-      signalRollback();
-      throw e;
-    }
-    catch (Error e)
-    {
-      signalRollback();
-      throw e;
-    }
-    finally
-    {
-      endTransaction();
+        sb.append(",");
+      sb.append("?");
+      list.add(parentDocumentIDHash);
+      k++;
     }
+
+    if (k > 0)
+      performRestoreRecords(sb.toString(),jobID,list);
   }
 
-  protected void performRestoreRecords(String query, Long jobID, ArrayList list)
+  protected void performRestoreRecords(String query, Long jobID, List<String> list)
     throws ManifoldCFException
   {
     // Delete
@@ -547,45 +592,23 @@ public class Carrydown extends org.apach
   public void deleteRecords(Long jobID, String[] documentIDHashes)
     throws ManifoldCFException
   {
-    beginTransaction();
-    try
+    int maxClause = maxClausePerformDeleteRecords(jobID);
+    List<String> list = new ArrayList<String>();
+    int k = 0;
+    for (String documentIDHash : documentIDHashes)
     {
-      int maxClause = maxClausePerformDeleteRecords(jobID);
-      ArrayList list = new ArrayList();
-      int i = 0;
-      int k = 0;
-      while (i < documentIDHashes.length)
+      if (k == maxClause)
       {
-        if (k == maxClause)
-        {
-          performDeleteRecords(jobID,list);
-          list.clear();
-          k = 0;
-        }
-        list.add(documentIDHashes[i++]);
-        k++;
-      }
-
-      if (k > 0)
         performDeleteRecords(jobID,list);
-
-
-    }
-    catch (ManifoldCFException e)
-    {
-      signalRollback();
-      throw e;
-    }
-    catch (Error e)
-    {
-      signalRollback();
-      throw e;
-    }
-    finally
-    {
-      endTransaction();
+        list.clear();
+        k = 0;
+      }
+      list.add(documentIDHash);
+      k++;
     }
 
+    if (k > 0)
+      performDeleteRecords(jobID,list);
   }
 
   protected int maxClausePerformDeleteRecords(Long jobID)
@@ -594,7 +617,7 @@ public class Carrydown extends org.apach
       new UnitaryClause(jobIDField,jobID)});
   }
     
-  protected void performDeleteRecords(Long jobID, ArrayList list)
+  protected void performDeleteRecords(Long jobID, List<String> list)
     throws ManifoldCFException
   {
     StringBuilder sb = new StringBuilder("WHERE ");

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java Tue Jul 15 14:25:39 2014
@@ -370,14 +370,25 @@ public class HopCount extends org.apache
     doFinish(jobID,legalLinkTypes,sourceDocumentHashes,hopcountMethod);
   }
 
+  /** Revert newly-added links, because of a possibly incomplete document processing phase.
+  * All child links marked as "new" will be removed, and all links marked as "existing" will be
+  * reset to be "base".
+  */
+  public void revertParents(Long jobID, String[] sourceDocumentHashes)
+    throws ManifoldCFException
+  {
+    intrinsicLinkManager.revertLinks(jobID,sourceDocumentHashes);
+  }
+  
   /** Do the work of recording source-target references. */
   protected boolean[] doRecord(Long jobID, String[] legalLinkTypes, String sourceDocumentIDHash, String[] targetDocumentIDHashes, String linkType,
     int hopcountMethod, String processID)
     throws ManifoldCFException
   {
-
-    // We have to both add the reference, AND invalidate appropriate cached hopcounts (if it is a NEW
-    // link.)
+    // NOTE: In order for the revertParents() call above to be correct in its current form,
+    // this method would need to be revised to not process any additions until the finishParents() call
+    // is made.  At the moment, revertParents() is not used by any thread.
+    // TBD, MHL
     boolean[] rval = new boolean[targetDocumentIDHashes.length];
     for (int i = 0; i < rval.length; i++)
     {

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/IntrinsicLink.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/IntrinsicLink.java?rev=1610713&r1=1610712&r2=1610713&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/IntrinsicLink.java (original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/IntrinsicLink.java Tue Jul 15 14:25:39 2014
@@ -232,19 +232,17 @@ public class IntrinsicLink extends org.a
     String[] targetDocumentIDHashes, String linkType, String processID)
     throws ManifoldCFException
   {
-    HashMap duplicateRemoval = new HashMap();
+    Set<String> duplicateRemoval = new HashSet<String>();
     int maxClause = maxClausePerformExistsCheck(jobID,linkType,sourceDocumentIDHash);
-    ArrayList list = new ArrayList();
+    List<String> list = new ArrayList<String>();
     int i = 0;
-    int k = 0;
     // Keep track of the document identifiers that have been seen vs. those that were unseen.
-    HashMap presentMap = new HashMap();
-    while (k < targetDocumentIDHashes.length)
+    Set<String> presentMap = new HashSet<String>();
+    for (String targetDocumentIDHash : targetDocumentIDHashes)
     {
-      String targetDocumentIDHash = targetDocumentIDHashes[k++];
-      if (duplicateRemoval.get(targetDocumentIDHash) != null)
+      if (duplicateRemoval.contains(targetDocumentIDHash))
         continue;
-      duplicateRemoval.put(targetDocumentIDHash,targetDocumentIDHash);
+      duplicateRemoval.add(targetDocumentIDHash);
       if (i == maxClause)
       {
         // Do the query and record the results
@@ -262,22 +260,22 @@ public class IntrinsicLink extends org.a
     // an update.
     // We have to count these by hand, in case there are duplicates in the array.
     int count = 0;
-    Iterator iter = duplicateRemoval.keySet().iterator();
+    Iterator<String> iter = duplicateRemoval.iterator();
     while (iter.hasNext())
     {
-      String targetDocumentIDHash = (String)iter.next();
-      if (presentMap.get(targetDocumentIDHash) == null)
+      String targetDocumentIDHash = iter.next();
+      if (!presentMap.contains(targetDocumentIDHash))
         count++;
     }
     String[] newReferences = new String[count];
     int j = 0;
     // Note: May be able to make this more efficient if we update things in batches...
-    iter = duplicateRemoval.keySet().iterator();
+    iter = duplicateRemoval.iterator();
     while (iter.hasNext())
     {
-      String targetDocumentIDHash = (String)iter.next();
+      String targetDocumentIDHash = iter.next();
 
-      if (presentMap.get(targetDocumentIDHash) == null)
+      if (!presentMap.contains(targetDocumentIDHash))
       {
         newReferences[j++] = targetDocumentIDHash;
         HashMap map = new HashMap();
@@ -319,7 +317,7 @@ public class IntrinsicLink extends org.a
   }
     
   /** Do the exists check, in batch. */
-  protected void performExistsCheck(Map presentMap, Long jobID, String linkType, String childIDHash, ArrayList list)
+  protected void performExistsCheck(Set<String> presentMap, Long jobID, String linkType, String childIDHash, List<String> list)
     throws ManifoldCFException
   {
     ArrayList newList = new ArrayList();
@@ -330,12 +328,11 @@ public class IntrinsicLink extends org.a
       new UnitaryClause(childIDHashField,childIDHash)});
 
     IResultSet result = performQuery("SELECT "+parentIDHashField+" FROM "+getTableName()+" WHERE "+query+" FOR UPDATE",newList,null,null);
-    int i = 0;
-    while (i < result.getRowCount())
+    for (int i = 0; i < result.getRowCount(); i++)
     {
-      IResultRow row = result.getRow(i++);
+      IResultRow row = result.getRow(i);
       String parentIDHash = (String)row.getValue(parentIDHashField);
-      presentMap.put(parentIDHash,parentIDHash);
+      presentMap.add(parentIDHash);
     }
   }
 
@@ -375,10 +372,9 @@ public class IntrinsicLink extends org.a
     throws ManifoldCFException
   {
     int maxClause = maxClausePerformRemoveDocumentLinks(jobID);
-    ArrayList list = new ArrayList();
-    int i = 0;
+    List<String> list = new ArrayList<String>();
     int k = 0;
-    while (i < documentIDHashes.length)
+    for (String documentIDHash : documentIDHashes)
     {
       if (k == maxClause)
       {
@@ -386,7 +382,7 @@ public class IntrinsicLink extends org.a
         list.clear();
         k = 0;
       }
-      list.add(documentIDHashes[i++]);
+      list.add(documentIDHash);
       k++;
     }
 
@@ -401,7 +397,7 @@ public class IntrinsicLink extends org.a
       new UnitaryClause(jobIDField,jobID)});
   }
     
-  protected void performRemoveDocumentLinks(ArrayList list, Long jobID)
+  protected void performRemoveDocumentLinks(List<String> list, Long jobID)
     throws ManifoldCFException
   {
     StringBuilder sb = new StringBuilder("WHERE ");
@@ -424,10 +420,9 @@ public class IntrinsicLink extends org.a
     throws ManifoldCFException
   {
     int maxClause = maxClausePerformRemoveLinks(jobID);
-    ArrayList list = new ArrayList();
-    int i = 0;
+    List<String> list = new ArrayList<String>();
     int k = 0;
-    while (i < sourceDocumentIDHashes.length)
+    for (String sourceDocumentIDHash : sourceDocumentIDHashes)
     {
       if (k == maxClause)
       {
@@ -435,7 +430,7 @@ public class IntrinsicLink extends org.a
         list.clear();
         k = 0;
       }
-      list.add(sourceDocumentIDHashes[i++]);
+      list.add(sourceDocumentIDHash);
       k++;
     }
 
@@ -450,7 +445,7 @@ public class IntrinsicLink extends org.a
       new UnitaryClause(jobIDField,jobID)});
   }
     
-  protected void performRemoveLinks(ArrayList list, Long jobID, String commonNewExpression,
+  protected void performRemoveLinks(List<String> list, Long jobID, String commonNewExpression,
     ArrayList commonNewParams)
     throws ManifoldCFException
   {
@@ -474,10 +469,9 @@ public class IntrinsicLink extends org.a
     throws ManifoldCFException
   {
     int maxClause = maxClausesPerformRestoreLinks(jobID);
-    ArrayList list = new ArrayList();
-    int i = 0;
+    List<String> list = new ArrayList<String>();
     int k = 0;
-    while (i < sourceDocumentIDHashes.length)
+    for (String sourceDocumentIDHash : sourceDocumentIDHashes)
     {
       if (k == maxClause)
       {
@@ -485,7 +479,7 @@ public class IntrinsicLink extends org.a
         list.clear();
         k = 0;
       }
-      list.add(sourceDocumentIDHashes[i++]);
+      list.add(sourceDocumentIDHash);
       k++;
     }
 
@@ -500,7 +494,7 @@ public class IntrinsicLink extends org.a
       new UnitaryClause(jobIDField,jobID)});
   }
   
-  protected void performRestoreLinks(Long jobID, ArrayList list)
+  protected void performRestoreLinks(Long jobID, List<String> list)
     throws ManifoldCFException
   {
     HashMap map = new HashMap();
@@ -519,6 +513,67 @@ public class IntrinsicLink extends org.a
     performUpdate(map,sb.toString(),newList,null);
   }
 
+  /** Throw away links added during (aborted) processing.
+  */
+  public void revertLinks(Long jobID, String[] sourceDocumentIDHashes)
+    throws ManifoldCFException
+  {
+    int maxClause = maxClausesPerformRevertLinks(jobID);
+    List<String> list = new ArrayList<String>();
+    int k = 0;
+    for (String sourceDocumentIDHash : sourceDocumentIDHashes)
+    {
+      if (k == maxClause)
+      {
+        performRevertLinks(jobID,list);
+        list.clear();
+        k = 0;
+      }
+      list.add(sourceDocumentIDHash);
+      k++;
+    }
+
+    if (k > 0)
+      performRevertLinks(jobID,list);
+    noteModifications(0,sourceDocumentIDHashes.length,0);
+  }
+
+  protected int maxClausesPerformRevertLinks(Long jobID)
+  {
+    return findConjunctionClauseMax(new ClauseDescription[]{
+      new UnitaryClause(jobIDField,jobID)});
+  }
+  
+  protected void performRevertLinks(Long jobID, List<String> list)
+    throws ManifoldCFException
+  {
+    // First, delete everything marked as "new"
+    StringBuilder sb = new StringBuilder("WHERE ");
+    ArrayList newList = new ArrayList();
+
+    sb.append(buildConjunctionClause(newList,new ClauseDescription[]{
+      new UnitaryClause(jobIDField,jobID),
+      new MultiClause(childIDHashField,list)})).append(" AND ")
+      .append(newField).append("=?");
+    newList.add(statusToString(LINKSTATUS_NEW));
+    performDelete(sb.toString(),newList,null);
+
+    // Now map everything marked as "EXISTING" back to "BASE".
+    HashMap map = new HashMap();
+    map.put(newField,statusToString(LINKSTATUS_BASE));
+    map.put(processIDField,null);
+    
+    sb = new StringBuilder();
+    newList.clear();
+    
+    sb.append(buildConjunctionClause(newList,new ClauseDescription[]{
+      new UnitaryClause(jobIDField,jobID),
+      new MultiClause(childIDHashField,list)})).append(" AND ")
+      .append(newField).append("=?");
+    newList.add(statusToString(LINKSTATUS_EXISTING));
+    performUpdate(map,sb.toString(),newList,null);
+  }
+
   /** Get document's children.
   *@return rows that contain the children.  Column names are 'linktype','childidentifier'.
   */
@@ -547,11 +602,10 @@ public class IntrinsicLink extends org.a
     IResultSet set = performQuery("SELECT DISTINCT "+parentIDHashField+" FROM "+
       getTableName()+" WHERE "+query,list,null,null);
     String[] rval = new String[set.getRowCount()];
-    int i = 0;
-    while (i < rval.length)
+    for (int i = 0; i < rval.length; i++)
     {
       IResultRow row = set.getRow(i);
-      rval[i++] = (String)row.getValue(parentIDHashField);
+      rval[i] = (String)row.getValue(parentIDHashField);
     }
     return rval;
   }



Mime
View raw message