manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1610565 - in /manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler: interfaces/IProcessActivity.java system/WorkerThread.java
Date Tue, 15 Jul 2014 01:19:19 GMT
Author: kwright
Date: Tue Jul 15 01:19:18 2014
New Revision: 1610565

URL: http://svn.apache.org/r1610565
Log:
Get the tests to pass

Modified:
    manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
    manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Modified: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1610565&r1=1610564&r2=1610565&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
Tue Jul 15 01:19:18 2014
@@ -29,7 +29,7 @@ import org.apache.manifoldcf.agents.inte
 * (2) The connector computes a version string for each document in the set as part of determining
 *    whether the document indeed needs to be refetched.
 * (3) For each document processed, there can be one of several dispositions:
-*   (a) There is no such document (anymore): nothing is called for the document (the framework
will delete it).
+*   (a) There is no such document (anymore): deleteDocument() called for the document.
 *   (b) The document is (re)indexed: ingestDocumentWithException() is called for the document.
 *   (c) The document is determined to be unchanged and no updates are needed: noteUnchangedDocument()
is called
 *     for the document.
@@ -38,6 +38,8 @@ import org.apache.manifoldcf.agents.inte
 *   (e) The document is determined to be unindexable BUT it still exists in the repository:
noDocument()
 *    is called for the document.
 *   (f) There was a service interruption: ServiceInterruption is thrown.
+*   (g) Nothing is called describing the document's disposition.  In that case, for backwards
compatibility,
+*    the framework marks the document as having been processed.
 * (4) In order to determine whether a document needs to be reindexed, the method checkDocumentNeedsReindexing()
 *    is available to return an opinion on that matter.
 */
@@ -173,6 +175,16 @@ public interface IProcessActivity extend
   public void noDocument(String documentIdentifier, String version)
     throws ManifoldCFException, ServiceInterruption;
 
+  /** Delete the specified document permanently from the search engine index, and from the
status table.
+  * This method does NOT keep track of any document version information for the document
and thus can
+  * lead to "churn", whereby the same document is queued, processed,
+  * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+  * in any case where the same decision will need to be made over and over.
+  *@param documentIdentifier is the document's identifier.
+  */
+  public void deleteDocument(String documentIdentifier)
+    throws ManifoldCFException;
+
   /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
   * documents with the same URL, however, will still be removed.)  This is
   * useful if the version string changes but the document contents are known not
@@ -193,16 +205,6 @@ public interface IProcessActivity extend
   public void deleteDocument(String documentIdentifier, String version)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Delete the specified document permanently from the search engine index, and from the
status table.
-  * This method does NOT keep track of any document version information for the document
and thus can
-  * lead to "churn", whereby the same document is queued, processed,
-  * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
-  * in any case where the same decision will need to be made over and over.
-  *@param documentIdentifier is the document's identifier.
-  */
-  @Deprecated
-  public void deleteDocument(String documentIdentifier)
-    throws ManifoldCFException, ServiceInterruption;
 
   /** Override the schedule for the next time a document is crawled.
   * Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper
expire bound, lower expire bound,

Modified: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1610565&r1=1610564&r2=1610565&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Tue Jul 15 01:19:18 2014
@@ -396,28 +396,19 @@ public class WorkerThread extends Thread
                             // unconditional requeue.
                             finishList.add(qd);
                           }
+                          else if (activity.wasDocumentDeleted(qd.getDocumentDescription().getDocumentIdentifier()))
+                          {
+                            deleteList.add(qd);
+                          }
+                          else if (activity.wasDocumentUnchanged(qd.getDocumentDescription().getDocumentIdentifier()))
+                          {
+                            finishList.add(qd);
+                            ingesterCheckList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
+                          }
                           else
                           {
-                            // If the document is not being deleted, add it to the finish
set.
-                            if (activity.wasDocumentProcessed(qd.getDocumentDescription().getDocumentIdentifier()))
-                            {
-                              finishList.add(qd);
-                            }
-                            else if (activity.wasDocumentUnchanged(qd.getDocumentDescription().getDocumentIdentifier()))
-                            {
-                              finishList.add(qd);
-                              ingesterCheckList.add(qd.getDocumentDescription().getDocumentIdentifierHash());
-                            }
-                            else
-                            {
-                              // Anything else means that the document was not found and
should be deleted, eventually.
-                              // We can't just delete because of connector backwards compatibility.
 The case in question
-                              // is handling documents that are not indexed, such as file
system directories.  To prevent
-                              // the job from not terminating, we have to add this document
to the finish list so that it gets
-                              // marked as being done.
-                              //deleteList.add(qd);
-                              finishList.add(qd);
-                            }
+                            // All documents not specifically called out above are simply
finished, since we know they haven't been deleted.
+                            finishList.add(qd);
                           }
                         }
                         
@@ -1147,8 +1138,8 @@ public class WorkerThread extends Thread
     // Whether the document was checked or not
     protected final Set<String> documentCheckedSet = new HashSet<String>();
     
-    // Whether document was processed or not
-    protected final Set<String> documentProcessedSet = new HashSet<String>();
+    // Whether document was deleted
+    protected final Set<String> documentDeletedSet = new HashSet<String>();
     
     /** Constructor.
     *@param jobManager is the job manager
@@ -1212,11 +1203,11 @@ public class WorkerThread extends Thread
       return documentCheckedSet.contains(documentIdentifier);
     }
     
-    /** Check whether a document was processed or not.
+    /** Check whether document was deleted or not.
     */
-    public boolean wasDocumentProcessed(String documentIdentifier)
+    public boolean wasDocumentDeleted(String documentIdentifier)
     {
-      return documentProcessedSet.contains(documentIdentifier);
+      return documentDeletedSet.contains(documentIdentifier);
     }
     
     /** Check whether a document was aborted or not.
@@ -1445,7 +1436,6 @@ public class WorkerThread extends Thread
     public void noteUnchangedDocument(String documentIdentifier)
       throws ManifoldCFException
     {
-      documentProcessedSet.add(documentIdentifier);
       documentCheckedSet.add(documentIdentifier);
     }
 
@@ -1462,7 +1452,6 @@ public class WorkerThread extends Thread
         pipelineSpecification.getBasicPipelineSpecification(),
         connectionName,documentIdentifierHash,
         version,currentTime);
-      documentProcessedSet.add(documentIdentifier);
     }
 
     /** Ingest the current document.
@@ -1538,7 +1527,6 @@ public class WorkerThread extends Thread
         documentURI,
         ingestLogger);
       
-      documentProcessedSet.add(documentIdentifier);
     }
 
     /** Remove the specified document from the search engine index, while keeping track of
the version information
@@ -1588,14 +1576,10 @@ public class WorkerThread extends Thread
     *@param documentIdentifier is the document's identifier.
     */
     @Override
-    @Deprecated
     public void deleteDocument(String documentIdentifier)
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
-      ingester.documentDelete(pipelineSpecification.getBasicPipelineSpecification(),
-        connectionName,documentIdentifierHash,
-        ingestLogger);
+      documentDeletedSet.add(documentIdentifier);
     }
 
     /** Override the schedule for the next time a document is crawled.



Mime
View raw message