manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1613423 - in /manifoldcf/trunk/framework: agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Date Fri, 25 Jul 2014 12:55:01 GMT
Author: kwright
Date: Fri Jul 25 12:55:00 2014
New Revision: 1613423

URL: http://svn.apache.org/r1613423
Log:
Put in sanity checks for connector document disposition.

Modified:
    manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
    manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Modified: manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java?rev=1613423&r1=1613422&r2=1613423&view=diff
==============================================================================
--- manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
(original)
+++ manifoldcf/trunk/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
Fri Jul 25 12:55:00 2014
@@ -3767,6 +3767,8 @@ public class IncrementalIngester extends
     public int sendDocument(String documentURI, RepositoryDocument document)
       throws ManifoldCFException, ServiceInterruption, IOException
     {
+      if (documentProcessed)
+        throw new IllegalStateException("Document cannot have multiple dispositions");
       int rval = activities.sendDocument(documentURI,document);
       documentProcessed = true;
       return rval;
@@ -3779,6 +3781,8 @@ public class IncrementalIngester extends
     public void noDocument()
       throws ManifoldCFException, ServiceInterruption
     {
+      if (documentProcessed)
+        throw new IllegalStateException("Document cannot have multiple dispositions");
       activities.noDocument();
       documentProcessed = true;
     }

Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1613423&r1=1613422&r2=1613423&view=diff
==============================================================================
--- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
(original)
+++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Fri Jul 25 12:55:00 2014
@@ -1159,6 +1159,8 @@ public class WorkerThread extends Thread
     // Whether a component was touched or not, keyed by document identifier.
     // This does not include primary document.  The set is keyed by component id hash.
     protected final Map<String,Set<String>> touchedComponentSet = new HashMap<String,Set<String>>();
+    // This represents primary documents.
+    protected final Set<String> touchedPrimarySet = new HashSet<String>();
     
     /** Constructor.
     *@param jobManager is the job manager
@@ -1501,6 +1503,7 @@ public class WorkerThread extends Thread
     {
       String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
       String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
+      checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
       ingester.documentRecord(
         pipelineSpecification.getBasicPipelineSpecification(),
         connectionName,documentIdentifierHash,componentIdentifierHash,
@@ -1573,6 +1576,7 @@ public class WorkerThread extends Thread
 
       String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
       String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
+      checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
 
       if (data != null)
       {
@@ -1634,6 +1638,7 @@ public class WorkerThread extends Thread
       // (by ignoring it and allowing it to be deleted later)
       String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
       String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
+      checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
 
       ingester.documentNoData(
         computePipelineSpecification(documentIdentifierHash,componentIdentifierHash),
@@ -1657,6 +1662,8 @@ public class WorkerThread extends Thread
     public void removeDocument(String documentIdentifier)
       throws ManifoldCFException, ServiceInterruption
     {
+      checkMultipleDispositions(documentIdentifier,null,null);
+
       String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
       ingester.documentRemove(
         pipelineSpecification.getBasicPipelineSpecification(),
@@ -1665,6 +1672,7 @@ public class WorkerThread extends Thread
         
       // Note that we touched it, so it won't get checked
       touchedSet.add(documentIdentifier);
+      touchComponentSet(documentIdentifier,null);
     }
 
     /** Retain existing document component.  Use this method to signal that an already-existing
@@ -1678,10 +1686,11 @@ public class WorkerThread extends Thread
       String componentIdentifier)
       throws ManifoldCFException
     {
-      touchComponentSet(documentIdentifier,computeComponentIDHash(componentIdentifier));
+      String componentIdentifierHash = computeComponentIDHash(componentIdentifier);
+      checkMultipleDispositions(documentIdentifier,componentIdentifier,componentIdentifierHash);
+      touchComponentSet(documentIdentifier,componentIdentifierHash);
     }
 
-    
     /** Delete the current document from the search engine index, while keeping track of
the version information
     * for it (to reduce churn).
     * Use noDocument() above instead.
@@ -2096,10 +2105,33 @@ public class WorkerThread extends Thread
       return ManifoldCF.createJobSpecificString(jobID,simpleString);
     }
 
+    protected void checkMultipleDispositions(String documentIdentifier, String componentIdentifier,
String componentIdentifierHash)
+    {
+      if (abortSet.contains(documentIdentifier))
+        throw new IllegalStateException("Multiple document dispositions not allowed: Abort
cannot be combiend with component disposition; document '"+documentIdentifier+"'");
+      if (documentDeletedSet.contains(documentIdentifier))
+        throw new IllegalStateException("Multiple document dispositions not allowed: Document
delete cannot be combined with component disposition; document '"+documentIdentifier+"'");
+      if (componentIdentifierHash == null)
+      {
+        // Primary
+        if (touchedPrimarySet.contains(documentIdentifier))
+          throw new IllegalStateException("Multiple document primary component dispositions
not allowed: document '"+documentIdentifier+"'");
+      }
+      else
+      {
+        Set<String> components = touchedComponentSet.get(documentIdentifier);
+        if (components.contains(componentIdentifierHash))
+          throw new IllegalStateException("Multiple document component dispositions not allowed:
document '"+documentIdentifier+"', component '"+componentIdentifier+"'");
+      }
+    }
+    
     protected void touchComponentSet(String documentIdentifier, String componentIdentifierHash)
     {
       if (componentIdentifierHash == null)
+      {
+        touchedPrimarySet.add(documentIdentifier);
         return;
+      }
       Set<String> components = touchedComponentSet.get(documentIdentifier);
       if (components == null)
       {



Mime
View raw message