manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1610414 - in /manifoldcf/branches/CONNECTORS-990/framework: agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/ agents/src/main/java/org/apache/manifoldcf/agents/interfaces/ pull-agent/src/main/java/org/apache/manifoldcf/c...
Date Mon, 14 Jul 2014 14:10:37 GMT
Author: kwright
Date: Mon Jul 14 14:10:36 2014
New Revision: 1610414

URL: http://svn.apache.org/r1610414
Log:
Revamp API to meet CONNECTORS-990 goal.  Doesn't compile yet

Added:
    manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
  (with props)
Modified:
    manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
    manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
    manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
    manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
    manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
    manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Modified: manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java?rev=1610414&r1=1610413&r2=1610414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
Mon Jul 14 14:10:36 2014
@@ -609,27 +609,35 @@ public class IncrementalIngester extends
   }
 
   /** Record a document version, but don't ingest it.
-  * The purpose of this method is to keep track of the frequency at which ingestion "attempts"
take place.
-  * ServiceInterruption is thrown if this action must be rescheduled.
+  * The purpose of this method is to update document version information without reindexing
the document.
   *@param pipelineSpecificationBasic is the basic pipeline specification needed.
   *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
   *@param identifierHash is the hashed document identifier.
   *@param documentVersion is the document version.
   *@param recordTime is the time at which the recording took place, in milliseconds since
epoch.
-  *@param activities is the object used in case a document needs to be removed from the output
index as the result of this operation.
   */
   @Override
   public void documentRecord(
     IPipelineSpecificationBasic pipelineSpecificationBasic,
     String identifierClass, String identifierHash,
-    String documentVersion, long recordTime,
-    IOutputActivity activities)
-    throws ManifoldCFException, ServiceInterruption
+    String documentVersion, long recordTime)
+    throws ManifoldCFException
   {
+    // This method is called when a connector decides that the last indexed version of the
document is in fact just fine,
+    // but the document version information should be updated.
+    // The code pathway is therefore similar to that of document indexing, EXCEPT that no
indexing will ever
+    // take place.  This has some interesting side effects.  For example:
+    // (1) In the case of a document collision with another job using the same repository
connection, the last document
+    //    indexed cannot be changed.  Updating the version string for the document would
therefore be misleading.  This
+    //    case should be detected and prevented from occurring, by refusing to perform the
update.
+    //    On the other hand, only one thread at a time can be processing the document at
a given time, and therefore
+    //    since the connector detected "no change", we are safe to presume we can just update
the version info.
+    // (2) In the case of a URL conflict with another job, since nothing changes and no new
URL is recorded, no cleanup
+    //    of conflicting records sharing the same URL should be needed.
+    
     String docKey = makeKey(identifierClass,identifierHash);
 
     String[] outputConnectionNames = extractOutputConnectionNames(pipelineSpecificationBasic);
-    IOutputConnection[] outputConnections = connectionManager.loadMultiple(outputConnectionNames);
 
     if (Logging.ingest.isDebugEnabled())
     {
@@ -639,99 +647,10 @@ public class IncrementalIngester extends
     for (int k = 0; k < outputConnectionNames.length; k++)
     {
       String outputConnectionName = outputConnectionNames[k];
-      IOutputConnection connection = outputConnections[k];
 
-      String oldURI = null;
-      String oldURIHash = null;
-      String oldOutputVersion = null;
-
-      // Repeat if needed
-      while (true)
-      {
-        long sleepAmt = 0L;
-        try
-        {
-          // See what uri was used before for this doc, if any
-          ArrayList list = new ArrayList();
-          String query = buildConjunctionClause(list,new ClauseDescription[]{
-            new UnitaryClause(docKeyField,docKey),
-            new UnitaryClause(outputConnNameField,outputConnectionName)});
-            
-          IResultSet set = performQuery("SELECT "+docURIField+","+uriHashField+","+lastOutputVersionField+"
FROM "+getTableName()+
-            " WHERE "+query,list,null,null);
-
-          if (set.getRowCount() > 0)
-          {
-            IResultRow row = set.getRow(0);
-            oldURI = (String)row.getValue(docURIField);
-            oldURIHash = (String)row.getValue(uriHashField);
-            oldOutputVersion = (String)row.getValue(lastOutputVersionField);
-          }
-          
-          break;
-        }
-        catch (ManifoldCFException e)
-        {
-          // Look for deadlock and retry if so
-          if (e.getErrorCode() == e.DATABASE_TRANSACTION_ABORT)
-          {
-            if (Logging.perf.isDebugEnabled())
-              Logging.perf.debug("Aborted select looking for status: "+e.getMessage());
-            sleepAmt = getSleepAmt();
-            continue;
-          }
-          throw e;
-        }
-        finally
-        {
-          sleepFor(sleepAmt);
-        }
-      }
-
-      // If uri hashes collide, then we must be sure to eliminate only the *correct* records
from the table, or we will leave
-      // dangling documents around.  So, all uri searches and comparisons MUST compare the
actual uri as well.
-
-      // But, since we need to insure that any given URI is only worked on by one thread
at a time, use critical sections
-      // to block the rare case that multiple threads try to work on the same URI.
-      
-      String[] lockArray = computeLockArray(null,oldURI,outputConnectionName);
-      lockManager.enterLocks(null,null,lockArray);
-      try
-      {
-
-        ArrayList list = new ArrayList();
-        
-        if (oldURI != null)
-        {
-          IOutputConnector connector = outputConnectorPool.grab(connection);
-          if (connector == null)
-            // The connector is not installed; treat this as a service interruption.
-            throw new ServiceInterruption("Output connector not installed",0L);
-          try
-          {
-            connector.removeDocument(oldURI,oldOutputVersion,new OutputRemoveActivitiesWrapper(activities,outputConnectionName));
-          }
-          finally
-          {
-            outputConnectorPool.release(connection,connector);
-          }
-          // Delete all records from the database that match the old URI, except for THIS
record.
-          list.clear();
-          String query = buildConjunctionClause(list,new ClauseDescription[]{
-            new UnitaryClause(uriHashField,"=",oldURIHash),
-            new UnitaryClause(outputConnNameField,outputConnectionName)});
-          list.add(docKey);
-          performDelete("WHERE "+query+" AND "+docKeyField+"!=?",list,null);
-        }
-
-        // If we get here, it means we are noting that the document was examined, but that
no change was required.  This is signaled
-        // to noteDocumentIngest by having the null documentURI.
-        noteDocumentIngest(outputConnectionName,docKey,documentVersion,null,null,null,null,recordTime,null,null);
-      }
-      finally
-      {
-        lockManager.leaveLocks(null,null,lockArray);
-      }
+      // If we get here, it means we are noting that the document was examined, but that
no change was required.  This is signaled
+      // to noteDocumentIngest by having the null documentURI.
+      noteDocumentIngest(outputConnectionName,docKey,documentVersion,null,null,null,null,recordTime,null,null);
     }
   }
 

Modified: manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java?rev=1610414&r1=1610413&r2=1610414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
Mon Jul 14 14:10:36 2014
@@ -150,21 +150,18 @@ public interface IIncrementalIngester
     String newAuthorityNameString);
 
   /** Record a document version, but don't ingest it.
-  * The purpose of this method is to keep track of the frequency at which ingestion "attempts"
take place.
-  * ServiceInterruption is thrown if this action must be rescheduled.
+  * The purpose of this method is to update document version information without reindexing
the document.
   *@param pipelineSpecificationBasic is the basic pipeline specification needed.
   *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
   *@param identifierHash is the hashed document identifier.
   *@param documentVersion is the document version.
   *@param recordTime is the time at which the recording took place, in milliseconds since
epoch.
-  *@param activities is the object used in case a document needs to be removed from the output
index as the result of this operation.
   */
   public void documentRecord(
     IPipelineSpecificationBasic pipelineSpecificationBasic,
     String identifierClass, String identifierHash,
-    String documentVersion, long recordTime,
-    IOutputActivity activities)
-    throws ManifoldCFException, ServiceInterruption;
+    String documentVersion, long recordTime)
+    throws ManifoldCFException;
 
   /** Ingest a document.
   * This ingests the document, and notes it.  If this is a repeat ingestion of the document,
this

Modified: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java?rev=1610414&r1=1610413&r2=1610414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/connectors/BaseRepositoryConnector.java
Mon Jul 14 14:10:36 2014
@@ -324,6 +324,78 @@ public abstract class BaseRepositoryConn
     return null;
   }
 
+  /** Process a set of documents.
+  * This is the method that should cause each document to be fetched, processed, and the
results either added
+  * to the queue of documents for the current job, and/or entered into the incremental ingestion
manager.
+  * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
+  *@param documentIdentifiers is the set of document identifiers to process.
+  *@param statuses are the currently-stored document versions for each document in the set
of document identifiers
+  * passed in above.
+  *@param activities is the interface this method should use to queue up new document references
+  * and ingest documents.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents
is the default one.
+  */
+  @Override
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses,
Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // The backwards-compatible base implementation does the following:
+    // (1) Uses the deprecated methods to obtain a set of version information
+    // (2) Based on the provided version information, determines whether processing is required
+    // (3) Uses deprecated methods to process documents
+    // (4) Releases document versions
+
+    // We need to get the old version strings together in order to use the deprecated methods
+    String[] oldVersions = new String[documentIdentifiers.length];
+    for (int i = 0; i < oldVersions.length; i++)
+    {
+      oldVersions[i] = statuses.getIndexedVersionString(documentIdentifiers[i]);
+    }
+    DocumentVersions dv = new DocumentVersions();
+    getDocumentVersions(dv,documentIdentifiers,oldVersions,activities,spec,jobMode,usesDefaultAuthority);
+    try
+    {
+      // Next, we determine what part of the set of documents were unchanged, and what part
we need to refetch.
+      Set<String> fetchDocuments = new HashSet<String>();
+      Set<String> scanDocuments = new HashSet<String>();
+      for (int i = 0; i < documentIdentifiers.length; i++)
+      {
+        String documentIdentifier = documentIdentifiers[i];
+        VersionContent vc = dv.getDocumentVersion(documentIdentifier);
+        if (vc != null)
+        {
+          if (dv.isAlwaysRefetch(documentIdentifier) || activities.checkDocumentNeedsReindexing(documentIdentifier,vc.getVersionString()))
+            fetchDocuments.add(documentIdentifier);
+          scanDocuments.add(documentIdentifier);
+        }
+      }
+
+      // Construct the appropriate data to call processDocuments() with
+      String[] processIDs = new String[scanDocuments.size()];
+      boolean[] scanOnly = new boolean[scanDocuments.size()];
+      int index = 0;
+      for (int i = 0; i < documentIdentifiers.length; i++)
+      {
+        String documentIdentifier = documentIdentifiers[i];
+        if (scanDocuments.contains(documentIdentifier))
+        {
+          processIDs[index] = documentIdentifier;
+          scanOnly[index] = !fetchDocuments.contains(documentIdentifier);
+          index++;
+        }
+      }
+      processDocuments(processIDs,dv,activities,scanOnly,jobMode);
+    }
+    finally
+    {
+      // Release document versions
+      releaseDocumentVersions(documentIdentifiers,dv);
+    }
+  }
+
   /** Get document versions given an array of document identifiers.
   * This method is called for EVERY document that is considered. It is therefore important
to perform
   * as little work as possible here.
@@ -340,7 +412,6 @@ public abstract class BaseRepositoryConn
   *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
   *@param usesDefaultAuthority will be true only if the authority in use for these documents
is the default one.
   */
-  @Override
   public void getDocumentVersions(
     DocumentVersions documentVersions,
     String[] documentIdentifiers, String[] oldVersions,
@@ -488,7 +559,6 @@ public abstract class BaseRepositoryConn
   *@param documentIdentifiers is the set of document identifiers.
   *@param versions is the corresponding set of version strings (individual identifiers may
have no version).
   */
-  @Override
   public void releaseDocumentVersions(String[] documentIdentifiers, DocumentVersions versions)
     throws ManifoldCFException
   {
@@ -543,7 +613,6 @@ public abstract class BaseRepositoryConn
   * should only find other references, and should not actually call the ingestion methods.
   *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
   */
-  @Override
   public void processDocuments(String[] documentIdentifiers, DocumentVersions versions, IProcessActivity
activities,
     boolean[] scanOnly, int jobMode)
     throws ManifoldCFException, ServiceInterruption

Added: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java?rev=1610414&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
(added)
+++ manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
Mon Jul 14 14:10:36 2014
@@ -0,0 +1,37 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.interfaces;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This interface describes functionality designed to allow retrieval of existing
+* version information from previous crawls.  It is part of the IRepositoryConnector API.
+*/
+public interface IExistingVersions
+{
+  public static final String _rcsid = "@(#)$Id$";
+
+  /** Retrieve an existing version string given a document identifier.
+  *@param documentIdentifier is the document identifier.
+  *@return the document version string, or null if the document was never previously indexed.
+  */
+  public String getIndexedVersionString(String documentIdentifier);
+
+}

Propchange: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IExistingVersions.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java?rev=1610414&r1=1610413&r2=1610414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IProcessActivity.java
Mon Jul 14 14:10:36 2014
@@ -23,15 +23,41 @@ import java.io.*;
 import org.apache.manifoldcf.core.interfaces.*;
 import org.apache.manifoldcf.agents.interfaces.*;
 
-/** This interface abstracts from the activities that a fetched document processor can do.
+/** This interface abstracts from the activities that a connector's processDocuments() method
can do.
+* The processing flow for a document is expected to go something like this:
+* (1) The connector's processDocuments() method is called with a set of documents to be processed.
+* (2) The connector computes a version string for each document in the set as part of determining
+*    whether the document indeed needs to be refetched.
+* (3) For each document processed, there can be one of several dispositions:
+*   (a) There is no such document (anymore): nothing is called for the document (the framework
will delete it).
+*   (b) The document is (re)indexed: ingestDocumentWithException() is called for the document.
+*   (c) The document is determined to be unchanged and no updates are needed: noteUnchangedDocument()
is called
+*     for the document.
+*   (d) The document is determined to be unchanged BUT the version string needs to be updated:
recordDocument()
+*     is called for the document.
+*   (e) The document is determined to be unindexable BUT it still exists in the repository:
noDocument()
+*    is called for the document.
+*   (f) There was a service interruption: ServiceInterruption is thrown.
+* (4) In order to determine whether a document needs to be reindexed, the method checkDocumentNeedsReindexing()
+*    is available to return an opinion on that matter.
 */
-public interface IProcessActivity extends IHistoryActivity, IEventActivity, IAbortActivity,
IFingerprintActivity,
-    ICarrydownActivity
+public interface IProcessActivity extends IVersionActivity
 {
   public static final String _rcsid = "@(#)$Id: IProcessActivity.java 988245 2010-08-23 18:39:35Z
kwright $";
 
+  /** Check if a document needs to be reindexed, based on a computed version string.
+  * Call this method to determine whether reindexing is necessary.  Pass in a newly-computed
version
+  * string.  This method will return "true" if the document needs to be re-indexed.
+  *@param documentIdentifier is the document identifier.
+  *@param newVersionString is the newly-computed version string.
+  *@return true if the document needs to be reindexed.
+  */
+  public boolean checkDocumentNeedsReindexing(String documentIdentifier,
+    String newVersionString)
+    throws ManifoldCFException;
+
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the local document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -45,12 +71,12 @@ public interface IProcessActivity extend
   *@param originationTime is the time, in ms since epoch, that the document originated. 
Pass null if none or unknown.
   *@param prereqEventNames are the names of the prerequisite events which this document requires
prior to processing.  Pass null if none.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String
relationshipType,
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String
relationshipType,
     String[] dataNames, Object[][] dataValues, Long originationTime, String[] prereqEventNames)
     throws ManifoldCFException;
 
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -63,13 +89,12 @@ public interface IProcessActivity extend
   *          The type of each object must either be a String, or a CharacterInput.
   *@param originationTime is the time, in ms since epoch, that the document originated. 
Pass null if none or unknown.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String
relationshipType,
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String
relationshipType,
     String[] dataNames, Object[][] dataValues, Long originationTime)
     throws ManifoldCFException;
 
-
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -81,12 +106,12 @@ public interface IProcessActivity extend
   *@param dataValues are the values that correspond to the data names in the dataNames parameter.
 May be null only if dataNames is null.
   *          The type of each object must either be a String, or a CharacterInput.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String
relationshipType,
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String
relationshipType,
     String[] dataNames, Object[][] dataValues)
     throws ManifoldCFException;
 
   /** Add a document description to the current job's queue.
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   *@param parentIdentifier is the document identifier that is considered to be the "parent"
   * of this identifier.  May be null, if no hopcount filtering desired for this kind of relationship.
@@ -94,27 +119,19 @@ public interface IProcessActivity extend
   * reference.  This must be one of the strings returned by the IRepositoryConnector method
   * "getRelationshipTypes()".  May be null.
   */
-  public void addDocumentReference(String localIdentifier, String parentIdentifier, String
relationshipType)
+  public void addDocumentReference(String documentIdentifier, String parentIdentifier, String
relationshipType)
     throws ManifoldCFException;
 
   /** Add a document description to the current job's queue.  This method is equivalent to
   * addDocumentReference(localIdentifier,null,null).
-  *@param localIdentifier is the local document identifier to add (for the connector that
+  *@param documentIdentifier is the document identifier to add (for the connector that
   * fetched the document).
   */
-  public void addDocumentReference(String localIdentifier)
+  public void addDocumentReference(String documentIdentifier)
     throws ManifoldCFException;
 
-
-  /** Record a document version, but don't ingest it.
-  *@param localIdentifier is the document identifier.
-  *@param version is the document version.
-  */
-  public void recordDocument(String localIdentifier, String version)
-    throws ManifoldCFException, ServiceInterruption;
-
   /** Ingest the current document.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param version is the version of the document, as reported by the getDocumentVersions()
method of the
   *       corresponding repository connector.
   *@param documentURI is the URI to use to retrieve this document from the search interface
(and is
@@ -122,11 +139,11 @@ public interface IProcessActivity extend
   *@param data is the document data.  The data is closed after ingestion is complete.
   *@throws IOException only when data stream reading fails.
   */
-  public void ingestDocumentWithException(String localIdentifier, String version, String
documentURI, RepositoryDocument data)
+  public void ingestDocumentWithException(String documentIdentifier, String version, String
documentURI, RepositoryDocument data)
     throws ManifoldCFException, ServiceInterruption, IOException;
 
   /** Ingest the current document.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param version is the version of the document, as reported by the getDocumentVersions()
method of the
   *       corresponding repository connector.
   *@param documentURI is the URI to use to retrieve this document from the search interface
(and is
@@ -136,48 +153,78 @@ public interface IProcessActivity extend
   * according to standard best practices.
   */
   @Deprecated
-  public void ingestDocument(String localIdentifier, String version, String documentURI,
RepositoryDocument data)
+  public void ingestDocument(String documentIdentifier, String version, String documentURI,
RepositoryDocument data)
     throws ManifoldCFException, ServiceInterruption;
 
+  /** Note the fact that a document exists but is unchanged, and nothing further
+  * needs to be done to it.
+  * Call this method if it is determined that the document in question is identical to
+  * the formerly indexed document, AND when the version string for the document
+  * has not changed either.
+  */
+  public void noteUnchangedDocument(String documentIdentifier)
+    throws ManifoldCFException;
+
+  /** Remove the specified document from the search engine index, and update the
+  * recorded version information for the document.
+  *@param documentIdentifier is the document's local identifier.
+  *@param version is the version string to be recorded for the document.
+  */
+  public void noDocument(String documentIdentifier, String version)
+    throws ManifoldCFException, ServiceInterruption;
+
+  /** Record a document version, WITHOUT reindexing it, or removing it.  (Other
+  * documents with the same URL, however, will still be removed.)  This is
+  * useful if the version string changes but the document contents are known not
+  * to have changed.
+  *@param documentIdentifier is the document identifier.
+  *@param version is the document version.
+  */
+  public void recordDocument(String documentIdentifier, String version)
+    throws ManifoldCFException;
+
   /** Delete the current document from the search engine index, while keeping track of the
version information
   * for it (to reduce churn).
-  *@param localIdentifier is the document's local identifier.
-  *@param version is the version of the document, as reported by the getDocumentVersions()
method of the
-  *       corresponding repository connector.
+  * Deprecated; use noDocument() above instead.
+  *@param documentIdentifier is the document's local identifier.
+  *@param version is the version string to be recorded for the document.
   */
-  public void deleteDocument(String localIdentifier, String version)
+  @Deprecated
+  public void deleteDocument(String documentIdentifier, String version)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Delete the current document from the search engine index.  This method does NOT keep
track of version
-  * information for the document and thus can lead to "churn", whereby the same document
is queued, versioned,
-  * and removed on subsequent crawls.  It therefore should be considered to be deprecated,
in favor of
-  * deleteDocument(String localIdentifier, String version).
-  *@param localIdentifier is the document's local identifier.
+  /** Delete the specified document permanently from the search engine index, and from the
status table.
+  * This method does NOT keep track of any document version information for the document
and thus can
+  * lead to "churn", whereby the same document is queued, processed,
+  * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+  * in any case where the same decision will need to be made over and over.
+  *@param documentIdentifier is the document's identifier.
   */
-  public void deleteDocument(String localIdentifier)
+  @Deprecated
+  public void deleteDocument(String documentIdentifier)
     throws ManifoldCFException, ServiceInterruption;
 
   /** Override the schedule for the next time a document is crawled.
   * Calling this method allows you to set an upper recrawl bound, lower recrawl bound, upper
expire bound, lower expire bound,
   * or a combination of these, on a specific document.  This method is only effective if
the job is a continuous one, and if the
   * identifier you pass in is being processed.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param lowerRecrawlBoundTime is the time in ms since epoch that the reschedule time should
not fall BELOW, or null if none.
   *@param upperRecrawlBoundTime is the time in ms since epoch that the reschedule time should
not rise ABOVE, or null if none.
   *@param lowerExpireBoundTime is the time in ms since epoch that the expire time should
not fall BELOW, or null if none.
   *@param upperExpireBoundTime is the time in ms since epoch that the expire time should
not rise ABOVE, or null if none.
   */
-  public void setDocumentScheduleBounds(String localIdentifier,
+  public void setDocumentScheduleBounds(String documentIdentifier,
     Long lowerRecrawlBoundTime, Long upperRecrawlBoundTime,
     Long lowerExpireBoundTime, Long upperExpireBoundTime)
     throws ManifoldCFException;
 
   /** Override a document's origination time.
   * Use this method to signal the framework that a document's origination time is something
other than the first time it was crawled.
-  *@param localIdentifier is the document's local identifier.
+  *@param documentIdentifier is the document's identifier.
   *@param originationTime is the document's origination time, or null if unknown.
   */
-  public void setDocumentOriginationTime(String localIdentifier,
+  public void setDocumentOriginationTime(String documentIdentifier,
     Long originationTime)
     throws ManifoldCFException;
 

Modified: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java?rev=1610414&r1=1610413&r2=1610414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IRepositoryConnector.java
Mon Jul 14 14:10:36 2014
@@ -48,16 +48,13 @@ import java.util.*;
 * It therefore establishes a space of document identifiers.  Each connector will only ever
be
 * asked to deal with identifiers that have in some way originated from the connector.
 *
-* Documents are fetched by ManifoldCF in three stages.  First, the addSeedDocuments() method
is called in the connector
+* Documents are fetched by ManifoldCF in two stages.  First, the addSeedDocuments() method
is called in the connector
 * implementation.  This method is meant to add a set of document identifiers to the queue.
 When ManifoldCF is ready
-* to process a document, the document identifier is used to obtain a current document version
string, using the
-* getDocumentVersions() method (the second stage).  This version string is used to decide
whether or not the
-* third stage need be called for the document or not.  The third stage is responsible for
sending document content
-* to the output, and for extracting any references to additional documents, and consists
of the processDocuments() method.
+* to process a document, the document identifier is used to build a version string for the
document and check whether
+* the document needs to be indexed, and index it if needed (the second stage).  The second
stage
+* consists of the processDocuments() method.
 *
-* All of these methods interact with ManifoldCF by means of an "activity" interface.  For
example, an IVersionActivity object
-* is passed to the getDocumentVersions() method, and that object contains methods that are
necessary for getDocumentVersions()
-* to do its job.  A similar architecture is used throughout the connector framework.
+* All of these methods interact with ManifoldCF by means of an "activity" interface.
 */
 public interface IRepositoryConnector extends IConnector
 {
@@ -182,57 +179,23 @@ public interface IRepositoryConnector ex
     String lastSeedVersion, long seedTime, int jobMode)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Get document versions given an array of document identifiers.
-  * This method is called for EVERY document that is considered. It is therefore important
to perform
-  * as little work as possible here.
-  * The connector will be connected before this method can be called.
-  *@param documentVersions is the versions object, to be filled in by this method.
-  *@param documentIdentifiers is the array of local document identifiers, as understood by
this connector.
-  *@param oldVersions is the corresponding array of version strings that have been saved
for the document identifiers.
-  *   A null value indicates that this is a first-time fetch, while an empty string indicates
that the previous document
-  *   had an empty version string.
-  *@param activities is the interface this method should use to perform whatever framework
actions are desired.
-  *@param spec is the current document specification for the current job.  If there is a
dependency on this
-  * specification, then the version string should include the pertinent data, so that reingestion
will occur
-  * when the specification changes.  This is primarily useful for metadata.
-  *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
-  *@param usesDefaultAuthority will be true only if the authority in use for these documents
is the default one.
-  */
-  public void getDocumentVersions(
-    DocumentVersions documentVersions,
-    String[] documentIdentifiers, String[] oldVersions,
-    IVersionActivity activities,
-    Specification spec, int jobMode, boolean usesDefaultAuthority)
-    throws ManifoldCFException, ServiceInterruption;
-
   /** Process a set of documents.
   * This is the method that should cause each document to be fetched, processed, and the
results either added
   * to the queue of documents for the current job, and/or entered into the incremental ingestion
manager.
   * The document specification allows this class to filter what is done based on the job.
   * The connector will be connected before this method can be called.
   *@param documentIdentifiers is the set of document identifiers to process.
-  *@param versions are the version strings returned by getDocumentVersions() above.
+  *@param statuses are the currently-stored document versions for each document in the set
of document identifiers
+  * passed in above.
   *@param activities is the interface this method should use to queue up new document references
   * and ingest documents.
-  *@param scanOnly is an array corresponding to the document identifiers.  It is set to true
to indicate when the processing
-  * should only find other references, and should not actually call the ingestion methods.
   *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents
is the default one.
   */
-  public void processDocuments(String[] documentIdentifiers, DocumentVersions versions, IProcessActivity
activities,
-    boolean[] scanOnly, int jobMode)
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses,
Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption;
 
-  /** Free a set of documents.  This method is called for all documents whose versions have
been fetched using
-  * the getDocumentVersions() method, including those that returned null versions.  It may
be used to free resources
-  * committed during the getDocumentVersions() method.  It is guaranteed to be called AFTER
any calls to
-  * processDocuments() for the documents in question.
-  * The connector will be connected before this method can be called.
-  *@param documentIdentifiers is the set of document identifiers.
-  *@param versions is the corresponding set of version strings (individual identifiers may
have no version).
-  */
-  public void releaseDocumentVersions(String[] documentIdentifiers, DocumentVersions versions)
-    throws ManifoldCFException;
-
   /** Get the maximum number of documents to amalgamate together into one batch, for this
connector.
   * The connector does not need to be connected for this method to be called.
   *@return the maximum number. 0 indicates "unlimited".

Modified: manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1610414&r1=1610413&r2=1610414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
(original)
+++ manifoldcf/branches/CONNECTORS-990/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Mon Jul 14 14:10:36 2014
@@ -1642,6 +1642,22 @@ public class WorkerThread extends Thread
       existingDr.addPrerequisiteEvents(prereqEventNames);
     }
 
+    /** Check if a document needs to be reindexed, based on a computed version string.
+    * Call this method to determine whether reindexing is necessary.  Pass in a newly-computed
version
+    * string.  This method will return "true" if the document needs to be re-indexed.
+    *@param documentIdentifier is the document identifier.
+    *@param newVersionString is the newly-computed version string.
+    *@return true if the document needs to be reindexed.
+    */
+    @Override
+    public boolean checkDocumentNeedsReindexing(String documentIdentifier,
+      String newVersionString)
+      throws ManifoldCFException
+    {
+      IPipelineSpecificationWithVersions spec = fetchPipelineSpecifications.get(documentIdentifier);
+      return ingester.checkFetchDocument(spec,newVersionString,parameterVersion,connection.getACLAuthority());
+    }
+
     /** Add a document description to the current job's queue.
     *@param localIdentifier is the local document identifier to add (for the connector that
     * fetched the document).
@@ -1733,20 +1749,32 @@ public class WorkerThread extends Thread
       return jobManager.retrieveParentDataAsFiles(jobID,ManifoldCF.hash(localIdentifier),dataName);
     }
 
+    /** Note the fact that a document exists but is unchanged, and nothing further
+    * needs to be done to it.
+    * Call this method if it is determined that the document in question is identical to
+    * the formerly indexed document, AND when the version string for the document
+    * has not changed either.
+    */
+    @Override
+    public void noteUnchangedDocument(String documentIdentifier)
+      throws ManifoldCFException
+    {
+      // MHL ???
+    }
+
     /** Record a document version, but don't ingest it.
-    * ServiceInterruption is thrown if this action must be rescheduled.
     *@param documentIdentifier is the document identifier.
     *@param version is the document version.
     */
     @Override
     public void recordDocument(String documentIdentifier, String version)
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
       String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
       ingester.documentRecord(
         pipelineSpecification.getBasicPipelineSpecification(),
         connectionName,documentIdentifierHash,
-        version,currentTime,ingestLogger);
+        version,currentTime);
     }
 
     /** Ingest the current document.
@@ -1823,16 +1851,15 @@ public class WorkerThread extends Thread
         ingestLogger);
     }
 
-    /** Delete the current document from the search engine index, while keeping track of
the version information
+    /** Remove the specified document from the search engine index, while keeping track of
the version information
     * for it (to reduce churn).
     *@param documentIdentifier is the document's local identifier.
-    *@param version is the version of the document, as reported by the getDocumentVersions()
method of the
-    *       corresponding repository connector.
+    *@param version is the version string to be recorded for the document.
     */
-    @Override
-    public void deleteDocument(String documentIdentifier, String version)
+    public void noDocument(String documentIdentifier, String version)
       throws ManifoldCFException, ServiceInterruption
     {
+      // Special interpretation for empty version string
       if (version.length() == 0)
         deleteDocument(documentIdentifier);
       else
@@ -1847,15 +1874,32 @@ public class WorkerThread extends Thread
           throw new IllegalStateException("IngestDocumentWithException threw an illegal IOException:
"+e.getMessage(),e);
         }
       }
+
     }
 
-    /** Delete the current document from the search engine index.  This method does NOT keep
track of version
-    * information for the document and thus can lead to "churn", whereby the same document
is queued, versioned,
-    * and removed on subsequent crawls.  It therefore should be considered to be deprecated,
in favor of
-    * deleteDocument(String localIdentifier, String version).
+    /** Delete the current document from the search engine index, while keeping track of
the version information
+    * for it (to reduce churn).
+    * Use noDocument() above instead.
     *@param documentIdentifier is the document's local identifier.
+    *@param version is the version string to be recorded for the document.
+    */
+    @Override
+    @Deprecated
+    public void deleteDocument(String documentIdentifier, String version)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      noDocument(documentIdentifier,version);
+    }
+
+    /** Delete the specified document from the search engine index, and from the status table.
 This
+    *  method does NOT keep track of version
+    * information for the document and thus can lead to "churn", whereby the same document
is queued, processed,
+    * and removed on subsequent crawls.  It is therefore preferable to use noDocument() instead,
+    * in any case where the same decision will need to be made over and over.
+    *@param documentIdentifier is the document's identifier.
     */
     @Override
+    @Deprecated
     public void deleteDocument(String documentIdentifier)
       throws ManifoldCFException, ServiceInterruption
     {



Mime
View raw message