manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1608414 - in /manifoldcf/branches/CONNECTORS-989-2: ./ framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/ framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/ framework/pull-agent/src/main/jav...
Date Mon, 07 Jul 2014 11:35:52 GMT
Author: kwright
Date: Mon Jul  7 11:35:51 2014
New Revision: 1608414

URL: http://svn.apache.org/r1608414
Log:
Merge (modified) changes from CONNECTORS-989 branch.  Still doesn't build; needs work on IngestStatuses
class

Modified:
    manifoldcf/branches/CONNECTORS-989-2/   (props changed)
    manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
    manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
    manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecificationWithVersions.java
    manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/QueuedDocument.java
    manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StufferThread.java
    manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Propchange: manifoldcf/branches/CONNECTORS-989-2/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-989:r1607605-1608196

Modified: manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java?rev=1608414&r1=1608413&r2=1608414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
(original)
+++ manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
Mon Jul  7 11:35:51 2014
@@ -608,12 +608,47 @@ public class IncrementalIngester extends
     sb.append(delim);
   }
 
+  /** Note the start of processing of a set of documents.  This method does whatever is needed
to handle the
+  * bookkeeping for the documents' indexing records, and the indexing records for multiple
virtual child documents.
+  * The documents must all have the same identifier class.
+  *@param pipelineSpecificationBasic is the basic pipeline specification for the set of documents.
+  *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
+  *@param identifierHashes are the document identifier hashes that are about to be processed.
+  */
+  @Override
+  public void beginDocumentProcessing(
+    IPipelineSpecificationBasic pipelineSpecificationBasic,
+    String identifierClass, String[] identifierHashes)
+    throws ManifoldCFException
+  {
+    // MHL
+  }
+
+  /** Note the end of processing of a set of documents.  This method completes bookkeeping
for the documents'
+  * indexing records, and the indexing records for multiple virtual child documents.  This
method may cause
+  * documents to be removed from the specified output connections, should that be indicated.
+  *@param pipelineSpecificationBasic is the basic pipeline specification for the set of documents.
+  *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
+  *@param identifierHashes are the document identifier hashes that are about to be processed.
+  *@param activities is the object to use to log the details of any removals.  May be null.
+  */
+  @Override
+  public void endDocumentProcessing(
+    IPipelineSpecificationBasic pipelineSpecificationBasic,
+    String identifierClass, String[] identifierHashes,
+    IOutputRemoveActivity activities)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // MHL
+  }
+
   /** Record a document version, but don't ingest it.
   * The purpose of this method is to keep track of the frequency at which ingestion "attempts"
take place.
   * ServiceInterruption is thrown if this action must be rescheduled.
   *@param pipelineSpecificationBasic is the basic pipeline specification needed.
   *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
   *@param identifierHash is the hashed document identifier.
+  *@param childIdentifierHash is the hashed virtual child document identifier.  Pass null
if this is a primary record.
   *@param documentVersion is the document version.
   *@param recordTime is the time at which the recording took place, in milliseconds since
epoch.
   *@param activities is the object used in case a document needs to be removed from the output
index as the result of this operation.
@@ -621,11 +656,12 @@ public class IncrementalIngester extends
   @Override
   public void documentRecord(
     IPipelineSpecificationBasic pipelineSpecificationBasic,
-    String identifierClass, String identifierHash,
+    String identifierClass, String identifierHash, String childIdentifierHash,
     String documentVersion, long recordTime,
     IOutputActivity activities)
     throws ManifoldCFException, ServiceInterruption
   {
+    // MHL
     String docKey = makeKey(identifierClass,identifierHash);
 
     String[] outputConnectionNames = extractOutputConnectionNames(pipelineSpecificationBasic);
@@ -743,6 +779,7 @@ public class IncrementalIngester extends
   *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched
output versioning information.
   *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
   *@param identifierHash is the hashed document identifier.
+  *@param childIdentifierHash is the hashed virtual child document identifier.  Pass null
if this is a primary record.
   *@param documentVersion is the document version.
   *@param parameterVersion is the version string for the forced parameters.
   *@param authorityName is the name of the authority associated with the document, if any.
@@ -756,15 +793,16 @@ public class IncrementalIngester extends
   @Override
   public boolean documentIngest(
     IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
-    String identifierClass, String identifierHash,
+    String identifierClass, String identifierHash, String childIdentifierHash,
     String documentVersion,
     String parameterVersion,
     String authorityName,
-    RepositoryDocument document,
+    RepositoryDocument data,
     long ingestTime, String documentURI,
     IOutputActivity activities)
     throws ManifoldCFException, ServiceInterruption, IOException
   {
+    // MHL
     PipelineConnectionsWithVersions pipelineConnectionsWithVersions = new PipelineConnectionsWithVersions(pipelineSpecificationWithVersions);
     
     String docKey = makeKey(identifierClass,identifierHash);
@@ -775,7 +813,7 @@ public class IncrementalIngester extends
     }
 
     // Set indexing date
-    document.setIndexingDate(new Date());
+    data.setIndexingDate(new Date());
     
     // Set up a pipeline
     PipelineObjectWithVersions pipeline = pipelineGrabWithVersions(pipelineConnectionsWithVersions);
@@ -784,7 +822,7 @@ public class IncrementalIngester extends
       throw new ServiceInterruption("Pipeline connector not installed",0L);
     try
     {
-      return pipeline.addOrReplaceDocumentWithException(docKey,documentURI,document,documentVersion,parameterVersion,authorityName,activities,ingestTime)
== IPipelineConnector.DOCUMENTSTATUS_ACCEPTED;
+      return pipeline.addOrReplaceDocumentWithException(docKey,documentURI,data,documentVersion,parameterVersion,authorityName,activities,ingestTime)
== IPipelineConnector.DOCUMENTSTATUS_ACCEPTED;
     }
     finally
     {

Modified: manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java?rev=1608414&r1=1608413&r2=1608414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
(original)
+++ manifoldcf/branches/CONNECTORS-989-2/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
Mon Jul  7 11:35:51 2014
@@ -149,19 +149,46 @@ public interface IIncrementalIngester
     String newParameterVersion,
     String newAuthorityNameString);
 
+  /** Note the start of processing of a set of documents.  This method does whatever is needed
to handle the
+  * bookkeeping for the documents' indexing records, and the indexing records for multiple
virtual child documents.
+  * The documents must all have the same identifier class.
+  *@param pipelineSpecificationBasic is the basic pipeline specification for the set of documents.
+  *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
+  *@param identifierHashes are the document identifier hashes that are about to be processed.
+  */
+  public void beginDocumentProcessing(
+    IPipelineSpecificationBasic pipelineSpecificationBasic,
+    String identifierClass, String[] identifierHashes)
+    throws ManifoldCFException;
+
+  /** Note the end of processing of a set of documents.  This method completes bookkeeping
for the documents'
+  * indexing records, and the indexing records for multiple virtual child documents.  This
method may cause
+  * documents to be removed from the specified output connections, should that be indicated.
+  *@param pipelineSpecificationBasic is the basic pipeline specification for the set of documents.
+  *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
+  *@param identifierHashes are the document identifier hashes that are about to be processed.
+  *@param activities is the object to use to log the details of any removals.  May be null.
+  */
+  public void endDocumentProcessing(
+    IPipelineSpecificationBasic pipelineSpecificationBasic,
+    String identifierClass, String[] identifierHashes,
+    IOutputRemoveActivity activities)
+    throws ManifoldCFException, ServiceInterruption;
+
   /** Record a document version, but don't ingest it.
   * The purpose of this method is to keep track of the frequency at which ingestion "attempts"
take place.
   * ServiceInterruption is thrown if this action must be rescheduled.
   *@param pipelineSpecificationBasic is the basic pipeline specification needed.
   *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
   *@param identifierHash is the hashed document identifier.
+  *@param childIdentifierHash is the hashed virtual child document identifier.  Pass null
if this is a primary record.
   *@param documentVersion is the document version.
   *@param recordTime is the time at which the recording took place, in milliseconds since
epoch.
   *@param activities is the object used in case a document needs to be removed from the output
index as the result of this operation.
   */
   public void documentRecord(
     IPipelineSpecificationBasic pipelineSpecificationBasic,
-    String identifierClass, String identifierHash,
+    String identifierClass, String identifierHash, String childIdentifierHash,
     String documentVersion, long recordTime,
     IOutputActivity activities)
     throws ManifoldCFException, ServiceInterruption;
@@ -174,6 +201,7 @@ public interface IIncrementalIngester
   *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched
output versioning information.
   *@param identifierClass is the name of the space in which the identifier hash should be
interpreted.
   *@param identifierHash is the hashed document identifier.
+  *@param childIdentifierHash is the hashed virtual child document identifier.  Pass null
if this is a primary record.
   *@param documentVersion is the document version.
   *@param parameterVersion is the version string for the forced parameters.
   *@param authorityName is the name of the authority associated with the document, if any.
@@ -186,7 +214,7 @@ public interface IIncrementalIngester
   */
   public boolean documentIngest(
     IPipelineSpecificationWithVersions pipelineSpecificationWithVersions,
-    String identifierClass, String identifierHash,
+    String identifierClass, String identifierHash, String childIdentifierHash,
     String documentVersion,
     String parameterVersion,
     String authorityName,

Modified: manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecificationWithVersions.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecificationWithVersions.java?rev=1608414&r1=1608413&r2=1608414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecificationWithVersions.java
(original)
+++ manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/PipelineSpecificationWithVersions.java
Mon Jul  7 11:35:51 2014
@@ -49,7 +49,8 @@ public class PipelineSpecificationWithVe
   protected DocumentIngestStatus getStatus(int index)
   {
     IPipelineSpecificationBasic basic = pipelineSpecification.getBasicPipelineSpecification();
-    return queuedDocument.getLastIngestedStatus(basic.getStageConnectionName(basic.getOutputStage(index)));
+    // MHL
+    return queuedDocument.getLastIngestedStatus(basic.getStageConnectionName(basic.getOutputStage(index)),"");
   }
   
   /** For a given output index, return a document version string.

Modified: manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/QueuedDocument.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/QueuedDocument.java?rev=1608414&r1=1608413&r2=1608414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/QueuedDocument.java
(original)
+++ manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/QueuedDocument.java
Mon Jul  7 11:35:51 2014
@@ -38,8 +38,8 @@ public class QueuedDocument
 
   /** The document description. */
   protected final DocumentDescription documentDescription;
-  /** The last ingested status, null meaning "never ingested". */
-  protected final Map<String,DocumentIngestStatus> lastIngestedStatus;
+  /** The last ingested status for all child records, arranged by output connection name.
null means "never ingested". */
+  protected final Map<String,Map<String,DocumentIngestStatus>> lastIngestedStatus;
   /** The binnames for the document, according to the connector */
   protected final String[] binNames;
   /** This flag indicates whether the document has been processed or not. */
@@ -47,10 +47,11 @@ public class QueuedDocument
 
   /** Constructor.
   *@param documentDescription is the document description.
-  *@param lastIngestedStatus is the document's last ingested status.
+  *@param lastIngestedStatus is the document's last ingested status.  Map is keyed by output
connection name, and child map is
+  *   keyed by child document id hash value, or a blank value for a primary row.
   *@param binNames are the bins associated with the document.
   */
-  public QueuedDocument(DocumentDescription documentDescription, Map<String,DocumentIngestStatus>
lastIngestedStatus, String[] binNames)
+  public QueuedDocument(DocumentDescription documentDescription, Map<String,Map<String,DocumentIngestStatus>>
lastIngestedStatus, String[] binNames)
   {
     this.documentDescription = documentDescription;
     this.lastIngestedStatus = lastIngestedStatus;
@@ -65,15 +66,34 @@ public class QueuedDocument
     return documentDescription;
   }
 
+  /** Get an iterator over the child document IDs available for this document identifier
and a specified
+  * output connection name.
+  *@param outputConnectionName is the name of the output connection.
+  *@return an iterator over child document IDs.  For the primary ID, an empty value will
be returned by this iterator.
+  */
+  public Iterator<String> getLastIngestedChildIDs(String outputConnectionName)
+  {
+    if (lastIngestedStatus == null)
+      return new HashSet<String>().iterator();
+    Map<String,DocumentIngestStatus> rval = lastIngestedStatus.get(outputConnectionName);
+    if (rval == null)
+      return new HashSet<String>().iterator();
+    return rval.keySet().iterator();
+  }
+  
   /** Get the last ingested status.
   *@param outputConnectionName is the name of the output connection.
+  *@param childIDHash is the child identifier hash, using an empty value for the primary.
   *@return the last ingested status for that output, or null if not found.
   */
-  public DocumentIngestStatus getLastIngestedStatus(String outputConnectionName)
+  public DocumentIngestStatus getLastIngestedStatus(String outputConnectionName, String childIDHash)
   {
     if (lastIngestedStatus == null)
       return null;
-    return lastIngestedStatus.get(outputConnectionName);
+    Map<String,DocumentIngestStatus> rval = lastIngestedStatus.get(outputConnectionName);
+    if (rval == null)
+      return null;
+    return rval.get(childIDHash);
   }
 
   /** Return true if there are *any* last ingested records.

Modified: manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StufferThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StufferThread.java?rev=1608414&r1=1608413&r2=1608414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StufferThread.java
(original)
+++ manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/StufferThread.java
Mon Jul  7 11:35:51 2014
@@ -334,6 +334,7 @@ public class StufferThread extends Threa
               binNames = new String[]{""};
             }
 
+            // ??? 
             QueuedDocument qd = new QueuedDocument(descs[i],(Map<String,DocumentIngestStatus>)versions[i],binNames);
 
             // Grab the arraylist that's there, or create it.

Modified: manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1608414&r1=1608413&r2=1608414&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
(original)
+++ manifoldcf/branches/CONNECTORS-989-2/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java
Mon Jul  7 11:35:51 2014
@@ -305,7 +305,8 @@ public class WorkerThread extends Thread
                       QueuedDocument qd = activeDocuments.get(i);
                       currentDocIDHashArray[i] = qd.getDocumentDescription().getDocumentIdentifierHash();
                       currentDocIDArray[i] = qd.getDocumentDescription().getDocumentIdentifier();
-                      DocumentIngestStatus dis = qd.getLastIngestedStatus(lastIndexedOutputConnectionName);
+                      // MHL
+                      DocumentIngestStatus dis = qd.getLastIngestedStatus(lastIndexedOutputConnectionName,"");
                       if (dis == null)
                         oldVersionStringArray[i] = null;
                       else
@@ -993,6 +994,7 @@ public class WorkerThread extends Thread
     {
       QueuedDocument qd = deleteList.get(i);
       // See if we need to delete from index
+      // MHL
       if (qd.anyLastIngestedRecords())
       {
         // Queue up to issue deletion
@@ -1743,9 +1745,10 @@ public class WorkerThread extends Thread
       throws ManifoldCFException, ServiceInterruption
     {
       String documentIdentifierHash = ManifoldCF.hash(documentIdentifier);
+      // MHL
       ingester.documentRecord(
         pipelineSpecification.getBasicPipelineSpecification(),
-        connectionName,documentIdentifierHash,
+        connectionName,documentIdentifierHash,null,
         version,currentTime,ingestLogger);
     }
 
@@ -1813,9 +1816,10 @@ public class WorkerThread extends Thread
       }
         
       // First, we need to add into the metadata the stuff from the job description.
+      // MHL
       ingester.documentIngest(
         fetchPipelineSpecifications.get(documentIdentifierHash),
-        connectionName,documentIdentifierHash,
+        connectionName,documentIdentifierHash,null,
         version,parameterVersion,
         connection.getACLAuthority(),
         data,currentTime,



Mime
View raw message