Return-Path: X-Original-To: apmail-manifoldcf-commits-archive@www.apache.org Delivered-To: apmail-manifoldcf-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 575FB1197D for ; Fri, 18 Jul 2014 17:56:09 +0000 (UTC) Received: (qmail 24653 invoked by uid 500); 18 Jul 2014 17:56:09 -0000 Delivered-To: apmail-manifoldcf-commits-archive@manifoldcf.apache.org Received: (qmail 24589 invoked by uid 500); 18 Jul 2014 17:56:09 -0000 Mailing-List: contact commits-help@manifoldcf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@manifoldcf.apache.org Delivered-To: mailing list commits@manifoldcf.apache.org Received: (qmail 24570 invoked by uid 99); 18 Jul 2014 17:56:09 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 18 Jul 2014 17:56:09 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 18 Jul 2014 17:56:06 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id F134523889CB; Fri, 18 Jul 2014 17:55:46 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1611735 - in /manifoldcf/branches/CONNECTORS-989/framework: agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/ agents/src/main/java/org/apache/manifoldcf/agents/interfaces/ pull-agent/src/main/java/org/apache/manifoldcf/c... Date: Fri, 18 Jul 2014 17:55:46 -0000 To: commits@manifoldcf.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20140718175546.F134523889CB@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: kwright Date: Fri Jul 18 17:55:46 2014 New Revision: 1611735 URL: http://svn.apache.org/r1611735 Log: First batch of work changing incremental ingester API Modified: manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Modified: manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java?rev=1611735&r1=1611734&r2=1611735&view=diff ============================================================================== --- manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java (original) +++ manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java Fri Jul 18 17:55:46 2014 @@ -612,16 +612,18 @@ public class IncrementalIngester extends *@param pipelineSpecificationBasic is the basic pipeline specification needed. *@param identifierClass is the name of the space in which the identifier hash should be interpreted. *@param identifierHash is the hashed document identifier. + *@param componentHash is the hashed component identifier, if any. *@param documentVersion is the document version. *@param recordTime is the time at which the recording took place, in milliseconds since epoch. */ @Override public void documentRecord( IPipelineSpecificationBasic pipelineSpecificationBasic, - String identifierClass, String identifierHash, + String identifierClass, String identifierHash, String componentHash, String documentVersion, long recordTime) throws ManifoldCFException { + // MHL // This method is called when a connector decides that the last indexed version of the document is in fact just fine, // but the document version information should be updated. // The code pathway is therefore similar to that of document indexing, EXCEPT that no indexing will ever @@ -661,6 +663,7 @@ public class IncrementalIngester extends *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched output versioning information. *@param identifierClass is the name of the space in which the identifier hash should be interpreted. *@param identifierHash is the hashed document identifier. + *@param componentHash is the hashed component identifier, if any. *@param documentVersion is the document version. *@param parameterVersion is the version string for the forced parameters. *@param authorityName is the name of the authority associated with the document, if any. @@ -670,7 +673,7 @@ public class IncrementalIngester extends @Override public void documentNoData( IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, - String identifierClass, String identifierHash, + String identifierClass, String identifierHash, String componentHash, String documentVersion, String parameterVersion, String authorityName, @@ -678,6 +681,7 @@ public class IncrementalIngester extends IOutputActivity activities) throws ManifoldCFException, ServiceInterruption { + // MHL PipelineConnectionsWithVersions pipelineConnectionsWithVersions = new PipelineConnectionsWithVersions(pipelineSpecificationWithVersions); String docKey = makeKey(identifierClass,identifierHash); @@ -710,6 +714,7 @@ public class IncrementalIngester extends *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched output versioning information. *@param identifierClass is the name of the space in which the identifier hash should be interpreted. *@param identifierHash is the hashed document identifier. + *@param componentHash is the hashed component identifier, if any. *@param documentVersion is the document version. *@param parameterVersion is the version string for the forced parameters. *@param authorityName is the name of the authority associated with the document, if any. @@ -723,15 +728,16 @@ public class IncrementalIngester extends @Override public boolean documentIngest( IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, - String identifierClass, String identifierHash, + String identifierClass, String identifierHash, String componentHash, String documentVersion, String parameterVersion, String authorityName, - RepositoryDocument document, + RepositoryDocument data, long ingestTime, String documentURI, IOutputActivity activities) throws ManifoldCFException, ServiceInterruption, IOException { + // MHL PipelineConnectionsWithVersions pipelineConnectionsWithVersions = new PipelineConnectionsWithVersions(pipelineSpecificationWithVersions); String docKey = makeKey(identifierClass,identifierHash); @@ -742,7 +748,7 @@ public class IncrementalIngester extends } // Set indexing date - document.setIndexingDate(new Date()); + data.setIndexingDate(new Date()); // Set up a pipeline PipelineObjectWithVersions pipeline = pipelineGrabWithVersions(pipelineConnectionsWithVersions); @@ -751,7 +757,7 @@ public class IncrementalIngester extends throw new ServiceInterruption("Pipeline connector not installed",0L); try { - return pipeline.addOrReplaceDocumentWithException(docKey,documentURI,document,documentVersion,parameterVersion,authorityName,activities,ingestTime) == IPipelineConnector.DOCUMENTSTATUS_ACCEPTED; + return pipeline.addOrReplaceDocumentWithException(docKey,documentURI,data,documentVersion,parameterVersion,authorityName,activities,ingestTime) == IPipelineConnector.DOCUMENTSTATUS_ACCEPTED; } finally { @@ -759,6 +765,25 @@ public class IncrementalIngester extends } } + /** Remove a document component from the search engine index. + *@param pipelineSpecificationBasic is the basic pipeline specification. + *@param identifierClass is the name of the space in which the identifier hash should be interpreted. + *@param identifierHash is the hash of the id of the document. + *@param componentHash is the hashed component identifier, if any. + *@param activities is the object to use to log the details of the ingestion attempt. May be null. + */ + @Override + public void documentRemove( + IPipelineSpecificationBasic pipelineSpecificationBasic, + String identifierClass, String identifierHash, String componentHash, + IOutputRemoveActivity activities) + throws ManifoldCFException, ServiceInterruption + { + // MHL + documentDelete(pipelineSpecificationBasic, + identifierClass,identifierHash,activities); + } + protected static String[] extractOutputConnectionNames(IPipelineSpecificationBasic pipelineSpecificationBasic) { String[] rval = new String[pipelineSpecificationBasic.getOutputCount()]; Modified: manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java?rev=1611735&r1=1611734&r2=1611735&view=diff ============================================================================== --- manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java (original) +++ manifoldcf/branches/CONNECTORS-989/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java Fri Jul 18 17:55:46 2014 @@ -154,12 +154,13 @@ public interface IIncrementalIngester *@param pipelineSpecificationBasic is the basic pipeline specification needed. *@param identifierClass is the name of the space in which the identifier hash should be interpreted. *@param identifierHash is the hashed document identifier. + *@param componentHash is the hashed component identifier, if any. *@param documentVersion is the document version. *@param recordTime is the time at which the recording took place, in milliseconds since epoch. */ public void documentRecord( IPipelineSpecificationBasic pipelineSpecificationBasic, - String identifierClass, String identifierHash, + String identifierClass, String identifierHash, String componentHash, String documentVersion, long recordTime) throws ManifoldCFException; @@ -171,6 +172,7 @@ public interface IIncrementalIngester *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched output versioning information. *@param identifierClass is the name of the space in which the identifier hash should be interpreted. *@param identifierHash is the hashed document identifier. + *@param componentHash is the hashed component identifier, if any. *@param documentVersion is the document version. *@param parameterVersion is the version string for the forced parameters. *@param authorityName is the name of the authority associated with the document, if any. @@ -179,7 +181,7 @@ public interface IIncrementalIngester */ public void documentNoData( IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, - String identifierClass, String identifierHash, + String identifierClass, String identifierHash, String componentHash, String documentVersion, String parameterVersion, String authorityName, @@ -195,6 +197,7 @@ public interface IIncrementalIngester *@param pipelineSpecificationWithVersions is the pipeline specification with already-fetched output versioning information. *@param identifierClass is the name of the space in which the identifier hash should be interpreted. *@param identifierHash is the hashed document identifier. + *@param componentHash is the hashed component identifier, if any. *@param documentVersion is the document version. *@param parameterVersion is the version string for the forced parameters. *@param authorityName is the name of the authority associated with the document, if any. @@ -207,7 +210,7 @@ public interface IIncrementalIngester */ public boolean documentIngest( IPipelineSpecificationWithVersions pipelineSpecificationWithVersions, - String identifierClass, String identifierHash, + String identifierClass, String identifierHash, String componentHash, String documentVersion, String parameterVersion, String authorityName, @@ -216,6 +219,19 @@ public interface IIncrementalIngester IOutputActivity activities) throws ManifoldCFException, ServiceInterruption, IOException; + /** Remove a document component from the search engine index. + *@param pipelineSpecificationBasic is the basic pipeline specification. + *@param identifierClass is the name of the space in which the identifier hash should be interpreted. + *@param identifierHash is the hash of the id of the document. + *@param componentHash is the hashed component identifier, if any. + *@param activities is the object to use to log the details of the ingestion attempt. May be null. + */ + public void documentRemove( + IPipelineSpecificationBasic pipelineSpecificationBasic, + String identifierClass, String identifierHash, String componentHash, + IOutputRemoveActivity activities) + throws ManifoldCFException, ServiceInterruption; + /** Note the fact that we checked a document (and found that it did not need to be ingested, because the * versions agreed). *@param pipelineSpecificationBasic is a pipeline specification. @@ -242,7 +258,7 @@ public interface IIncrementalIngester long checkTime) throws ManifoldCFException; - /** Delete multiple documents from the search engine index. + /** Delete multiple documents, and their components, from the search engine index. *@param pipelineSpecificationBasics are the pipeline specifications associated with the documents. *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. *@param identifierHashes is tha array of document identifier hashes if the documents. @@ -254,7 +270,7 @@ public interface IIncrementalIngester IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption; - /** Delete multiple documents from the search engine index. + /** Delete multiple documents, and their components, from the search engine index. *@param pipelineSpecificationBasic is the basic pipeline specification. *@param identifierClasses are the names of the spaces in which the identifier hashes should be interpreted. *@param identifierHashes is tha array of document identifier hashes if the documents. @@ -266,7 +282,7 @@ public interface IIncrementalIngester IOutputRemoveActivity activities) throws ManifoldCFException, ServiceInterruption; - /** Delete a document from the search engine index. + /** Delete a document, and all its components, from the search engine index. *@param pipelineSpecificationBasic is the basic pipeline specification. *@param identifierClass is the name of the space in which the identifier hash should be interpreted. *@param identifierHash is the hash of the id of the document. Modified: manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java?rev=1611735&r1=1611734&r2=1611735&view=diff ============================================================================== --- manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java (original) +++ manifoldcf/branches/CONNECTORS-989/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java Fri Jul 18 17:55:46 2014 @@ -1433,12 +1433,7 @@ public class WorkerThread extends Thread public void recordDocument(String documentIdentifier, String version) throws ManifoldCFException { - String documentIdentifierHash = ManifoldCF.hash(documentIdentifier); - ingester.documentRecord( - pipelineSpecification.getBasicPipelineSpecification(), - connectionName,documentIdentifierHash, - version,currentTime); - touchedSet.add(documentIdentifier); + recordDocument(documentIdentifier,null,version); } /** Record a document version, WITHOUT reindexing it, or removing it. (Other @@ -1455,7 +1450,17 @@ public class WorkerThread extends Thread String version) throws ManifoldCFException { - // MHL + String documentIdentifierHash = ManifoldCF.hash(documentIdentifier); + String componentIdentifierHash; + if (componentIdentifier != null) + componentIdentifierHash = ManifoldCF.hash(componentIdentifier); + else + componentIdentifierHash = null; + ingester.documentRecord( + pipelineSpecification.getBasicPipelineSpecification(), + connectionName,documentIdentifierHash,componentIdentifierHash, + version,currentTime); + touchedSet.add(documentIdentifier); } /** Ingest the current document. @@ -1497,11 +1502,34 @@ public class WorkerThread extends Thread public void ingestDocumentWithException(String documentIdentifier, String version, String documentURI, RepositoryDocument data) throws ManifoldCFException, ServiceInterruption, IOException { + } + + /** Ingest the current document. + *@param documentIdentifier is the document's identifier. + *@param componentIdentifier is the component document identifier, if any. + *@param version is the version of the document, as reported by the getDocumentVersions() method of the + * corresponding repository connector. + *@param documentURI is the URI to use to retrieve this document from the search interface (and is + * also the unique key in the index). + *@param data is the document data. The data is closed after ingestion is complete. + *@throws IOException only when data stream reading fails. + */ + @Override + public void ingestDocumentWithException(String documentIdentifier, + String componentIdentifier, + String version, String documentURI, RepositoryDocument data) + throws ManifoldCFException, ServiceInterruption, IOException + { // We should not get called here if versions agree, unless the repository // connector cannot distinguish between versions - in which case it must // always ingest (essentially) String documentIdentifierHash = ManifoldCF.hash(documentIdentifier); + String componentIdentifierHash; + if (componentIdentifier != null) + componentIdentifierHash = ManifoldCF.hash(componentIdentifier); + else + componentIdentifierHash = null; if (data != null) { @@ -1524,7 +1552,7 @@ public class WorkerThread extends Thread // First, we need to add into the metadata the stuff from the job description. ingester.documentIngest( fetchPipelineSpecifications.get(documentIdentifierHash), - connectionName,documentIdentifierHash, + connectionName,documentIdentifierHash,componentIdentifierHash, version,parameterVersion, connection.getACLAuthority(), data,currentTime, @@ -1534,45 +1562,16 @@ public class WorkerThread extends Thread touchedSet.add(documentIdentifier); } - /** Ingest the current document. - *@param documentIdentifier is the document's identifier. - *@param componentIdentifier is the component document identifier, if any. - *@param version is the version of the document, as reported by the getDocumentVersions() method of the - * corresponding repository connector. - *@param documentURI is the URI to use to retrieve this document from the search interface (and is - * also the unique key in the index). - *@param data is the document data. The data is closed after ingestion is complete. - *@throws IOException only when data stream reading fails. - */ - @Override - public void ingestDocumentWithException(String documentIdentifier, - String componentIdentifier, - String version, String documentURI, RepositoryDocument data) - throws ManifoldCFException, ServiceInterruption, IOException - { - // MHL - } - /** Remove the specified document from the search engine index, while keeping track of the version information * for it (to reduce churn). *@param documentIdentifier is the document's local identifier. *@param version is the version string to be recorded for the document. */ + @Override public void noDocument(String documentIdentifier, String version) throws ManifoldCFException, ServiceInterruption { - // Special interpretation for empty version string; treat as if the document doesn't exist - // (by ignoring it and allowing it to be deleted later) - String documentIdentifierHash = ManifoldCF.hash(documentIdentifier); - ingester.documentNoData( - fetchPipelineSpecifications.get(documentIdentifierHash), - connectionName,documentIdentifierHash, - version,parameterVersion, - connection.getACLAuthority(), - currentTime, - ingestLogger); - - touchedSet.add(documentIdentifier); + noDocument(documentIdentifier,null,version); } /** Remove the specified document from the search engine index, and update the @@ -1587,7 +1586,23 @@ public class WorkerThread extends Thread String version) throws ManifoldCFException, ServiceInterruption { - // MHL + // Special interpretation for empty version string; treat as if the document doesn't exist + // (by ignoring it and allowing it to be deleted later) + String documentIdentifierHash = ManifoldCF.hash(documentIdentifier); + String componentIdentifierHash; + if (componentIdentifier != null) + componentIdentifierHash = ManifoldCF.hash(componentIdentifier); + else + componentIdentifierHash = null; + ingester.documentNoData( + fetchPipelineSpecifications.get(documentIdentifierHash), + connectionName,documentIdentifierHash,componentIdentifierHash, + version,parameterVersion, + connection.getACLAuthority(), + currentTime, + ingestLogger); + + touchedSet.add(documentIdentifier); } /** Remove the specified document component permanently from the search engine index, and from the status table. @@ -1602,15 +1617,7 @@ public class WorkerThread extends Thread public void removeDocument(String documentIdentifier) throws ManifoldCFException, ServiceInterruption { - // Remove from incremental ingester ONLY. - String documentIdentifierHash = ManifoldCF.hash(documentIdentifier); - ingester.documentDelete( - pipelineSpecification.getBasicPipelineSpecification(), - connectionName,documentIdentifierHash, - ingestLogger); - - // Note that we touched it, so it won't get checked - touchedSet.add(documentIdentifier); + removeDocument(documentIdentifier,null); } /** Remove the specified document component permanently from the search engine index, and from the status table. @@ -1626,7 +1633,21 @@ public class WorkerThread extends Thread String componentIdentifier) throws ManifoldCFException, ServiceInterruption { - // MHL + // Remove from incremental ingester ONLY. + String documentIdentifierHash = ManifoldCF.hash(documentIdentifier); + String componentIdentifierHash; + if (componentIdentifier != null) + componentIdentifierHash = ManifoldCF.hash(componentIdentifier); + else + componentIdentifierHash = null; + + ingester.documentRemove( + pipelineSpecification.getBasicPipelineSpecification(), + connectionName,documentIdentifierHash,componentIdentifierHash, + ingestLogger); + + // Note that we touched it, so it won't get checked + touchedSet.add(documentIdentifier); }