From commits-return-9908-archive-asf-public=cust-asf.ponee.io@manifoldcf.apache.org Fri Dec 14 06:58:46 2018 Return-Path: X-Original-To: archive-asf-public@cust-asf.ponee.io Delivered-To: archive-asf-public@cust-asf.ponee.io Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by mx-eu-01.ponee.io (Postfix) with SMTP id AEA94180645 for ; Fri, 14 Dec 2018 06:58:45 +0100 (CET) Received: (qmail 37936 invoked by uid 500); 14 Dec 2018 05:58:44 -0000 Mailing-List: contact commits-help@manifoldcf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@manifoldcf.apache.org Delivered-To: mailing list commits@manifoldcf.apache.org Received: (qmail 37920 invoked by uid 99); 14 Dec 2018 05:58:44 -0000 Received: from Unknown (HELO svn01-us-west.apache.org) (209.188.14.144) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 14 Dec 2018 05:58:44 +0000 Received: from svn01-us-west.apache.org (localhost [127.0.0.1]) by svn01-us-west.apache.org (ASF Mail Server at svn01-us-west.apache.org) with ESMTP id A43593A20B4 for ; Fri, 14 Dec 2018 05:58:43 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1848911 - /manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java Date: Fri, 14 Dec 2018 05:58:43 -0000 To: commits@manifoldcf.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20181214055843.A43593A20B4@svn01-us-west.apache.org> Author: kwright Date: Fri Dec 14 05:58:42 2018 New Revision: 1848911 URL: http://svn.apache.org/viewvc?rev=1848911&view=rev Log: Improve hopcount logging and commenting Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java Modified: manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java URL: http://svn.apache.org/viewvc/manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java?rev=1848911&r1=1848910&r2=1848911&view=diff ============================================================================== --- manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java (original) +++ manifoldcf/trunk/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/jobs/HopCount.java Fri Dec 14 05:58:42 2018 @@ -949,7 +949,13 @@ public class HopCount extends org.apache } - /** Method that does the work of "finishing" a set of child references. */ + /** Method that does the work of "finishing" a set of child references. + * The API for hopcount involves doing the following for every document that is recrawled or reassessed, INCLUDING the seeds (in which case the + * document hash is the empty string): + * (1) Record all target references of the source documents, which either adds intrinsic links, or moves them to the "existing" state + * (2) When done adding, call this method, which should (depending on hopcount mode) mark hopcount records in need of reassessment, and + * delete the intrinsic links that have the right source document and were not marked as "new" or "existing", but rather just "base". + **/ protected void doFinish(Long jobID, String[] legalLinkTypes, String[] sourceDocumentHashes, int hopcountMethod) throws ManifoldCFException { @@ -966,7 +972,9 @@ public class HopCount extends org.apache // ... and then, get rid of all hopcount records and their dependencies that are marked for delete. - // Invalidate all links with the given source documents that match the common expression + // Invalidate all links with the given source documents. + // This basically should make sure reassessment of all referenced documents takes place. + // It also deletes the intrinsic links that no longer exist. doDeleteInvalidation(jobID,sourceDocumentHashes); } // Make all new and existing links become just "base" again. @@ -1188,31 +1196,42 @@ public class HopCount extends org.apache // See CONNECTORS-501. } - /** Invalidate links meeting a simple criteria which have a given set of source documents. This also runs a queue - * which is initialized with all the documents that have sources that exist in the hopcount table. The purpose + /** Invalidate targets of links which have a given set of source documents. This also removes intrinsic links + * that were not re-added that point to children of the source documents. + * + * The purpose * of that queue is to re-establish non-infinite values for all nodes that are described in IntrinsicLinks, that are * still connected to the root. */ protected void doDeleteInvalidation(Long jobID, String[] sourceDocumentHashes) throws ManifoldCFException { - ArrayList commonNewList = new ArrayList(); - commonNewList.add(intrinsicLinkManager.statusToString(intrinsicLinkManager.LINKSTATUS_BASE)); - String commonNewExpression = intrinsicLinkManager.newField+"=?"; - - // Clear up hopcount table + // Only do anything if we have something to process!! if (sourceDocumentHashes.length > 0) { + + // Conditionally log what we're doing if (Logging.hopcount.isDebugEnabled()) { - Logging.hopcount.debug("Marking for delete for job "+jobID+" all target document references matching '"+commonNewExpression+"'"+ - " from:"); + final StringBuilder sb = new StringBuilder(); for (int k = 0; k < sourceDocumentHashes.length; k++) { - Logging.hopcount.debug(" "+sourceDocumentHashes[k]); + sb.append(" '").append(sourceDocumentHashes[k]).append("' "); } + Logging.hopcount.debug("Marking for delete for job "+jobID+" all target document references matching '"+ + intrinsicLinkManager.newField+"="+intrinsicLinkManager.statusToString(intrinsicLinkManager.LINKSTATUS_BASE)+ + "' that are targets of ("+sb.toString()+")"); } + // Look for the links that are marked as "base". These are the ones whose target's hopcounts will need to + // be reassessed, and also which will need to be removed from the intrinsiclink table after we do that. + String commonNewExpression = intrinsicLinkManager.newField+"=?"; + ArrayList commonNewList = new ArrayList(); + commonNewList.add(intrinsicLinkManager.statusToString(intrinsicLinkManager.LINKSTATUS_BASE)); + + // Clear up hopcount table FIRST, before we remove the links. + // This sets the "mark for death" field so that we know which records are, in fact, the targets we need to be concerned about. + // The query form I found that seems to work ok with postgresql looks like this: // // UPDATE hopcount SET x=y WHERE id IN (SELECT ownerid FROM hopdeletedeps t0 @@ -1254,7 +1273,10 @@ public class HopCount extends org.apache noteModifications(0,sourceDocumentHashes.length,0); if (Logging.hopcount.isDebugEnabled()) - Logging.hopcount.debug("Done setting hopcount rows for job "+jobID+" to initial distances"); + Logging.hopcount.debug("Done marking for delete for job "+jobID); + + // Clean up the intrinsiclink and hopcountdeletedeps tables. We can use the mark-for-death + // field as a way of figuring out which rows to remove. // Remove the intrinsic links that we said we would - BEFORE we evaluate the queue. intrinsicLinkManager.removeLinks(jobID, @@ -1268,7 +1290,8 @@ public class HopCount extends org.apache new UnitaryClause(markForDeathField,markToString(MARK_DELETING))}); deleteDepsManager.removeMarkedRows(getTableName(),idField,query,queryList); - // Set the hopcount rows back to just "queued". + // Map the hopcount rows we set to "mark for death" back to just "queued". That means they'll + // be re-evaluated when the associated hopcount is requested. HashMap newMap = new HashMap(); newMap.put(markForDeathField,markToString(MARK_QUEUED)); performUpdate(newMap,"WHERE "+query,queryList,null); @@ -1279,7 +1302,7 @@ public class HopCount extends org.apache // will have new hopcount values. if (Logging.hopcount.isDebugEnabled()) - Logging.hopcount.debug("Done queueing for deletion for "+jobID); + Logging.hopcount.debug("Done queueing for re-evaluation for "+jobID); }