Return-Path: X-Original-To: apmail-manifoldcf-commits-archive@www.apache.org Delivered-To: apmail-manifoldcf-commits-archive@www.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 4BDD517FA7 for ; Wed, 8 Oct 2014 22:54:44 +0000 (UTC) Received: (qmail 78287 invoked by uid 500); 8 Oct 2014 22:54:44 -0000 Delivered-To: apmail-manifoldcf-commits-archive@manifoldcf.apache.org Received: (qmail 78237 invoked by uid 500); 8 Oct 2014 22:54:44 -0000 Mailing-List: contact commits-help@manifoldcf.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: dev@manifoldcf.apache.org Delivered-To: mailing list commits@manifoldcf.apache.org Received: (qmail 78228 invoked by uid 99); 8 Oct 2014 22:54:44 -0000 Received: from nike.apache.org (HELO nike.apache.org) (192.87.106.230) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Oct 2014 22:54:44 +0000 X-ASF-Spam-Status: No, hits=-2000.0 required=5.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Wed, 08 Oct 2014 22:54:21 +0000 Received: from eris.apache.org (localhost [127.0.0.1]) by eris.apache.org (Postfix) with ESMTP id 2D73023888FE; Wed, 8 Oct 2014 22:54:19 +0000 (UTC) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r1630247 - /manifoldcf/branches/CONNECTORS-1068/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java Date: Wed, 08 Oct 2014 22:54:19 -0000 To: commits@manifoldcf.apache.org From: kwright@apache.org X-Mailer: svnmailer-1.0.9 Message-Id: <20141008225419.2D73023888FE@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: kwright Date: Wed Oct 8 22:54:18 2014 New Revision: 1630247 URL: http://svn.apache.org/r1630247 Log: Do hard checks for documents Modified: manifoldcf/branches/CONNECTORS-1068/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java Modified: manifoldcf/branches/CONNECTORS-1068/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1068/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java?rev=1630247&r1=1630246&r2=1630247&view=diff ============================================================================== --- manifoldcf/branches/CONNECTORS-1068/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java (original) +++ manifoldcf/branches/CONNECTORS-1068/connectors/documentfilter/connector/src/main/java/org/apache/manifoldcf/agents/transformation/documentfilter/DocumentFilter.java Wed Oct 8 22:54:18 2014 @@ -63,6 +63,29 @@ public class DocumentFilter extends org. return new VersionContext(sp.toPackedString(),params,os); } + /** Detect if a document date is acceptable or not. This method is used to determine whether it makes sense to fetch a document + * in the first place. + *@param outputDescription is the document's output version. + *@param date is the date of the document. + *@param activities is an object including the activities that can be performed by this method. + *@return true if the document with that date can be accepted by this connector. + */ + @Override + public boolean checkDateIndexable(VersionContext outputDescription, Date date, IOutputCheckActivity activities) + throws ManifoldCFException, ServiceInterruption + { + SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); + return checkDateIndexable(sp, outputDescription, date, activities); + } + + protected boolean checkDateIndexable(SpecPacker sp, VersionContext outputDescription, Date date, IOutputCheckActivity activities) + throws ManifoldCFException, ServiceInterruption { + if (sp.checkDate(date)) + return super.checkDateIndexable(outputDescription, date, activities); + else + return false; + } + /** Detect if a mime type is indexable or not. This method is used by participating repository connectors to pre-filter the number of * unusable documents that will be passed to this output connector. *@param outputDescription is the document's output version. @@ -74,6 +97,11 @@ public class DocumentFilter extends org. throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); + return checkMimeTypeIndexable(sp, outputDescription, mimeType, activities); + } + + protected boolean checkMimeTypeIndexable(SpecPacker sp, VersionContext outputDescription, String mimeType, IOutputCheckActivity activities) + throws ManifoldCFException, ServiceInterruption { if (sp.checkMimeType(mimeType)) return super.checkMimeTypeIndexable(outputDescription, mimeType, activities); else @@ -84,6 +112,11 @@ public class DocumentFilter extends org. public boolean checkLengthIndexable(VersionContext outputDescription, long length, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); + return checkLengthIndexable(sp, outputDescription, length, activities); + } + + protected boolean checkLengthIndexable(SpecPacker sp, VersionContext outputDescription, long length, IOutputCheckActivity activities) + throws ManifoldCFException, ServiceInterruption { if (sp.checkLengthIndexable(length)) return super.checkLengthIndexable(outputDescription, length, activities); else @@ -94,6 +127,11 @@ public class DocumentFilter extends org. public boolean checkURLIndexable(VersionContext outputDescription, String url, IOutputCheckActivity activities) throws ManifoldCFException, ServiceInterruption { SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); + return checkURLIndexable(sp, outputDescription, url, activities); + } + + protected boolean checkURLIndexable(SpecPacker sp, VersionContext outputDescription, String url, IOutputCheckActivity activities) + throws ManifoldCFException, ServiceInterruption { if (sp.checkURLIndexable(url)) return super.checkURLIndexable(outputDescription, url, activities); else @@ -103,9 +141,6 @@ public class DocumentFilter extends org. /** Add (or replace) a document in the output data store using the connector. * This method presumes that the connector object has been configured, and it is thus able to communicate with the output data store should that be * necessary. - * The OutputSpecification is *not* provided to this method, because the goal is consistency, and if output is done it must be consistent with the - * output description, since that was what was partly used to determine if output should be taking place. So it may be necessary for this method to decode - * an output description string in order to determine what should be done. *@param documentURI is the URI of the document. The URI is presumed to be the unique identifier which the output data store will use to process * and serve the document. This URI is constructed by the repository connector which fetches the document, and is thus universal across all output connectors. *@param outputDescription is the description string that was constructed for this document by the getOutputDescription() method. @@ -118,6 +153,15 @@ public class DocumentFilter extends org. public int addOrReplaceDocumentWithException(String documentURI, VersionContext outputDescription, RepositoryDocument document, String authorityNameString, IOutputAddActivity activities) throws ManifoldCFException, ServiceInterruption, IOException { + // Hard filtering (in case connectors don't call check methods above) + SpecPacker sp = new SpecPacker(outputDescription.getSpecification()); + if (!checkURLIndexable(sp, outputDescription, documentURI, activities) || + !checkLengthIndexable(sp, outputDescription, document.getBinaryLength(), activities) || + !checkMimeTypeIndexable(sp, outputDescription, document.getMimeType(), activities) || + !checkDateIndexable(sp, outputDescription, document.getModifiedDate(), activities)) { + activities.noDocument(); + return DOCUMENTSTATUS_REJECTED; + } return activities.sendDocument(documentURI, document); } @@ -433,6 +477,11 @@ public class DocumentFilter extends org. return true; } + public boolean checkDate(Date date) { + // MHL + return true; + } + public boolean checkMimeType(String mimeType) { if (mimeType == null) mimeType = "application/unknown";