manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1635106 - in /manifoldcf/branches/dev_1x: ./ connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/ framework/ framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/ framework/pull-agent...
Date Wed, 29 Oct 2014 11:59:38 GMT
Author: kwright
Date: Wed Oct 29 11:59:38 2014
New Revision: 1635106

URL: http://svn.apache.org/r1635106
Log:
Pull up CONNECTORS-1077 fix for GridFS from trunk

Modified:
    manifoldcf/branches/dev_1x/   (props changed)
    manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
    manifoldcf/branches/dev_1x/framework/   (props changed)
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
    manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java

Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
  Merged /manifoldcf/trunk:r1634373

Modified: manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java?rev=1635106&r1=1635105&r2=1635106&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
(original)
+++ manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
Wed Oct 29 11:59:38 2014
@@ -390,13 +390,13 @@ public class GridFSRepositoryConnector e
     public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses,
Specification spec,
       IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
       throws ManifoldCFException, ServiceInterruption {
-        
+
         for (String documentIdentifier : documentIdentifiers) {
-          
+
             String versionString;
             GridFS gfs;
             GridFSDBFile document;
-          
+
             getSession();
             String _id = documentIdentifier;
             gfs = new GridFS(session, bucket);
@@ -410,136 +410,154 @@ public class GridFSRepositoryConnector e
                         ? Integer.toString(metadata.hashCode())
                         : StringUtils.EMPTY;
             }
-            
+
             if (versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
{
                 long startTime = System.currentTimeMillis();
-                String errorCode = "OK";
+                String errorCode = null;
                 String errorDesc = null;
                 String version = versionString;
+                try {
 
-                if (Logging.connectors.isDebugEnabled()) {
-                    Logging.connectors.debug("GridFS: Processing document _id = " + _id);
-                }
-
-                DBObject metadata = document.getMetaData();
-                if (metadata == null) {
-                    Logging.connectors.warn("GridFS: Document " + _id + " has a null metadata
- skipping.");
-                    activities.noDocument(_id,version);
-                    continue;
-                }
+                    if (Logging.connectors.isDebugEnabled()) {
+                        Logging.connectors.debug("GridFS: Processing document _id = " + _id);
+                    }
 
-                String urlValue = document.getMetaData().get(this.url) == null
-                        ? StringUtils.EMPTY
-                        : document.getMetaData().get(this.url).toString();
-                if (!StringUtils.isEmpty(urlValue)) {
-                    boolean validURL;
-                    try {
-                        new java.net.URI(urlValue);
-                        validURL = true;
-                    } catch (java.net.URISyntaxException e) {
-                        validURL = false;
+                    DBObject metadata = document.getMetaData();
+                    if (metadata == null) {
+                        errorCode = "NULLMETADATA";
+                        errorDesc = "Excluded because document had a null Metadata";
+                        Logging.connectors.warn("GridFS: Document " + _id + " has a null
metadata - skipping.");
+                        activities.noDocument(_id, version);
+                        continue;
                     }
-                    if (validURL) {
-                        long fileLenght = document.getLength();
-                        Date createdDate = document.getUploadDate();
-                        String fileName = document.getFilename();
-                        String mimeType = document.getContentType();
-                      
-                        if (!activities.checkURLIndexable(urlValue))
-                        {
-                          Logging.connectors.warn("GridFS: Document " + _id + " has a URL
excluded by the output connector ('" + urlValue + "') - skipping.");
-                          activities.noDocument(_id, version);
-                          continue;
-                        }
-                        
-                        if (!activities.checkLengthIndexable(fileLenght))
-                        {
-                          Logging.connectors.warn("GridFS: Document " + _id + " has a length
excluded by the output connector (" + fileLenght + ") - skipping.");
-                          activities.noDocument(_id, version);
-                          continue;
-                        }
-                        
-                        if (!activities.checkMimeTypeIndexable(mimeType))
-                        {
-                          Logging.connectors.warn("GridFS: Document " + _id + " has a mime
type excluded by the output connector ('" + mimeType + "') - skipping.");
-                          activities.noDocument(_id, version);
-                          continue;
-                        }
-                        
-                        if (!activities.checkDateIndexable(createdDate))
-                        {
-                          Logging.connectors.warn("GridFS: Document " + _id + " has a date
excluded by the output connector (" + createdDate + ") - skipping.");
-                          activities.noDocument(_id, version);
-                          continue;
+
+                    String urlValue = document.getMetaData().get(this.url) == null
+                            ? StringUtils.EMPTY
+                            : document.getMetaData().get(this.url).toString();
+                    if (!StringUtils.isEmpty(urlValue)) {
+                        boolean validURL;
+                        try {
+                            new java.net.URI(urlValue);
+                            validURL = true;
+                        } catch (java.net.URISyntaxException e) {
+                            validURL = false;
                         }
-                        
-                        RepositoryDocument rd = new RepositoryDocument();
-                        rd.setCreatedDate(createdDate);
-                        rd.setModifiedDate(createdDate);
-                        rd.setFileName(fileName);
-                        rd.setMimeType(mimeType);
-                        String[] aclsArray = null;
-                        String[] denyAclsArray = null;
-                        if (acl != null) {
-                            try {
-                                Object aclObject = document.getMetaData().get(acl);
-                                if (aclObject != null) {
-                                    List<String> acls = (List<String>) aclObject;
-                                    aclsArray = (String[]) acls.toArray();
+                        if (validURL) {
+                            long fileLenght = document.getLength();
+                            Date createdDate = document.getUploadDate();
+                            String fileName = document.getFilename();
+                            String mimeType = document.getContentType();
+
+                            if (!activities.checkURLIndexable(urlValue)) {
+                                Logging.connectors.warn("GridFS: Document " + _id + " has
a URL excluded by the output connector ('" + urlValue + "') - skipping.");
+                                errorCode = activities.EXCLUDED_URL;
+                                errorDesc = "Excluded because of URL (" + urlValue + ")";
+                                activities.noDocument(_id, version);
+                                continue;
+                            }
+
+                            if (!activities.checkLengthIndexable(fileLenght)) {
+                                Logging.connectors.warn("GridFS: Document " + _id + " has
a length excluded by the output connector (" + fileLenght + ") - skipping.");
+                                errorCode = activities.EXCLUDED_LENGTH;
+                                errorDesc = "Excluded because of length (" + fileLenght +
")";
+                                activities.noDocument(_id, version);
+                                continue;
+                            }
+
+                            if (!activities.checkMimeTypeIndexable(mimeType)) {
+                                Logging.connectors.warn("GridFS: Document " + _id + " has
a mime type excluded by the output connector ('" + mimeType + "') - skipping.");
+                                errorCode = activities.EXCLUDED_MIMETYPE;
+                                errorDesc = "Excluded because of mime type (" + mimeType
+ ")";
+                                activities.noDocument(_id, version);
+                                continue;
+                            }
+
+                            if (!activities.checkDateIndexable(createdDate)) {
+                                Logging.connectors.warn("GridFS: Document " + _id + " has
a date excluded by the output connector (" + createdDate + ") - skipping.");
+                                errorCode = activities.EXCLUDED_DATE;
+                                errorDesc = "Excluded because of date (" + createdDate +
")";
+                                activities.noDocument(_id, version);
+                                continue;
+                            }
+
+                            RepositoryDocument rd = new RepositoryDocument();
+                            rd.setCreatedDate(createdDate);
+                            rd.setModifiedDate(createdDate);
+                            rd.setFileName(fileName);
+                            rd.setMimeType(mimeType);
+                            String[] aclsArray = null;
+                            String[] denyAclsArray = null;
+                            if (acl != null) {
+                                try {
+                                    Object aclObject = document.getMetaData().get(acl);
+                                    if (aclObject != null) {
+                                        List<String> acls = (List<String>) aclObject;
+                                        aclsArray = (String[]) acls.toArray();
+                                    }
+                                } catch (ClassCastException e) {
+                                    // This is bad because security will fail
+                                    Logging.connectors.warn("GridFS: Document " + _id + "
metadata ACL field doesn't contain List<String> type.");
+                                    errorCode = "ACLTYPE";
+                                    errorDesc = "Allow ACL field doesn't contain List<String>
type.";
+                                    throw new ManifoldCFException("Security decoding error:
" + e.getMessage(), e);
                                 }
-                            } catch (ClassCastException e) {
-                                // This is bad because security will fail
-                                Logging.connectors.warn("GridFS: Document " + _id + " metadata
ACL field doesn't contain List<String> type.");
-                                throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
                             }
-                        }
-                        if (denyAcl != null) {
-                            try {
-                                Object denyAclObject = document.getMetaData().get(denyAcl);
-                                if (denyAclObject != null) {
-                                    List<String> denyAcls = (List<String>) denyAclObject;
-                                    denyAcls.add(GLOBAL_DENY_TOKEN);
-                                    denyAclsArray = (String[]) denyAcls.toArray();
+                            if (denyAcl != null) {
+                                try {
+                                    Object denyAclObject = document.getMetaData().get(denyAcl);
+                                    if (denyAclObject != null) {
+                                        List<String> denyAcls = (List<String>)
denyAclObject;
+                                        denyAcls.add(GLOBAL_DENY_TOKEN);
+                                        denyAclsArray = (String[]) denyAcls.toArray();
+                                    }
+                                } catch (ClassCastException e) {
+                                    // This is bad because security will fail
+                                    Logging.connectors.warn("GridFS: Document " + _id + "
metadata DenyACL field doesn't contain List<String> type.");
+                                    errorCode = "ACLTYPE";
+                                    errorDesc = "Deny ACL field doesn't contain List<String>
type.";
+                                    throw new ManifoldCFException("Security decoding error:
" + e.getMessage(), e);
                                 }
-                            } catch (ClassCastException e) {
-                                // This is bad because security will fail
-                                Logging.connectors.warn("GridFS: Document " + _id + " metadata
DenyACL field doesn't contain List<String> type.");
-                                throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
                             }
-                        }
-                        rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsArray,denyAclsArray);
+                            rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT, aclsArray,
denyAclsArray);
 
-                        InputStream is = document.getInputStream();
-                        try {
-                            rd.setBinary(is, fileLenght);
-                            try {
-                                activities.ingestDocumentWithException(_id, version, urlValue,
rd);
-                            } catch (IOException e) {
-                                handleIOException(e);
-                            }
-                        } finally {
+                            InputStream is = document.getInputStream();
                             try {
-                                is.close();
-                            } catch (IOException e) {
-                                handleIOException(e);
+                                rd.setBinary(is, fileLenght);
+                                try {
+                                    activities.ingestDocumentWithException(_id, version,
urlValue, rd);
+                                } catch (IOException e) {
+                                    handleIOException(e);
+                                }
+                            } finally {
+                                try {
+                                    is.close();
+                                } catch (IOException e) {
+                                    handleIOException(e);
+                                }
                             }
+                            gfs.getDB().getMongo().getConnector().close();
+                            session = null;
+                            activities.recordActivity(startTime, ACTIVITY_FETCH,
+                                    fileLenght, _id, "OK", null, null);
+                        } else {
+                            Logging.connectors.warn("GridFS: Document " + _id + " has a invalid
URL: " + urlValue + " - skipping.");
+                            errorCode = activities.BAD_URL;
+                            errorDesc = "Excluded because document had illegal URL ('" +
urlValue + "')";
+                            activities.noDocument(_id, version);
                         }
-                        gfs.getDB().getMongo().getConnector().close();
-                        session = null;
-                        activities.recordActivity(startTime, ACTIVITY_FETCH,
-                                fileLenght, _id, errorCode, errorDesc, null);
                     } else {
-                        Logging.connectors.warn("GridFS: Document " + _id + " has a invalid
URL: " + urlValue + " - skipping.");
-                        activities.noDocument(_id,version);
+                        Logging.connectors.warn("GridFS: Document " + _id + " has a null
URL - skipping.");
+                        errorCode = activities.NULL_URL;
+                        errorDesc = "Excluded because document had a null URL.";
+                        activities.noDocument(_id, version);
+                    }
+                } finally {
+                    if (errorCode != null) {
+                        activities.recordActivity(startTime, ACTIVITY_FETCH, document.getLength(),
_id, errorCode, errorDesc, null);
                     }
-                } else {
-                    Logging.connectors.warn("GridFS: Document " + _id + " has a null URL
- skipping.");
-                    activities.noDocument(_id,version);
                 }
-              
             }
         }
-
     }
 
     protected static void handleIOException(IOException e) throws ManifoldCFException, ServiceInterruption
{

Propchange: manifoldcf/branches/dev_1x/framework/
------------------------------------------------------------------------------
  Merged /manifoldcf/trunk/framework:r1634373

Modified: manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java?rev=1635106&r1=1635105&r2=1635106&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
(original)
+++ manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputHistoryActivity.java
Wed Oct 29 11:59:38 2014
@@ -40,6 +40,8 @@ public interface IOutputHistoryActivity
   public static final String JSON_ERROR = "JSONERROR";
   public static final String INDEX_NOT_FOUND = "INDEXNOTFOUND";
   public static final String XPATH_EXCEPTION = "XPATHEXCEPTION";
+  public static final String BAD_URL = "BADURL";
+  public static final String NULL_URL = "NULLURL";
   /** Record time-stamped information about the activity of the output connector.
   *@param startTime is either null or the time since the start of epoch in milliseconds (Jan
1, 1970).  Every
   *       activity has an associated time; the startTime field records when the activity
began.  A null value

Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java?rev=1635106&r1=1635105&r2=1635106&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
(original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IHistoryActivity.java
Wed Oct 29 11:59:38 2014
@@ -32,7 +32,8 @@ public interface IHistoryActivity
   public static final String EXCLUDED_LENGTH = IOutputHistoryActivity.EXCLUDED_LENGTH;
   public static final String EXCLUDED_MIMETYPE = IOutputHistoryActivity.EXCLUDED_MIMETYPE;
   public static final String EXCLUDED_DATE = IOutputHistoryActivity.EXCLUDED_DATE;
-
+  public static final String BAD_URL = IOutputHistoryActivity.BAD_URL;
+  public static final String NULL_URL = IOutputHistoryActivity.NULL_URL;
   /** Record time-stamped information about the activity of the connector.
   *@param startTime is either null or the time since the start of epoch in milliseconds (Jan
1, 1970).  Every
   *       activity has an associated time; the startTime field records when the activity
began.  A null value



Mime
View raw message