manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1630190 [1/2] - in /manifoldcf/branches/dev_1x: ./ connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/ connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldc...
Date Wed, 08 Oct 2014 18:06:02 GMT
Author: kwright
Date: Wed Oct  8 18:06:01 2014
New Revision: 1630190

URL: http://svn.apache.org/r1630190
Log:
Pull up fix for CONNECTORS-1067 from trunk.

Modified:
    manifoldcf/branches/dev_1x/   (props changed)
    manifoldcf/branches/dev_1x/CHANGES.txt
    manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java
    manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java
    manifoldcf/branches/dev_1x/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java
    manifoldcf/branches/dev_1x/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
    manifoldcf/branches/dev_1x/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
    manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
    manifoldcf/branches/dev_1x/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
    manifoldcf/branches/dev_1x/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
    manifoldcf/branches/dev_1x/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
    manifoldcf/branches/dev_1x/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
    manifoldcf/branches/dev_1x/connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/LivelinkConnector.java
    manifoldcf/branches/dev_1x/connectors/sharepoint/   (props changed)
    manifoldcf/branches/dev_1x/connectors/sharepoint/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharepoint/SharePointRepository.java
    manifoldcf/branches/dev_1x/connectors/wiki/   (props changed)
    manifoldcf/branches/dev_1x/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/incrementalingest/IncrementalIngester.java
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IIncrementalIngester.java
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IOutputCheckActivity.java
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/interfaces/IPipelineConnector.java
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/output/BaseOutputConnector.java
    manifoldcf/branches/dev_1x/framework/agents/src/main/java/org/apache/manifoldcf/agents/transformation/BaseTransformationConnector.java
    manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/interfaces/IFingerprintActivity.java
    manifoldcf/branches/dev_1x/framework/pull-agent/src/main/java/org/apache/manifoldcf/crawler/system/WorkerThread.java

Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-1067:r1630049-1630186
  Merged /manifoldcf/trunk:r1630188

Modified: manifoldcf/branches/dev_1x/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/CHANGES.txt?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/CHANGES.txt (original)
+++ manifoldcf/branches/dev_1x/CHANGES.txt Wed Oct  8 18:06:01 2014
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 1.8-dev =====================
 
+CONNECTORS-1067: Allow document filtering on modification date,
+and also hook this up in all repository connectors where it makes sense.
+(Karl Wright)
+
 CONNECTORS-1057: Implement full internationalization for alfresco-webscript
 connector.
 (Karl Wright)

Modified: manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnector.java Wed Oct  8 18:06:01 2014
@@ -271,11 +271,16 @@ public class AlfrescoConnector extends B
           continue;
         }
         
-        if (mimeType != null && !activities.checkMimeTypeIndexable(mimeType)) {
+        if (!activities.checkMimeTypeIndexable(mimeType)) {
           activities.noDocument(doc, documentVersion);
           continue;
         }
 
+        if (!activities.checkDateIndexable(modifiedDate)) {
+          activities.noDocument(doc, documentVersion);
+          continue;
+        }
+        
         RepositoryDocument rd = new RepositoryDocument();
         rd.addField(FIELD_NODEREF, nodeRef);
         rd.addField(FIELD_TYPE, type);

Modified: manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java (original)
+++ manifoldcf/branches/dev_1x/connectors/alfresco-webscript/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/alfrescowebscript/AlfrescoConnectorTest.java Wed Oct  8 18:06:01 2014
@@ -20,6 +20,7 @@ import static org.mockito.Matchers.any;
 import static org.mockito.Matchers.anyInt;
 import static org.mockito.Matchers.anyLong;
 import static org.mockito.Matchers.anyString;
+import static org.mockito.Matchers.anyObject;
 import static org.mockito.Matchers.eq;
 import static org.mockito.Mockito.mock;
 import static org.mockito.Mockito.never;
@@ -32,6 +33,7 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.Map;
+import java.util.Date;
 
 import org.alfresco.consulting.indexer.client.AlfrescoClient;
 import org.alfresco.consulting.indexer.client.AlfrescoFilters;
@@ -127,6 +129,8 @@ public class AlfrescoConnectorTest {
       .thenReturn(true);
     when(activities.checkMimeTypeIndexable(anyString()))
       .thenReturn(true);
+    when(activities.checkDateIndexable((Date)anyObject()))
+      .thenReturn(true);
     IExistingVersions statuses = mock(IExistingVersions.class);
     
     when(client.fetchNode(anyString()))
@@ -148,6 +152,8 @@ public class AlfrescoConnectorTest {
     verify(activities)
             .checkMimeTypeIndexable(eq("text/plain"));
     verify(activities)
+            .checkDateIndexable(eq(org.apache.manifoldcf.core.common.DateParser.parseISO8601Date((String)testDocument.get("cm:modified"))));
+    verify(activities)
             .ingestDocumentWithException(eq(TestDocument.uuid), anyString(),
                     eq(TestDocument.nodeRef), rd.capture());
     

Modified: manifoldcf/branches/dev_1x/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/CmisRepositoryConnector.java Wed Oct  8 18:06:01 2014
@@ -1167,171 +1167,198 @@ public class CmisRepositoryConnector ext
             activities.addDocumentReference(child.getId(), documentIdentifier,
                 RELATIONSHIP_CHILD);
           }
-      } else if(baseTypeId.equals(CMIS_DOCUMENT_BASE_TYPE)){
-        // content ingestion
+        } else if(baseTypeId.equals(CMIS_DOCUMENT_BASE_TYPE)) {
+          // content ingestion
 
-        Document document = (Document) cmisObject;
-        long fileLength;
-        InputStream is;
-        try {
-          fileLength = document.getContentStreamLength();
-          if (fileLength > 0)
-            is = document.getContentStream().getStream();
-          else
-            is = null;
-        } catch (CmisObjectNotFoundException e) {
-          // Document gone
-          activities.deleteDocument(documentIdentifier);
-          continue;
-        }
+          Document document = (Document) cmisObject;
           
-        try {
-          RepositoryDocument rd = new RepositoryDocument();
           Date createdDate = document.getCreationDate().getTime();
           Date modifiedDate = document.getLastModificationDate().getTime();
-            
-          rd.setFileName(document.getContentStreamFileName());
-          rd.setMimeType(document.getContentStreamMimeType());
+          long fileLength = document.getContentStreamLength();
+          String fileName = document.getContentStreamFileName();
+          String mimeType = document.getContentStreamMimeType();
+          //documentURI
+          String documentURI = CmisRepositoryConnectorUtils.getDocumentURL(document, session);
+          
+          // Do any filtering (which will save us work)
+          if (!activities.checkURLIndexable(documentURI))
+          {
+            activities.noDocument(documentIdentifier,versionString);
+            continue;
+          }
+          
+          if (!activities.checkMimeTypeIndexable(mimeType))
+          {
+            activities.noDocument(documentIdentifier,versionString);
+            continue;
+          }
+
+          if (!activities.checkLengthIndexable(fileLength))
+          {
+            activities.noDocument(documentIdentifier,versionString);
+            continue;
+          }
+          
+          if (!activities.checkDateIndexable(modifiedDate))
+          {
+            activities.noDocument(documentIdentifier,versionString);
+            continue;
+          }
+          
+          RepositoryDocument rd = new RepositoryDocument();
+          rd.setFileName(fileName);
+          rd.setMimeType(mimeType);
           rd.setCreatedDate(createdDate);
           rd.setModifiedDate(modifiedDate);
-            
-          //binary
-          if(is != null) {
-            rd.setBinary(is, fileLength);
-          } else {
-            rd.setBinary(new NullInputStream(0),0);
+              
+          InputStream is;
+          try {
+            if (fileLength > 0)
+              is = document.getContentStream().getStream();
+            else
+              is = null;
+          } catch (CmisObjectNotFoundException e) {
+            // Document gone
+            activities.deleteDocument(documentIdentifier);
+            continue;
           }
+            
+          try {
+            //binary
+            if(is != null) {
+              rd.setBinary(is, fileLength);
+            } else {
+              rd.setBinary(new NullInputStream(0),0);
+            }
 
-          //properties
-          List<Property<?>> properties = document.getProperties();
-          String id = StringUtils.EMPTY;
-          for (Property<?> property : properties) {
-            String propertyId = property.getId();
-              
-            if(CmisRepositoryConnectorUtils.existsInSelectClause(cmisQuery, propertyId)){
+            //properties
+            List<Property<?>> properties = document.getProperties();
+            String id = StringUtils.EMPTY;
+            for (Property<?> property : properties) {
+              String propertyId = property.getId();
                 
-              if (propertyId.endsWith(Constants.PARAM_OBJECT_ID)) {
-                id = (String) property.getValue();
-    
-                if (property.getValue() !=null 
-                    || property.getValues() != null) {
-                  PropertyType propertyType = property.getType();
-      
-                  switch (propertyType) {
-      
-                  case STRING:
-                  case ID:
-                  case URI:
-                  case HTML:
-                    if(property.isMultiValued()){
-                      List<String> htmlPropertyValues = (List<String>) property.getValues();
-                      for (String htmlPropertyValue : htmlPropertyValues) {
-                        rd.addField(propertyId, htmlPropertyValue);
-                      }
-                    } else {
-                      String stringValue = (String) property.getValue();
-                      if(StringUtils.isNotEmpty(stringValue)){
-                        rd.addField(propertyId, stringValue);
-                      }
-                    }
-                    break;
-           
-                  case BOOLEAN:
-                    if(property.isMultiValued()){
-                      List<Boolean> booleanPropertyValues = (List<Boolean>) property.getValues();
-                      for (Boolean booleanPropertyValue : booleanPropertyValues) {
-                        rd.addField(propertyId, booleanPropertyValue.toString());
-                      }
-                    } else {
-                      Boolean booleanValue = (Boolean) property.getValue();
-                      if(booleanValue!=null){
-                        rd.addField(propertyId, booleanValue.toString());
-                      }
-                    }
-                    break;
-      
-                  case INTEGER:
-                    if(property.isMultiValued()){
-                      List<BigInteger> integerPropertyValues = (List<BigInteger>) property.getValues();
-                      for (BigInteger integerPropertyValue : integerPropertyValues) {
-                        rd.addField(propertyId, integerPropertyValue.toString());
-                      }
-                    } else {
-                      BigInteger integerValue = (BigInteger) property.getValue();
-                      if(integerValue!=null){
-                        rd.addField(propertyId, integerValue.toString());
-                      }
-                    }
-                    break;
-      
-                  case DECIMAL:
-                    if(property.isMultiValued()){
-                      List<BigDecimal> decimalPropertyValues = (List<BigDecimal>) property.getValues();
-                      for (BigDecimal decimalPropertyValue : decimalPropertyValues) {
-                        rd.addField(propertyId, decimalPropertyValue.toString());
-                      }
-                    } else {
-                      BigDecimal decimalValue = (BigDecimal) property.getValue();
-                      if(decimalValue!=null){
-                        rd.addField(propertyId, decimalValue.toString());
-                      }
-                    }
-                    break;
+              if(CmisRepositoryConnectorUtils.existsInSelectClause(cmisQuery, propertyId)){
+                  
+                if (propertyId.endsWith(Constants.PARAM_OBJECT_ID)) {
+                  id = (String) property.getValue();
       
-                  case DATETIME:
-                    if(property.isMultiValued()){
-                      List<GregorianCalendar> datePropertyValues = (List<GregorianCalendar>) property.getValues();
-                      for (GregorianCalendar datePropertyValue : datePropertyValues) {
-                        rd.addField(propertyId,
-                            ISO8601_DATE_FORMATTER.format(datePropertyValue.getTime()));
-                      }
-                    } else {
-                      GregorianCalendar dateValue = (GregorianCalendar) property.getValue();
-                      if(dateValue!=null){
-                        rd.addField(propertyId, ISO8601_DATE_FORMATTER.format(dateValue.getTime()));
-                      }
+                  if (property.getValue() !=null 
+                      || property.getValues() != null) {
+                    PropertyType propertyType = property.getType();
+        
+                    switch (propertyType) {
+        
+                    case STRING:
+                    case ID:
+                    case URI:
+                    case HTML:
+                      if(property.isMultiValued()){
+                        List<String> htmlPropertyValues = (List<String>) property.getValues();
+                        for (String htmlPropertyValue : htmlPropertyValues) {
+                          rd.addField(propertyId, htmlPropertyValue);
+                        }
+                      } else {
+                        String stringValue = (String) property.getValue();
+                        if(StringUtils.isNotEmpty(stringValue)){
+                          rd.addField(propertyId, stringValue);
+                        }
+                      }
+                      break;
+             
+                    case BOOLEAN:
+                      if(property.isMultiValued()){
+                        List<Boolean> booleanPropertyValues = (List<Boolean>) property.getValues();
+                        for (Boolean booleanPropertyValue : booleanPropertyValues) {
+                          rd.addField(propertyId, booleanPropertyValue.toString());
+                        }
+                      } else {
+                        Boolean booleanValue = (Boolean) property.getValue();
+                        if(booleanValue!=null){
+                          rd.addField(propertyId, booleanValue.toString());
+                        }
+                      }
+                      break;
+        
+                    case INTEGER:
+                      if(property.isMultiValued()){
+                        List<BigInteger> integerPropertyValues = (List<BigInteger>) property.getValues();
+                        for (BigInteger integerPropertyValue : integerPropertyValues) {
+                          rd.addField(propertyId, integerPropertyValue.toString());
+                        }
+                      } else {
+                        BigInteger integerValue = (BigInteger) property.getValue();
+                        if(integerValue!=null){
+                          rd.addField(propertyId, integerValue.toString());
+                        }
+                      }
+                      break;
+        
+                    case DECIMAL:
+                      if(property.isMultiValued()){
+                        List<BigDecimal> decimalPropertyValues = (List<BigDecimal>) property.getValues();
+                        for (BigDecimal decimalPropertyValue : decimalPropertyValues) {
+                          rd.addField(propertyId, decimalPropertyValue.toString());
+                        }
+                      } else {
+                        BigDecimal decimalValue = (BigDecimal) property.getValue();
+                        if(decimalValue!=null){
+                          rd.addField(propertyId, decimalValue.toString());
+                        }
+                      }
+                      break;
+        
+                    case DATETIME:
+                      if(property.isMultiValued()){
+                        List<GregorianCalendar> datePropertyValues = (List<GregorianCalendar>) property.getValues();
+                        for (GregorianCalendar datePropertyValue : datePropertyValues) {
+                          rd.addField(propertyId,
+                              ISO8601_DATE_FORMATTER.format(datePropertyValue.getTime()));
+                        }
+                      } else {
+                        GregorianCalendar dateValue = (GregorianCalendar) property.getValue();
+                        if(dateValue!=null){
+                          rd.addField(propertyId, ISO8601_DATE_FORMATTER.format(dateValue.getTime()));
+                        }
+                      }
+                      break;
+        
+                    default:
+                      break;
                     }
-                    break;
-      
-                  default:
-                    break;
                   }
+                    
                 }
-                  
+                
               }
-              
             }
-          }
-          
-          //ingestion
             
-          //documentURI
-          String documentURI = CmisRepositoryConnectorUtils.getDocumentURL(document, session);
-            
-          try {
-            activities.ingestDocumentWithException(documentIdentifier, versionString, documentURI, rd);
-          } catch (IOException e) {
-            errorCode = "IO ERROR";
-            errorDesc = e.getMessage();
-            handleIOException(e, "reading file input stream");
-          }
-        } finally {
-          try {
-            if(is!=null){
-              is.close();
+            //ingestion
+              
+              
+            try {
+              activities.ingestDocumentWithException(documentIdentifier, versionString, documentURI, rd);
+            } catch (IOException e) {
+              errorCode = "IO ERROR";
+              errorDesc = e.getMessage();
+              handleIOException(e, "reading file input stream");
             }
-          } catch (IOException e) {
-            errorCode = "IO ERROR";
-            errorDesc = e.getMessage();
-            handleIOException(e, "closing file input stream");
           } finally {
-            activities.recordActivity(new Long(startTime), ACTIVITY_READ,
-              fileLength, documentIdentifier, errorCode, errorDesc, null);
+            try {
+              if(is!=null){
+                is.close();
+              }
+            } catch (IOException e) {
+              errorCode = "IO ERROR";
+              errorDesc = e.getMessage();
+              handleIOException(e, "closing file input stream");
+            } finally {
+              activities.recordActivity(new Long(startTime), ACTIVITY_READ,
+                fileLength, documentIdentifier, errorCode, errorDesc, null);
+            }
           }
         }
-      }
-      else
-        activities.deleteDocument(documentIdentifier);
+        else
+          activities.noDocument(documentIdentifier,versionString);
       }
     }
     

Modified: manifoldcf/branches/dev_1x/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java Wed Oct  8 18:06:01 2014
@@ -330,6 +330,45 @@ public class FileConnector extends org.a
             // We still need to check based on file data.
             if (checkIngest(file,spec))
             {
+              String fileName = file.getName();
+              Date modifiedDate = new Date(file.lastModified());
+              String mimeType = mapExtensionToMimeType(fileName);
+              String uri;
+              if (convertPath != null) {
+                // WGET-compatible input; convert back to external URI
+                uri = convertToWGETURI(convertPath);
+              } else {
+                uri = convertToURI(documentIdentifier);
+              }
+
+              if (!activities.checkLengthIndexable(fileLength))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because length was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
+              if (!activities.checkURLIndexable(uri))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because URL was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
+              if (!activities.checkDateIndexable(modifiedDate))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because date ("+modifiedDate+") was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
+              if (!activities.checkMimeTypeIndexable(mimeType))
+              {
+                Logging.connectors.debug("Skipping file '"+documentIdentifier+"' because mime type ('"+mimeType+"') was excluded by output connector.");
+                activities.noDocument(documentIdentifier,versionString);
+                continue;
+              }
+              
               long startTime = System.currentTimeMillis();
               String errorCode = "OK";
               String errorDesc = null;
@@ -345,17 +384,13 @@ public class FileConnector extends org.a
                   {
                     RepositoryDocument data = new RepositoryDocument();
                     data.setBinary(is,fileLength);
-                    String fileName = file.getName();
                     data.setFileName(fileName);
-                    data.setMimeType(mapExtensionToMimeType(fileName));
-                    data.setModifiedDate(new Date(file.lastModified()));
-                    String uri;
+                    data.setMimeType(mimeType);
+                    data.setModifiedDate(modifiedDate);
                     if (convertPath != null) {
                       // WGET-compatible input; convert back to external URI
-                      uri = convertToWGETURI(convertPath);
                       data.addField("uri",uri);
                     } else {
-                      uri = convertToURI(documentIdentifier);
                       data.addField("uri",file.toString());
                     }
                     // MHL for other metadata

Modified: manifoldcf/branches/dev_1x/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/googledrive/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/googledrive/GoogleDriveRepositoryConnector.java Wed Oct  8 18:06:01 2014
@@ -1114,18 +1114,51 @@ public class GoogleDriveRepositoryConnec
               Logging.connectors.debug("GOOGLEDRIVE: its a file");
             }
 
-            // We always direct to the PDF except for Spreadsheets
-            String documentURI = null;
-            if (!googleFile.getMimeType().equals("application/vnd.google-apps.spreadsheet")) {
-              documentURI = getUrl(googleFile, "application/pdf");
-            } else {
-              documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
-            }
-
             // Get the file length
-            Long fileLength = Objects.firstNonNull(googleFile.getFileSize(), 0L);
-            if (fileLength != null) {
+            Long fileLengthLong = Objects.firstNonNull(googleFile.getFileSize(), 0L);
+            if (fileLengthLong != null) {
 
+              // Now do standard stuff
+              long fileLength = fileLengthLong.longValue();
+              String mimeType = googleFile.getMimeType();
+              DateTime createdDateObject = googleFile.getCreatedDate();
+              DateTime modifiedDateObject = googleFile.getModifiedDate();
+              String extension = googleFile.getFileExtension();
+              String title = googleFile.getTitle();
+              Date createdDate = (createdDateObject==null)?null:new Date(createdDateObject.getValue());
+              Date modifiedDate = (modifiedDateObject==null)?null:new Date(modifiedDateObject.getValue());
+              // We always direct to the PDF except for Spreadsheets
+              String documentURI = null;
+              if (!mimeType.equals("application/vnd.google-apps.spreadsheet")) {
+                documentURI = getUrl(googleFile, "application/pdf");
+              } else {
+                documentURI = getUrl(googleFile, "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+              }
+
+              if (!activities.checkLengthIndexable(fileLength))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
+              if (!activities.checkURLIndexable(documentURI))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
+              if (!activities.checkMimeTypeIndexable(mimeType))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
+              if (!activities.checkDateIndexable(modifiedDate))
+              {
+                activities.noDocument(nodeId,version);
+                continue;
+              }
+              
               RepositoryDocument rd = new RepositoryDocument();
 
               if (acls != null) {
@@ -1136,19 +1169,12 @@ public class GoogleDriveRepositoryConnec
                 }
               }
               
-              // Now do standard stuff
-              String mimeType = googleFile.getMimeType();
-              DateTime createdDate = googleFile.getCreatedDate();
-              DateTime modifiedDate = googleFile.getModifiedDate();
-              String extension = googleFile.getFileExtension();
-              String title = googleFile.getTitle();
-              
               if (mimeType != null)
                 rd.setMimeType(mimeType);
               if (createdDate != null)
-                rd.setCreatedDate(new Date(createdDate.getValue()));
+                rd.setCreatedDate(createdDate);
               if (modifiedDate != null)
-                rd.setModifiedDate(new Date(modifiedDate.getValue()));
+                rd.setModifiedDate(modifiedDate);
               if (extension != null)
               {
                 if (title == null)

Modified: manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/gridfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/gridfs/GridFSRepositoryConnector.java Wed Oct  8 18:06:01 2014
@@ -417,8 +417,6 @@ public class GridFSRepositoryConnector e
                 String errorDesc = null;
                 String version = versionString;
 
-                RepositoryDocument rd = new RepositoryDocument();
-
                 if (Logging.connectors.isDebugEnabled()) {
                     Logging.connectors.debug("GridFS: Processing document _id = " + _id);
                 }
@@ -443,44 +441,77 @@ public class GridFSRepositoryConnector e
                     }
                     if (validURL) {
                         long fileLenght = document.getLength();
-                        InputStream is = document.getInputStream();
-                        try {
-                            Date indexingDate = new Date();
-                            rd.setBinary(is, fileLenght);
-                            rd.setCreatedDate(document.getUploadDate());
-                            rd.setFileName(document.getFilename());
-                            rd.setIndexingDate(indexingDate);
-                            rd.setMimeType(document.getContentType());
-                            String[] aclsArray = null;
-                            String[] denyAclsArray = null;
-                            if (acl != null) {
-                                try {
-                                    Object aclObject = document.getMetaData().get(acl);
-                                    if (aclObject != null) {
-                                        List<String> acls = (List<String>) aclObject;
-                                        aclsArray = (String[]) acls.toArray();
-                                    }
-                                } catch (ClassCastException e) {
-                                    // This is bad because security will fail
-                                    Logging.connectors.warn("GridFS: Document " + _id + " metadata ACL field doesn't contain List<String> type.");
-                                    throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
+                        Date createdDate = document.getUploadDate();
+                        String fileName = document.getFilename();
+                        String mimeType = document.getContentType();
+                      
+                        if (!activities.checkURLIndexable(urlValue))
+                        {
+                          Logging.connectors.warn("GridFS: Document " + _id + " has a URL excluded by the output connector ('" + urlValue + "') - skipping.");
+                          activities.noDocument(_id, version);
+                          continue;
+                        }
+                        
+                        if (!activities.checkLengthIndexable(fileLenght))
+                        {
+                          Logging.connectors.warn("GridFS: Document " + _id + " has a length excluded by the output connector (" + fileLenght + ") - skipping.");
+                          activities.noDocument(_id, version);
+                          continue;
+                        }
+                        
+                        if (!activities.checkMimeTypeIndexable(mimeType))
+                        {
+                          Logging.connectors.warn("GridFS: Document " + _id + " has a mime type excluded by the output connector ('" + mimeType + "') - skipping.");
+                          activities.noDocument(_id, version);
+                          continue;
+                        }
+                        
+                        if (!activities.checkDateIndexable(createdDate))
+                        {
+                          Logging.connectors.warn("GridFS: Document " + _id + " has a date excluded by the output connector (" + createdDate + ") - skipping.");
+                          activities.noDocument(_id, version);
+                          continue;
+                        }
+                        
+                        RepositoryDocument rd = new RepositoryDocument();
+                        rd.setCreatedDate(createdDate);
+                        rd.setModifiedDate(createdDate);
+                        rd.setFileName(fileName);
+                        rd.setMimeType(mimeType);
+                        String[] aclsArray = null;
+                        String[] denyAclsArray = null;
+                        if (acl != null) {
+                            try {
+                                Object aclObject = document.getMetaData().get(acl);
+                                if (aclObject != null) {
+                                    List<String> acls = (List<String>) aclObject;
+                                    aclsArray = (String[]) acls.toArray();
                                 }
+                            } catch (ClassCastException e) {
+                                // This is bad because security will fail
+                                Logging.connectors.warn("GridFS: Document " + _id + " metadata ACL field doesn't contain List<String> type.");
+                                throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
                             }
-                            if (denyAcl != null) {
-                                try {
-                                    Object denyAclObject = document.getMetaData().get(denyAcl);
-                                    if (denyAclObject != null) {
-                                        List<String> denyAcls = (List<String>) denyAclObject;
-                                        denyAcls.add(GLOBAL_DENY_TOKEN);
-                                        denyAclsArray = (String[]) denyAcls.toArray();
-                                    }
-                                } catch (ClassCastException e) {
-                                    // This is bad because security will fail
-                                    Logging.connectors.warn("GridFS: Document " + _id + " metadata DenyACL field doesn't contain List<String> type.");
-                                    throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
+                        }
+                        if (denyAcl != null) {
+                            try {
+                                Object denyAclObject = document.getMetaData().get(denyAcl);
+                                if (denyAclObject != null) {
+                                    List<String> denyAcls = (List<String>) denyAclObject;
+                                    denyAcls.add(GLOBAL_DENY_TOKEN);
+                                    denyAclsArray = (String[]) denyAcls.toArray();
                                 }
+                            } catch (ClassCastException e) {
+                                // This is bad because security will fail
+                                Logging.connectors.warn("GridFS: Document " + _id + " metadata DenyACL field doesn't contain List<String> type.");
+                                throw new ManifoldCFException("Security decoding error: "+e.getMessage(),e);
                             }
-                            rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsArray,denyAclsArray);
+                        }
+                        rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsArray,denyAclsArray);
+
+                        InputStream is = document.getInputStream();
+                        try {
+                            rd.setBinary(is, fileLenght);
                             try {
                                 activities.ingestDocumentWithException(_id, version, urlValue, rd);
                             } catch (IOException e) {

Modified: manifoldcf/branches/dev_1x/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/hdfs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/hdfs/HDFSRepositoryConnector.java Wed Oct  8 18:06:01 2014
@@ -386,27 +386,49 @@ public class HDFSRepositoryConnector ext
               continue;
             }
 
+            // It is a file to be indexed.
             long fileLength = fileStatus.getLen();
-            if (!activities.checkLengthIndexable(fileLength)) {
+            String fileName = fileStatus.getPath().getName();
+            String mimeType = mapExtensionToMimeType(fileStatus.getPath().getName());
+            Date modifiedDate = new Date(fileStatus.getModificationTime());
+            String uri;
+            if (convertPath != null) {
+              uri = convertToWGETURI(convertPath);
+            } else {
+              uri = fileStatus.getPath().toUri().toString();
+            }
+            
+            if (!activities.checkLengthIndexable(fileLength))
+            {
+              activities.noDocument(documentIdentifier,versionString);
+              continue;
+            }
+            
+            if (!activities.checkURLIndexable(uri))
+            {
+              activities.noDocument(documentIdentifier,versionString);
+              continue;
+            }
+            
+            if (!activities.checkMimeTypeIndexable(mimeType))
+            {
+              activities.noDocument(documentIdentifier,versionString);
+              continue;
+            }
+            
+            if (!activities.checkDateIndexable(modifiedDate))
+            {
               activities.noDocument(documentIdentifier,versionString);
               continue;
             }
-
-            // It is a file to be indexed.
             
             // Prepare the metadata part of RepositoryDocument
             RepositoryDocument data = new RepositoryDocument();
 
-            data.setFileName(fileStatus.getPath().getName());
-            data.setMimeType(mapExtensionToMimeType(fileStatus.getPath().getName()));
-            data.setModifiedDate(new Date(fileStatus.getModificationTime()));
+            data.setFileName(fileName);
+            data.setMimeType(mimeType);
+            data.setModifiedDate(modifiedDate);
 
-            String uri;
-            if (convertPath != null) {
-              uri = convertToWGETURI(convertPath);
-            } else {
-              uri = fileStatus.getPath().toUri().toString();
-            }
             data.addField("uri",uri);
 
             // We will record document fetch as an activity

Modified: manifoldcf/branches/dev_1x/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/jcifs/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/sharedrive/SharedDriveConnector.java Wed Oct  8 18:06:01 2014
@@ -782,131 +782,189 @@ public class SharedDriveConnector extend
               if (fileName != null && !file.isHidden())
               {
                 String uri = ingestionURI;
+                String fileNameString = file.getName();
+                Date lastModifiedDate = new Date(file.lastModified());
+                Date creationDate = new Date(file.createTime());
+                String contentType = mapExtensionToMimeType(fileNameString);
 
-                if (activities.checkURLIndexable(uri))
+                if (!activities.checkURLIndexable(uri))
                 {
-                  // Initialize repository document with common stuff, and find the URI
-                  RepositoryDocument rd = new RepositoryDocument();
-                  prepareForIndexing(rd,file,
-                    shareAllow,shareDeny,
-                    parentAllow,parentDeny,
-                    documentAllow,documentDeny,
-                    pathAttributeName,pathAttributeValue);
-
-                  // manipulate path to include the DFS alias, not the literal path
-                  // String newPath = matchPrefix + fileName.substring(matchReplace.length());
-                  String newPath = fileName;
-                  if (checkNeedFileData(newPath, spec))
-                  {
-                    if (Logging.connectors.isDebugEnabled())
-                      Logging.connectors.debug("JCIFS: Local file data needed for '"+documentIdentifier+"'");
+                  Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept URL ('"+uri+"')");
+                  activities.recordActivity(null,ACTIVITY_ACCESS,
+                    null,documentIdentifier,"Skip","Output connector refused URL",null);
+                  activities.noDocument(documentIdentifier,versionString);
+                  continue;
+                }
+
+                if (!activities.checkMimeTypeIndexable(contentType))
+                {
+                  Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept content type ('"+contentType+"')");
+                  activities.recordActivity(null,ACTIVITY_ACCESS,
+                    null,documentIdentifier,"Skip","Output connector refused mime type",null);
+                  activities.noDocument(documentIdentifier,versionString);
+                  continue;
+                }
 
-                    // Create a temporary file, and use that for the check and then the ingest
-                    File tempFile = File.createTempFile("_sdc_",null);
+                if (!activities.checkDateIndexable(lastModifiedDate))
+                {
+                  Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept date ("+lastModifiedDate+")");
+                  activities.recordActivity(null,ACTIVITY_ACCESS,
+                    null,documentIdentifier,"Skip","Output connector refused date",null);
+                  activities.noDocument(documentIdentifier,versionString);
+                  continue;
+                }
+
+                // Initialize repository document with common stuff, and find the URI
+                RepositoryDocument rd = new RepositoryDocument();
+                
+                //If using the lastAccess patched/Google version of jcifs then this can be uncommented
+                //Date lastAccessDate = new Date(file.lastAccess());
+                Integer attributes = file.getAttributes();
+                String shareName = file.getShare();
+
+                rd.setFileName(fileNameString);
+                if (contentType != null)
+                  rd.setMimeType(contentType);
+                rd.addField("lastModified", lastModifiedDate.toString());
+                rd.setModifiedDate(lastModifiedDate);
+                
+                // Add extra obtainable fields to the field map
+                rd.addField("createdOn", creationDate.toString());
+                rd.setCreatedDate(creationDate);
+
+                //rd.addField("lastAccess", lastModifiedDate.toString());
+                rd.addField("attributes", Integer.toString(attributes));
+                rd.addField("shareName", shareName);
+
+                setDocumentSecurity(rd,shareAllow,shareDeny,parentAllow,parentDeny,documentAllow,documentDeny);
+                setPathMetadata(rd,pathAttributeName,pathAttributeValue);
+
+                // manipulate path to include the DFS alias, not the literal path
+                // String newPath = matchPrefix + fileName.substring(matchReplace.length());
+                String newPath = fileName;
+                if (checkNeedFileData(newPath, spec))
+                {
+                  if (Logging.connectors.isDebugEnabled())
+                    Logging.connectors.debug("JCIFS: Local file data needed for '"+documentIdentifier+"'");
+
+                  // Create a temporary file, and use that for the check and then the ingest
+                  File tempFile = File.createTempFile("_sdc_",null);
+                  try
+                  {
+                    FileOutputStream os = new FileOutputStream(tempFile);
                     try
                     {
-                      FileOutputStream os = new FileOutputStream(tempFile);
+
+                      // Now, make a local copy so we can fingerprint
+                      InputStream inputStream = getFileInputStream(file);
                       try
                       {
-
-                        // Now, make a local copy so we can fingerprint
-                        InputStream inputStream = getFileInputStream(file);
-                        try
+                        // Copy!
+                        if (transferBuffer == null)
+                          transferBuffer = new byte[65536];
+                        while (true)
                         {
-                          // Copy!
-                          if (transferBuffer == null)
-                            transferBuffer = new byte[65536];
-                          while (true)
-                          {
-                            int amt = inputStream.read(transferBuffer,0,transferBuffer.length);
-                            if (amt == -1)
-                              break;
-                            os.write(transferBuffer,0,amt);
-                          }
-                        }
-                        finally
-                        {
-                          inputStream.close();
+                          int amt = inputStream.read(transferBuffer,0,transferBuffer.length);
+                          if (amt == -1)
+                            break;
+                          os.write(transferBuffer,0,amt);
                         }
                       }
                       finally
                       {
-                        os.close();
+                        inputStream.close();
                       }
+                    }
+                    finally
+                    {
+                      os.close();
+                    }
 
-                      if (checkIngest(tempFile, newPath, spec, activities))
+                    if (checkIngest(tempFile, newPath, spec, activities))
+                    {
+                      long fileLength = tempFile.length();
+                      if (!activities.checkLengthIndexable(fileLength))
                       {
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("JCIFS: Decided to ingest '"+documentIdentifier+"'");
-                        // OK, do ingestion itself!
-                        InputStream inputStream = new FileInputStream(tempFile);
-                        try
-                        {
-                          rd.setBinary(inputStream, tempFile.length());
-                          
-                          activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
-                        }
-                        finally
-                        {
-                          inputStream.close();
-                        }
-
-                        // I put this record here deliberately for two reasons:
-                        // (1) the other path includes ingestion time, and
-                        // (2) if anything fails up to and during ingestion, I want THAT failure record to be written, not this one.
-                        // So, really, ACTIVITY_ACCESS is a bit more than just fetch for JCIFS...
-                        activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
-                          new Long(tempFile.length()),documentIdentifier,"Success",null,null);
+                        Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
+                        activities.recordActivity(null,ACTIVITY_ACCESS,
+                          null,documentIdentifier,"Skip","Output connector refused length",null);
+                        activities.noDocument(documentIdentifier,versionString);
+                        continue;
+                      }
 
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("JCIFS: Decided to ingest '"+documentIdentifier+"'");
+                      // OK, do ingestion itself!
+                      InputStream inputStream = new FileInputStream(tempFile);
+                      try
+                      {
+                        rd.setBinary(inputStream, fileLength);
+                          
+                        activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
                       }
-                      else
+                      finally
                       {
-                        // We must actively remove the document here, because the getDocumentVersions()
-                        // method has no way of signalling this, since it does not do the fingerprinting.
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
-                        activities.noDocument(documentIdentifier, versionString);
-                        // We should record the access here as well, since this is a non-exception way through the code path.
-                        // (I noticed that this was not being recorded in the history while fixing 25477.)
-                        activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
-                          new Long(tempFile.length()),documentIdentifier,"Success",null,null);
+                        inputStream.close();
                       }
-                    }
-                    finally
-                    {
-                      tempFile.delete();
-                    }
-                  }
-                  else
-                  {
-                    if (Logging.connectors.isDebugEnabled())
-                      Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
 
-                    // Presume that since the file was queued that it fulfilled the needed criteria.
-                    // Go off and ingest the fast way.
+                      // I put this record here deliberately for two reasons:
+                      // (1) the other path includes ingestion time, and
+                      // (2) if anything fails up to and during ingestion, I want THAT failure record to be written, not this one.
+                      // So, really, ACTIVITY_ACCESS is a bit more than just fetch for JCIFS...
+                      activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+                        new Long(tempFile.length()),documentIdentifier,"Success",null,null);
 
-                    // Ingest the document.
-                    InputStream inputStream = getFileInputStream(file);
-                    try
-                    {
-                      rd.setBinary(inputStream, fileLength(file));
-                      
-                      activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
                     }
-                    finally
+                    else
                     {
-                      inputStream.close();
+                      // We must actively remove the document here, because the getDocumentVersions()
+                      // method has no way of signalling this, since it does not do the fingerprinting.
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("JCIFS: Decided to remove '"+documentIdentifier+"'");
+                      activities.noDocument(documentIdentifier, versionString);
+                      // We should record the access here as well, since this is a non-exception way through the code path.
+                      // (I noticed that this was not being recorded in the history while fixing 25477.)
+                      activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+                        new Long(tempFile.length()),documentIdentifier,"Success",null,null);
                     }
-                    activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
-                      new Long(fileLength(file)),documentIdentifier,"Success",null,null);
+                  }
+                  finally
+                  {
+                    tempFile.delete();
                   }
                 }
                 else
                 {
-                  Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept it");
-                  activities.recordActivity(null,ACTIVITY_ACCESS,
-                    null,documentIdentifier,"Skip","Output connector refused",null);
-                  activities.noDocument(documentIdentifier,versionString);
+                  if (Logging.connectors.isDebugEnabled())
+                    Logging.connectors.debug("JCIFS: Local file data not needed for '"+documentIdentifier+"'");
+
+                  long fileLength = fileLength(file);
+                  if (!activities.checkLengthIndexable(fileLength))
+                  {
+                    Logging.connectors.debug("JCIFS: Skipping file because output connector cannot accept length ("+fileLength+")");
+                    activities.recordActivity(null,ACTIVITY_ACCESS,
+                      null,documentIdentifier,"Skip","Output connector refused length",null);
+                    activities.noDocument(documentIdentifier,versionString);
+                    continue;
+                  }
+
+                  // Presume that since the file was queued that it fulfilled the needed criteria.
+                  // Go off and ingest the fast way.
+                  
+                  // Ingest the document.
+                  InputStream inputStream = getFileInputStream(file);
+                  try
+                  {
+                    rd.setBinary(inputStream, fileLength);
+                      
+                    activities.ingestDocumentWithException(documentIdentifier, versionString, uri, rd);
+                  }
+                  finally
+                  {
+                    inputStream.close();
+                  }
+                  activities.recordActivity(new Long(startFetchTime),ACTIVITY_ACCESS,
+                    new Long(fileLength(file)),documentIdentifier,"Success",null,null);
                 }
               }
               else
@@ -1036,40 +1094,6 @@ public class SharedDriveConnector extend
   }
 
 
-  protected static void prepareForIndexing(RepositoryDocument rd, SmbFile file,
-    String[] shareAllow, String[] shareDeny, String[] parentAllow, String[] parentDeny, String[] allow, String[] deny,
-    String pathAttributeName, String pathAttributeValue)
-    throws ManifoldCFException, SmbException
-  {
-    String fileNameString = file.getName();
-    Date lastModifiedDate = new Date(file.lastModified());
-    Date creationDate = new Date(file.createTime());
-    //If using the lastAccess patched/Google version of jcifs then this can be uncommented
-    //Date lastAccessDate = new Date(file.lastAccess());
-    Integer attributes = file.getAttributes();
-    String shareName = file.getShare();
-
-    
-    String contentType = mapExtensionToMimeType(fileNameString);
-
-    rd.setFileName(fileNameString);
-    if (contentType != null)
-      rd.setMimeType(contentType);
-    rd.addField("lastModified", lastModifiedDate.toString());
-    rd.setModifiedDate(lastModifiedDate);
-    
-    // Add extra obtainable fields to the field map
-    rd.addField("createdOn", creationDate.toString());
-    rd.setCreatedDate(creationDate);
-
-    //rd.addField("lastAccess", lastModifiedDate.toString());
-    rd.addField("attributes", Integer.toString(attributes));
-    rd.addField("shareName", shareName);
-
-    setDocumentSecurity(rd,shareAllow,shareDeny,parentAllow,parentDeny,allow,deny);
-    setPathMetadata(rd,pathAttributeName,pathAttributeValue);
-  }
-  
   /** Map an extension to a mime type */
   protected static String mapExtensionToMimeType(String fileName)
   {

Modified: manifoldcf/branches/dev_1x/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/jdbc/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jdbc/JDBCConnector.java Wed Oct  8 18:06:01 2014
@@ -526,154 +526,167 @@ public class JDBCConnector extends org.a
                   if (o != null)
                     contentType = JDBCConnection.readAsString(o);
                   else
-                    contentType = null;
-                  
-                  if (contentType == null || activities.checkMimeTypeIndexable(contentType))
                   {
                     if (contents instanceof BinaryInput)
-                    {
-                      // An ingestion will take place for this document.
-                      RepositoryDocument rd = new RepositoryDocument();
+                      contentType = "application/octet-stream";
+                    else if (contents instanceof CharacterInput)
+                      contentType = "text/plain; charset=utf-8";
+                    else
+                      contentType = "text/plain";
+                  }
+                  
+                  if (!activities.checkMimeTypeIndexable(contentType))
+                  {
+                    Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of mime type - skipping");
+                    activities.noDocument(id,version);
+                    continue;
+                  }
+                  
+                  if (!activities.checkURLIndexable(url))
+                  {
+                    Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of url - skipping");
+                    activities.noDocument(id,version);
+                    continue;
+                  }
 
-                      // Default content type is application/octet-stream for binary data
-                      if (contentType == null)
-                        rd.setMimeType("application/octet-stream");
-                      else
-                        rd.setMimeType(contentType);
+                  // An ingestion will take place for this document.
+                  RepositoryDocument rd = new RepositoryDocument();
+                  rd.setMimeType(contentType);
                       
-                      applyAccessTokens(rd,ts);
-                      applyMetadata(rd,row);
+                  applyAccessTokens(rd,ts);
+                  applyMetadata(rd,row);
+
+                  if (contents instanceof BinaryInput)
+                  {
+
+                    BinaryInput bi = (BinaryInput)contents;
+                    long fileLength = bi.getLength();
+                    
+                    if (!activities.checkLengthIndexable(fileLength))
+                    {
+                      Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+                      activities.noDocument(id, version);
+                      continue;
+                    }
 
-                      BinaryInput bi = (BinaryInput)contents;
+                    try
+                    {
+                      // Read the stream
+                      InputStream is = bi.getStream();
                       try
                       {
-                        // Read the stream
-                        InputStream is = bi.getStream();
-                        try
-                        {
-                          rd.setBinary(is,bi.getLength());
-                          activities.ingestDocumentWithException(id, version, url, rd);
-                        }
-                        finally
-                        {
-                          is.close();
-                        }
+                        rd.setBinary(is,fileLength);
+                        activities.ingestDocumentWithException(id, version, url, rd);
                       }
-                      catch (java.net.SocketTimeoutException e)
+                      finally
                       {
-                        throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+                        is.close();
                       }
-                      catch (InterruptedIOException e)
+                    }
+                    catch (java.net.SocketTimeoutException e)
+                    {
+                      throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+                    }
+                    catch (InterruptedIOException e)
+                    {
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
+                    {
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                    }
+                  }
+                  else if (contents instanceof CharacterInput)
+                  {
+                    CharacterInput ci = (CharacterInput)contents;
+                    long fileLength = ci.getUtf8StreamLength();
+                    
+                    if (!activities.checkLengthIndexable(fileLength))
+                    {
+                      Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+                      activities.noDocument(id, version);
+                      continue;
+                    }
+                    
+                    try
+                    {
+                      // Read the stream
+                      InputStream is = ci.getUtf8Stream();
+                      try
                       {
-                        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                        rd.setBinary(is,fileLength);
+                        activities.ingestDocumentWithException(id, version, url, rd);
                       }
-                      catch (IOException e)
+                      finally
                       {
-                        throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                        is.close();
                       }
                     }
-                    else if (contents instanceof CharacterInput)
+                    catch (java.net.SocketTimeoutException e)
+                    {
+                      throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
+                    }
+                    catch (InterruptedIOException e)
+                    {
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
                     {
-                      // An ingestion will take place for this document.
-                      RepositoryDocument rd = new RepositoryDocument();
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                    }
+                  }
+                  else
+                  {
+                    // Turn it into a string, and then into a stream
+                    String value = contents.toString();
+                    byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
+                    long fileLength = bytes.length;
 
-                      // Default content type is application/octet-stream for binary data
-                      if (contentType == null)
-                        rd.setMimeType("text/plain; charset=utf-8");
-                      else
-                        rd.setMimeType(contentType);
-                      
-                      applyAccessTokens(rd,ts);
-                      applyMetadata(rd,row);
+                    if (!activities.checkLengthIndexable(fileLength))
+                    {
+                      Logging.connectors.debug("JDBC: Document '"+id+"' excluded because of length - skipping");
+                      activities.noDocument(id, version);
+                      continue;
+                    }
 
-                      CharacterInput ci = (CharacterInput)contents;
+                    try
+                    {
+                      InputStream is = new ByteArrayInputStream(bytes);
                       try
                       {
-                        // Read the stream
-                        InputStream is = ci.getUtf8Stream();
-                        try
-                        {
-                          rd.setBinary(is,ci.getUtf8StreamLength());
-                          activities.ingestDocumentWithException(id, version, url, rd);
-                        }
-                        finally
-                        {
-                          is.close();
-                        }
-                      }
-                      catch (java.net.SocketTimeoutException e)
-                      {
-                        throw new ManifoldCFException("Socket timeout reading database data: "+e.getMessage(),e);
-                      }
-                      catch (InterruptedIOException e)
-                      {
-                        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                        rd.setBinary(is,fileLength);
+                        activities.ingestDocumentWithException(id, version, url, rd);
                       }
-                      catch (IOException e)
+                      finally
                       {
-                        throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
+                        is.close();
                       }
                     }
-                    else
+                    catch (InterruptedIOException e)
                     {
-                      // Turn it into a string, and then into a stream
-                      String value = contents.toString();
-                      try
-                      {
-                        byte[] bytes = value.getBytes(StandardCharsets.UTF_8);
-                        RepositoryDocument rd = new RepositoryDocument();
-
-                        // Default content type is text/plain for character data
-                        if (contentType == null)
-                          rd.setMimeType("text/plain");
-                        else
-                          rd.setMimeType(contentType);
-                        
-                        applyAccessTokens(rd,ts);
-                        applyMetadata(rd,row);
-
-                        InputStream is = new ByteArrayInputStream(bytes);
-                        try
-                        {
-                          rd.setBinary(is,bytes.length);
-                          activities.ingestDocumentWithException(id, version, url, rd);
-                        }
-                        finally
-                        {
-                          is.close();
-                        }
-                      }
-                      catch (InterruptedIOException e)
-                      {
-                        throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
-                      }
-                      catch (IOException e)
-                      {
-                        throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
-                      }
+                      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+                    }
+                    catch (IOException e)
+                    {
+                      throw new ManifoldCFException("Error reading database data: "+e.getMessage(),e);
                     }
-                  }
-                  else
-                  {
-                    Logging.connectors.warn("JDBC: Document '"+id+"' excluded because of mime type - skipping");
-                    activities.noDocument(id,version);
                   }
                 }
                 else
                 {
-                  Logging.connectors.warn("JDBC: Document '"+id+"' seems to have null data - skipping");
+                  Logging.connectors.debug("JDBC: Document '"+id+"' seems to have null data - skipping");
                   activities.noDocument(id,version);
                 }
               }
               else
               {
-                Logging.connectors.warn("JDBC: Document '"+id+"' has an illegal url: '"+url+"' - skipping");
+                Logging.connectors.debug("JDBC: Document '"+id+"' has an illegal url: '"+url+"' - skipping");
                 activities.noDocument(id,version);
               }
             }
             else
             {
-              Logging.connectors.warn("JDBC: Document '"+id+"' has a null url - skipping");
+              Logging.connectors.debug("JDBC: Document '"+id+"' has a null url - skipping");
               activities.noDocument(id,version);
             }
           }

Modified: manifoldcf/branches/dev_1x/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java?rev=1630190&r1=1630189&r2=1630190&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/jira/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/jira/JiraRepositoryConnector.java Wed Oct  8 18:06:01 2014
@@ -1012,6 +1012,31 @@ public class JiraRepositoryConnector ext
                   + documentIdentifier + "'");
             }
 
+            // Now do standard stuff
+              
+            String mimeType = "text/plain";
+            Date createdDate = jiraFile.getCreatedDate();
+            Date modifiedDate = jiraFile.getUpdatedDate();
+            String documentURI = jiraFile.getSelf();
+
+            if (!activities.checkURLIndexable(documentURI))
+            {
+              activities.noDocument(documentIdentifier, versionString);
+              continue;
+            }
+            
+            if (!activities.checkMimeTypeIndexable(mimeType))
+            {
+              activities.noDocument(documentIdentifier, versionString);
+              continue;
+            }
+            
+            if (!activities.checkDateIndexable(modifiedDate))
+            {
+              activities.noDocument(documentIdentifier, versionString);
+              continue;
+            }
+            
             //otherwise process
             RepositoryDocument rd = new RepositoryDocument();
               
@@ -1023,12 +1048,6 @@ public class JiraRepositoryConnector ext
               denyAclsToUse = new String[0];
             rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclsToUse,denyAclsToUse);
 
-            // Now do standard stuff
-              
-            String mimeType = "text/plain";
-            Date createdDate = jiraFile.getCreatedDate();
-            Date modifiedDate = jiraFile.getUpdatedDate();
-
             rd.setMimeType(mimeType);
             if (createdDate != null)
               rd.setCreatedDate(createdDate);
@@ -1046,13 +1065,20 @@ public class JiraRepositoryConnector ext
               rd.addField(entry.getKey(), entry.getValue());
             }
 
-            String documentURI = jiraFile.getSelf();
             String document = getJiraBody(jiraFile);
             try {
               byte[] documentBytes = document.getBytes(StandardCharsets.UTF_8);
+              long fileLength = documentBytes.length;
+              
+              if (!activities.checkLengthIndexable(fileLength))
+              {
+                activities.noDocument(documentIdentifier, versionString);
+                continue;
+              }
+                
               InputStream is = new ByteArrayInputStream(documentBytes);
               try {
-                rd.setBinary(is, documentBytes.length);
+                rd.setBinary(is, fileLength);
                 activities.ingestDocumentWithException(documentIdentifier, versionString, documentURI, rd);
                 // No errors.  Record the fact that we made it.
                 errorCode = "OK";



Mime
View raw message