manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1624174 - /manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
Date Thu, 11 Sep 2014 00:40:46 GMT
Author: kwright
Date: Thu Sep 11 00:40:45 2014
New Revision: 1624174

URL: http://svn.apache.org/r1624174
Log:
Upgrade filesystem connector, part of CONNECTORS-977

Modified:
    manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java

Modified: manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java?rev=1624174&r1=1624173&r2=1624174&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
(original)
+++ manifoldcf/trunk/connectors/filesystem/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/filesystem/FileConnector.java
Thu Sep 11 00:40:45 2014
@@ -224,35 +224,26 @@ public class FileConnector extends org.a
     return "";
   }
   
-  /** Get document versions given an array of document identifiers.
-  * This method is called for EVERY document that is considered. It is therefore important
to perform
-  * as little work as possible here.
+  /** Process a set of documents.
+  * This is the method that should cause each document to be fetched, processed, and the
results either added
+  * to the queue of documents for the current job, and/or entered into the incremental ingestion
manager.
+  * The document specification allows this class to filter what is done based on the job.
   * The connector will be connected before this method can be called.
-  *@param documentIdentifiers is the array of local document identifiers, as understood by
this connector.
-  *@param oldVersions is the corresponding array of version strings that have been saved
for the document identifiers.
-  *   A null value indicates that this is a first-time fetch, while an empty string indicates
that the previous document
-  *   had an empty version string.
-  *@param activities is the interface this method should use to perform whatever framework
actions are desired.
-  *@param spec is the current document specification for the current job.  If there is a
dependency on this
-  * specification, then the version string should include the pertinent data, so that reingestion
will occur
-  * when the specification changes.  This is primarily useful for metadata.
+  *@param documentIdentifiers is the set of document identifiers to process.
+  *@param statuses are the currently-stored document versions for each document in the set
of document identifiers
+  * passed in above.
+  *@param activities is the interface this method should use to queue up new document references
+  * and ingest documents.
   *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
   *@param usesDefaultAuthority will be true only if the authority in use for these documents
is the default one.
-  *@return the corresponding version strings, with null in the places where the document
no longer exists.
-  * Empty version strings indicate that there is no versioning ability for the corresponding
document, and the document
-  * will always be processed.
   */
-  public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions,
IVersionActivity activities,
-    DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
+  @Override
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses,
Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption
   {
-    int i = 0;
-    
-    String[] rval = new String[documentIdentifiers.length];
-    i = 0;
-    while (i < rval.length)
+    for (String documentIdentifier : documentIdentifiers)
     {
-      String documentIdentifier = documentIdentifiers[i];
       File file = new File(documentIdentifier);
       if (file.exists())
       {
@@ -264,70 +255,7 @@ public class FileConnector extends org.a
           //long lastModified = file.lastModified();
           //rval[i] = new Long(lastModified).toString();
 
-          // Signal that we don't have any versioning and that we should recheck always.
-          rval[i] = "";
-        }
-        else
-        {
-          // It's a file
-          long fileLength = file.length();
-          if (activities.checkLengthIndexable(fileLength))
-          {
-            // Get the file's modified date.
-            long lastModified = file.lastModified();
-            
-            // Check if the path is to be converted.  We record that info in the version
string so that we'll reindex documents whose
-            // URI's change.
-            String convertPath = findConvertPath(spec, file);
-            StringBuilder sb = new StringBuilder();
-            if (convertPath != null)
-            {
-              // Record the path.
-              sb.append("+");
-              pack(sb,convertPath,'+');
-            }
-            else
-              sb.append("-");
-            sb.append(new Long(lastModified).toString()).append(":").append(new Long(fileLength).toString());
-            rval[i] = sb.toString();
-          }
-          else
-            rval[i] = null;
-        }
-      }
-      else
-        rval[i] = null;
-      i++;
-    }
-    return rval;
-  }
-
-
-  /** Process a set of documents.
-  * This is the method that should cause each document to be fetched, processed, and the
results either added
-  * to the queue of documents for the current job, and/or entered into the incremental ingestion
manager.
-  * The document specification allows this class to filter what is done based on the job.
-  *@param documentIdentifiers is the set of document identifiers to process.
-  *@param activities is the interface this method should use to queue up new document references
-  * and ingest documents.
-  *@param spec is the document specification.
-  *@param scanOnly is an array corresponding to the document identifiers.  It is set to true
to indicate when the processing
-  * should only find other references, and should not actually call the ingestion methods.
-  */
-  @Override
-  public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity
activities, DocumentSpecification spec, boolean[] scanOnly)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    int i = 0;
-    while (i < documentIdentifiers.length)
-    {
-      String version = versions[i];
-      String documentIdentifier = documentIdentifiers[i];
-      File file = new File(documentIdentifier);
-      if (file.exists())
-      {
-        if (file.isDirectory())
-        {
+          // No versioning; just reference children
           // Chained connectors scan parent nodes always
           // Queue up stuff for directory
           long startTime = System.currentTimeMillis();
@@ -362,31 +290,50 @@ public class FileConnector extends org.a
           {
             activities.recordActivity(new Long(startTime),ACTIVITY_READ,null,entityReference,errorCode,errorDesc,null);
           }
+          continue;
         }
         else
         {
-          if (!scanOnly[i])
+          // It's a file
+          String versionString;
+          String convertPath;
+          long fileLength = file.length();
+          if (activities.checkLengthIndexable(fileLength))
+          {
+            // Get the file's modified date.
+            long lastModified = file.lastModified();
+            
+            // Check if the path is to be converted.  We record that info in the version
string so that we'll reindex documents whose
+            // URI's change.
+            convertPath = findConvertPath(spec, file);
+            StringBuilder sb = new StringBuilder();
+            if (convertPath != null)
+            {
+              // Record the path.
+              sb.append("+");
+              pack(sb,convertPath,'+');
+            }
+            else
+              sb.append("-");
+            sb.append(new Long(lastModified).toString()).append(":").append(new Long(fileLength).toString());
+            versionString = sb.toString();
+          }
+          else
+          {
+            activities.deleteDocument(documentIdentifier);
+            continue;
+          }
+    
+          if (activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
           {
             // We've already avoided queuing documents that we don't want, based on file
specifications.
             // We still need to check based on file data.
             if (checkIngest(file,spec))
             {
-              
-              /*
-               * get filepathtouri value
-               */
-              String convertPath = null;
-              if (version.length() > 0 && version.startsWith("+"))
-              {
-                StringBuilder unpack = new StringBuilder();
-                unpack(unpack, version, 1, '+');
-                convertPath = unpack.toString();
-              }
-              
               long startTime = System.currentTimeMillis();
               String errorCode = "OK";
               String errorDesc = null;
-              Long fileLength = null;
+              Long fileLengthLong = null;
               String entityDescription = documentIdentifier;
               try
               {
@@ -396,9 +343,8 @@ public class FileConnector extends org.a
                   InputStream is = new FileInputStream(file);
                   try
                   {
-                    long fileBytes = file.length();
                     RepositoryDocument data = new RepositoryDocument();
-                    data.setBinary(is,fileBytes);
+                    data.setBinary(is,fileLength);
                     String fileName = file.getName();
                     data.setFileName(fileName);
                     data.setMimeType(mapExtensionToMimeType(fileName));
@@ -413,8 +359,8 @@ public class FileConnector extends org.a
                       data.addField("uri",file.toString());
                     }
                     // MHL for other metadata
-                    activities.ingestDocumentWithException(documentIdentifier,version,uri,data);
-                    fileLength = new Long(fileBytes);
+                    activities.ingestDocumentWithException(documentIdentifier,versionString,uri,data);
+                    fileLengthLong = new Long(fileLength);
                   }
                   finally
                   {
@@ -439,13 +385,17 @@ public class FileConnector extends org.a
               }
               finally
               {
-                activities.recordActivity(new Long(startTime),ACTIVITY_READ,fileLength,entityDescription,errorCode,errorDesc,null);
+                activities.recordActivity(new Long(startTime),ACTIVITY_READ,fileLengthLong,entityDescription,errorCode,errorDesc,null);
               }
             }
           }
         }
       }
-      i++;
+      else
+      {
+        activities.deleteDocument(documentIdentifier);
+        continue;
+      }
     }
   }
 
@@ -455,7 +405,7 @@ public class FileConnector extends org.a
   *@param documentIdentifier is the document identifier.
   *@return the part of the path to be converted, or null.
   */
-  protected static String findConvertPath(DocumentSpecification spec, File theFile)
+  protected static String findConvertPath(Specification spec, File theFile)
   {
     String fullpath = theFile.getAbsolutePath().replaceAll("\\\\","/");
     for (int j = 0; j < spec.getChildCount(); j++)
@@ -1090,7 +1040,7 @@ public class FileConnector extends org.a
   *@param documentSpecification is the specification.
   *@return true if it should be included.
   */
-  protected static boolean checkInclude(File file, String fileName, DocumentSpecification
documentSpecification)
+  protected static boolean checkInclude(File file, String fileName, Specification documentSpecification)
     throws ManifoldCFException
   {
     if (Logging.connectors.isDebugEnabled())
@@ -1195,7 +1145,7 @@ public class FileConnector extends org.a
   *@param file is the file.
   *@param documentSpecification is the specification.
   */
-  protected static boolean checkIngest(File file, DocumentSpecification documentSpecification)
+  protected static boolean checkIngest(File file, Specification documentSpecification)
     throws ManifoldCFException
   {
     // Since the only exclusions at this point are not based on file contents, this is a
no-op.



Mime
View raw message