manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1625103 - /manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Date Mon, 15 Sep 2014 18:18:59 GMT
Author: kwright
Date: Mon Sep 15 18:18:59 2014
New Revision: 1625103

URL: http://svn.apache.org/r1625103
Log:
Revamp wiki connector.  Part of CONNECTORS-977.

Modified:
    manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java

Modified: manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1625103&r1=1625102&r2=1625103&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
(original)
+++ manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Mon Sep 15 18:18:59 2014
@@ -899,94 +899,86 @@ public class WikiConnector extends org.a
     return new Long(seedTime).toString();
   }
 
-  /** Get document versions given an array of document identifiers.
-  * This method is called for EVERY document that is considered. It is
-  * therefore important to perform as little work as possible here.
-  *@param documentIdentifiers is the array of local document identifiers, as understood by
this connector.
-  *@param oldVersions is the corresponding array of version strings that have been saved
for the document identifiers.
-  *   A null value indicates that this is a first-time fetch, while an empty string indicates
that the previous document
-  *   had an empty version string.
-  *@param activities is the interface this method should use to perform whatever framework
actions are desired.
-  *@param spec is the current document specification for the current job.  If there is a
dependency on this
-  * specification, then the version string should include the pertinent data, so that reingestion
will occur
-  * when the specification changes.  This is primarily useful for metadata.
-  *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
-  *@param usesDefaultAuthority will be true only if the authority in use for these documents
is the default one.
-  *@return the corresponding version strings, with null in the places where the document
no longer exists.
-  * Empty version strings indicate that there is no versioning ability for the corresponding
document, and the document
-  * will always be processed.
-  */
-  @Override
-  public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions,
IVersionActivity activities,
-    DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    Map<String,String> versions = new HashMap<String,String>();
-    getTimestamps(documentIdentifiers,versions,activities);
-    String[] rval = new String[documentIdentifiers.length];
-    for (int i = 0 ; i < rval.length ; i++)
-    {
-      rval[i] = versions.get(documentIdentifiers[i]);
-    }
-    return rval;
-  }
-  
   /** Process a set of documents.
   * This is the method that should cause each document to be fetched, processed, and the
results either added
   * to the queue of documents for the current job, and/or entered into the incremental ingestion
manager.
   * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
   *@param documentIdentifiers is the set of document identifiers to process.
-  *@param versions is the corresponding document versions to process, as returned by getDocumentVersions()
above.
-  *       The implementation may choose to ignore this parameter and always process the current
version.
+  *@param statuses are the currently-stored document versions for each document in the set
of document identifiers
+  * passed in above.
   *@param activities is the interface this method should use to queue up new document references
   * and ingest documents.
-  *@param spec is the document specification.
-  *@param scanOnly is an array corresponding to the document identifiers.  It is set to true
to indicate when the processing
-  * should only find other references, and should not actually call the ingestion methods.
   *@param jobMode is an integer describing how the job is being run, whether continuous or
once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents
is the default one.
   */
-  public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity
activities,
-    DocumentSpecification spec, boolean[] scanOnly, int jobMode)
+  @Override
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses,
Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption
   {
     // Forced acls
     String[] acls = getAcls(spec);
 
-    Map<String,String> urls = new HashMap<String,String>();
-    getDocURLs(documentIdentifiers,urls);
-    for (int i = 0 ; i < documentIdentifiers.length ; i++)
+    Map<String,String> versions = new HashMap<String,String>();
+    getTimestamps(documentIdentifiers,versions,activities);
+    
+    List<String> fetchDocuments = new ArrayList<String>();
+    for (String documentIdentifier : documentIdentifiers)
     {
-      if (!scanOnly[i])
+      String versionString = versions.get(documentIdentifier);
+      if (versionString == null)
       {
-        String url = urls.get(documentIdentifiers[i]);
-        if (url != null)
-          getDocInfo(documentIdentifiers[i], versions[i], url, activities, acls);
+        activities.deleteDocument(documentIdentifier);
+        continue;
       }
+      
+      if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
+        continue;
+      
+      fetchDocuments.add(documentIdentifier);
+    }
+    
+    if (fetchDocuments.size() == 0)
+      return;
+    
+    String[] fetchDocumentsArray = fetchDocuments.toArray(new String[0]);
+    Map<String,String> urls = new HashMap<String,String>();
+    getDocURLs(documentIdentifiers,urls);
+    for (String documentIdentifier : fetchDocumentsArray)
+    {
+      String url = urls.get(documentIdentifier);
+      String versionString = versions.get(documentIdentifier);
+      if (url != null)
+        getDocInfo(documentIdentifier, versionString, url, activities, acls);
+      else
+        activities.noDocument(documentIdentifier,versionString);
     }
-  }
 
+  }
+  
   /**
    * Grab forced acl out of document specification.
    *
    * @param spec is the document specification.
    * @return the acls.
    */
-  protected static String[] getAcls(DocumentSpecification spec) {
-    HashMap map = new HashMap();
-    int i = 0;
-    while (i < spec.getChildCount()) {
-      SpecificationNode sn = spec.getChild(i++);
+  protected static String[] getAcls(Specification spec) {
+    Set<String> aclMap = new HashSet<String>();
+    for (int i = 0; i < spec.getChildCount(); i++)
+    {
+      SpecificationNode sn = spec.getChild(i);
       if (sn.getType().equals("access")) {
         String token = sn.getAttributeValue("token");
-        map.put(token, token);
+        aclMap.add(token);
       }
     }
 
-    String[] rval = new String[map.size()];
-    Iterator iter = map.keySet().iterator();
-    i = 0;
-    while (iter.hasNext()) {
-      rval[i++] = (String) iter.next();
+    String[] rval = new String[aclMap.size()];
+    int j = 0;
+    for (String acl : aclMap)
+    {
+      rval[j++] = acl;
     }
     return rval;
   }
@@ -3213,10 +3205,10 @@ public class WikiConnector extends org.a
   /** Thread to execute a "get timestamp" operation.  This thread both executes the operation
and parses the result. */
   protected static class ExecuteGetTimestampThread extends Thread
   {
-    protected HttpClient client;
-    protected HttpRequestBase executeMethod;
+    protected final HttpClient client;
+    protected final HttpRequestBase executeMethod;
     protected Throwable exception = null;
-    protected Map<String,String> versions;
+    protected final Map<String,String> versions;
     protected boolean loginNeeded = false;
 
     public ExecuteGetTimestampThread(HttpClient client, HttpRequestBase executeMethod, Map<String,String>
versions)



Mime
View raw message