manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1626228 [10/10] - in /manifoldcf/branches/dev_1x: ./ connectors/alfresco/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/alfresco/ connectors/cmis/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/cmis/ conn...
Date Fri, 19 Sep 2014 14:22:28 GMT
Modified: manifoldcf/branches/dev_1x/connectors/sharepoint/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/sharepoint/XMLGenTest.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/sharepoint/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/sharepoint/XMLGenTest.java?rev=1626228&r1=1626227&r2=1626228&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/sharepoint/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/sharepoint/XMLGenTest.java (original)
+++ manifoldcf/branches/dev_1x/connectors/sharepoint/connector/src/test/java/org/apache/manifoldcf/crawler/connectors/sharepoint/XMLGenTest.java Fri Sep 19 14:22:27 2014
@@ -46,10 +46,10 @@ public class XMLGenTest
   public void buildViewFieldsTest()
     throws Exception
   {
-    ArrayList list = new ArrayList();
+    List<String> list = new ArrayList<String>();
     list.add("foo");
     list.add("bar");
-    String viewFieldsXML = SPSProxyHelper.buildViewFields(list).get_any()[0].toString();
+    String viewFieldsXML = SPSProxyHelper.buildViewFields(list.toArray(new String[0])).get_any()[0].toString();
     assertEquals("<ViewFields><FieldRef Name=\"foo\"/><FieldRef Name=\"bar\"/></ViewFields>",viewFieldsXML);
   }
   

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1626228&r1=1626227&r2=1626228&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Sep 19 14:22:27 2014
@@ -560,26 +560,22 @@ public class WebcrawlerConnector extends
   protected static final int RESULT_RETRY_DOCUMENT = 3;
 
 
-  /** Get document versions given an array of document identifiers.
-  * This method is called for EVERY document that is considered. It is
-  * therefore important to perform as little work as possible here.
-  *@param documentIdentifiers is the array of local document identifiers, as understood by this connector.
-  *@param oldVersions is the corresponding array of version strings that have been saved for the document identifiers.
-  *   A null value indicates that this is a first-time fetch, while an empty string indicates that the previous document
-  *   had an empty version string.
-  *@param activities is the interface this method should use to perform whatever framework actions are desired.
-  *@param spec is the current document specification for the current job.  If there is a dependency on this
-  * specification, then the version string should include the pertinent data, so that reingestion will occur
-  * when the specification changes.  This is primarily useful for metadata.
+  /** Process a set of documents.
+  * This is the method that should cause each document to be fetched, processed, and the results either added
+  * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
+  * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
+  *@param documentIdentifiers is the set of document identifiers to process.
+  *@param statuses are the currently-stored document versions for each document in the set of document identifiers
+  * passed in above.
+  *@param activities is the interface this method should use to queue up new document references
+  * and ingest documents.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
   *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
-  *@return the corresponding version strings, with null in the places where the document no longer exists.
-  * Empty version strings indicate that there is no versioning ability for the corresponding document, and the document
-  * will always be processed.
   */
   @Override
-  public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities,
-    DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption
   {
     getSession();
@@ -596,9 +592,8 @@ public class WebcrawlerConnector extends
     List<NameValue> namesAndValues = findMetadata(spec);
     // Create an array of name/value fixedlists
     String[] metadata = new String[namesAndValues.size()];
-    int k = 0;
     String[] fixedListStrings = new String[2];
-    while (k < metadata.length)
+    for (int k = 0; k < metadata.length; k++)
     {
       NameValue nv = namesAndValues.get(k);
       String name = nv.getName();
@@ -607,9 +602,20 @@ public class WebcrawlerConnector extends
       fixedListStrings[1] = value;
       StringBuilder newsb = new StringBuilder();
       packFixedList(newsb,fixedListStrings,'=');
-      metadata[k++] = newsb.toString();
+      metadata[k] = newsb.toString();
     }
     java.util.Arrays.sort(metadata);
+    Map<String,Set<String>> metaHash2 = new HashMap<String,Set<String>>();
+    for (NameValue nv : namesAndValues)
+    {
+      Set<String> hashValues = metaHash2.get(nv.getName());
+      if (hashValues == null)
+      {
+        hashValues = new HashSet<String>();
+        metaHash2.put(nv.getName(),hashValues);
+      }
+      hashValues.add(nv.getValue());
+    }
 
     // Get the excluded headers
     Set<String> excludedHeaders = findExcludedHeaders(spec);
@@ -621,8 +627,6 @@ public class WebcrawlerConnector extends
 
     String filterVersion = filter.getVersionString();
     
-    String[] rval = new String[documentIdentifiers.length];
-
     long currentTime = System.currentTimeMillis();
 
     // There are two ways to handle any document that's not available.  The first is to remove it.  The second is to keep it, but mark it with an empty version string.
@@ -632,12 +636,19 @@ public class WebcrawlerConnector extends
     // incapable of deleting documents.
     // Since the primary use of the crawler is expected to be repeated intranet crawls,  I've thus chosen to optimize the crawler for accuracy rather than performance
     // - if the document is gone, I just remove it, and expect churn when recrawling activities occur.
-    int i = 0;
-    while (i < documentIdentifiers.length)
+    for (String documentIdentifier : documentIdentifiers)
     {
-      String documentIdentifier = documentIdentifiers[i];
       // Verify that the url is legal
-      if (filter.isDocumentAndHostLegal(documentIdentifier))
+      if (!filter.isDocumentAndHostLegal(documentIdentifier))
+      {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("WEB: Removing url '"+documentIdentifier+"' because it's not in the set of allowed ones");
+        // Use null because we should have already filtered when we queued.
+        activities.deleteDocument(documentIdentifier);
+        continue;
+      }
+      
+      try
       {
         // The first thing we need to know is whether this url is part of a session-protected area.  We'll use that information
         // later to detect redirection to login.
@@ -648,7 +659,7 @@ public class WebcrawlerConnector extends
           if (sessionCredential != null)
             Logging.connectors.debug("Web: For document identifier '"+documentIdentifier+"' found session credential key '"+sessionCredential.getSequenceKey()+"'");
         }
-        
+          
         // Set up the initial state and state variables.
         int sessionState = SESSIONSTATE_NORMAL;
         String currentURI = documentIdentifier;
@@ -685,7 +696,7 @@ public class WebcrawlerConnector extends
           String checkSum = null;
           // The headers, which will be needed if resultSignal is RESULT_VERSION_NEEDED.
           Map<String,List<String>> headerData = null;
-          
+            
           while (true)
           {
             try
@@ -1020,7 +1031,7 @@ public class WebcrawlerConnector extends
                       }
                       // Otherwise, the last fetch stands on its own.  Fall through, and allow processing and link extraction
                     }
-
+                    
                     // Now, based on the session state and the document contents, decide how to proceed
                     if (resultSignal == RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
                     {
@@ -1158,12 +1169,15 @@ public class WebcrawlerConnector extends
           case RESULT_NO_DOCUMENT:
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("WEB: Removing url '"+documentIdentifier+"'"+((contextMessage!=null)?" because "+contextMessage:""),contextException);
-            rval[i] = null;
+            activities.deleteDocument(documentIdentifier);
             break;
           case RESULT_NO_VERSION:
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("WEB: Ignoring url '"+documentIdentifier+"'"+((contextMessage!=null)?" because "+contextMessage:""),contextException);
-            rval[i] = "";
+            
+            // We get here when a document didn't fetch.
+            // No version 
+            activities.noDocument(documentIdentifier,"");
             break;
           case RESULT_VERSION_NEEDED:
             // Calculate version from document data, which is presumed to be present.
@@ -1180,6 +1194,8 @@ public class WebcrawlerConnector extends
               sb.append('-');
 
             // Now, do the metadata.  This comes in two parts: first, the canned metadata, then the header data.
+            Map<String,Set<String>> metaHash = new HashMap<String,Set<String>>();
+            
             // They're all folded into the same part of the version string.
             int headerCount = 0;
             Iterator<String> headerIterator = headerData.keySet().iterator();
@@ -1199,9 +1215,16 @@ public class WebcrawlerConnector extends
               String lowerHeaderName = headerName.toLowerCase(Locale.ROOT);
               if (!reservedHeaders.contains(lowerHeaderName) && !excludedHeaders.contains(lowerHeaderName))
               {
+                Set<String> valueSet = metaHash.get(headerName);
+                if (valueSet == null)
+                {
+                  valueSet = new HashSet<String>();
+                  metaHash.put(headerName,valueSet);
+                }
                 List<String> headerValues = headerData.get(headerName);
                 for (String headerValue : headerValues)
                 {
+                  valueSet.add(headerValue);
                   fixedListStrings[0] = "header-"+headerName;
                   fixedListStrings[1] = headerValue;
                   StringBuilder newsb = new StringBuilder();
@@ -1216,21 +1239,147 @@ public class WebcrawlerConnector extends
               fullMetadata[headerCount++] = metadata[index++];
             }
             java.util.Arrays.sort(fullMetadata);
-            
+              
             packList(sb,fullMetadata,'+');
             // Done with the parseable part!  Add the checksum.
             sb.append(checkSum);
             // Add the filter version
             sb.append("+");
             sb.append(filterVersion);
-            rval[i] = sb.toString();
+              
+            String versionString = sb.toString();
+              
+            // Now, extract links.
+            // We'll call the "link extractor" series, so we can plug more stuff in over time.
+            boolean indexDocument = extractLinks(documentIdentifier,activities,filter);
+
+            // If scanOnly is set, we never ingest.  But all else is the same.
+            if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
+              continue;
+            
+            // Consider this document for ingestion.
+            // We can exclude it if it does not seem to be a kind of document that the ingestion system knows
+            // about.
+            String ingestURL;
+            if (indexDocument)
+              ingestURL = isDataIngestable(activities,documentIdentifier,filter);
+            else
+              ingestURL = null;
+
+            if (ingestURL == null)
+            {
+              // In case the indexability of the document changed, we still want to notify the incremental indexer.
+              // We do this by using a null url and a null repository document.  If a document with this identifier was
+              // previously indexed, it will be removed.
+                
+              activities.noDocument(documentIdentifier,versionString);
+                
+              if (Logging.connectors.isDebugEnabled())
+                Logging.connectors.debug("WEB: Decided not to ingest '"+documentIdentifier+"' because it did not match ingestability criteria");
+              continue;
+            }
+            
+            // Ingest the document
+            if (Logging.connectors.isDebugEnabled())
+              Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
+
+            RepositoryDocument rd = new RepositoryDocument();
+
+            // Set the file name
+            String fileName = "";
+            try {
+              fileName = documentIdentifiertoFileName(documentIdentifier);
+            } catch (URISyntaxException e1) {
+              fileName = "";
+            }
+            if (fileName.length() > 0){
+              rd.setFileName(fileName);
+            }
+                
+            // Set the content type
+            rd.setMimeType(cache.getContentType(documentIdentifier));
+                
+            // Turn into acls and add into description
+            String[] denyAcls;
+            if (acls == null)
+              denyAcls = null;
+            else
+            {
+              if (acls.length > 0)
+                denyAcls = new String[]{defaultAuthorityDenyToken};
+              else
+                denyAcls = new String[0];
+            }
+            
+            if (acls != null && denyAcls != null)
+              rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
+
+            // Grab metadata
+            for (String key : metaHash.keySet())
+            {
+              Set<String> metaList = metaHash.get(key);
+              String[] values = new String[metaList.size()];
+              int k = 0;
+              for (String value : metaList)
+              {
+                values[k++] = value;
+              }
+              rd.addField(key,values);
+            }
+
+            // Grab forced names and values
+            for (String key : metaHash2.keySet())
+            {
+              Set<String> metaList = metaHash2.get(key);
+              String[] values = new String[metaList.size()];
+              int k = 0;
+              for (String value : metaList)
+              {
+                values[k++] = value;
+              }
+              rd.addField(key,values);
+            }
+
+            long length = cache.getDataLength(documentIdentifier);
+            InputStream is = cache.getData(documentIdentifier);
+
+            if (is != null)
+            {
+              try
+              {
+                rd.setBinary(is,length);
+                try
+                {
+                  activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
+                }
+                catch (IOException e)
+                {
+                  handleIOException(e,"reading data");
+                }
+              }
+              finally
+              {
+                try
+                {
+                  is.close();
+                }
+                catch (IOException e)
+                {
+                  handleIOException(e,"closing stream");
+                }
+              }
+            }
+            else
+              Logging.connectors.error("WEB: Expected a cached document for '"+documentIdentifier+"', but none present!");
+
+            // MHL
+              
             break;
           case RESULT_RETRY_DOCUMENT:
             // Document could not be processed right now.
             if (Logging.connectors.isDebugEnabled())
               Logging.connectors.debug("WEB: Retrying url '"+documentIdentifier+"' later"+((contextMessage!=null)?" because "+contextMessage:""),contextException);
             activities.retryDocumentProcessing(documentIdentifier);
-            rval[i] = null;
             break;
           default:
             throw new ManifoldCFException("Unexpected value for result signal: "+Integer.toString(resultSignal));
@@ -1246,16 +1395,11 @@ public class WebcrawlerConnector extends
           }
         }
       }
-      else
+      finally
       {
-        if (Logging.connectors.isDebugEnabled())
-          Logging.connectors.debug("WEB: Removing url '"+documentIdentifier+"' because it's not in the set of allowed ones");
-        // Use null because we should have already filtered when we queued.
-        rval[i] = null;
+        cache.deleteData(documentIdentifier);
       }
-      i++;
     }
-    return rval;
   }
 
   protected static String extractContentType(String contentType)
@@ -1297,190 +1441,6 @@ public class WebcrawlerConnector extends
     return contentType;
   }
   
-  /** Process a set of documents.
-  * This is the method that should cause each document to be fetched, processed, and the results either added
-  * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
-  * The document specification allows this class to filter what is done based on the job.
-  *@param documentIdentifiers is the set of document identifiers to process.
-  *@param activities is the interface this method should use to queue up new document references
-  * and ingest documents.
-  *@param spec is the document specification.
-  *@param scanOnly is an array corresponding to the document identifiers.  It is set to true to indicate when the processing
-  * should only find other references, and should not actually call the ingestion methods.
-  */
-  @Override
-  public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities, DocumentSpecification spec, boolean[] scanOnly)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    getSession();
-
-    DocumentURLFilter filter = new DocumentURLFilter(spec);
-
-    String[] fixedList = new String[2];
-
-    // We need to extract and ingest here.
-    int i = 0;
-    while (i < documentIdentifiers.length)
-    {
-      String documentIdentifier = documentIdentifiers[i];
-      String version = versions[i];
-      boolean doScanOnly = scanOnly[i];
-
-      if (version.length() == 0)
-      {
-        i++;
-        // Leave document in jobqueue, but do NOT get rid of it, or we will wind up seeing it queued again by
-        // somebody else.  We *do* have to signal the document to be removed from the index, however, or it will
-        // stick around until the job is deleted.
-        activities.noDocument(documentIdentifier,version);
-        continue;
-      }
-
-      // Now, extract links.
-      // We'll call the "link extractor" series, so we can plug more stuff in over time.
-      boolean indexDocument = extractLinks(documentIdentifier,activities,filter);
-
-      // If scanOnly is set, we never ingest.  But all else is the same.
-      if (!doScanOnly)
-      {
-        // Consider this document for ingestion.
-        // We can exclude it if it does not seem to be a kind of document that the ingestion system knows
-        // about.
-        String ingestURL;
-        if (indexDocument)
-          ingestURL = isDataIngestable(activities,documentIdentifier,filter);
-        else
-          ingestURL = null;
-
-        if (ingestURL != null)
-        {
-          // Ingest the document
-          if (Logging.connectors.isDebugEnabled())
-            Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
-
-          // Unpack the version string
-          ArrayList acls = new ArrayList();
-          StringBuilder denyAclBuffer = new StringBuilder();
-          ArrayList metadata = new ArrayList();
-          int index = unpackList(acls,version,0,'+');
-          if (index < version.length() && version.charAt(index++) == '+')
-          {
-            index = unpack(denyAclBuffer,version,index,'+');
-          }
-          index = unpackList(metadata,version,index,'+');
-
-          RepositoryDocument rd = new RepositoryDocument();
-
-          // Set the file name
-          String fileName = "";
-          try {
-            fileName = documentIdentifiertoFileName(documentIdentifier);
-          } catch (URISyntaxException e1) {
-            fileName = "";
-          }
-          if (fileName.length() > 0){
-            rd.setFileName(fileName);
-          }
-          
-          // Set the content type
-          rd.setMimeType(cache.getContentType(documentIdentifier));
-          
-          // Turn into acls and add into description
-          String[] aclArray = new String[acls.size()];
-          int j = 0;
-          while (j < aclArray.length)
-          {
-            aclArray[j] = (String)acls.get(j);
-            j++;
-          }
-          rd.setSecurityACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,aclArray);
-          if (denyAclBuffer.length() > 0)
-          {
-            String[] denyAclArray = new String[]{denyAclBuffer.toString()};
-            rd.setSecurityDenyACL(RepositoryDocument.SECURITY_TYPE_DOCUMENT,denyAclArray);
-          }
-
-          // Grab metadata
-          HashMap metaHash = new HashMap();
-          int k = 0;
-          while (k < metadata.size())
-          {
-            String metadataItem = (String)metadata.get(k++);
-            unpackFixedList(fixedList,metadataItem,0,'=');
-            HashMap hashValue = (HashMap)metaHash.get(fixedList[0]);
-            if (hashValue == null)
-            {
-              hashValue = new HashMap();
-              metaHash.put(fixedList[0],hashValue);
-            }
-            hashValue.put(fixedList[1],fixedList[1]);
-          }
-          Iterator metaIter = metaHash.keySet().iterator();
-          while (metaIter.hasNext())
-          {
-            String key = (String)metaIter.next();
-            HashMap metaList = (HashMap)metaHash.get(key);
-            String[] values = new String[metaList.size()];
-            Iterator iter = metaList.keySet().iterator();
-            k = 0;
-            while (iter.hasNext())
-            {
-              values[k] = (String)iter.next();
-              k++;
-            }
-            rd.addField(key,values);
-          }
-
-          long length = cache.getDataLength(documentIdentifier);
-          InputStream is = cache.getData(documentIdentifier);
-
-          if (is != null)
-          {
-            try
-            {
-              rd.setBinary(is,length);
-              try
-              {
-                activities.ingestDocumentWithException(documentIdentifier,version,ingestURL,rd);
-              }
-              catch (IOException e)
-              {
-                handleIOException(e,"reading data");
-              }
-            }
-            finally
-            {
-              try
-              {
-                is.close();
-              }
-              catch (IOException e)
-              {
-                handleIOException(e,"closing stream");
-              }
-            }
-          }
-          else
-            Logging.connectors.error("WEB: Expected a cached document for '"+documentIdentifier+"', but none present!");
-        }
-        else
-        {
-          // In case the indexability of the document changed, we still want to notify the incremental indexer.
-          // We do this by using a null url and a null repository document.  If a document with this identifier was
-          // previously indexed, it will be removed.
-          
-          activities.noDocument(documentIdentifier,version);
-          
-          if (Logging.connectors.isDebugEnabled())
-            Logging.connectors.debug("WEB: Decided not to ingest '"+documentIdentifier+"' because it did not match ingestability criteria");
-        }
-      }
-
-
-      i++;
-    }
-  }
-
   protected static void handleIOException(IOException e, String context)
     throws ManifoldCFException, ServiceInterruption
   {
@@ -1494,30 +1454,6 @@ public class WebcrawlerConnector extends
       throw new ManifoldCFException("IO error "+context+": "+e.getMessage(),e);
   }
   
-  /** Free a set of documents.  This method is called for all documents whose versions have been fetched using
-  * the getDocumentVersions() method, including those that returned null versions.  It may be used to free resources
-  * committed during the getDocumentVersions() method.  It is guaranteed to be called AFTER any calls to
-  * processDocuments() for the documents in question.
-  *@param documentIdentifiers is the set of document identifiers.
-  *@param versions is the corresponding set of version identifiers (individual identifiers may be null).
-  */
-  @Override
-  public void releaseDocumentVersions(String[] documentIdentifiers, String[] versions)
-    throws ManifoldCFException
-  {
-    int i = 0;
-    while (i < documentIdentifiers.length)
-    {
-      String version = versions[i];
-      if (version != null)
-      {
-        String urlValue = documentIdentifiers[i];
-        cache.deleteData(urlValue);
-      }
-      i++;
-    }
-  }
-
   /** Get the maximum number of documents to amalgamate together into one batch, for this connector.
   *@return the maximum number. 0 indicates "unlimited".
   */
@@ -7508,7 +7444,7 @@ public class WebcrawlerConnector extends
   *@param spec is the document specification.
   *@return the acls.
   */
-  protected static String[] getAcls(DocumentSpecification spec)
+  protected static String[] getAcls(Specification spec)
   {
     Set<String> map = new HashSet<String>();
     int i = 0;
@@ -7533,7 +7469,7 @@ public class WebcrawlerConnector extends
   }
 
   /** Read a document specification to yield a map of name/value pairs for metadata */
-  protected static List<NameValue> findMetadata(DocumentSpecification spec)
+  protected static List<NameValue> findMetadata(Specification spec)
     throws ManifoldCFException
   {
     List<NameValue> rval = new ArrayList<NameValue>();
@@ -7553,7 +7489,7 @@ public class WebcrawlerConnector extends
   }
 
   /** Read a document specification to get a set of excluded headers */
-  protected static Set<String> findExcludedHeaders(DocumentSpecification spec)
+  protected static Set<String> findExcludedHeaders(Specification spec)
     throws ManifoldCFException
   {
     Set<String> rval = new HashSet<String>();
@@ -8053,7 +7989,7 @@ public class WebcrawlerConnector extends
     * This should be checked at save time to prevent errors.  Any syntax errors found here
     * will thus cause the include or exclude regexp to be skipped.
     */
-    public DocumentURLFilter(DocumentSpecification spec)
+    public DocumentURLFilter(Specification spec)
       throws ManifoldCFException
     {
       String includes = ".*";

Propchange: manifoldcf/branches/dev_1x/connectors/wiki/
------------------------------------------------------------------------------
  Merged /manifoldcf/trunk/connectors/wiki:r1625103

Modified: manifoldcf/branches/dev_1x/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1626228&r1=1626227&r2=1626228&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java Fri Sep 19 14:22:27 2014
@@ -886,94 +886,86 @@ public class WikiConnector extends org.a
       listAllPages(activities,null,null,startTime,endTime);
   }
 
-  /** Get document versions given an array of document identifiers.
-  * This method is called for EVERY document that is considered. It is
-  * therefore important to perform as little work as possible here.
-  *@param documentIdentifiers is the array of local document identifiers, as understood by this connector.
-  *@param oldVersions is the corresponding array of version strings that have been saved for the document identifiers.
-  *   A null value indicates that this is a first-time fetch, while an empty string indicates that the previous document
-  *   had an empty version string.
-  *@param activities is the interface this method should use to perform whatever framework actions are desired.
-  *@param spec is the current document specification for the current job.  If there is a dependency on this
-  * specification, then the version string should include the pertinent data, so that reingestion will occur
-  * when the specification changes.  This is primarily useful for metadata.
-  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
-  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
-  *@return the corresponding version strings, with null in the places where the document no longer exists.
-  * Empty version strings indicate that there is no versioning ability for the corresponding document, and the document
-  * will always be processed.
-  */
-  @Override
-  public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities,
-    DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    Map<String,String> versions = new HashMap<String,String>();
-    getTimestamps(documentIdentifiers,versions,activities);
-    String[] rval = new String[documentIdentifiers.length];
-    for (int i = 0 ; i < rval.length ; i++)
-    {
-      rval[i] = versions.get(documentIdentifiers[i]);
-    }
-    return rval;
-  }
-  
   /** Process a set of documents.
   * This is the method that should cause each document to be fetched, processed, and the results either added
   * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
   * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
   *@param documentIdentifiers is the set of document identifiers to process.
-  *@param versions is the corresponding document versions to process, as returned by getDocumentVersions() above.
-  *       The implementation may choose to ignore this parameter and always process the current version.
+  *@param statuses are the currently-stored document versions for each document in the set of document identifiers
+  * passed in above.
   *@param activities is the interface this method should use to queue up new document references
   * and ingest documents.
-  *@param spec is the document specification.
-  *@param scanOnly is an array corresponding to the document identifiers.  It is set to true to indicate when the processing
-  * should only find other references, and should not actually call the ingestion methods.
   *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
   */
-  public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities,
-    DocumentSpecification spec, boolean[] scanOnly, int jobMode)
+  @Override
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption
   {
     // Forced acls
     String[] acls = getAcls(spec);
 
-    Map<String,String> urls = new HashMap<String,String>();
-    getDocURLs(documentIdentifiers,urls);
-    for (int i = 0 ; i < documentIdentifiers.length ; i++)
+    Map<String,String> versions = new HashMap<String,String>();
+    getTimestamps(documentIdentifiers,versions,activities);
+    
+    List<String> fetchDocuments = new ArrayList<String>();
+    for (String documentIdentifier : documentIdentifiers)
     {
-      if (!scanOnly[i])
+      String versionString = versions.get(documentIdentifier);
+      if (versionString == null)
       {
-        String url = urls.get(documentIdentifiers[i]);
-        if (url != null)
-          getDocInfo(documentIdentifiers[i], versions[i], url, activities, acls);
+        activities.deleteDocument(documentIdentifier);
+        continue;
       }
+      
+      if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
+        continue;
+      
+      fetchDocuments.add(documentIdentifier);
+    }
+    
+    if (fetchDocuments.size() == 0)
+      return;
+    
+    String[] fetchDocumentsArray = fetchDocuments.toArray(new String[0]);
+    Map<String,String> urls = new HashMap<String,String>();
+    getDocURLs(documentIdentifiers,urls);
+    for (String documentIdentifier : fetchDocumentsArray)
+    {
+      String url = urls.get(documentIdentifier);
+      String versionString = versions.get(documentIdentifier);
+      if (url != null)
+        getDocInfo(documentIdentifier, versionString, url, activities, acls);
+      else
+        activities.noDocument(documentIdentifier,versionString);
     }
-  }
 
+  }
+  
   /**
    * Grab forced acl out of document specification.
    *
    * @param spec is the document specification.
    * @return the acls.
    */
-  protected static String[] getAcls(DocumentSpecification spec) {
-    HashMap map = new HashMap();
-    int i = 0;
-    while (i < spec.getChildCount()) {
-      SpecificationNode sn = spec.getChild(i++);
+  protected static String[] getAcls(Specification spec) {
+    Set<String> aclMap = new HashSet<String>();
+    for (int i = 0; i < spec.getChildCount(); i++)
+    {
+      SpecificationNode sn = spec.getChild(i);
       if (sn.getType().equals("access")) {
         String token = sn.getAttributeValue("token");
-        map.put(token, token);
+        aclMap.add(token);
       }
     }
 
-    String[] rval = new String[map.size()];
-    Iterator iter = map.keySet().iterator();
-    i = 0;
-    while (iter.hasNext()) {
-      rval[i++] = (String) iter.next();
+    String[] rval = new String[aclMap.size()];
+    int j = 0;
+    for (String acl : aclMap)
+    {
+      rval[j++] = acl;
     }
     return rval;
   }
@@ -3222,10 +3214,10 @@ public class WikiConnector extends org.a
   /** Thread to execute a "get timestamp" operation.  This thread both executes the operation and parses the result. */
   protected static class ExecuteGetTimestampThread extends Thread
   {
-    protected HttpClient client;
-    protected HttpRequestBase executeMethod;
+    protected final HttpClient client;
+    protected final HttpRequestBase executeMethod;
     protected Throwable exception = null;
-    protected Map<String,String> versions;
+    protected final Map<String,String> versions;
     protected boolean loginNeeded = false;
 
     public ExecuteGetTimestampThread(HttpClient client, HttpRequestBase executeMethod, Map<String,String> versions)

Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java?rev=1626228&r1=1626227&r2=1626228&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/InterruptionRepositoryConnector.java Fri Sep 19 14:22:27 2014
@@ -54,48 +54,44 @@ public class InterruptionRepositoryConne
     }
   }
   
+  /** Process a set of documents.
+  * This is the method that should cause each document to be fetched, processed, and the results either added
+  * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
+  * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
+  *@param documentIdentifiers is the set of document identifiers to process.
+  *@param statuses are the currently-stored document versions for each document in the set of document identifiers
+  * passed in above.
+  *@param activities is the interface this method should use to queue up new document references
+  * and ingest documents.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
+  */
   @Override
-  public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities,
-    DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    String[] rval = new String[documentIdentifiers.length];
-    for (int i = 0; i < rval.length; i++)
-    {
-      rval[i] = "";
-    }
-    return rval;
-  }
-
-  @Override
-  public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities,
-    DocumentSpecification spec, boolean[] scanOnly, int jobMode)
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption
   {
     for (int i = 0; i < documentIdentifiers.length; i++)
     {
       String documentIdentifier = documentIdentifiers[i];
-      String version = versions[i];
-      if (!scanOnly[i])
+      if (documentIdentifier.equals("test0.txt"))
+      {
+        // This will emulate one particular document failing (and being skipped)
+        long currentTime = System.currentTimeMillis();
+        throw new ServiceInterruption("Pretending there's a service interruption",
+          null,currentTime+1000L,currentTime+5000L,10,false);
+      }
+      RepositoryDocument rd = new RepositoryDocument();
+      byte[] bytes = documentIdentifier.getBytes(StandardCharsets.UTF_8);
+      rd.setBinary(new ByteArrayInputStream(bytes),bytes.length);
+      try
+      {
+        activities.ingestDocumentWithException(documentIdentifier,"","http://"+documentIdentifier,rd);
+      }
+      catch (IOException e)
       {
-        if (documentIdentifier.equals("test0.txt"))
-        {
-          // This will emulate one particular document failing (and being skipped)
-          long currentTime = System.currentTimeMillis();
-          throw new ServiceInterruption("Pretending there's a service interruption",
-            null,currentTime+1000L,currentTime+5000L,10,false);
-        }
-        RepositoryDocument rd = new RepositoryDocument();
-        byte[] bytes = documentIdentifier.getBytes(StandardCharsets.UTF_8);
-        rd.setBinary(new ByteArrayInputStream(bytes),bytes.length);
-        try
-        {
-          activities.ingestDocumentWithException(documentIdentifier,version,"http://"+documentIdentifier,rd);
-        }
-        catch (IOException e)
-        {
-          throw new RuntimeException("Shouldn't be seeing IOException from binary array input stream: "+e.getMessage(),e);
-        }
+        throw new RuntimeException("Shouldn't be seeing IOException from binary array input stream: "+e.getMessage(),e);
       }
     }
   }

Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java?rev=1626228&r1=1626227&r2=1626228&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/SchedulingRepositoryConnector.java Fri Sep 19 14:22:27 2014
@@ -60,22 +60,22 @@ public class SchedulingRepositoryConnect
     System.out.println("Seeding completed at "+System.currentTimeMillis());
   }
   
+  /** Process a set of documents.
+  * This is the method that should cause each document to be fetched, processed, and the results either added
+  * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
+  * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
+  *@param documentIdentifiers is the set of document identifiers to process.
+  *@param statuses are the currently-stored document versions for each document in the set of document identifiers
+  * passed in above.
+  *@param activities is the interface this method should use to queue up new document references
+  * and ingest documents.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
+  */
   @Override
-  public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities,
-    DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    String[] rval = new String[documentIdentifiers.length];
-    for (int i = 0; i < rval.length; i++)
-    {
-      rval[i] = "";
-    }
-    return rval;
-  }
-
-  @Override
-  public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities,
-    DocumentSpecification spec, boolean[] scanOnly, int jobMode)
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption
   {
     String documentsPerSeedString = params.getParameter("documentsperseed");
@@ -104,37 +104,34 @@ public class SchedulingRepositoryConnect
       }
       else
       {
-        if (!scanOnly[i])
+        System.out.println("Fetching "+documentIdentifier);
+        // Find the bin
+        String bin = documentIdentifier.substring(0,documentIdentifier.indexOf("/"));
+        // For now they are all the same
+        long binTimePerDocument = timePerDocument;
+        long now = System.currentTimeMillis();
+        long whenFetch;
+        synchronized (nextFetchTime)
+        {
+          Long time = nextFetchTime.get(bin);
+          if (time == null)
+            whenFetch = now;
+          else
+            whenFetch = time.longValue();
+          nextFetchTime.put(bin,new Long(whenFetch + binTimePerDocument));
+        }
+        if (whenFetch > now)
         {
-          System.out.println("Fetching "+documentIdentifier);
-          // Find the bin
-          String bin = documentIdentifier.substring(0,documentIdentifier.indexOf("/"));
-          // For now they are all the same
-          long binTimePerDocument = timePerDocument;
-          long now = System.currentTimeMillis();
-          long whenFetch;
-          synchronized (nextFetchTime)
+          System.out.println("Waiting "+(whenFetch-now)+" to fetch "+documentIdentifier);
+          try
           {
-            Long time = nextFetchTime.get(bin);
-            if (time == null)
-              whenFetch = now;
-            else
-              whenFetch = time.longValue();
-            nextFetchTime.put(bin,new Long(whenFetch + binTimePerDocument));
+            ManifoldCF.sleep(whenFetch-now);
           }
-          if (whenFetch > now)
+          catch (InterruptedException e)
           {
-            System.out.println("Waiting "+(whenFetch-now)+" to fetch "+documentIdentifier);
-            try
-            {
-              ManifoldCF.sleep(whenFetch-now);
-            }
-            catch (InterruptedException e)
-            {
-              throw new ManifoldCFException(e.getMessage(),ManifoldCFException.INTERRUPTED);
-            }
-            System.out.println("Wait complete for "+documentIdentifier);
+            throw new ManifoldCFException(e.getMessage(),ManifoldCFException.INTERRUPTED);
           }
+          System.out.println("Wait complete for "+documentIdentifier);
         }
       }
     }

Modified: manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java?rev=1626228&r1=1626227&r2=1626228&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java (original)
+++ manifoldcf/branches/dev_1x/framework/pull-agent/src/test/java/org/apache/manifoldcf/crawler/tests/TestingRepositoryConnector.java Fri Sep 19 14:22:27 2014
@@ -54,41 +54,37 @@ public class TestingRepositoryConnector 
     }
   }
   
+  /** Process a set of documents.
+  * This is the method that should cause each document to be fetched, processed, and the results either added
+  * to the queue of documents for the current job, and/or entered into the incremental ingestion manager.
+  * The document specification allows this class to filter what is done based on the job.
+  * The connector will be connected before this method can be called.
+  *@param documentIdentifiers is the set of document identifiers to process.
+  *@param statuses are the currently-stored document versions for each document in the set of document identifiers
+  * passed in above.
+  *@param activities is the interface this method should use to queue up new document references
+  * and ingest documents.
+  *@param jobMode is an integer describing how the job is being run, whether continuous or once-only.
+  *@param usesDefaultAuthority will be true only if the authority in use for these documents is the default one.
+  */
   @Override
-  public String[] getDocumentVersions(String[] documentIdentifiers, String[] oldVersions, IVersionActivity activities,
-    DocumentSpecification spec, int jobMode, boolean usesDefaultAuthority)
-    throws ManifoldCFException, ServiceInterruption
-  {
-    String[] rval = new String[documentIdentifiers.length];
-    for (int i = 0; i < rval.length; i++)
-    {
-      rval[i] = "";
-    }
-    return rval;
-  }
-
-  @Override
-  public void processDocuments(String[] documentIdentifiers, String[] versions, IProcessActivity activities,
-    DocumentSpecification spec, boolean[] scanOnly, int jobMode)
+  public void processDocuments(String[] documentIdentifiers, IExistingVersions statuses, Specification spec,
+    IProcessActivity activities, int jobMode, boolean usesDefaultAuthority)
     throws ManifoldCFException, ServiceInterruption
   {
     for (int i = 0; i < documentIdentifiers.length; i++)
     {
       String documentIdentifier = documentIdentifiers[i];
-      String version = versions[i];
-      if (!scanOnly[i])
+      RepositoryDocument rd = new RepositoryDocument();
+      byte[] bytes = documentIdentifier.getBytes(StandardCharsets.UTF_8);
+      rd.setBinary(new ByteArrayInputStream(bytes),bytes.length);
+      try
+      {
+        activities.ingestDocumentWithException(documentIdentifier,"","http://"+documentIdentifier,rd);
+      }
+      catch (IOException e)
       {
-        RepositoryDocument rd = new RepositoryDocument();
-        byte[] bytes = documentIdentifier.getBytes(StandardCharsets.UTF_8);
-        rd.setBinary(new ByteArrayInputStream(bytes),bytes.length);
-        try
-        {
-          activities.ingestDocumentWithException(documentIdentifier,version,"http://"+documentIdentifier,rd);
-        }
-        catch (IOException e)
-        {
-          throw new RuntimeException("Shouldn't be seeing IOException from binary array input stream: "+e.getMessage(),e);
-        }
+        throw new RuntimeException("Shouldn't be seeing IOException from binary array input stream: "+e.getMessage(),e);
       }
     }
   }



Mime
View raw message