manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1416199 [1/3] - in /manifoldcf/trunk: connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcraw...
Date Sun, 02 Dec 2012 16:58:00 GMT
Author: kwright
Date: Sun Dec  2 16:57:56 2012
New Revision: 1416199

URL: http://svn.apache.org/viewvc?rev=1416199&view=rev
Log:
Refactor webcrawler connector, part of CONNECTORS-574.

Added:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
  (with props)
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
  (with props)
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
  (with props)
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
  (with props)
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
  (with props)
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindRedirectionHandler.java
  (with props)
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockSessionWebService.java
  (with props)
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/SessionTester.java
  (with props)
Modified:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/CredentialsDescription.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LoginParameters.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_en_US.properties
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/webcrawler/common_ja_JP.properties
    manifoldcf/trunk/tests/webcrawler/src/test/java/org/apache/manifoldcf/webcrawler_tests/MockWebService.java

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/CredentialsDescription.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/CredentialsDescription.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/CredentialsDescription.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/CredentialsDescription.java
Sun Dec  2 16:57:56 2012
@@ -108,6 +108,9 @@ public class CredentialsDescription
                 String authPageRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_URLREGEXP);
                 String pageType = child.getAttributeValue(WebcrawlerConfig.ATTR_TYPE);
                 String matchRegexp = child.getAttributeValue(WebcrawlerConfig.ATTR_MATCHREGEXP);
+		String overrideTargetURL = child.getAttributeValue(WebcrawlerConfig.ATTR_OVERRIDETARGETURL);
+		if (overrideTargetURL != null && overrideTargetURL.length() == 0)
+		  overrideTargetURL = null;
                 Pattern authPattern;
                 try
                 {
@@ -128,15 +131,19 @@ public class CredentialsDescription
                 }
                 if (pageType.equals(WebcrawlerConfig.ATTRVALUE_FORM))
                 {
-                  sc.addAuthPage(authPageRegexp,authPattern,null,null,matchRegexp,matchPattern,null,null);
+                  sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,matchRegexp,matchPattern,null,null,null,null);
                 }
                 else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_LINK))
                 {
-                  sc.addAuthPage(authPageRegexp,authPattern,matchRegexp,matchPattern,null,null,null,null);
+                  sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,matchRegexp,matchPattern,null,null,null,null,null,null);
                 }
                 else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_REDIRECTION))
                 {
-                  sc.addAuthPage(authPageRegexp,authPattern,null,null,null,null,matchRegexp,matchPattern);
+                  sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,null,null,matchRegexp,matchPattern,null,null);
+                }
+                else if (pageType.equals(WebcrawlerConfig.ATTRVALUE_CONTENT))
+                {
+                  sc.addAuthPage(authPageRegexp,authPattern,overrideTargetURL,null,null,null,null,null,null,matchRegexp,matchPattern);
                 }
                 else
                   throw new ManifoldCFException("Invalid page type: "+pageType);
@@ -315,38 +322,50 @@ public class CredentialsDescription
   protected static class SessionCredentialItem implements LoginParameters
   {
     /** url regexp */
-    protected String regexp;
+    protected final String regexp;
     /** Url match pattern */
-    protected Pattern pattern;
+    protected final Pattern pattern;
+    /** Override target URL */
+    protected final String overrideTargetURL;
     /** The preferred redirection regexp */
-    protected String preferredRedirectionRegexp;
+    protected final String preferredRedirectionRegexp;
     /** The preferred redirection pattern, or null if there's no preferred redirection */
-    protected Pattern preferredRedirectionPattern;
+    protected final Pattern preferredRedirectionPattern;
     /** The preferred link regexp */
-    protected String preferredLinkRegexp;
+    protected final String preferredLinkRegexp;
     /** The preferred link pattern, or null if there's no preferred link */
-    protected Pattern preferredLinkPattern;
+    protected final Pattern preferredLinkPattern;
     /** The form name regexp */
-    protected String formNameRegexp;
+    protected final String formNameRegexp;
     /** The form name pattern, or null if no form is expected */
-    protected Pattern formNamePattern;
+    protected final Pattern formNamePattern;
+    /** The content regexp */
+    protected final String contentRegexp;
+    /** The content pattern, or null if no content is sought for */
+    protected final Pattern contentPattern;
+    
     /** The list of the parameters we want to add for this pattern. */
-    protected List parameters = new ArrayList();
+    protected final List parameters = new ArrayList();
 
     /** Constructor */
     public SessionCredentialItem(String regexp, Pattern p,
+      String overrideTargetURL,
       String preferredLinkRegexp, Pattern preferredLinkPattern,
       String formNameRegexp, Pattern formNamePattern,
-      String preferredRedirectionRegexp, Pattern preferredRedirectionPattern)
+      String preferredRedirectionRegexp, Pattern preferredRedirectionPattern,
+      String contentRegexp, Pattern contentPattern)
     {
       this.regexp = regexp;
       this.pattern = p;
+      this.overrideTargetURL = overrideTargetURL;
       this.preferredLinkRegexp = preferredLinkRegexp;
       this.preferredLinkPattern = preferredLinkPattern;
       this.formNameRegexp = formNameRegexp;
       this.formNamePattern = formNamePattern;
       this.preferredRedirectionRegexp = preferredRedirectionRegexp;
       this.preferredRedirectionPattern = preferredRedirectionPattern;
+      this.contentRegexp = contentRegexp;
+      this.contentPattern = contentPattern;
     }
 
     /** Add parameter */
@@ -361,6 +380,13 @@ public class CredentialsDescription
       return pattern;
     }
 
+    /** Get the override target URL.
+    */
+    public String getOverrideTargetURL()
+    {
+      return overrideTargetURL;
+    }
+
     /** Get the preferred redirection pattern.
     */
     public Pattern getPreferredRedirectionPattern()
@@ -382,6 +408,13 @@ public class CredentialsDescription
       return formNamePattern;
     }
 
+    /** Get the content pattern.
+    */
+    public Pattern getContentPattern()
+    {
+      return contentPattern;
+    }
+
     /** Get the name of the i'th parameter.
     */
     public Pattern getParameterNamePattern(int index)
@@ -440,6 +473,14 @@ public class CredentialsDescription
       else if (!formNameRegexp.equals(sci.formNameRegexp))
         return false;
 
+      if (contentRegexp == null || sci.contentRegexp == null)
+      {
+        if (contentRegexp != sci.contentRegexp)
+          return false;
+      }
+      else if (!contentRegexp.equals(sci.contentRegexp))
+        return false;
+
       if (parameters.size() != sci.parameters.size())
         return false;
       int i = 0;
@@ -457,7 +498,8 @@ public class CredentialsDescription
     {
       int rval = regexp.hashCode() + ((preferredRedirectionRegexp==null)?0:preferredRedirectionRegexp.hashCode())
+
         ((preferredLinkRegexp==null)?0:preferredLinkRegexp.hashCode()) +
-        ((formNameRegexp==null)?0:formNameRegexp.hashCode());
+        ((formNameRegexp==null)?0:formNameRegexp.hashCode()) +
+	((contentRegexp==null)?0:contentRegexp.hashCode());
       int i = 0;
       while (i < parameters.size())
       {
@@ -539,15 +581,19 @@ public class CredentialsDescription
 
     /** Add an auth page */
     public void addAuthPage(String urlregexp, Pattern urlPattern,
+      String overrideTargetURL,
       String preferredLinkRegexp, Pattern preferredLinkPattern,
       String formNameRegexp, Pattern formNamePattern,
-      String preferredRedirectionRegexp, Pattern preferredRedirectionPattern)
+      String preferredRedirectionRegexp, Pattern preferredRedirectionPattern,
+      String contentRegexp, Pattern contentPattern)
       throws ManifoldCFException
     {
       sessionPages.put(urlregexp,new SessionCredentialItem(urlregexp,urlPattern,
+	overrideTargetURL,
         preferredLinkRegexp,preferredLinkPattern,
         formNameRegexp,formNamePattern,
-        preferredRedirectionRegexp,preferredRedirectionPattern));
+        preferredRedirectionRegexp,preferredRedirectionPattern,
+	contentRegexp,contentPattern));
     }
 
     /** Add a page parameter */

Added: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
(added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
Sun Dec  2 16:57:56 2012
@@ -0,0 +1,127 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.crawler.system.Logging;
+import java.util.regex.*;
+import java.util.*;
+
+/** This class is the handler for HTML content grepping during state transitions */
+public class FindContentHandler extends FindHandler implements IHTMLHandler
+{
+  protected final Pattern contentPattern;
+  protected final StringBuilder contentBuffer = new StringBuilder();
+
+  public FindContentHandler(String parentURI, Pattern contentPattern)
+  {
+    super(parentURI);
+    this.contentPattern = contentPattern;
+  }
+
+  /** Apply overrides */
+  public void applyOverrides(LoginParameters lp)
+    throws ManifoldCFException
+  {
+    if (targetURI != null && lp != null)
+    {
+      if (lp.getOverrideTargetURL() != null)
+        super.noteDiscoveredLink(lp.getOverrideTargetURL());
+    }
+  }
+
+  /** Note a character of text.
+  * Structured this way to keep overhead low for handlers that don't use text.
+  */
+  @Override
+  public void noteTextCharacter(char textCharacter)
+    throws ManifoldCFException
+  {
+    if (targetURI != null)
+      return;
+    // Build characters up into lines, and apply the regexp against them
+    if (textCharacter == '\t' || textCharacter >= ' ')
+      contentBuffer.append(textCharacter);
+    else
+    {
+      String bufferContents = contentBuffer.toString();
+      contentBuffer.setLength(0);
+      if (contentPattern.matcher(bufferContents).find())
+        targetURI = "";
+    }
+  }
+
+  /** Note a meta tag */
+  @Override
+  public void noteMetaTag(Map metaAttributes)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note the start of a form */
+  @Override
+  public void noteFormStart(Map formAttributes)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note an input tag */
+  @Override
+  public void noteFormInput(Map inputAttributes)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note the end of a form */
+  @Override
+  public void noteFormEnd()
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered href */
+  @Override
+  public void noteAHREF(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered href */
+  @Override
+  public void noteLINKHREF(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered IMG SRC */
+  @Override
+  public void noteIMGSRC(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered FRAME SRC */
+  @Override
+  public void noteFRAMESRC(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+
+}

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindContentHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
(added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
Sun Dec  2 16:57:56 2012
@@ -0,0 +1,191 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.crawler.system.Logging;
+import java.util.regex.*;
+import java.util.*;
+
+/** This class is the handler for HTML form parsing during state transitions */
+public class FindHTMLFormHandler extends FindHandler implements IHTMLHandler
+{
+  protected final Pattern formNamePattern;
+  protected FormDataAccumulator discoveredFormData = null;
+  protected FormDataAccumulator currentFormData = null;
+
+  public FindHTMLFormHandler(String parentURI, Pattern formNamePattern)
+  {
+    super(parentURI);
+    this.formNamePattern = formNamePattern;
+  }
+
+  public void applyFormOverrides(LoginParameters lp)
+    throws ManifoldCFException
+  {
+    if (discoveredFormData != null && lp != null)
+    {
+      if (lp.getOverrideTargetURL() != null)
+      {
+        super.noteDiscoveredLink(lp.getOverrideTargetURL());
+        discoveredFormData.overrideActionURI(getTargetURI());
+      }
+      discoveredFormData.applyOverrides(lp);
+    }
+  }
+
+  public FormData getFormData()
+  {
+    return discoveredFormData;
+  }
+
+  /** Note a character of text.
+  * Structured this way to keep overhead low for handlers that don't use text.
+  */
+  @Override
+  public void noteTextCharacter(char textCharacter)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note a meta tag */
+  @Override
+  public void noteMetaTag(Map metaAttributes)
+    throws ManifoldCFException
+  {
+  }
+    
+  /** Note the start of a form */
+  @Override
+  public void noteFormStart(Map formAttributes)
+    throws ManifoldCFException
+  {
+    if (Logging.connectors.isDebugEnabled())
+      Logging.connectors.debug("WEB: Saw form with"+
+        " name "+((formAttributes.get("name")==null)?"null":"'"+formAttributes.get("name")+"'")
+
+        " id "+((formAttributes.get("id")==null)?"null":"'"+formAttributes.get("id")+"'")
+
+        " action "+((formAttributes.get("action")==null)?"null":"'"+formAttributes.get("action")+"'")
+      );
+
+    // Is this a form element we can use?
+    boolean canUse;
+    if (formNamePattern != null)
+    {
+      // Find the identifier we will use for the form.  If name isn't there,
+      // we use id.  If id isn't there, we use action.  The only other thing we
+      // could reasonably do is identify the form by its form elements.
+      String formName = (String)formAttributes.get("name");
+      if (formName == null)
+        formName = (String)formAttributes.get("id");
+      if (formName == null)
+        formName = (String)formAttributes.get("action");
+      if (formName == null)
+        formName = "";
+
+      Matcher m = formNamePattern.matcher(formName);
+      canUse = m.find();
+    }
+    else
+      canUse = true;
+
+    if (canUse)
+    {
+      String actionURI = (String)formAttributes.get("action");
+      if (actionURI == null)
+        // Action URI is THIS uri!
+        actionURI = parentURI;
+      else if (actionURI.length() == 0)
+        actionURI = "";
+      noteDiscoveredLink(actionURI);
+      actionURI = getTargetURI();
+      if (actionURI != null)
+      {
+        String method = (String)formAttributes.get("method");
+        if (method == null || method.length() == 0)
+          method = "get";
+        else
+          method = method.toLowerCase();
+
+        // Start a new form
+        currentFormData = new FormDataAccumulator(actionURI,method.equals("post")?FormData.SUBMITMETHOD_POST:FormData.SUBMITMETHOD_GET);
+
+      }
+    }
+  }
+
+  /** Note an input tag */
+  @Override
+  public void noteFormInput(Map inputAttributes)
+    throws ManifoldCFException
+  {
+    if (Logging.connectors.isDebugEnabled())
+    {
+      String type = (String)inputAttributes.get("type");
+      if (type == null)
+        type = "text";
+      String name = (String)inputAttributes.get("name");
+      if (name == null)
+        name = "(null)";
+      Logging.connectors.debug("WEB: Saw form element of type '"+type+"' name '"+name+"'");
+    }
+    if (currentFormData != null)
+      currentFormData.addElement(inputAttributes);
+  }
+
+  /** Note the end of a form */
+  @Override
+  public void noteFormEnd()
+    throws ManifoldCFException
+  {
+    if (currentFormData != null)
+    {
+      discoveredFormData = currentFormData;
+      currentFormData = null;
+    }
+  }
+
+  /** Note discovered href */
+  @Override
+  public void noteAHREF(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered href */
+  @Override
+  public void noteLINKHREF(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered IMG SRC */
+  @Override
+  public void noteIMGSRC(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered FRAME SRC */
+  @Override
+  public void noteFRAMESRC(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+}

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLFormHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
(added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
Sun Dec  2 16:57:56 2012
@@ -0,0 +1,147 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.crawler.system.Logging;
+import java.util.regex.*;
+import java.util.*;
+
+/** This class is the handler for HTML parsing during state transitions */
+public class FindHTMLHrefHandler extends FindHandler implements IHTMLHandler
+{
+  protected final Pattern preferredLinkPattern;
+
+  public FindHTMLHrefHandler(String parentURI, Pattern preferredLinkPattern)
+  {
+    super(parentURI);
+    this.preferredLinkPattern = preferredLinkPattern;
+  }
+
+  /** Apply overrides */
+  public void applyOverrides(LoginParameters lp)
+    throws ManifoldCFException
+  {
+    if (targetURI != null && lp != null)
+    {
+      if (lp.getOverrideTargetURL() != null)
+        super.noteDiscoveredLink(lp.getOverrideTargetURL());
+    }
+  }
+
+  /** Note a character of text.
+  * Structured this way to keep overhead low for handlers that don't use text.
+  */
+  @Override
+  public void noteTextCharacter(char textCharacter)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note a meta tag */
+  @Override
+  public void noteMetaTag(Map metaAttributes)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note the start of a form */
+  @Override
+  public void noteFormStart(Map formAttributes)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note an input tag */
+  @Override
+  public void noteFormInput(Map inputAttributes)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note the end of a form */
+  @Override
+  public void noteFormEnd()
+    throws ManifoldCFException
+  {
+  }
+
+  /** Override noteDiscoveredLink */
+  @Override
+  public void noteDiscoveredLink(String rawURL)
+    throws ManifoldCFException
+  {
+    if (targetURI == null)
+    {
+      Logging.connectors.debug("WEB: Tried to match raw url '"+rawURL+"'");
+      super.noteDiscoveredLink(rawURL);
+      if (targetURI != null)
+      {
+        Logging.connectors.debug("WEB: Tried to match cooked url '"+targetURI+"'");
+        // Is this a form element we can use?
+        boolean canUse;
+        if (preferredLinkPattern != null)
+        {
+          Matcher m = preferredLinkPattern.matcher(targetURI);
+          canUse = m.find();
+          Logging.connectors.debug("WEB: Preferred link lookup "+((canUse)?"matched":"didn't
match")+" '"+targetURI+"'");
+        }
+        else
+        {
+          Logging.connectors.debug("WEB: Preferred link lookup for '"+targetURI+"' had no
pattern to match");
+          canUse = true;
+        }
+        if (!canUse)
+          targetURI = null;
+      }
+    }
+  }
+
+  /** Note discovered href */
+  @Override
+  public void noteAHREF(String rawURL)
+    throws ManifoldCFException
+  {
+    noteDiscoveredLink(rawURL);
+  }
+
+  /** Note discovered href */
+  @Override
+  public void noteLINKHREF(String rawURL)
+    throws ManifoldCFException
+  {
+    noteDiscoveredLink(rawURL);
+  }
+
+  /** Note discovered IMG SRC */
+  @Override
+  public void noteIMGSRC(String rawURL)
+    throws ManifoldCFException
+  {
+  }
+
+  /** Note discovered FRAME SRC */
+  @Override
+  public void noteFRAMESRC(String rawURL)
+    throws ManifoldCFException
+  {
+    noteDiscoveredLink(rawURL);
+  }
+
+}

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHTMLHrefHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
(added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
Sun Dec  2 16:57:56 2012
@@ -0,0 +1,105 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This class is used to discover links in a session login context */
+public class FindHandler implements IDiscoveredLinkHandler
+{
+  protected String parentURI;
+  protected String targetURI = null;
+
+  public FindHandler(String parentURI)
+  {
+    this.parentURI = parentURI;
+  }
+
+  /** Inform the world of a discovered link.
+  *@param rawURL is the raw discovered url.  This may be relative, malformed, or otherwise
unsuitable for use until final form is acheived.
+  */
+  @Override
+  public void noteDiscoveredLink(String rawURL)
+    throws ManifoldCFException
+  {
+    // Build a complete url, but don't filter or anything
+    try
+    {
+      java.net.URI url;
+      if (parentURI != null)
+      {
+        java.net.URI parentURL = new java.net.URI(parentURI);
+        url = parentURL.resolve(rawURL);
+      }
+      else
+        url = new java.net.URI(rawURL);
+
+      String protocol = url.getScheme();
+      String host = url.getHost();
+
+      // The new URL better darn well have a host and a protocol, and we only know how to
deal with
+      // http and https.
+      if (protocol == null || host == null)
+      {
+        return;
+      }
+      if (WebcrawlerConnector.understoodProtocols.get(protocol) == null)
+      {
+        return;
+      }
+
+      String id = url.toASCIIString();
+      if (id == null)
+        return;
+
+      // As a last basic legality check, go through looking for illegal characters.
+      int i = 0;
+      while (i < id.length())
+      {
+        char x = id.charAt(i++);
+        // Only 7-bit ascii is allowed in URLs - and that has limits too (no control characters)
+        if (x < ' ' || x > 127)
+        {
+          return;
+        }
+      }
+
+      // Set the target.
+      targetURI = id;
+    }
+    catch (java.net.URISyntaxException e)
+    {
+      return;
+    }
+    catch (java.lang.IllegalArgumentException e)
+    {
+      return;
+    }
+    catch (java.lang.NullPointerException e)
+    {
+      // This gets tossed by url.toAsciiString() for reasons I don't understand, but which
have to do with a malformed URL.
+      return;
+    }
+  }
+
+  public String getTargetURI()
+  {
+    return targetURI;
+  }
+}

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
(added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
Sun Dec  2 16:57:56 2012
@@ -0,0 +1,79 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.crawler.system.Logging;
+
+import java.util.regex.*;
+
+/** This class is the handler for redirection handling during state transitions */
+public class FindPreferredRedirectionHandler extends FindHandler implements IRedirectionHandler
+{
+  protected Pattern redirectionURIPattern;
+
+  public FindPreferredRedirectionHandler(String parentURI, Pattern redirectionURIPattern)
+  {
+    super(parentURI);
+    this.redirectionURIPattern = redirectionURIPattern;
+  }
+
+  /** Apply overrides */
+  public void applyOverrides(LoginParameters lp)
+    throws ManifoldCFException
+  {
+    if (targetURI != null && lp != null)
+    {
+      if (lp.getOverrideTargetURL() != null)
+        super.noteDiscoveredLink(lp.getOverrideTargetURL());
+    }
+  }
+    
+  /** Override noteDiscoveredLink */
+  @Override
+  public void noteDiscoveredLink(String rawURL)
+    throws ManifoldCFException
+  {
+    if (targetURI == null)
+    {
+      Logging.connectors.debug("WEB: Tried to match raw url '"+rawURL+"'");
+      super.noteDiscoveredLink(rawURL);
+      if (targetURI != null)
+      {
+        Logging.connectors.debug("WEB: Tried to match cooked url '"+targetURI+"'");
+        // Is this a form element we can use?
+        boolean canUse;
+        if (redirectionURIPattern != null)
+        {
+          Matcher m = redirectionURIPattern.matcher(targetURI);
+          canUse = m.find();
+          Logging.connectors.debug("WEB: Redirection link lookup "+((canUse)?"matched":"didn't
match")+" '"+targetURI+"'");
+        }
+        else
+        {
+          Logging.connectors.debug("WEB: Redirection link lookup for '"+targetURI+"' had
no pattern to match");
+          canUse = true;
+        }
+        if (!canUse)
+          targetURI = null;
+      }
+    }
+  }
+}
+

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindPreferredRedirectionHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindRedirectionHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindRedirectionHandler.java?rev=1416199&view=auto
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindRedirectionHandler.java
(added)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindRedirectionHandler.java
Sun Dec  2 16:57:56 2012
@@ -0,0 +1,32 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This class is the handler for redirection parsing during state transitions */
+public class FindRedirectionHandler extends FindHandler implements IRedirectionHandler
+{
+  public FindRedirectionHandler(String parentURI)
+  {
+    super(parentURI);
+  }
+
+}
+

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindRedirectionHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FindRedirectionHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
Sun Dec  2 16:57:56 2012
@@ -114,6 +114,11 @@ public class FormDataAccumulator impleme
     }
   }
 
+  public void overrideActionURI(String overrideURI)
+  {
+    this.actionURI = overrideURI;
+  }
+  
   public void applyOverrides(LoginParameters lp)
   {
     // This map contains the control names we have ALREADY wiped clean.

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
Sun Dec  2 16:57:56 2012
@@ -187,6 +187,8 @@ public class FormParseState extends Link
       if (optionValueText != null)
         optionValueText.append(thisChar);
     }
+    else
+      handler.noteTextCharacter(thisChar);
   }
 
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
Sun Dec  2 16:57:56 2012
@@ -52,4 +52,11 @@ public interface IHTMLHandler extends ID
   /** Note discovered FRAME SRC */
   public void noteFRAMESRC(String rawURL)
     throws ManifoldCFException;
+
+  /** Note a character of text.
+  * Structured this way to keep overhead low for handlers that don't use text.
+  */
+  public void noteTextCharacter(char textCharacter)
+    throws ManifoldCFException;
+
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
Sun Dec  2 16:57:56 2012
@@ -33,6 +33,7 @@ public class LinkParseState extends Meta
     this.handler = handler;
   }
 
+  @Override
   protected void noteNonscriptTag(String tagName, Map attributes)
     throws ManifoldCFException
   {

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LoginParameters.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LoginParameters.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LoginParameters.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LoginParameters.java
Sun Dec  2 16:57:56 2012
@@ -26,6 +26,10 @@ public interface LoginParameters
 {
   public static final String _rcsid = "@(#)$Id: LoginParameters.java 988245 2010-08-23 18:39:35Z
kwright $";
 
+  /** Get the override target URL.
+  */
+  public String getOverrideTargetURL();
+
   /** Get the preferred redirection pattern.
   */
   public Pattern getPreferredRedirectionPattern();
@@ -38,6 +42,10 @@ public interface LoginParameters
   */
   public Pattern getFormNamePattern();
 
+  /** Get the content pattern.
+  */
+  public Pattern getContentPattern();
+  
   /** Get the number of parameters.
   */
   public int getParameterCount();

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
Sun Dec  2 16:57:56 2012
@@ -32,6 +32,7 @@ public class MetaParseState extends Scri
     this.handler = handler;
   }
 
+  @Override
   protected void noteNonscriptTag(String tagName, Map attributes)
     throws ManifoldCFException
   {

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
Sun Dec  2 16:57:56 2012
@@ -37,6 +37,7 @@ public class ScriptParseState extends Ba
 
   // Override methods having to do with notification of tag discovery
 
+  @Override
   protected void noteTag(String tagName, Map attributes)
     throws ManifoldCFException
   {
@@ -57,6 +58,7 @@ public class ScriptParseState extends Ba
     }
   }
 
+  @Override
   protected void noteEndTag(String tagName)
     throws ManifoldCFException
   {

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java?rev=1416199&r1=1416198&r2=1416199&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
(original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConfig.java
Sun Dec  2 16:57:56 2012
@@ -96,8 +96,12 @@ public class WebcrawlerConfig
   public static final String ATTRVALUE_LINK = "link";
   /** Authentication page type: Redirection */
   public static final String ATTRVALUE_REDIRECTION = "redirection";
+  /** Authentication page type: Access */
+  public static final String ATTRVALUE_CONTENT = "content";
   /** Form name or link target regexp for authentication page */
   public static final String ATTR_MATCHREGEXP = "match";
+  /** URL to fetch next in a sequence (an override) */
+  public static final String ATTR_OVERRIDETARGETURL = "overridetargeturl";
   /** Authentication parameter node */
   public static final String NODE_AUTHPARAMETER = "authparameter";
   /** Authentication parameter name regexp */



Mime
View raw message