incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1179076 - /incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Date Wed, 05 Oct 2011 07:34:08 GMT
Author: kwright
Date: Wed Oct  5 07:34:08 2011
New Revision: 1179076

URL: http://svn.apache.org/viewvc?rev=1179076&view=rev
Log:
Add session management and more http transaction infrastructure

Modified:
    incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java

Modified: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1179076&r1=1179075&r2=1179076&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
(original)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Wed Oct  5 07:34:08 2011
@@ -31,8 +31,15 @@ import org.apache.manifoldcf.agents.comm
 import org.apache.manifoldcf.agents.common.XMLStringContext;
 import org.apache.manifoldcf.agents.common.XMLFileContext;
 
+import org.apache.commons.httpclient.*;
+import org.apache.commons.httpclient.methods.*;
+import org.apache.commons.httpclient.params.*;
+import org.apache.commons.httpclient.auth.*;
+import org.apache.commons.httpclient.protocol.*;
+
 import java.util.*;
 import java.io.*;
+import java.net.*;
 
 /** This is the repository connector for a wiki.
 */
@@ -41,17 +48,25 @@ public class WikiConnector extends org.a
   public static final String _rcsid = "@(#)$Id$";
 
   // Activities that we know about
+  
+  /** Fetch activity */
   protected final static String ACTIVITY_FETCH = "fetch document";
 
-  // Activities list
+  /** Activities list */
   protected static final String[] activitiesList = new String[]{ACTIVITY_FETCH};
 
-  // Parameters
-  protected String protocol = null;
+  /** Has setup been called? */
+  protected boolean hasBeenSetup = false;
+  
+  /** Server name */
   protected String server = null;
-  protected int port = -1;
-  protected String path = null;
   
+  /** Base URL */
+  protected String baseURL = null;
+  
+  /** Connection management */
+  protected MultiThreadedHttpConnectionManager connectionManager = null;
+
   /** Constructor.
   */
   public WikiConnector()
@@ -83,7 +98,125 @@ public class WikiConnector extends org.a
   public void connect(ConfigParams configParameters)
   {
     super.connect(configParameters);
-    server = configParameters.getParameter(WikiConfig.PARAM_SERVER);
+    server = params.getParameter(WikiConfig.PARAM_SERVER);
+  }
+
+  protected void getSession()
+    throws ManifoldCFException, ServiceInterruption
+  {
+    if (hasBeenSetup == false)
+    {
+      String protocol = params.getParameter(WikiConfig.PARAM_PROTOCOL);
+      if (protocol == null || protocol.length() == 0)
+        protocol = "http";
+      String portString = params.getParameter(WikiConfig.PARAM_PORT);
+      if (portString == null || portString.length() == 0)
+        portString = null;
+      String path = params.getParameter(WikiConfig.PARAM_PATH);
+      if (path == null)
+        path = "";
+      
+      baseURL = protocol + "://" + server + ((portString!=null)?":" + portString:"") + path
+ "/api.php?";
+
+      // Set up connection manager
+      connectionManager = new MultiThreadedHttpConnectionManager();
+      connectionManager.getParams().setMaxTotalConnections(1);
+
+      hasBeenSetup = true;
+    }
+  }
+  
+  /** Check status of connection.
+  */
+  @Override
+  public String check()
+    throws ManifoldCFException
+  {
+    try
+    {
+      // Destroy saved session setup and repeat it
+      hasBeenSetup = false;
+      getSession();
+
+      // Now, set up trial data fetch
+      String checkURL = getCheckURL();
+      
+      HttpClient client = getInitializedClient();
+      try
+      {
+        // Set up fetch using our special stuff if it's https
+        GetMethod method = new GetMethod(checkURL);
+        try
+        {
+          method.getParams().setParameter("http.socket.timeout", new Integer(300000));
+          int statusCode = executeMethodViaThread(client,method);
+          switch (statusCode)
+          {
+          case HttpStatus.SC_OK:
+            return super.check();
+
+          default:
+            return "Fetch test returned an unexpected response code of "+Integer.toString(statusCode);
+          }
+        }
+        catch (InterruptedException e)
+        {
+          // Drop the connection on the floor
+          method = null;
+          throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+        }
+        catch (java.net.SocketTimeoutException e)
+        {
+          return "Fetch test timed out reading from the Wiki server: "+e.getMessage();
+        }
+        catch (java.net.SocketException e)
+        {
+          return "Fetch test received a socket error reading from Wiki server: "+e.getMessage();
+        }
+        catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+        {
+          return "Fetch test connection timed out reading from Wiki server: "+e.getMessage();
+        }
+        catch (InterruptedIOException e)
+        {
+          throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+        }
+        catch (IOException e)
+        {
+          return "Fetch test had an IO failure: "+e.getMessage();
+        }
+        finally
+        {
+          if (method != null)
+            method.releaseConnection();
+        }
+      }
+      catch (IllegalStateException e)
+      {
+        return "Fetch test had a state exception talking to Livelink HTTP Server: "+e.getMessage();
+      }
+    }
+    catch (ServiceInterruption e)
+    {
+      return "Transient error: "+e.getMessage();
+    }
+    catch (ManifoldCFException e)
+    {
+      if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+        throw e;
+      return "Error: "+e.getMessage();
+    }
+  }
+
+  /** This method is periodically called for all connectors that are connected but not
+  * in active use.
+  */
+  @Override
+  public void poll()
+    throws ManifoldCFException
+  {
+    if (connectionManager != null)
+      connectionManager.closeIdleConnections(60000L);
   }
 
   /** Close the connection.  Call this before discarding the connection.
@@ -92,7 +225,16 @@ public class WikiConnector extends org.a
   public void disconnect()
     throws ManifoldCFException
   {
+    hasBeenSetup = false;
     server = null;
+    baseURL = null;
+    
+    if (connectionManager != null)
+    {
+      connectionManager.shutdown();
+      connectionManager = null;
+    }
+
     super.disconnect();
   }
 
@@ -480,6 +622,118 @@ public class WikiConnector extends org.a
 
   // Protected static classes and methods
 
+  /** Create and initialize an HttpClient instance */
+  protected HttpClient getInitializedClient()
+    throws ServiceInterruption, ManifoldCFException
+  {
+    HttpClient client = new HttpClient(connectionManager);
+    return client;
+  }
+
+  /** Execute an HttpClient method via thread, so we don't get stuck in socket wait */
+  protected static int executeMethodViaThread(HttpClient client, HttpMethodBase executeMethod)
+    throws InterruptedException, IOException
+  {
+    ExecuteMethodThread t = new ExecuteMethodThread(client,executeMethod);
+    try
+    {
+      t.start();
+      t.join();
+      Throwable thr = t.getException();
+      if (thr != null)
+      {
+        if (thr instanceof IOException)
+          throw (IOException)thr;
+        else if (thr instanceof RuntimeException)
+          throw (RuntimeException)thr;
+        else
+          throw (Error)thr;
+      }
+      return t.getResponse();
+    }
+    catch (InterruptedException e)
+    {
+      t.interrupt();
+      // We need the caller to abandon any connections left around, so rethrow in a way that
forces them to process the event properly.
+      throw e;
+    }
+  }
+
+  /** Thread to execute an HttpClient method */
+  protected static class ExecuteMethodThread extends Thread
+  {
+    protected HttpClient client;
+    protected HttpMethodBase executeMethod;
+    protected Throwable exception = null;
+    protected int rval = 0;
+
+    public ExecuteMethodThread(HttpClient client, HttpMethodBase executeMethod)
+    {
+      super();
+      setDaemon(true);
+      this.client = client;
+      this.executeMethod = executeMethod;
+    }
+
+    public void run()
+    {
+      try
+      {
+        // Call the execute method appropriately
+        rval = client.executeMethod(executeMethod);
+      }
+      catch (Throwable e)
+      {
+        this.exception = e;
+      }
+    }
+
+    public Throwable getException()
+    {
+      return exception;
+    }
+
+    public int getResponse()
+    {
+      return rval;
+    }
+  }
+
+
+  /** Create a URL to obtain the next 500 pages.
+  */
+  protected String getListPagesURL(String startingTitle)
+    throws ManifoldCFException
+  {
+    try
+    {
+      return baseURL + "action=query&list=allpages" +
+        ((startingTitle!=null)?"&apfrom="+URLEncoder.encode(startingTitle,"utf-8"):"")
+
+        "&aplimit=500";
+    }
+    catch (UnsupportedEncodingException e)
+    {
+      throw new ManifoldCFException(e.getMessage(),e);
+    }
+  }
+
+  /** Create a URL to obtain a page's browse URL, given the page ID.
+  */
+  protected String getGetURLURL(String documentIdentifier)
+    throws ManifoldCFException
+  {
+    return baseURL + "action=query&prop=info&pageids="+documentIdentifier+"&inprop=url";
+  }
+  
+  /** Parse the response to a GetURL request, of the form:
+  * <api>
+  *  <query>
+  *    <pages>
+  *      <page pageid="27697087" ns="0" title="API" touched="2011-09-27T07:00:55Z" lastrevid="367741756"
counter="" length="70" redirect="" fullurl="http://en.wikipedia.org/wiki/API" editurl="http://en.wikipedia.org/w/index.php?title=API&amp;action=edit"
/>
+  *    </pages>
+  *  </query>
+  *</api>
+  */
   protected static String parseGetURLResponse(InputStream is, String documentIdentifier)
     throws ManifoldCFException, ServiceInterruption
   {
@@ -517,6 +771,7 @@ public class WikiConnector extends org.a
     }
   }
   
+  /** Class representing the "api" context of a "get url" response */
   protected static class WikiGetURLAPIContext extends SingleLevelContext
   {
     protected String fullURL = null;
@@ -548,6 +803,7 @@ public class WikiConnector extends org.a
 
   }
 
+  /** Class representing the "api/query" context of a "get url" response */
   protected static class WikiGetURLQueryContext extends SingleLevelContext
   {
     protected String fullURL = null;
@@ -579,6 +835,7 @@ public class WikiConnector extends org.a
     
   }
 
+  /** Class representing the "api/query/pages" context of a "get url" response */
   protected static class WikiGetURLPagesContext extends SingleLevelContext
   {
     protected String fullURL = null;
@@ -610,6 +867,7 @@ public class WikiConnector extends org.a
 
   }
 
+  /** Class representing the "api/query/pages/page" context of a "get url" response */
   protected static class WikiGetURLPageContext extends BaseProcessingContext
   {
     protected String fullURL = null;
@@ -638,4 +896,28 @@ public class WikiConnector extends org.a
     }
   }
 
+  /** Create a URL to obtain a page's timestamp, given the page ID.
+  */
+  protected String getGetTimestampURL(String documentIdentifier)
+    throws ManifoldCFException
+  {
+    return baseURL + "action=query&prop=revisions&pageids="+documentIdentifier+"&rvprop=timestamp";
+  }
+
+  /** Create a URL to obtain a page's metadata and content, given the page ID.
+  */
+  protected String getGetDocinfoURL(String documentIdentifier)
+    throws ManifoldCFException
+  {
+    return baseURL + "action=query&prop=revisions&pageids="+documentIdentifier+"&rvprop=user|comment|content";
+  }
+
+  /** Get a URL for a check operation.
+  */
+  protected String getCheckURL()
+    throws ManifoldCFException
+  {
+    return baseURL + "action=query&list=allpages&aplimit=1";
+  }
+  
 }



Mime
View raw message