incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1182600 - /incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Date Wed, 12 Oct 2011 22:28:57 GMT
Author: kwright
Date: Wed Oct 12 22:28:57 2011
New Revision: 1182600

URL: http://svn.apache.org/viewvc?rev=1182600&view=rev
Log:
Add logic for getting multiple urls during the processing phase.

Modified:
    incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java

Modified: incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java?rev=1182600&r1=1182599&r2=1182600&view=diff
==============================================================================
--- incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
(original)
+++ incubator/lcf/branches/CONNECTORS-256/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
Wed Oct 12 22:28:57 2011
@@ -1311,9 +1311,303 @@ public class WikiConnector extends org.a
   protected void getDocURLs(String[] documentIdentifiers, Map<String,String> urls)
     throws ManifoldCFException, ServiceInterruption
   {
-    // MHL
+    getSession();
+    HttpClient client = getInitializedClient();
+    HttpMethodBase executeMethod = getInitializedMethod(getGetDocURLsURL(documentIdentifiers));
+    try
+    {
+      ExecuteGetDocURLsThread t = new ExecuteGetDocURLsThread(client,executeMethod,urls);
+      try
+      {
+        t.start();
+        t.join();
+        Throwable thr = t.getException();
+        if (thr != null)
+        {
+          if (thr instanceof ManifoldCFException)
+          {
+            if (((ManifoldCFException)thr).getErrorCode() == ManifoldCFException.INTERRUPTED)
+              throw new InterruptedException(thr.getMessage());
+            throw (ManifoldCFException)thr;
+          }
+          else if (thr instanceof ServiceInterruption)
+            throw (ServiceInterruption)thr;
+          else if (thr instanceof IOException)
+            throw (IOException)thr;
+          else if (thr instanceof RuntimeException)
+            throw (RuntimeException)thr;
+          else
+            throw (Error)thr;
+        }
+      }
+      catch (ManifoldCFException e)
+      {
+        t.interrupt();
+        throw e;
+      }
+      catch (ServiceInterruption e)
+      {
+        t.interrupt();
+        throw e;
+      }
+      catch (IOException e)
+      {
+        t.interrupt();
+        throw e;
+      }
+      catch (InterruptedException e)
+      {
+        t.interrupt();
+        // We need the caller to abandon any connections left around, so rethrow in a way
that forces them to process the event properly.
+        throw e;
+      }
+    }
+    catch (InterruptedException e)
+    {
+      // Drop the connection on the floor
+      executeMethod = null;
+      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+    }
+    catch (ManifoldCFException e)
+    {
+      if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+        // Drop the connection on the floor
+        executeMethod = null;
+      throw e;
+    }
+    catch (java.net.SocketTimeoutException e)
+    {
+      long currentTime = System.currentTimeMillis();
+      throw new ServiceInterruption("URL fetch timed out reading from the Wiki server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L
* 60000L,-1,false);
+    }
+    catch (java.net.SocketException e)
+    {
+      long currentTime = System.currentTimeMillis();
+      throw new ServiceInterruption("URL fetch received a socket error reading from Wiki
server: "+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+    }
+    catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+    {
+      long currentTime = System.currentTimeMillis();
+      throw new ServiceInterruption("URL fetch connection timed out reading from Wiki server:
"+e.getMessage(),e,currentTime+300000L,currentTime+12L * 60000L,-1,false);
+    }
+    catch (InterruptedIOException e)
+    {
+      executeMethod = null;
+      throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+    }
+    catch (IOException e)
+    {
+      throw new ManifoldCFException("URL fetch had an IO failure: "+e.getMessage(),e);
+    }
+    finally
+    {
+      if (executeMethod != null)
+        executeMethod.releaseConnection();
+    }
   }
   
+  /** Create a URL to obtain multiple page's urls, given the page IDs.
+  */
+  protected String getGetDocURLsURL(String[] documentIdentifiers)
+    throws ManifoldCFException
+  {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0 ; i < documentIdentifiers.length ; i++)
+    {
+      if (i > 0)
+        sb.append("|");
+      sb.append(documentIdentifiers[i]);
+    }
+    return baseURL + "action=query&prop=info&pageids="+sb.toString()+"&inprop=url";
+  }
+
+  /** Thread to execute a "get timestamp" operation.  This thread both executes the operation
and parses the result. */
+  protected static class ExecuteGetDocURLsThread extends Thread
+  {
+    protected HttpClient client;
+    protected HttpMethodBase executeMethod;
+    protected Throwable exception = null;
+    protected Map<String,String> urls;
+
+    public ExecuteGetDocURLsThread(HttpClient client, HttpMethodBase executeMethod, Map<String,String>
urls)
+    {
+      super();
+      setDaemon(true);
+      this.client = client;
+      this.executeMethod = executeMethod;
+      this.urls = urls;
+    }
+
+    public void run()
+    {
+      try
+      {
+        // Call the execute method appropriately
+        int rval = client.executeMethod(executeMethod);
+        if (rval != 200)
+          throw new ManifoldCFException("Unexpected response code: "+rval);
+        // Read response and make sure it's valid
+        InputStream is = executeMethod.getResponseBodyAsStream();
+        try
+        {
+          parseGetDocURLsResponse(is,urls);
+        }
+        finally
+        {
+          try
+          {
+            is.close();
+          }
+          catch (IllegalStateException e)
+          {
+            // Ignore this error
+          }
+        }
+      }
+      catch (Throwable e)
+      {
+        this.exception = e;
+      }
+    }
+
+    public Throwable getException()
+    {
+      return exception;
+    }
+
+  }
+
+  /** This method parses a response like the following:
+  * <api>
+  *   <query>
+  *     <pages>
+  *       <page pageid="27697087" ns="0" title="API" fullurl="..."/>
+  *     </pages>
+  *   </query>
+  * </api>
+  */
+  protected static void parseGetDocURLsResponse(InputStream is, Map<String,String>
urls)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // Parse the document.  This will cause various things to occur, within the instantiated
XMLContext class.
+    XMLStream x = new XMLStream();
+    WikiGetDocURLsAPIContext c = new WikiGetDocURLsAPIContext(x,urls);
+    x.setContext(c);
+    try
+    {
+      try
+      {
+        x.parse(is);
+      }
+      catch (IOException e)
+      {
+        long time = System.currentTimeMillis();
+        throw new ServiceInterruption(e.getMessage(),e,time + 300000L,time + 12L * 60000L,-1,false);
+      }
+    }
+    finally
+    {
+      x.cleanup();
+    }
+  }
+
+  /** Class representing the "api" context of a "get timestamp" response */
+  protected static class WikiGetDocURLsAPIContext extends SingleLevelContext
+  {
+    protected Map<String,String> urls;
+    
+    public WikiGetDocURLsAPIContext(XMLStream theStream, Map<String,String> urls)
+    {
+      super(theStream,"api");
+      this.urls = urls;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String
qName, Attributes atts)
+    {
+      return new WikiGetDocURLsQueryContext(theStream,namespaceURI,localName,qName,atts,urls);
+    }
+    
+    protected void finishChild(BaseProcessingContext child)
+      throws ManifoldCFException
+    {
+    }
+
+  }
+
+  /** Class representing the "api/query" context of a "get timestamp" response */
+  protected static class WikiGetDocURLsQueryContext extends SingleLevelContext
+  {
+    protected Map<String,String> urls;
+    
+    public WikiGetDocURLsQueryContext(XMLStream theStream, String namespaceURI, String localName,
String qName, Attributes atts,
+      Map<String,String> urls)
+    {
+      super(theStream,namespaceURI,localName,qName,atts,"query");
+      this.urls = urls;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String
qName, Attributes atts)
+    {
+      return new WikiGetDocURLsPagesContext(theStream,namespaceURI,localName,qName,atts,urls);
+    }
+
+    protected void finishChild(BaseProcessingContext child)
+      throws ManifoldCFException
+    {
+    }
+    
+  }
+
+  /** Class looking for the "api/query/pages" context of a "get timestamp" response */
+  protected static class WikiGetDocURLsPagesContext extends SingleLevelContext
+  {
+    protected Map<String,String> urls;
+    
+    public WikiGetDocURLsPagesContext(XMLStream theStream, String namespaceURI, String localName,
String qName, Attributes atts,
+      Map<String,String> urls)
+    {
+      super(theStream,namespaceURI,localName,qName,atts,"pages");
+      this.urls = urls;
+    }
+
+    protected BaseProcessingContext createChild(String namespaceURI, String localName, String
qName, Attributes atts)
+    {
+      return new WikiGetDocURLsPageContext(theStream,namespaceURI,localName,qName,atts,urls);
+    }
+
+    protected void finishChild(BaseProcessingContext child)
+      throws ManifoldCFException
+    {
+    }
+  }
+
+  /** Class looking for the "api/query/pages/page" context of a "get timestamp" response
*/
+  protected static class WikiGetDocURLsPageContext extends BaseProcessingContext
+  {
+    protected Map<String,String> urls;
+    
+    public WikiGetDocURLsPageContext(XMLStream theStream, String namespaceURI, String localName,
String qName, Attributes atts,
+      Map<String,String> urls)
+    {
+      super(theStream,namespaceURI,localName,qName,atts);
+      this.urls = urls;
+    }
+
+    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes
atts)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      if (qName.equals("page"))
+      {
+        String pageID = atts.getValue("pageid");
+        String fullURL = atts.getValue("fullurl");
+        if (pageID != null && fullURL != null)
+          urls.put(pageID,fullURL);
+      }
+      return super.beginTag(namespaceURI,localName,qName,atts);
+    }
+    
+  }
+
   // -- Methods and classes to perform a "get Timestamp" operation. --
 
   /** Obtain document versions for a set of documents.



Mime
View raw message