incubator-connectors-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r1034705 [1/2] - in /incubator/lcf/site: publish/ src/documentation/content/xdocs/
Date Sat, 13 Nov 2010 09:08:31 GMT
Author: kwright
Date: Sat Nov 13 09:08:30 2010
New Revision: 1034705

URL: http://svn.apache.org/viewvc?rev=1034705&view=rev
Log:
More native site stuff, so it can be versioned properly in svn

Added:
    incubator/lcf/site/publish/concepts.html
    incubator/lcf/site/publish/concepts.pdf   (with props)
    incubator/lcf/site/publish/writing-authority-connectors.html
    incubator/lcf/site/publish/writing-authority-connectors.pdf   (with props)
    incubator/lcf/site/publish/writing-output-connectors.html
    incubator/lcf/site/publish/writing-output-connectors.pdf   (with props)
    incubator/lcf/site/publish/writing-repository-connectors.html
    incubator/lcf/site/publish/writing-repository-connectors.pdf   (with props)
    incubator/lcf/site/src/documentation/content/xdocs/concepts.xml   (with props)
    incubator/lcf/site/src/documentation/content/xdocs/writing-authority-connectors.xml   (with props)
    incubator/lcf/site/src/documentation/content/xdocs/writing-output-connectors.xml   (with props)
    incubator/lcf/site/src/documentation/content/xdocs/writing-repository-connectors.xml   (with props)
Modified:
    incubator/lcf/site/publish/developer-resources.html
    incubator/lcf/site/publish/developer-resources.pdf
    incubator/lcf/site/publish/end-user-documentation.pdf
    incubator/lcf/site/publish/faq.pdf
    incubator/lcf/site/publish/how-to-build-and-deploy.pdf
    incubator/lcf/site/publish/index.pdf
    incubator/lcf/site/publish/linkmap.pdf
    incubator/lcf/site/publish/mail.pdf
    incubator/lcf/site/publish/programmatic-operation.pdf
    incubator/lcf/site/publish/who.pdf
    incubator/lcf/site/src/documentation/content/xdocs/developer-resources.xml

Added: incubator/lcf/site/publish/concepts.html
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/concepts.html?rev=1034705&view=auto
==============================================================================
--- incubator/lcf/site/publish/concepts.html (added)
+++ incubator/lcf/site/publish/concepts.html Sat Nov 13 09:08:30 2010
@@ -0,0 +1,362 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.9-dev">
+<meta name="Forrest-skin-name" content="lucene">
+<title>Concepts</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://www.apache.org"><img class="logoImage" alt="Apache" src="images/apache_feather.gif" title="Apache Software Foundation"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://incubator.apache.org/lcf"><img class="logoImage" alt="Apache ManifoldCF" src="images/ManifoldCF-logo.PNG" title="ManifoldCF"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.lucidimagination.com/search/" method="get" class="roundtopsmall">
+<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a>
+</div>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://cwiki.apache.org/confluence/display/CONNECTORS/Index">Wiki</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">About</div>
+<div id="menu_1.1" class="menuitemgroup">
+<div class="menuitem">
+<a href="index.html">Welcome</a>
+</div>
+<div class="menuitem">
+<a href="who.html">Who We Are</a>
+</div>
+<div class="menuitem">
+<a href="mail.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="http://www.cafepress.com/lucene/">Buy Stuff</a>
+</div>
+<div class="menuitem">
+<a href="http://www.apache.org/foundation/sponsorship.html">Sponsor Apache</a>
+</div>
+<div class="menuitem">
+<a href="http://www.apache.org/foundation/thanks.html">Sponsors of Apache</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="faq.html">Frequently Asked Questions</a>
+</div>
+<div class="menuitem">
+<a href="developer-resources.html">Developer/Integrator Resources</a>
+</div>
+<div class="menuitem">
+<a href="end-user-documentation.html">End-user Documentation</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Related-Projects</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://incubator.apache.org/droids/">Droids</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Java</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/lucene.net/">Lucene.Net</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/lucy/">Lucy</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/mahout/">Mahout</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/">Nutch</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/openrelevance/">Open Relevance</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/pylucene/">PyLucene</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/solr/">Solr</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/tika/">Tika</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="concepts.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Concepts</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#Concepts">Concepts</a>
+<ul class="minitoc">
+<li>
+<a href="#ManifoldCF+document+model">ManifoldCF document model</a>
+</li>
+<li>
+<a href="#ManifoldCF+security+model">ManifoldCF security model</a>
+</li>
+<li>
+<a href="#ManifoldCF+conceptual+entities">ManifoldCF conceptual entities</a>
+<ul class="minitoc">
+<li>
+<a href="#Connectors">Connectors</a>
+</li>
+<li>
+<a href="#Connections">Connections</a>
+</li>
+<li>
+<a href="#Jobs">Jobs</a>
+</li>
+</ul>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+    
+<a name="N1000D"></a><a name="Concepts"></a>
+<h2 class="h3">Concepts</h2>
+<div class="section">
+<p>ManifoldCF is a crawler framework which is designed to meet several key goals.</p>
+<p></p>
+<ul>
+        
+<li>It's reliable, and resilient against being shutdown or restarted</li>
+        
+<li>It's incremental, meaning that jobs describe a set of documents by some criteria, and are meant to be run again and again to pick up any differences</li>
+        
+<li>It supports connections to multiple kinds of repositories at the same time</li>
+        
+<li>It defines and fully supports a model of document security, so that each document listed in a search result from the back-end search engine is one that the current user is allowed to see</li>
+        
+<li>It operates with reasonable efficiency and throughput</li>
+        
+<li>Its memory usage characteristics are bounded and predictable in advance</li>
+      
+</ul>
+<p></p>
+<p>ManifoldCF meets many of its architectural goals by being implemented on top of a relational database.  The current implementation requires Postgresql or uses the included Derby.  Longer term, we may support other DB bindings.</p>
+<p></p>
+<a name="N10034"></a><a name="ManifoldCF+document+model"></a>
+<h3 class="h4">ManifoldCF document model</h3>
+<p></p>
+<p>Each document in ManifoldCF consists of some opaque binary data, plus some opaque associated metadata (which is described by name-value pairs), and is uniquely addressed by a URI.  The back-end search engines which ManifoldCF communicates with are all expected to support, to a greater or lesser degree, this model.</p>
+<p></p>
+<p>Documents may also have access tokens associated with them.  These access tokens are described more fully in the next section.</p>
+<p></p>
+<a name="N10047"></a><a name="ManifoldCF+security+model"></a>
+<h3 class="h4">ManifoldCF security model</h3>
+<p></p>
+<p>The ManifoldCF security model is based loosely on the standard authorization concepts and hierarchies found in Microsoft's Active Directory.  Active Directory is quite common in the kinds of environments where data repositories exist that are ripe for indexing.  Active Directory's authorization model is also easily used in a general way to represent authorization for a huge variety of third-party content repositories.</p>
+<p></p>
+<p>ManifoldCF defines a concept of an <em>access token</em>.  An access token, to ManifoldCF, is a string which is meaningful only to a specific connector or connectors.  This string describes the ability of a user to view (or not view) some set of documents.  For documents protected by Active Directory itself, an access token would be an Active Directory SID (e.g. "S-1-23-4-1-45").  But, for example, for documents protected by Livelink a wholly different string would be used.</p>
+<p></p>
+<p>In the ManifoldCF security model, it is the job of an <em>authority</em> to provide a list of access tokens for a given searching user.  Multiple authorities cooperate in that each one can add to the list of access tokens describing a given user's security.  The resulting access tokens are handed to the search engine as part of every search request, so that the search engine may properly exclude documents that the user is not allowed to see.</p>
+<p></p>
+<p>When document indexing is done, therefore, it is the job of the crawler to hand access tokens to the search engine, so that it may categorize the documents properly according to their accessibility.  Note that the access tokens so provided are meaningful only within the space of the governing authority.  Access tokens can be provided as "grant" tokens, or as "deny" tokens.  Finally, there are multiple levels of tokens, which correspond to Active Directory's concepts of "share" security, "directory" security, or "file" security.  (The latter concepts are rarely used except for documents that come from Windows or Samba systems.)</p>
+<p></p>
+<p>Once all these documents and their access tokens are handed to the search engine, it is the search engine's job to enforce security by excluding inappropriate documents from the search results.  For Solr 1.5, this infrastructure has been submitted in jira ticket SOLR-1895, found <a href="https://issues.apache.org/jira/browse/SOLR-1895">here</a>, where you can download a SearchComponent plug-in and simple instructions for setting up your copy of Solr to enforce ManifoldCF's model of document security.  Bear in mind that this plug-in is still not a complete solution, as it requires an authenticated user name to be passed to it from some upstream source, possibly a JAAS authenticator within an application server framework.</p>
+<p></p>
+<a name="N10073"></a><a name="ManifoldCF+conceptual+entities"></a>
+<h3 class="h4">ManifoldCF conceptual entities</h3>
+<p></p>
+<a name="N1007B"></a><a name="Connectors"></a>
+<h4>Connectors</h4>
+<p></p>
+<p>ManifoldCF defines three different kinds of connectors.  These are:</p>
+<p></p>
+<ul>
+            
+<li>Authority connectors</li>
+            
+<li>Repository connectors</li>
+            
+<li>Output connectors</li>
+          
+</ul>
+<p></p>
+<p>All connectors share certain characteristics.  First, they are pooled.  This means that ManifoldCF keeps configured and connected instances of a connector around for a while, and has the ability to limit the total number of such instances to within some upper limit.  Connector implementations have specific methods in them for managing their existence in the pools that ManifoldCF keeps them in.  Second, they are configurable.  The configuration description for a connector is an XML document, whose precise format is determined by the connector implementation.  A configured connector instance is called a <em>connection</em>, by common ManifoldCF convention.</p>
+<p></p>
+<p>The function of each type of connector is described below.</p>
+<p></p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+            
+<tr>
+<th colspan="1" rowspan="1">Connector type</th><th colspan="1" rowspan="1">Function</th>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1">Authority connector</td><td colspan="1" rowspan="1">Furnishes a standard way of mapping a user name to access tokens that are meaningful for a given type of repository</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1">Repository connector</td><td colspan="1" rowspan="1">Fetches documents from a specific kind of repository, such as SharePoint or off the web</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1">Output connector</td><td colspan="1" rowspan="1">Pushes document ingestion requests and deletion requests to a specific kind of back end search engine or other entity, such as Lucene</td>
+</tr>
+          
+</table>
+<p></p>
+<a name="N100D1"></a><a name="Connections"></a>
+<h4>Connections</h4>
+<p></p>
+<p>As described above, a <em>connection</em> is a connector implementation plus connector-specific configuration information.  A user can define a connection of all three types in the crawler UI.</p>
+<p></p>
+<p>The kind of information included in the configuration data for a connector typically describes the "how", as opposed to the "what".  For example, you'd configure a LiveLink connection by specifying how to talk to the LiveLink server.  You would <strong>not</strong> include information about which documents to select in such a configuration.</p>
+<p></p>
+<p>There is one difference between how you define a <em>repository connection</em>, vs. how you would define an <em>authority connection</em> or <em>output connection</em>.  The difference is that you must specify a governing authority connection for your repository connection.  This is because <strong>all</strong> documents ingested by ManifoldCF need to include appropriate access tokens, and those access tokens are specific to the governing authority.</p>
+<p></p>
+<a name="N100FB"></a><a name="Jobs"></a>
+<h4>Jobs</h4>
+<p></p>
+<p>A <em>job</em> in ManifoldCF parlance is a description of some kind of synchronization that needs to occur between a specified repository connection and a specified output connection.  A job includes the following:</p>
+<p></p>
+<ul>
+            
+<li>A verbal description</li>
+            
+<li>A repository connection (and thus implicitly an authority connection as well)</li>
+            
+<li>An output connection</li>
+            
+<li>A repository-connection-specific description of "what" documents and metadata the job applies to</li>
+            
+<li>A model for crawling: either "run to completion", or "run continuously"</li>
+            
+<li>A schedule for when the job will run: either within specified time windows, or on demand</li>
+          
+</ul>
+<p></p>
+<p>Jobs are allowed to share the same repository connection, and thus they can overlap in the set of documents they describe.  ManifoldCF permits this situation, although when it occurs it is probably an accident.</p>
+</div>
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2009, 2010 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Added: incubator/lcf/site/publish/concepts.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/concepts.pdf?rev=1034705&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/lcf/site/publish/concepts.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: incubator/lcf/site/publish/developer-resources.html
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/developer-resources.html?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
--- incubator/lcf/site/publish/developer-resources.html (original)
+++ incubator/lcf/site/publish/developer-resources.html Sat Nov 13 09:08:30 2010
@@ -242,7 +242,7 @@ document.write("Last Published: " + docu
 <a name="N1001D"></a><a name="understandingManifoldCF"></a>
 <h2 class="h3">Familiarize Yourself with the Concepts and Terminology</h2>
 <div class="section">
-<p>Like any software project, ManifoldCF has its own underlying concepts and terminology.  Read about that <a href="http://cwiki.apache.org/confluence/display/CONNECTORS/ManifoldCF+concepts">here</a>.
+<p>Like any software project, ManifoldCF has its own underlying concepts and terminology.  Read about that <a href="concepts.html">here</a>.
 	</p>
 </div>
     
@@ -304,15 +304,15 @@ document.write("Last Published: " + docu
 <ul>
 	    
 <li>
-<a href="http://cwiki.apache.org/confluence/display/CONNECTORS/How+to+Write+an+Output+Connector">How to write an output connector</a>
+<a href="writing-output-connectors.html">How to write an output connector</a>
 </li>
 	    
 <li>
-<a href="http://cwiki.apache.org/confluence/display/CONNECTORS/How+to+Write+an+Authority+Connector">How to write an authority connector</a>
+<a href="writing-authority-connectors.html">How to write an authority connector</a>
 </li>
 	    
 <li>
-<a href="http://cwiki.apache.org/confluence/display/CONNECTORS/How+to+Write+a+Repository+Connector">How to write a repository connector</a>
+<a href="writing-repository-connectors.html">How to write a repository connector</a>
 </li>
 	
 </ul>

Modified: incubator/lcf/site/publish/developer-resources.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/developer-resources.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/lcf/site/publish/end-user-documentation.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/end-user-documentation.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Files incubator/lcf/site/publish/end-user-documentation.pdf (original) and incubator/lcf/site/publish/end-user-documentation.pdf Sat Nov 13 09:08:30 2010 differ

Modified: incubator/lcf/site/publish/faq.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/faq.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Files incubator/lcf/site/publish/faq.pdf (original) and incubator/lcf/site/publish/faq.pdf Sat Nov 13 09:08:30 2010 differ

Modified: incubator/lcf/site/publish/how-to-build-and-deploy.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/how-to-build-and-deploy.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/lcf/site/publish/index.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/index.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/lcf/site/publish/linkmap.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/linkmap.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/lcf/site/publish/mail.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/mail.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/lcf/site/publish/programmatic-operation.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/programmatic-operation.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Binary files - no diff available.

Modified: incubator/lcf/site/publish/who.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/who.pdf?rev=1034705&r1=1034704&r2=1034705&view=diff
==============================================================================
Binary files - no diff available.

Added: incubator/lcf/site/publish/writing-authority-connectors.html
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/writing-authority-connectors.html?rev=1034705&view=auto
==============================================================================
--- incubator/lcf/site/publish/writing-authority-connectors.html (added)
+++ incubator/lcf/site/publish/writing-authority-connectors.html Sat Nov 13 09:08:30 2010
@@ -0,0 +1,423 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.9-dev">
+<meta name="Forrest-skin-name" content="lucene">
+<title>Writing authority connectors</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://www.apache.org"><img class="logoImage" alt="Apache" src="images/apache_feather.gif" title="Apache Software Foundation"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://incubator.apache.org/lcf"><img class="logoImage" alt="Apache ManifoldCF" src="images/ManifoldCF-logo.PNG" title="ManifoldCF"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.lucidimagination.com/search/" method="get" class="roundtopsmall">
+<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a>
+</div>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://cwiki.apache.org/confluence/display/CONNECTORS/Index">Wiki</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">About</div>
+<div id="menu_1.1" class="menuitemgroup">
+<div class="menuitem">
+<a href="index.html">Welcome</a>
+</div>
+<div class="menuitem">
+<a href="who.html">Who We Are</a>
+</div>
+<div class="menuitem">
+<a href="mail.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="http://www.cafepress.com/lucene/">Buy Stuff</a>
+</div>
+<div class="menuitem">
+<a href="http://www.apache.org/foundation/sponsorship.html">Sponsor Apache</a>
+</div>
+<div class="menuitem">
+<a href="http://www.apache.org/foundation/thanks.html">Sponsors of Apache</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="faq.html">Frequently Asked Questions</a>
+</div>
+<div class="menuitem">
+<a href="developer-resources.html">Developer/Integrator Resources</a>
+</div>
+<div class="menuitem">
+<a href="end-user-documentation.html">End-user Documentation</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Related-Projects</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://incubator.apache.org/droids/">Droids</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Java</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/lucene.net/">Lucene.Net</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/lucy/">Lucy</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/mahout/">Mahout</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/">Nutch</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/openrelevance/">Open Relevance</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/pylucene/">PyLucene</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/solr/">Solr</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/tika/">Tika</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="writing-authority-connectors.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Writing authority connectors</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#Writing+an+Authority+Connector">Writing an Authority Connector</a>
+<ul class="minitoc">
+<li>
+<a href="#Key+concepts">Key concepts</a>
+</li>
+<li>
+<a href="#Implementing+the+Authority+Connector+class">Implementing the Authority Connector class</a>
+<ul class="minitoc">
+<li>
+<a href="#Principle+methods">Principle methods</a>
+</li>
+<li>
+<a href="#Notes+on+connector+UI+methods">Notes on connector UI methods</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Implementation+support+provided+by+the+framework">Implementation support provided by the framework</a>
+</li>
+<li>
+<a href="#DO%27s+and+DON%27T+DO%27s">DO's and DON'T DO's</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+    
+<a name="N1000D"></a><a name="Writing+an+Authority+Connector"></a>
+<h2 class="h3">Writing an Authority Connector</h2>
+<div class="section">
+<p></p>
+<p>An authority connector to a repository allows a repository's security model to be enforced by a search engine.  Its only function is to convert a user name (which is often a Kerberos principal name) into a set of _access tokens_.</p>
+<p></p>
+<p>The definition of an access token within ManifoldCF for a given repository is completely defined by the connectors that deal with that repository, with one exception.  That exception is for Active Directory.  Active Directory is so prevalent as a repository authorization mechanism that ManifoldCF currently treats it as the "default" authority - that is, if you don't specify another authority when you define a repository connection, ManifoldCF presumes that you mean that Active Directory should be the controlling authority for the connection.  In that case, an access token is simply an Active Directory SID.</p>
+<p></p>
+<p>For those repositories that do not use Active Directory as their authorization mechanism, an authority connector should be written, along with the repository connector for the repository.  Access tokens in that case represent a contract between your implementation of the authority connector for the repository, and the repository connector for the repository.  They must work together to define access tokens that will limit document access when used properly within any search engine query.</p>
+<p></p>
+<p>As is the case with all connectors under the ManifoldCF umbrella, an authority connector consists of a single parts:</p>
+<p></p>
+<ul>
+        
+<li>A class implementing an interface (in this case, <em>org.apache.manifoldcf.authorities.interfaces.IAuthorityConnector</em>)</li>
+      
+</ul>
+<p></p>
+<a name="N10034"></a><a name="Key+concepts"></a>
+<h3 class="h4">Key concepts</h3>
+<p></p>
+<p>The authority connector abstraction makes use of, or introduces, the following concepts:</p>
+<p></p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+          
+<tr>
+<th colspan="1" rowspan="1">Concept</th><th colspan="1" rowspan="1">What it is</th>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Configuration parameters</td><td colspan="1" rowspan="1">A hierarchical structure, internally represented as an XML document, which describes a specific configuration of a specific authority connector, i.e. <strong>how</strong> the connector should do its job; see <em>org.apache.manifoldcf.core.interfaces.ConfigParams</em></td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Authority connection</td><td colspan="1" rowspan="1">An authority connector instance that has been furnished with configuration data</td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">User name</td><td colspan="1" rowspan="1">The name of a user, which is often a Kerberos principal name, e.g. <em>john@apache.org</em></td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Access token</td><td colspan="1" rowspan="1">An arbitrary string, which is only meaningful within the context of a specific authority connector, that describes a quantum of authorization</td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Connection management/threading/pooling model</td><td colspan="1" rowspan="1">How an individual authority connector class instance is managed and used</td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Service interruption</td><td colspan="1" rowspan="1">A specific kind of exception that signals ManifoldCF that the output repository is unavailable, and gives a best estimate of when it might become available again; see <em>org.apache.manifoldcf.agents.interfaces.ServiceInterruption</em></td>
+</tr>
+        
+</table>
+<p></p>
+<a name="N10096"></a><a name="Implementing+the+Authority+Connector+class"></a>
+<h3 class="h4">Implementing the Authority Connector class</h3>
+<p></p>
+<p>A very good place to start is to read the javadoc for the authority connector interface.  You will note that the javadoc describes the usage and pooling model for a connector class pretty thoroughly.  It is very important to understand the model thoroughly in order to write reliable connectors!  Use of static variables, for one thing, must be done in a very careful way, to avoid issues that would be hard to detect with a cursory test.</p>
+<p></p>
+<p>The second thing to do is to examine some of the provided authority connector implementations.  The Documentum connector, the LiveLink connector, the Memex connector, and the Meridio connector all include authority connectors which demonstrate (to some degree) the sorts of techniques you will need for an effective implementation.  You will also note that all of these connectors extend a framework-provided authority connector base class, found at <em>org.apache.manifoldcf.authorities.authorities.BaseAuthorityConnector</em>.  This base class furnishes some basic bookkeeping logic for managing the connector pool, as well as default implementations of some of the less typical functionality a connector may have.  For example, connectors are allowed to have database tables of their own, which are instantiated when the connector is registered, and are torn down when the connector is removed.  This is, however, not very typical, and the base implementation reflects that.</p>
+<p></p>
+<a name="N100AB"></a><a name="Principle+methods"></a>
+<h4>Principle methods</h4>
+<p></p>
+<p>The principle methods an implementer should be concerned with for creating an authority connector are the following:</p>
+<p></p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+            
+<tr>
+<th colspan="1" rowspan="1">Method</th><th colspan="1" rowspan="1">What it should do</th>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>getAuthorizationResponse()</strong></td><td colspan="1" rowspan="1">Obtain the authorization response, given a user name</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>outputConfigurationHeader()</strong></td><td colspan="1" rowspan="1">Output the head-section part of an authority connection <em>ConfigParams</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>outputConfigurationBody()</strong></td><td colspan="1" rowspan="1">Output the body-section part of an authority connection <em>ConfigParams</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>processConfigurationPost()</strong></td><td colspan="1" rowspan="1">Receive and process form data from an authority connection <em>ConfigParams</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>viewConfiguration()</strong></td><td colspan="1" rowspan="1">Output the viewing HTML for an authority connection <em>ConfigParams</em> object</td>
+</tr>
+          
+</table>
+<p></p>
+<p>These methods come in two broad classes: (a) functional methods for doing the work of the connector; (b) UI methods for configuring a connection.  Together they do the heavy lifting of your connector.</p>
+<p></p>
+<p>The <em>getAuthorizationResponse()</em> method returns an <em>AuthorizationResponse</em> object, which can describe a number of conditions:</p>
+<p></p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+            
+<tr>
+<th colspan="1" rowspan="1">Condition</th><th colspan="1" rowspan="1">Meaning</th>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1">RESPONSE_OK</td><td colspan="1" rowspan="1">The access tokens for the user were successfully obtained from the repository, and are being returned</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1">RESPONSE_UNREACHABLE</td><td colspan="1" rowspan="1">The repository is currently unreachable, and appropriate disabling tokens are being returned</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1">RESPONSE_USERNOTFOUND</td><td colspan="1" rowspan="1">The user was not found within the repository, and appropriate disabling tokens are being returned</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1">RESPONSE_USERUNAUTHORIZED</td><td colspan="1" rowspan="1">The user was found, but was in some way disabled, and appropriate disabling tokens are being returned</td>
+</tr>
+          
+</table>
+<p></p>
+<p></p>
+<p>In all cases, the connector returns access tokens.  But in the case where token lookup has failed in some way, it is the responsibility of the connector to insure that inappropriate content is not viewed.  Usually, this is done by ingesting a "global deny" token attached to all documents from the given repository, and then having the associated authority connector return this global deny token when error conditions apply.</p>
+<p></p>
+<p></p>
+<a name="N1015B"></a><a name="Notes+on+connector+UI+methods"></a>
+<h4>Notes on connector UI methods</h4>
+<p></p>
+<p>The crawler UI uses a tabbed layout structure, and thus each of these elements must properly implement the tabbed model.  This means that the "header" methods above must add the desired tab names to a specified array, and the "body" methods must provide appropriate HTML which handles both the case where a tab is displayed, and where it is not displayed.  Also, it makes sense to use the appropriate css definitions, so that the connector UI pages have a similar look-and-feel to the rest of ManifoldCF's crawler ui.  We strongly suggest starting with one of the supplied authority connector's UI code, both for a description of the arguments to each page, and for some decent ideas of ways to organize your connector's UI code.</p>
+<p></p>
+<a name="N1016A"></a><a name="Implementation+support+provided+by+the+framework"></a>
+<h3 class="h4">Implementation support provided by the framework</h3>
+<p></p>
+<p>ManifoldCF's framework provides a number of helpful services designed to make the creation of a connector easier.  These services are summarized below.  (This is not an exhaustive list, by any means.)</p>
+<p></p>
+<ul>
+          
+<li>Lock management and synchronization (see <em>org.apache.manifoldcf.core.interfaces.LockManagerFactory</em>)</li>
+          
+<li>Cache management (see <em>org.apache.manifoldcf.core.interfaces.CacheManagerFactory</em>)</li>
+          
+<li>Local keystore management (see <em>org.apache.manifoldcf.core.KeystoreManagerFactory</em>)</li>
+          
+<li>Database management (see <em>org.apache.manifoldcf.core.DBInterfaceFactory</em>)</li>
+        
+</ul>
+<p></p>
+<p>For UI method support, these too are very useful:</p>
+<p></p>
+<ul>
+          
+<li>Multipart form processing (see <em>org.apache.manifoldcf.ui.multipart.MultipartWrapper</em>)</li>
+          
+<li>HTML encoding (see <em>org.apache.manifoldcf.ui.util.Encoder</em>)</li>
+          
+<li>HTML formatting (see <em>org.apache.manifoldcf.ui.util.Formatter</em>)</li>
+        
+</ul>
+<p></p>
+<a name="N101B1"></a><a name="DO%27s+and+DON%27T+DO%27s"></a>
+<h3 class="h4">DO's and DON'T DO's</h3>
+<p></p>
+<p>It's always a good idea to make use of an existing infrastructure component, if it's meant for that purpose, rather than inventing your own.  There are, however, some limitations we recommend you adhere to.</p>
+<p></p>
+<ul>
+          
+<li>DO make use of infrastructure components described in the section above</li>
+          
+<li>DON'T make use of infrastructure components that aren't mentioned, without checking first</li>
+          
+<li>NEVER write connector code that directly uses framework database tables, other than the ones installed and managed by your connector</li>
+        
+</ul>
+<p></p>
+<p>If you are tempted to violate these rules, it may well mean you don't understand something important.  At the very least, we'd like to know why.  Send email to connectors-dev@incubator.apache.org with a description of your problem and how you are tempted to solve it.</p>
+</div>
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2009, 2010 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Added: incubator/lcf/site/publish/writing-authority-connectors.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/writing-authority-connectors.pdf?rev=1034705&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/lcf/site/publish/writing-authority-connectors.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/lcf/site/publish/writing-output-connectors.html
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/writing-output-connectors.html?rev=1034705&view=auto
==============================================================================
--- incubator/lcf/site/publish/writing-output-connectors.html (added)
+++ incubator/lcf/site/publish/writing-output-connectors.html Sat Nov 13 09:08:30 2010
@@ -0,0 +1,443 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<META http-equiv="Content-Type" content="text/html; charset=UTF-8">
+<meta content="Apache Forrest" name="Generator">
+<meta name="Forrest-version" content="0.9-dev">
+<meta name="Forrest-skin-name" content="lucene">
+<title>Writing output connectors</title>
+<link type="text/css" href="skin/basic.css" rel="stylesheet">
+<link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet">
+<link media="print" type="text/css" href="skin/print.css" rel="stylesheet">
+<link type="text/css" href="skin/profile.css" rel="stylesheet">
+<script src="skin/getBlank.js" language="javascript" type="text/javascript"></script><script src="skin/getMenu.js" language="javascript" type="text/javascript"></script><script src="skin/fontsize.js" language="javascript" type="text/javascript"></script>
+<link rel="shortcut icon" href="images/favicon.ico">
+</head>
+<body onload="init()">
+<script type="text/javascript">ndeSetTextSize();</script>
+<div id="top">
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+<a href="http://www.apache.org/">Apache</a><script src="skin/breadcrumbs.js" language="JavaScript" type="text/javascript"></script>
+</div>
+<!--+
+    |header
+    +-->
+<div class="header">
+<!--+
+    |start group logo
+    +-->
+<div class="grouplogo">
+<a href="http://www.apache.org"><img class="logoImage" alt="Apache" src="images/apache_feather.gif" title="Apache Software Foundation"></a>
+</div>
+<!--+
+    |end group logo
+    +-->
+<!--+
+    |start Project Logo
+    +-->
+<div class="projectlogo">
+<a href="http://incubator.apache.org/lcf"><img class="logoImage" alt="Apache ManifoldCF" src="images/ManifoldCF-logo.PNG" title="ManifoldCF"></a>
+</div>
+<!--+
+    |end Project Logo
+    +-->
+<!--+
+    |start Search
+    +-->
+<div class="searchbox">
+<form action="http://www.lucidimagination.com/search/" method="get" class="roundtopsmall">
+<input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query" type="text" value="Search the site with Solr">&nbsp; 
+                    <input name="Search" value="Search" type="submit">
+</form>
+<div style="position: relative; top: -5px; left: -10px">Powered by <a href="http://www.lucidimagination.com" style="color: #033268">Lucid Imagination</a>
+</div>
+</div>
+<!--+
+    |end search
+    +-->
+<!--+
+    |start Tabs
+    +-->
+<ul id="tabs">
+<li class="current">
+<a class="selected" href="index.html">Main</a>
+</li>
+<li>
+<a class="unselected" href="http://cwiki.apache.org/confluence/display/CONNECTORS/Index">Wiki</a>
+</li>
+</ul>
+<!--+
+    |end Tabs
+    +-->
+</div>
+</div>
+<div id="main">
+<div id="publishedStrip">
+<!--+
+    |start Subtabs
+    +-->
+<div id="level2tabs"></div>
+<!--+
+    |end Endtabs
+    +-->
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<!--+
+    |breadtrail
+    +-->
+<div class="breadtrail">
+
+             &nbsp;
+           </div>
+<!--+
+    |start Menu, mainarea
+    +-->
+<!--+
+    |start Menu
+    +-->
+<div id="menu">
+<div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle">About</div>
+<div id="menu_1.1" class="menuitemgroup">
+<div class="menuitem">
+<a href="index.html">Welcome</a>
+</div>
+<div class="menuitem">
+<a href="who.html">Who We Are</a>
+</div>
+<div class="menuitem">
+<a href="mail.html">Mailing Lists</a>
+</div>
+<div class="menuitem">
+<a href="http://www.cafepress.com/lucene/">Buy Stuff</a>
+</div>
+<div class="menuitem">
+<a href="http://www.apache.org/foundation/sponsorship.html">Sponsor Apache</a>
+</div>
+<div class="menuitem">
+<a href="http://www.apache.org/foundation/thanks.html">Sponsors of Apache</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle">Documentation</div>
+<div id="menu_1.2" class="menuitemgroup">
+<div class="menuitem">
+<a href="faq.html">Frequently Asked Questions</a>
+</div>
+<div class="menuitem">
+<a href="developer-resources.html">Developer/Integrator Resources</a>
+</div>
+<div class="menuitem">
+<a href="end-user-documentation.html">End-user Documentation</a>
+</div>
+</div>
+<div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle">Related-Projects</div>
+<div id="menu_1.3" class="menuitemgroup">
+<div class="menuitem">
+<a href="http://incubator.apache.org/droids/">Droids</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/java/">Java</a>
+</div>
+<div class="menuitem">
+<a href="http://incubator.apache.org/lucene.net/">Lucene.Net</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/lucy/">Lucy</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/mahout/">Mahout</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/nutch/">Nutch</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/openrelevance/">Open Relevance</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/pylucene/">PyLucene</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/solr/">Solr</a>
+</div>
+<div class="menuitem">
+<a href="http://lucene.apache.org/tika/">Tika</a>
+</div>
+</div>
+<div id="credit"></div>
+<div id="roundbottom">
+<img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"></div>
+<!--+
+  |alternative credits
+  +-->
+<div id="credit2"></div>
+</div>
+<!--+
+    |end Menu
+    +-->
+<!--+
+    |start content
+    +-->
+<div id="content">
+<div title="Portable Document Format" class="pdflink">
+<a class="dida" href="writing-output-connectors.pdf"><img alt="PDF -icon" src="skin/images/pdfdoc.gif" class="skin"><br>
+        PDF</a>
+</div>
+<h1>Writing output connectors</h1>
+<div id="minitoc-area">
+<ul class="minitoc">
+<li>
+<a href="#Writing+an+Output+Connector">Writing an Output Connector</a>
+<ul class="minitoc">
+<li>
+<a href="#Key+concepts">Key concepts</a>
+</li>
+<li>
+<a href="#Implementing+the+Output+Connector+class">Implementing the Output Connector class</a>
+<ul class="minitoc">
+<li>
+<a href="#Principle+methods">Principle methods</a>
+</li>
+<li>
+<a href="#Choosing+the+form+of+the+output+version+string">Choosing the form of the output version string</a>
+</li>
+<li>
+<a href="#Notes+on+connector+UI+methods">Notes on connector UI methods</a>
+</li>
+</ul>
+</li>
+<li>
+<a href="#Implementation+support+provided+by+the+framework">Implementation support provided by the framework</a>
+</li>
+<li>
+<a href="#DO%27s+and+DON%27T+DO%27s">DO's and DON'T DO's</a>
+</li>
+</ul>
+</li>
+</ul>
+</div> 
+    
+<a name="N1000D"></a><a name="Writing+an+Output+Connector"></a>
+<h2 class="h3">Writing an Output Connector</h2>
+<div class="section">
+<p></p>
+<p>An output connector furnishes the mechanism by which content that has been fetched from a repository gets handed to a back-end repository for processing.  It also furnishes a mechanism for removing previously-processed content from that back end repository.</p>
+<p></p>
+<p>As is the case with all connectors under the ManifoldCF umbrella, an output connector consists of a single part, which is:</p>
+<p></p>
+<ul>
+        
+<li>A class implementing an interface (in this case, <em>org.apache.manifoldcf.agents.interfaces.IOutputConnector</em>)</li>
+      
+</ul>
+<p></p>
+<a name="N1002A"></a><a name="Key+concepts"></a>
+<h3 class="h4">Key concepts</h3>
+<p></p>
+<p>The output connector abstraction makes use of, or introduces, the following concepts:</p>
+<p></p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+          
+<tr>
+<th colspan="1" rowspan="1">Concept</th><th colspan="1" rowspan="1">What it is</th>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Configuration parameters</td><td colspan="1" rowspan="1">A hierarchical structure, internally represented as an XML document, which describes a specific configuration of a specific output connector, i.e. <strong>how</strong> the connector should do its job; see <em>org.apache.manifoldcf.core.interfaces.ConfigParams</em></td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Output connection</td><td colspan="1" rowspan="1">An output connector instance that has been furnished with configuration data</td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Document URI</td><td colspan="1" rowspan="1">The unique URI (or, in some cases, file IRI) of a document, which is meant to be displayed in search engine results as the link to the document</td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Repository document</td><td colspan="1" rowspan="1">An object that describes a document's contents, including raw document data (as a stream), metadata (as either strings or streams), and access tokens; see <em>org.apache.manifoldcf.agents.interfaces.RepositoryDocument</em></td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Connection management/threading/pooling model</td><td colspan="1" rowspan="1">How an individual output connector class instance is managed and used</td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Activity infrastructure</td><td colspan="1" rowspan="1">The framework API provided to specific methods allowing those methods to perform specific actions within the framework, e.g. recording activities; see <em>org.apache.manifoldcf.agents.interfaces.IOutputAddActivity</em> and <em>org.apache.manifoldcf.agents.interfaces.IOutputRemoveActivity</em></td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Output specification</td><td colspan="1" rowspan="1">A hierarchical structure, internally represented as an XML document, which describes <strong>what</strong> a specific output connector should do in the context of a specific job; see <em>org.apache.manifoldcf.agents.interfaces.OutputSpecification</em></td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Output version string</td><td colspan="1" rowspan="1">A simple string, used for comparison purposes, that allows ManifoldCF to figure out if an ingestion operation needs to be repeated as a result of changes to the output specification in effect for a document</td>
+</tr>
+          
+<tr>
+<td colspan="1" rowspan="1">Service interruption</td><td colspan="1" rowspan="1">A specific kind of exception that signals ManifoldCF that the output repository is unavailable, and gives a best estimate of when it might become available again; see <em>org.apache.manifoldcf.agents.interfaces.ServiceInterruption</em></td>
+</tr>
+        
+</table>
+<p></p>
+<p></p>
+<a name="N100B6"></a><a name="Implementing+the+Output+Connector+class"></a>
+<h3 class="h4">Implementing the Output Connector class</h3>
+<p></p>
+<p>A very good place to start is to read the javadoc for the output connector interface.  You will note that the javadoc describes the usage and pooling model for a connector class pretty thoroughly.  It is very important to understand the model thoroughly in order to write reliable connectors!  Use of static variables, for one thing, must be done in a very careful way, to avoid issues that would be hard to detect with a cursory test.</p>
+<p></p>
+<p>The second thing to do is to examine some of the provided output connector implementations.  The GTS connector, the SOLR connector, and the Null Output connector all are output connectors which demonstrate (to some degree) the sorts of techniques you will need for an effective implementation.  You will also note that all of these connectors extend a framework-provided output connector base class, found at <em>org.apache.manifoldcf.agents.output.BaseOutputConnector</em>.  This base class furnishes some basic bookkeeping logic for managing the connector pool, as well as default implementations of some of the less typical functionality a connector may have.  For example, connectors are allowed to have database tables of their own, which are instantiated when the connector is registered, and are torn down when the connector is removed.  This is, however, not very typical, and the base implementation reflects that.</p>
+<p></p>
+<a name="N100CB"></a><a name="Principle+methods"></a>
+<h4>Principle methods</h4>
+<p></p>
+<p>The principle methods an implementer should be concerned with for creating an output connector are the following:</p>
+<p></p>
+<table class="ForrestTable" cellspacing="1" cellpadding="4">
+            
+<tr>
+<th colspan="1" rowspan="1">Method</th><th colspan="1" rowspan="1">What it should do</th>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>checkDocumentIndexable()</strong></td><td colspan="1" rowspan="1">Decide whether a file is indexable or not</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>getOutputDescription()</strong></td><td colspan="1" rowspan="1">Use the supplied output specification to come up with an output version string</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>addOrReplaceDocument()</strong></td><td colspan="1" rowspan="1">Add or replace the specified document within the target repository, or signal if the document cannot be handled</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>removeDocument()</strong></td><td colspan="1" rowspan="1">Remove the specified document from the target repository</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>outputConfigurationHeader()</strong></td><td colspan="1" rowspan="1">Output the head-section part of an output connection <em>ConfigParams</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>outputConfigurationBody()</strong></td><td colspan="1" rowspan="1">Output the body-section part of an output connection <em>ConfigParams</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>processConfigurationPost()</strong></td><td colspan="1" rowspan="1">Receive and process form data from an output connection <em>ConfigParams</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>viewConfiguration()</strong></td><td colspan="1" rowspan="1">Output the viewing HTML for an output connection <em>ConfigParams</em> object</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>outputSpecificationHeader()</strong></td><td colspan="1" rowspan="1">Output the head-section part of an <em>OutputSpecification</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>outputSpecificationBody()</strong></td><td colspan="1" rowspan="1">Output the body-section part of an <em>OutputSpecification</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>processSpecificationPost()</strong></td><td colspan="1" rowspan="1">Receive and process form data from an <em>OutputSpecification</em> editing page</td>
+</tr>
+            
+<tr>
+<td colspan="1" rowspan="1"><strong>viewSpecification()</strong></td><td colspan="1" rowspan="1">Output the viewing page for an <em>OutputSpecification</em> object</td>
+</tr>
+          
+</table>
+<p></p>
+<p>These methods come in three broad classes: (a) functional methods for doing the work of the connector; (b) UI methods for configuring a connection; and (c) UI methods for editing the output specification for a job.  Together they do the heavy lifting of your connector.  But before you can write any code at all, you need to plan things out a bit.</p>
+<p></p>
+<a name="N10189"></a><a name="Choosing+the+form+of+the+output+version+string"></a>
+<h4>Choosing the form of the output version string</h4>
+<p></p>
+<p>The output version string is used by ManifoldCF to determine whether or not the output specification or configuration changed in such a way as to require that the document be reprocessed.  ManifoldCF therefore requests the output version string for any document that is ready for processing, and usually does not process the document again if the returned output version string agrees with the output version string it has stored.</p>
+<p></p>
+<p>Thinking about it more carefully, it is clear that what an output connector writer needs to do is include everything in the output version string that could potentially affect how the document gets ingested, save that which is specific to the repository connector.  That may include bits of output connector configuration information, as well as data from the output specification.  When it's time to ingest, it's usually the correct thing to do to obtain the necessary data for ingestion out of the output version string, rather than calculating it or fetching it anew, because that guarantees that the document processing was done in a manner that agrees with its recorded output version string, thus eliminating any chance of ManifoldCF getting confused.</p>
+<p></p>
+<a name="N1019C"></a><a name="Notes+on+connector+UI+methods"></a>
+<h4>Notes on connector UI methods</h4>
+<p></p>
+<p>The crawler UI uses a tabbed layout structure, and thus each of the UI methods must properly implement the tabbed model.  This means that the "header" methods above must add the desired tab names to a specified array, and the "body" methods must provide appropriate HTML which handles both the case where a tab is displayed, and where it is not displayed.  Also, it makes sense to use the appropriate css definitions, so that the connector UI pages have a similar look-and-feel to the rest of ManifoldCF's crawler ui.  We strongly suggest starting with one of the supplied connector's UI code, both for a description of the arguments to each method, and for some decent ideas of ways to organize your connector's UI code.</p>
+<p></p>
+<p>Please also note that it is good practice to name the form fields in your HTML in such a way that they cannot collide with form fields that may come from the framework's HTML or any specific repository connector's HTML.  The <em>OutputSpecification</em> HTML especially may be prone to collisions, because within any given job, this HTML is included in the same page as HTML from the chosen repository connector.</p>
+<p></p>
+<p></p>
+<a name="N101B5"></a><a name="Implementation+support+provided+by+the+framework"></a>
+<h3 class="h4">Implementation support provided by the framework</h3>
+<p></p>
+<p>ManifoldCF's framework provides a number of helpful services designed to make the creation of a connector easier.  These services are summarized below.  (This is not an exhaustive list, by any means.)</p>
+<p></p>
+<ul>
+          
+<li>Lock management and synchronization (see <em>org.apache.manifoldcf.core.interfaces.LockManagerFactory</em>)</li>
+          
+<li>Cache management (see <em>org.apache.manifoldcf.core.interfaces.CacheManagerFactory</em>)</li>
+          
+<li>Local keystore management (see <em>org.apache.manifoldcf.core.KeystoreManagerFactory</em>)</li>
+          
+<li>Database management (see <em>org.apache.manifoldcf.core.DBInterfaceFactory</em>)</li>
+        
+</ul>
+<p></p>
+<p>For UI method support, these too are very useful:</p>
+<p></p>
+<ul>
+          
+<li>Multipart form processing (see <em>org.apache.manifoldcf.ui.multipart.MultipartWrapper</em>)</li>
+          
+<li>HTML encoding (see <em>org.apache.manifoldcf.ui.util.Encoder</em>)</li>
+          
+<li>HTML formatting (see <em>org.apache.manifoldcf.ui.util.Formatter</em>)</li>
+        
+</ul>
+<p></p>
+<a name="N101FC"></a><a name="DO%27s+and+DON%27T+DO%27s"></a>
+<h3 class="h4">DO's and DON'T DO's</h3>
+<p></p>
+<p>It's always a good idea to make use of an existing infrastructure component, if it's meant for that purpose, rather than inventing your own.  There are, however, some limitations we recommend you adhere to.</p>
+<p></p>
+<ul>
+          
+<li>DO make use of infrastructure components described in the section above</li>
+          
+<li>DON'T make use of infrastructure components that aren't mentioned, without checking first</li>
+          
+<li>NEVER write connector code that directly uses framework database tables, other than the ones installed and managed by your connector</li>
+        
+</ul>
+<p></p>
+<p>If you are tempted to violate these rules, it may well mean you don't understand something important.  At the very least, we'd like to know why.  Send email to connectors-dev@incubator.apache.org with a description of your problem and how you are tempted to solve it.</p>
+</div>
+  
+</div>
+<!--+
+    |end content
+    +-->
+<div class="clearboth">&nbsp;</div>
+</div>
+<div id="footer">
+<!--+
+    |start bottomstrip
+    +-->
+<div class="lastmodified">
+<script type="text/javascript"><!--
+document.write("Last Published: " + document.lastModified);
+//  --></script>
+</div>
+<div class="copyright">
+        Copyright &copy;
+         2009, 2010 <a href="http://www.apache.org/licenses/">The Apache Software Foundation.</a>
+</div>
+<!--+
+    |end bottomstrip
+    +-->
+</div>
+</body>
+</html>

Added: incubator/lcf/site/publish/writing-output-connectors.pdf
URL: http://svn.apache.org/viewvc/incubator/lcf/site/publish/writing-output-connectors.pdf?rev=1034705&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/lcf/site/publish/writing-output-connectors.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream



Mime
View raw message