manifoldcf-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From kwri...@apache.org
Subject svn commit: r911418 [2/8] - in /incubator/lcf/trunk/modules: connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/ framework/agents/org/apache/lcf/agents/agentmanager/ framework/agents/org/apache/lcf/agents/incrementalingest/ fr...
Date Thu, 18 Feb 2010 14:31:31 GMT
Modified: incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java?rev=911418&r1=911417&r2=911418&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java (original)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java Thu Feb 18 14:31:31 2010
@@ -34,891 +34,885 @@
 */
 public class RobotsManager extends org.apache.lcf.core.database.BaseTable
 {
-	public static final String _rcsid = "@(#)$Id$";
+        public static final String _rcsid = "@(#)$Id$";
 
-	// Robots cache class.  Only one needed.
-	protected static RobotsCacheClass robotsCacheClass = new RobotsCacheClass();
+        // Robots cache class.  Only one needed.
+        protected static RobotsCacheClass robotsCacheClass = new RobotsCacheClass();
 
-	// Database fields
-	protected final static String hostField = "hostname";
-	protected final static String robotsField = "robotsdata";
-	protected final static String expirationField = "expirationtime";
-
-	// Cache manager.  This handle is set up during the constructor.
-	ICacheManager cacheManager;
-
-	/** Constructor.  Note that one robotsmanager handle is only useful within a specific thread context,
-	* so the calling connector object logic must recreate the handle whenever the thread context changes.
-	*@param tc is the thread context.
-	*@param database is the database handle.
-	*/
-	public RobotsManager(IThreadContext tc, IDBInterface database)
-		throws LCFException
-	{
-		super(database,"robotsdata");
-		cacheManager = CacheManagerFactory.make(tc);
-	}
-
-	/** Install the manager.
-	*/
-	public void install()
-		throws LCFException
-	{
-		beginTransaction();
-		try
-		{
-			Map existing = getTableSchema(null,null);
-			if (existing == null)
-			{
-				// Install the table.
-				HashMap map = new HashMap();
-				map.put(hostField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false));
-				map.put(expirationField,new ColumnDescription("BIGINT",false,false,null,null,false));
-				map.put(robotsField,new ColumnDescription("BYTEA",false,true,null,null,false));
-				performCreate(map,null);
-			}
-		}
-		catch (LCFException e)
-		{
-			signalRollback();
-			throw e;
-		}
-		catch (Error e)
-		{
-			signalRollback();
-			throw e;
-		}
-		finally
-		{
-			endTransaction();
-		}
-	}
-
-	/** Uninstall the manager.
-	*/
-	public void deinstall()
-		throws LCFException
-	{
-		performDrop(null);
-	}
-
-
-	/** Read robots.txt data from the cache or from the database.
-	*@param hostName is the host for which the data is desired.
-	*@param currentTime is the time of the check.
-	*@return null if the record needs to be fetched, true if fetch is allowed.
-	*/
-	public Boolean checkFetchAllowed(String userAgent, String hostName, long currentTime, String pathString,
-		IVersionActivity activities)
-		throws LCFException
-	{
-		// Build description objects
-		HostDescription[] objectDescriptions = new HostDescription[1];
-		StringSetBuffer ssb = new StringSetBuffer();
-		ssb.add(getRobotsKey(hostName));
-		objectDescriptions[0] = new HostDescription(hostName,new StringSet(ssb));
-
-		HostExecutor exec = new HostExecutor(this,activities,objectDescriptions[0]);
-		cacheManager.findObjectsAndExecute(objectDescriptions,null,exec,getTransactionID());
-
-		// We do the expiration check here, rather than in the query, so that caching
-		// is possible.
-		RobotsData rd = exec.getResults();
-		if (rd == null || rd.getExpirationTime() <= currentTime)
-			return null;
-		return new Boolean(rd.isFetchAllowed(userAgent,pathString));
-	}
-
-	/** Write robots.txt, replacing any existing row.
-	*@param hostName is the host.
-	*@param expirationTime is the time this data should expire.
-	*@param data is the robots data stream.  May be null.
-	*/
-	public void writeRobotsData(String hostName, long expirationTime, InputStream data)
-		throws LCFException, IOException
-	{
-		TempFileInput tfi = null;
-		try
-		{
-			if (data != null)
-			{
-				try
-				{
-					tfi = new TempFileInput(data);
-				}
-				catch (LCFException e)
-				{
-					if (e.getErrorCode() == LCFException.INTERRUPTED)
-						throw e;
-					throw new IOException("Fetch failed: "+e.getMessage());
-				}
-			}
-			
-			StringSetBuffer ssb = new StringSetBuffer();
-			ssb.add(getRobotsKey(hostName));
-			StringSet cacheKeys = new StringSet(ssb);
-			ICacheHandle ch = cacheManager.enterCache(null,cacheKeys,getTransactionID());
-			try
-			{
-				
-			    beginTransaction();
-			    try
-			    {
-				// See whether the instance exists
-				ArrayList params = new ArrayList();
-				params.add(hostName);
-				IResultSet set = performQuery("SELECT * FROM "+getTableName()+" WHERE "+
-					hostField+"=?",params,null,null);
-				HashMap values = new HashMap();
-				values.put(expirationField,new Long(expirationTime));
-				if (tfi != null)
-					values.put(robotsField,tfi);
-				if (set.getRowCount() > 0)
-				{
-					// Update
-					params.clear();
-					params.add(hostName);
-					performUpdate(values," WHERE "+hostField+"=?",params,null);
-				}
-				else
-				{
-					// Insert
-					values.put(hostField,hostName);
-					// We only need the general key because this is new.
-					performInsert(values,null);
-				}
-				cacheManager.invalidateKeys(ch);
-			    }
-			    catch (LCFException e)
-			    {
-				signalRollback();
-				throw e;
-			    }
-			    catch (Error e)
-			    {
-				signalRollback();
-				throw e;
-			    }
-			    finally
-			    {
-				endTransaction();
-			    }
-			}
-			finally
-			{
-				cacheManager.leaveCache(ch);
-			}
-		}
-		finally
-		{
-			if (tfi != null)
-				tfi.discard();
-		}
-	}
-
-	// Protected methods and classes
-
-	/** Construct a key which represents an individual host name.
-	*@param hostName is the name of the connector.
-	*@return the cache key.
-	*/
-	protected static String getRobotsKey(String hostName)
-	{
-		return "ROBOTS_"+hostName;
-	}
-
-	/** Read robots data, if it exists.
-	*@return null if the data doesn't exist at all.  Return robots data if it does.
-	*/
-	protected RobotsData readRobotsData(String hostName, IVersionActivity activities)
-		throws LCFException
-	{
-	    try
-	    {
-		ArrayList list = new ArrayList();
-		list.add(hostName);
-		IResultSet set = performQuery("SELECT "+robotsField+","+expirationField+" FROM "+getTableName()+
-			" WHERE "+hostField+"=?",list,null,null);
-		if (set.getRowCount() == 0)
-			return null;
-		if (set.getRowCount() > 1)
-			throw new LCFException("Unexpected number of robotsdata rows matching '"+hostName+"': "+Integer.toString(set.getRowCount()));
-		IResultRow row = set.getRow(0);
-		long expiration = ((Long)row.getValue(expirationField)).longValue();
-		BinaryInput bi = (BinaryInput)row.getValue(robotsField);
-		if (bi == null)
-			return new RobotsData(null,expiration,hostName,activities);
-		try
-		{
-			InputStream is = bi.getStream();
-			return new RobotsData(is,expiration,hostName,activities);
-		}
-		finally
-		{
-			bi.discard();
-		}
-	    }
-	    catch (InterruptedIOException e)
-	    {
-		throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
-	    }
-	    catch (IOException e)
-	    {
-		throw new LCFException("IO error reading robots data for "+hostName+": "+e.getMessage(),e);
-	    }
-	}
-
-	/** Convert a string from the robots file into a readable form that does NOT contain NUL characters (since postgresql does not accept those).
-	*/
-	protected static String makeReadable(String inputString)
-	{
-		StringBuffer sb = new StringBuffer();
-		int i = 0;
-		while (i < inputString.length())
-		{
-			char y = inputString.charAt(i++);
-			if (y >= ' ')
-				sb.append(y);
-			else
-			{
-				sb.append('^');
-				sb.append((char)(y + '@'));
-			}
-		}
-		return sb.toString();
-	}
-	
-	/** This is a cached data item.
-	*/
-	protected static class RobotsData
-	{
-		protected long expiration;
-		protected ArrayList records = null;
-
-		/** Constructor. */
-		public RobotsData(InputStream is, long expiration, String hostName, IVersionActivity activities)
-			throws IOException, LCFException
-		{
-			this.expiration = expiration;
-			if (is == null)
-			{
-				records = null;
-				return;
-			}
-			Reader r = new InputStreamReader(is,"utf-8");
-			try
-			{
-				BufferedReader br = new BufferedReader(r);
-				try
-				{
-					parseRobotsTxt(br,hostName,activities);
-				}
-				finally
-				{
-					br.close();
-				}
-			}
-			finally
-			{
-				r.close();
-			}
-		}
-
-		/** Check if fetch is allowed */
-		public boolean isFetchAllowed(String userAgent, String pathString)
-		{
-			if (records == null)
-				return true;
-
-			boolean wasDisallowed = false;
-			boolean wasAllowed = false;
-			
-			// First matching user-agent takes precedence, according to the following chunk of spec:
-			// "These name tokens are used in User-agent lines in /robots.txt to
-			// identify to which specific robots the record applies. The robot
-			// must obey the first record in /robots.txt that contains a User-
-			// Agent line whose value contains the name token of the robot as a
-			// substring. The name comparisons are case-insensitive. If no such
-			// record exists, it should obey the first record with a User-agent
-			// line with a "*" value, if present. If no record satisfied either
-			// condition, or no records are present at all, access is unlimited."
-
-			boolean sawAgent = false;
-			
-			String userAgentUpper = userAgent.toUpperCase();
-
-			int i = 0;
-			while (i < records.size())
-			{
-				Record r = (Record)records.get(i++);
-				if (r.isAgentMatch(userAgentUpper,false))
-				{
-					if (r.isDisallowed(pathString))
-						wasDisallowed = true;
-					if (r.isAllowed(pathString))
-						wasAllowed = true;
-					
-					sawAgent = true;
-					break;
-				}
-			}
-			if (sawAgent == false)
-			{
-				i = 0;
-				while (i < records.size())
-				{
-					Record r = (Record)records.get(i++);
-					if (r.isAgentMatch("*",true))
-					{
-						if (r.isDisallowed(pathString))
-							wasDisallowed = true;
-						if (r.isAllowed(pathString))
-							wasAllowed = true;
-						
-						sawAgent = true;
-						break;
-					}
-				}
-			}
-			
-			if (sawAgent == false)
-				return true;
-			
-			// Allowed always overrides disallowed
-			if (wasAllowed)
-				return true;
-			if (wasDisallowed)
-				return false;
-			
-			// No match -> crawl allowed
-			return true;
-		}
-
-		/** Get expiration */
-		public long getExpirationTime()
-		{
-			return expiration;
-		}
-
-		/** Parse the robots.txt file using a reader.
-		* Is NOT expected to close the stream.
-		*/
-		protected void parseRobotsTxt(BufferedReader r, String hostName, IVersionActivity activities)
-			throws IOException, LCFException
-		{
-			boolean parseCompleted = false;
-			boolean robotsWasHtml = false;
-			boolean foundErrors = false;
-			String description = null;
-
-			long startParseTime = System.currentTimeMillis();
-			try
-			{
-				records = new ArrayList();
-				Record record = null;
-				boolean seenAction = false;
-				while (true)
-				{
-					String x = r.readLine();
-					if (x == null)
-						break;
-					int numSignPos = x.indexOf("#");
-					if (numSignPos != -1)
-						x = x.substring(0,numSignPos);
-					String lowercaseLine = x.toLowerCase().trim();
-					if (lowercaseLine.startsWith("user-agent:"))
-					{
-						if (seenAction)
-						{
-							records.add(record);
-							record = null;
-							seenAction = false;
-						}
-						if (record == null)
-							record = new Record();
-
-						String agentName = x.substring("User-agent:".length()).trim();	
-						record.addAgent(agentName);
-					}
-					else if (lowercaseLine.startsWith("user-agent"))
-					{
-						if (seenAction)
-						{
-							records.add(record);
-							record = null;
-							seenAction = false;
-						}
-						if (record == null)
-							record = new Record();
-
-						String agentName = x.substring("User-agent".length()).trim();	
-						record.addAgent(agentName);
-					}
-					else if (lowercaseLine.startsWith("disallow:"))
-					{
-						if (record == null)
-						{
-							description = "Disallow without User-agent";
-							Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
-							foundErrors = true;
-						}
-						else
-						{
-							String disallowPath = x.substring("Disallow:".length()).trim();
-							// The spec says that a blank disallow means let everything through.
-							if (disallowPath.length() > 0)
-								record.addDisallow(disallowPath);
-							seenAction = true;
-						}
-					}
-					else if (lowercaseLine.startsWith("disallow"))
-					{
-						if (record == null)
-						{
-							description = "Disallow without User-agent";
-							Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
-							foundErrors = true;
-						}
-						else
-						{
-							String disallowPath = x.substring("Disallow".length()).trim();
-							// The spec says that a blank disallow means let everything through.
-							if (disallowPath.length() > 0)
-								record.addDisallow(disallowPath);
-							seenAction = true;
-						}
-					}
-					else if (lowercaseLine.startsWith("allow:"))
-					{
-						if (record == null)
-						{
-							description = "Allow without User-agent";
-							Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
-							foundErrors = true;
-						}
-						else
-						{
-							String allowPath = x.substring("Allow:".length()).trim();
-							// The spec says that a blank disallow means let everything through.
-							if (allowPath.length() > 0)
-								record.addAllow(allowPath);
-							seenAction = true;
-						}
-					}
-					else if (lowercaseLine.startsWith("allow"))
-					{
-						if (record == null)
-						{
-							description = "Allow without User-agent";
-							Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
-							foundErrors = true;
-						}
-						else
-						{
-							String allowPath = x.substring("Allow".length()).trim();
-							// The spec says that a blank disallow means let everything through.
-							if (allowPath.length() > 0)
-								record.addAllow(allowPath);
-							seenAction = true;
-						}
-					}
-					else if (lowercaseLine.startsWith("crawl-delay:"))
-					{
-						// We don't complain about this, but right now we don't listen to it either.
-					}
-					else if (lowercaseLine.startsWith("crawl-delay"))
-					{
-						// We don't complain about this, but right now we don't listen to it either.
-					}
-					else
-					{
-						// If it's not just a blank line, complain
-						if (x.trim().length() > 0)
-						{
-							String problemLine = makeReadable(x);
-							description = "Unknown robots.txt line: '"+problemLine+"'";
-							Logging.connectors.warn("Web: Unknown robots.txt line from '"+hostName+"': '"+problemLine+"'");
-							if (x.indexOf("<html") != -1 || x.indexOf("<HTML") != -1)
-							{
-								// Looks like some kind of an html file, probably as a result of a redirection, so just abort as if we have a page error
-								robotsWasHtml = true;
-								parseCompleted = true;
-								break;
-							}
-							foundErrors = true;
-						}
-					}
-				}
-				if (record != null)
-					records.add(record);
-				parseCompleted = true;
-			}
-			finally
-			{
-				// Log the fact that we attempted to parse robots.txt, as well as what happened
-				// These are the following situations we will report:
-				// (1) INCOMPLETE - Parsing did not complete - if the stream was interrupted
-				// (2) HTML - Robots was html - if the robots data seemed to be html
-				// (3) ERRORS - Robots had errors - if the robots data was accepted but had errors in it
-				// (4) SUCCESS - Robots parsed successfully - if the robots data was parsed without problem
-				String status;
-				if (parseCompleted)
-				{
-					if (robotsWasHtml)
-					{
-						status = "HTML";
-						description = "Robots file contained HTML, skipped";
-					}
-					else
-					{
-						if (foundErrors)
-						{
-							status = "ERRORS";
-							// description should already be set
-						}
-						else
-						{
-							status = "SUCCESS";
-							description = null;
-						}
-					}
-				}
-				else
-				{
-					status = "INCOMPLETE";
-					description = "Parsing was interrupted";
-				}
-				
-				activities.recordActivity(new Long(startParseTime),WebcrawlerConnector.ACTIVITY_ROBOTSPARSE,
-					null,hostName,status,description,null);
-
-			}
-		}
-
-	}
-
-	/** Check if path matches specification */
-	protected static boolean doesPathMatch(String path, String spec)
-	{
-		// For robots 1.0, this function would do just this:
-		// return path.startsWith(spec);
-		// However, we implement the "google bot" spec, which allows wildcard matches that are, in fact, regular-expression-like in some ways.
-		// The "specification" can be found here: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40367
-		return doesPathMatch(path,0,spec,0);
-	}
-
-	/** Recursive method for matching specification to path. */
-	protected static boolean doesPathMatch(String path, int pathIndex, String spec, int specIndex)
-	{
-		while (true)
-		{
-			if (specIndex == spec.length())
-				// Hit the end of the specification!  We're done.
-				return true;
-			char specChar = spec.charAt(specIndex++);
-			if (specChar == '*')
-			{
-				// Found a specification wildcard.
-				// Eat up all the '*' characters at this position - otherwise each additional one increments the exponent of how long this can take,
-				// making denial-of-service via robots parsing a possibility.
-				while (specIndex < spec.length())
-				{
-					if (spec.charAt(specIndex) != '*')
-						break;
-					specIndex++;
-				}
-				// It represents zero or more characters, so we must recursively try for a match against all remaining characters in the path string.
-				while (true)
-				{
-					boolean match = doesPathMatch(path,pathIndex,spec,specIndex);
-					if (match)
-						return true;
-					if (path.length() == pathIndex)
-						// Nothing further to try, and no match
-						return false;
-					pathIndex++;
-					// Try again
-				}
-			}
-			else if (specChar == '$' && specIndex == spec.length())
-			{
-				// Found a specification end-of-path character.
-				// (It can only be legitimately the last character of the specification.)
-				return pathIndex == path.length();
-			}
-			if (pathIndex == path.length())
-				// Hit the end of the path! (but not the end of the specification!)
-				return false;
-			if (path.charAt(pathIndex) != specChar)
-				return false;
-			// On to the next match
-			pathIndex++;
-		}
-	}
-
-	/** This is the object description for a robots host object.
-	* This is the key that is used to look up cached data.
-	*/
-	protected static class HostDescription extends org.apache.lcf.core.cachemanager.BaseDescription
-	{
-		protected String hostName;
-		protected String criticalSectionName;
-		protected StringSet cacheKeys;
-
-		public HostDescription(String hostName, StringSet invKeys)
-		{
-			super("robotscache");
-			this.hostName = hostName;
-			criticalSectionName = getClass().getName()+"-"+hostName;
-			cacheKeys = invKeys;
-		}
-
-		public String getHostName()
-		{
-			return hostName;
-		}
-
-		public int hashCode()
-		{
-			return hostName.hashCode();
-		}
-
-		public boolean equals(Object o)
-		{
-			if (!(o instanceof HostDescription))
-				return false;
-			HostDescription d = (HostDescription)o;
-			return d.hostName.equals(hostName);
-		}
-
-		public String getCriticalSectionName()
-		{
-			return criticalSectionName;
-		}
-
-		/** Get the cache keys for an object (which may or may not exist yet in
-		* the cache).  This method is called in order for cache manager to throw the correct locks.
-		* @return the object's cache keys, or null if the object should not
-		* be cached.
-		*/
-		public StringSet getObjectKeys()
-		{
-			return cacheKeys;
-		}
-
-		/** Get the object class for an object.  The object class is used to determine
-		* the group of objects treated in the same LRU manner.
-		* @return the newly created object's object class, or null if there is no
-		* such class, and LRU behavior is not desired.
-		*/
-		public ICacheClass getObjectClass()
-		{
-			return robotsCacheClass;
-		}
-	}
-
-	/** Cache class for robots.
-	* An instance of this class describes the cache class for robots data caching.  There's
-	* only ever a need for one, so that will be created statically.
-	*/
-	protected static class RobotsCacheClass implements ICacheClass
-	{
-		/** Get the name of the object class.
-		* This determines the set of objects that are treated in the same
-		* LRU pool.
-		*@return the class name.
-		*/
-		public String getClassName()
-		{
-			// We count all the robot data, so this is a constant string.
-			return "ROBOTSCLASS";
-		}
-
-		/** Get the maximum LRU count of the object class.
-		*@return the maximum number of the objects of the particular class
-		* allowed.
-		*/
-		public int getMaxLRUCount()
-		{
-			// Hardwired for the moment; 2000 robots data records will be cached,
-			// and no more.
-			return 2000;
-		}
-
-	}
-
-	/** This is the executor object for locating robots host objects.
-	* This object furnishes the operations the cache manager needs to rebuild objects that it needs that are
-	* not in the cache at the moment.
-	*/
-	protected static class HostExecutor extends org.apache.lcf.core.cachemanager.ExecutorBase
-	{
-		// Member variables
-		protected RobotsManager thisManager;
-		protected RobotsData returnValue;
-		protected HostDescription thisHost;
-		protected IVersionActivity activities;
-
-		/** Constructor.
-		*@param manager is the RobotsManager class instance.
-		*@param objectDescription is the desired object description.
-		*/
-		public HostExecutor(RobotsManager manager, IVersionActivity activities, HostDescription objectDescription)
-		{
-			super();
-			thisManager = manager;
-			this.activities = activities;
-			thisHost = objectDescription;
-			returnValue = null;
-		}
-
-		/** Get the result.
-		*@return the looked-up or read cached instance.
-		*/
-		public RobotsData getResults()
-		{
-			return returnValue;
-		}
-
-		/** Create a set of new objects to operate on and cache.  This method is called only
-		* if the specified object(s) are NOT available in the cache.  The specified objects
-		* should be created and returned; if they are not created, it means that the
-		* execution cannot proceed, and the execute() method will not be called.
-		* @param objectDescriptions is the set of unique identifier of the object.
-		* @return the newly created objects to cache, or null, if any object cannot be created.
-		*  The order of the returned objects must correspond to the order of the object descriptinos.
-		*/
-		public Object[] create(ICacheDescription[] objectDescriptions) throws LCFException
-		{
-			// I'm not expecting multiple values to be request, so it's OK to walk through the objects
-			// and do a request at a time.
-			RobotsData[] rval = new RobotsData[objectDescriptions.length];
-			int i = 0;
-			while (i < rval.length)
-			{
-				HostDescription desc = (HostDescription)objectDescriptions[i];
-				// I need to cache both the data and the expiration date, and pick up both when I
-				// do the query.  This is because I don't want to cache based on request time, since that
-				// would screw up everything!
-				rval[i] = thisManager.readRobotsData(desc.getHostName(),activities);
-				i++;
-			}
-
-			return rval;
-		}
-
-
-		/** Notify the implementing class of the existence of a cached version of the
-		* object.  The object is passed to this method so that the execute() method below
-		* will have it available to operate on.  This method is also called for all objects
-		* that are freshly created as well.
-		* @param objectDescription is the unique identifier of the object.
-		* @param cachedObject is the cached object.
-		*/
-		public void exists(ICacheDescription objectDescription, Object cachedObject) throws LCFException
-		{
-			// Cast what came in as what it really is
-			HostDescription objectDesc = (HostDescription)objectDescription;
-			RobotsData robotsData = (RobotsData)cachedObject;
-			if (objectDesc.equals(thisHost))
-				returnValue = robotsData;
-		}
-
-		/** Perform the desired operation.  This method is called after either createGetObject()
-		* or exists() is called for every requested object.
-		*/
-		public void execute() throws LCFException
-		{
-			// Does nothing; we only want to fetch objects in this cacher.
-		}
-
-
-	}
-
-	/** This class represents a record in a robots.txt file.  It contains one or
-	* more user-agents, and one or more disallows.
-	*/
-	protected static class Record
-	{
-		protected ArrayList userAgents = new ArrayList();
-		protected ArrayList disallows = new ArrayList();
-		protected ArrayList allows = new ArrayList();
-
-		/** Constructor.
-		*/
-		public Record()
-		{
-		}
-
-		/** Add a user-agent.
-		*/
-		public void addAgent(String agentName)
-		{
-			userAgents.add(agentName);
-		}
-
-		/** Add a disallow.
-		*/
-		public void addDisallow(String disallowPath)
-		{
-			disallows.add(disallowPath);
-		}
-
-		/** Add an allow.
-		*/
-		public void addAllow(String allowPath)
-		{
-			allows.add(allowPath);
-		}
-		
-		/** See if user-agent matches.
-		*/
-		public boolean isAgentMatch(String agentNameUpper, boolean exactMatch)
-		{
-			int i = 0;
-			while (i < userAgents.size())
-			{
-				String agent = ((String)userAgents.get(i++)).toUpperCase();
-				if (exactMatch && agent.trim().equals(agentNameUpper))
-					return true;
-				if (!exactMatch && agentNameUpper.indexOf(agent) != -1)
-					return true;
-			}
-			return false;
-		}
-
-		/** See if path is disallowed.  Only called if user-agent has already
-		* matched.  (This checks if there's an explicit match with one of the
-		* Disallows clauses.)
-		*/
-		public boolean isDisallowed(String path)
-		{
-			int i = 0;
-			while (i < disallows.size())
-			{
-				String disallow = (String)disallows.get(i++);
-				if (doesPathMatch(path,disallow))
-					return true;
-			}
-			return false;
-		}
-		
-		/** See if path is allowed.  Only called if user-agent has already
-		* matched.  (This checks if there's an explicit match with one of the
-		* Allows clauses).
-		*/
-		public boolean isAllowed(String path)
-		{
-			int i = 0;
-			while (i < allows.size())
-			{
-				String allow = (String)allows.get(i++);
-				if (doesPathMatch(path,allow))
-					return true;
-			}
-			return false;
-		}
-		
-	}
+        // Database fields
+        protected final static String hostField = "hostname";
+        protected final static String robotsField = "robotsdata";
+        protected final static String expirationField = "expirationtime";
+
+        // Cache manager.  This handle is set up during the constructor.
+        ICacheManager cacheManager;
+
+        /** Constructor.  Note that one robotsmanager handle is only useful within a specific thread context,
+        * so the calling connector object logic must recreate the handle whenever the thread context changes.
+        *@param tc is the thread context.
+        *@param database is the database handle.
+        */
+        public RobotsManager(IThreadContext tc, IDBInterface database)
+                throws LCFException
+        {
+                super(database,"robotsdata");
+                cacheManager = CacheManagerFactory.make(tc);
+        }
+
+        /** Install the manager.
+        */
+        public void install()
+                throws LCFException
+        {
+                // Standard practice: outer loop on install methods, no transactions
+                while (true)
+                {
+                        Map existing = getTableSchema(null,null);
+                        if (existing == null)
+                        {
+                                // Install the table.
+                                HashMap map = new HashMap();
+                                map.put(hostField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false));
+                                map.put(expirationField,new ColumnDescription("BIGINT",false,false,null,null,false));
+                                map.put(robotsField,new ColumnDescription("BYTEA",false,true,null,null,false));
+                                performCreate(map,null);
+                        }
+                        else
+                        {
+                                // Upgrade code, if needed, goes here
+                        }
+                        
+                        // Handle indexes, if needed
+                        
+                        break;
+                }
+        }
+
+        /** Uninstall the manager.
+        */
+        public void deinstall()
+                throws LCFException
+        {
+                performDrop(null);
+        }
+
+
+        /** Read robots.txt data from the cache or from the database.
+        *@param hostName is the host for which the data is desired.
+        *@param currentTime is the time of the check.
+        *@return null if the record needs to be fetched, true if fetch is allowed.
+        */
+        public Boolean checkFetchAllowed(String userAgent, String hostName, long currentTime, String pathString,
+                IVersionActivity activities)
+                throws LCFException
+        {
+                // Build description objects
+                HostDescription[] objectDescriptions = new HostDescription[1];
+                StringSetBuffer ssb = new StringSetBuffer();
+                ssb.add(getRobotsKey(hostName));
+                objectDescriptions[0] = new HostDescription(hostName,new StringSet(ssb));
+
+                HostExecutor exec = new HostExecutor(this,activities,objectDescriptions[0]);
+                cacheManager.findObjectsAndExecute(objectDescriptions,null,exec,getTransactionID());
+
+                // We do the expiration check here, rather than in the query, so that caching
+                // is possible.
+                RobotsData rd = exec.getResults();
+                if (rd == null || rd.getExpirationTime() <= currentTime)
+                        return null;
+                return new Boolean(rd.isFetchAllowed(userAgent,pathString));
+        }
+
+        /** Write robots.txt, replacing any existing row.
+        *@param hostName is the host.
+        *@param expirationTime is the time this data should expire.
+        *@param data is the robots data stream.  May be null.
+        */
+        public void writeRobotsData(String hostName, long expirationTime, InputStream data)
+                throws LCFException, IOException
+        {
+                TempFileInput tfi = null;
+                try
+                {
+                        if (data != null)
+                        {
+                                try
+                                {
+                                        tfi = new TempFileInput(data);
+                                }
+                                catch (LCFException e)
+                                {
+                                        if (e.getErrorCode() == LCFException.INTERRUPTED)
+                                                throw e;
+                                        throw new IOException("Fetch failed: "+e.getMessage());
+                                }
+                        }
+                        
+                        StringSetBuffer ssb = new StringSetBuffer();
+                        ssb.add(getRobotsKey(hostName));
+                        StringSet cacheKeys = new StringSet(ssb);
+                        ICacheHandle ch = cacheManager.enterCache(null,cacheKeys,getTransactionID());
+                        try
+                        {
+                                
+                            beginTransaction();
+                            try
+                            {
+                                // See whether the instance exists
+                                ArrayList params = new ArrayList();
+                                params.add(hostName);
+                                IResultSet set = performQuery("SELECT * FROM "+getTableName()+" WHERE "+
+                                        hostField+"=?",params,null,null);
+                                HashMap values = new HashMap();
+                                values.put(expirationField,new Long(expirationTime));
+                                if (tfi != null)
+                                        values.put(robotsField,tfi);
+                                if (set.getRowCount() > 0)
+                                {
+                                        // Update
+                                        params.clear();
+                                        params.add(hostName);
+                                        performUpdate(values," WHERE "+hostField+"=?",params,null);
+                                }
+                                else
+                                {
+                                        // Insert
+                                        values.put(hostField,hostName);
+                                        // We only need the general key because this is new.
+                                        performInsert(values,null);
+                                }
+                                cacheManager.invalidateKeys(ch);
+                            }
+                            catch (LCFException e)
+                            {
+                                signalRollback();
+                                throw e;
+                            }
+                            catch (Error e)
+                            {
+                                signalRollback();
+                                throw e;
+                            }
+                            finally
+                            {
+                                endTransaction();
+                            }
+                        }
+                        finally
+                        {
+                                cacheManager.leaveCache(ch);
+                        }
+                }
+                finally
+                {
+                        if (tfi != null)
+                                tfi.discard();
+                }
+        }
+
+        // Protected methods and classes
+
+        /** Construct a key which represents an individual host name.
+        *@param hostName is the name of the connector.
+        *@return the cache key.
+        */
+        protected static String getRobotsKey(String hostName)
+        {
+                return "ROBOTS_"+hostName;
+        }
+
+        /** Read robots data, if it exists.
+        *@return null if the data doesn't exist at all.  Return robots data if it does.
+        */
+        protected RobotsData readRobotsData(String hostName, IVersionActivity activities)
+                throws LCFException
+        {
+            try
+            {
+                ArrayList list = new ArrayList();
+                list.add(hostName);
+                IResultSet set = performQuery("SELECT "+robotsField+","+expirationField+" FROM "+getTableName()+
+                        " WHERE "+hostField+"=?",list,null,null);
+                if (set.getRowCount() == 0)
+                        return null;
+                if (set.getRowCount() > 1)
+                        throw new LCFException("Unexpected number of robotsdata rows matching '"+hostName+"': "+Integer.toString(set.getRowCount()));
+                IResultRow row = set.getRow(0);
+                long expiration = ((Long)row.getValue(expirationField)).longValue();
+                BinaryInput bi = (BinaryInput)row.getValue(robotsField);
+                if (bi == null)
+                        return new RobotsData(null,expiration,hostName,activities);
+                try
+                {
+                        InputStream is = bi.getStream();
+                        return new RobotsData(is,expiration,hostName,activities);
+                }
+                finally
+                {
+                        bi.discard();
+                }
+            }
+            catch (InterruptedIOException e)
+            {
+                throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
+            }
+            catch (IOException e)
+            {
+                throw new LCFException("IO error reading robots data for "+hostName+": "+e.getMessage(),e);
+            }
+        }
+
+        /** Convert a string from the robots file into a readable form that does NOT contain NUL characters (since postgresql does not accept those).
+        */
+        protected static String makeReadable(String inputString)
+        {
+                StringBuffer sb = new StringBuffer();
+                int i = 0;
+                while (i < inputString.length())
+                {
+                        char y = inputString.charAt(i++);
+                        if (y >= ' ')
+                                sb.append(y);
+                        else
+                        {
+                                sb.append('^');
+                                sb.append((char)(y + '@'));
+                        }
+                }
+                return sb.toString();
+        }
+        
+        /** This is a cached data item.
+        */
+        protected static class RobotsData
+        {
+                protected long expiration;
+                protected ArrayList records = null;
+
+                /** Constructor. */
+                public RobotsData(InputStream is, long expiration, String hostName, IVersionActivity activities)
+                        throws IOException, LCFException
+                {
+                        this.expiration = expiration;
+                        if (is == null)
+                        {
+                                records = null;
+                                return;
+                        }
+                        Reader r = new InputStreamReader(is,"utf-8");
+                        try
+                        {
+                                BufferedReader br = new BufferedReader(r);
+                                try
+                                {
+                                        parseRobotsTxt(br,hostName,activities);
+                                }
+                                finally
+                                {
+                                        br.close();
+                                }
+                        }
+                        finally
+                        {
+                                r.close();
+                        }
+                }
+
+                /** Check if fetch is allowed */
+                public boolean isFetchAllowed(String userAgent, String pathString)
+                {
+                        if (records == null)
+                                return true;
+
+                        boolean wasDisallowed = false;
+                        boolean wasAllowed = false;
+                        
+                        // First matching user-agent takes precedence, according to the following chunk of spec:
+                        // "These name tokens are used in User-agent lines in /robots.txt to
+                        // identify to which specific robots the record applies. The robot
+                        // must obey the first record in /robots.txt that contains a User-
+                        // Agent line whose value contains the name token of the robot as a
+                        // substring. The name comparisons are case-insensitive. If no such
+                        // record exists, it should obey the first record with a User-agent
+                        // line with a "*" value, if present. If no record satisfied either
+                        // condition, or no records are present at all, access is unlimited."
+
+                        boolean sawAgent = false;
+                        
+                        String userAgentUpper = userAgent.toUpperCase();
+
+                        int i = 0;
+                        while (i < records.size())
+                        {
+                                Record r = (Record)records.get(i++);
+                                if (r.isAgentMatch(userAgentUpper,false))
+                                {
+                                        if (r.isDisallowed(pathString))
+                                                wasDisallowed = true;
+                                        if (r.isAllowed(pathString))
+                                                wasAllowed = true;
+                                        
+                                        sawAgent = true;
+                                        break;
+                                }
+                        }
+                        if (sawAgent == false)
+                        {
+                                i = 0;
+                                while (i < records.size())
+                                {
+                                        Record r = (Record)records.get(i++);
+                                        if (r.isAgentMatch("*",true))
+                                        {
+                                                if (r.isDisallowed(pathString))
+                                                        wasDisallowed = true;
+                                                if (r.isAllowed(pathString))
+                                                        wasAllowed = true;
+                                                
+                                                sawAgent = true;
+                                                break;
+                                        }
+                                }
+                        }
+                        
+                        if (sawAgent == false)
+                                return true;
+                        
+                        // Allowed always overrides disallowed
+                        if (wasAllowed)
+                                return true;
+                        if (wasDisallowed)
+                                return false;
+                        
+                        // No match -> crawl allowed
+                        return true;
+                }
+
+                /** Get expiration */
+                public long getExpirationTime()
+                {
+                        return expiration;
+                }
+
+                /** Parse the robots.txt file using a reader.
+                * Is NOT expected to close the stream.
+                */
+                protected void parseRobotsTxt(BufferedReader r, String hostName, IVersionActivity activities)
+                        throws IOException, LCFException
+                {
+                        boolean parseCompleted = false;
+                        boolean robotsWasHtml = false;
+                        boolean foundErrors = false;
+                        String description = null;
+
+                        long startParseTime = System.currentTimeMillis();
+                        try
+                        {
+                                records = new ArrayList();
+                                Record record = null;
+                                boolean seenAction = false;
+                                while (true)
+                                {
+                                        String x = r.readLine();
+                                        if (x == null)
+                                                break;
+                                        int numSignPos = x.indexOf("#");
+                                        if (numSignPos != -1)
+                                                x = x.substring(0,numSignPos);
+                                        String lowercaseLine = x.toLowerCase().trim();
+                                        if (lowercaseLine.startsWith("user-agent:"))
+                                        {
+                                                if (seenAction)
+                                                {
+                                                        records.add(record);
+                                                        record = null;
+                                                        seenAction = false;
+                                                }
+                                                if (record == null)
+                                                        record = new Record();
+
+                                                String agentName = x.substring("User-agent:".length()).trim();	
+                                                record.addAgent(agentName);
+                                        }
+                                        else if (lowercaseLine.startsWith("user-agent"))
+                                        {
+                                                if (seenAction)
+                                                {
+                                                        records.add(record);
+                                                        record = null;
+                                                        seenAction = false;
+                                                }
+                                                if (record == null)
+                                                        record = new Record();
+
+                                                String agentName = x.substring("User-agent".length()).trim();	
+                                                record.addAgent(agentName);
+                                        }
+                                        else if (lowercaseLine.startsWith("disallow:"))
+                                        {
+                                                if (record == null)
+                                                {
+                                                        description = "Disallow without User-agent";
+                                                        Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+                                                        foundErrors = true;
+                                                }
+                                                else
+                                                {
+                                                        String disallowPath = x.substring("Disallow:".length()).trim();
+                                                        // The spec says that a blank disallow means let everything through.
+                                                        if (disallowPath.length() > 0)
+                                                                record.addDisallow(disallowPath);
+                                                        seenAction = true;
+                                                }
+                                        }
+                                        else if (lowercaseLine.startsWith("disallow"))
+                                        {
+                                                if (record == null)
+                                                {
+                                                        description = "Disallow without User-agent";
+                                                        Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+                                                        foundErrors = true;
+                                                }
+                                                else
+                                                {
+                                                        String disallowPath = x.substring("Disallow".length()).trim();
+                                                        // The spec says that a blank disallow means let everything through.
+                                                        if (disallowPath.length() > 0)
+                                                                record.addDisallow(disallowPath);
+                                                        seenAction = true;
+                                                }
+                                        }
+                                        else if (lowercaseLine.startsWith("allow:"))
+                                        {
+                                                if (record == null)
+                                                {
+                                                        description = "Allow without User-agent";
+                                                        Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+                                                        foundErrors = true;
+                                                }
+                                                else
+                                                {
+                                                        String allowPath = x.substring("Allow:".length()).trim();
+                                                        // The spec says that a blank disallow means let everything through.
+                                                        if (allowPath.length() > 0)
+                                                                record.addAllow(allowPath);
+                                                        seenAction = true;
+                                                }
+                                        }
+                                        else if (lowercaseLine.startsWith("allow"))
+                                        {
+                                                if (record == null)
+                                                {
+                                                        description = "Allow without User-agent";
+                                                        Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+                                                        foundErrors = true;
+                                                }
+                                                else
+                                                {
+                                                        String allowPath = x.substring("Allow".length()).trim();
+                                                        // The spec says that a blank disallow means let everything through.
+                                                        if (allowPath.length() > 0)
+                                                                record.addAllow(allowPath);
+                                                        seenAction = true;
+                                                }
+                                        }
+                                        else if (lowercaseLine.startsWith("crawl-delay:"))
+                                        {
+                                                // We don't complain about this, but right now we don't listen to it either.
+                                        }
+                                        else if (lowercaseLine.startsWith("crawl-delay"))
+                                        {
+                                                // We don't complain about this, but right now we don't listen to it either.
+                                        }
+                                        else
+                                        {
+                                                // If it's not just a blank line, complain
+                                                if (x.trim().length() > 0)
+                                                {
+                                                        String problemLine = makeReadable(x);
+                                                        description = "Unknown robots.txt line: '"+problemLine+"'";
+                                                        Logging.connectors.warn("Web: Unknown robots.txt line from '"+hostName+"': '"+problemLine+"'");
+                                                        if (x.indexOf("<html") != -1 || x.indexOf("<HTML") != -1)
+                                                        {
+                                                                // Looks like some kind of an html file, probably as a result of a redirection, so just abort as if we have a page error
+                                                                robotsWasHtml = true;
+                                                                parseCompleted = true;
+                                                                break;
+                                                        }
+                                                        foundErrors = true;
+                                                }
+                                        }
+                                }
+                                if (record != null)
+                                        records.add(record);
+                                parseCompleted = true;
+                        }
+                        finally
+                        {
+                                // Log the fact that we attempted to parse robots.txt, as well as what happened
+                                // These are the following situations we will report:
+                                // (1) INCOMPLETE - Parsing did not complete - if the stream was interrupted
+                                // (2) HTML - Robots was html - if the robots data seemed to be html
+                                // (3) ERRORS - Robots had errors - if the robots data was accepted but had errors in it
+                                // (4) SUCCESS - Robots parsed successfully - if the robots data was parsed without problem
+                                String status;
+                                if (parseCompleted)
+                                {
+                                        if (robotsWasHtml)
+                                        {
+                                                status = "HTML";
+                                                description = "Robots file contained HTML, skipped";
+                                        }
+                                        else
+                                        {
+                                                if (foundErrors)
+                                                {
+                                                        status = "ERRORS";
+                                                        // description should already be set
+                                                }
+                                                else
+                                                {
+                                                        status = "SUCCESS";
+                                                        description = null;
+                                                }
+                                        }
+                                }
+                                else
+                                {
+                                        status = "INCOMPLETE";
+                                        description = "Parsing was interrupted";
+                                }
+                                
+                                activities.recordActivity(new Long(startParseTime),WebcrawlerConnector.ACTIVITY_ROBOTSPARSE,
+                                        null,hostName,status,description,null);
+
+                        }
+                }
+
+        }
+
+        /** Check if path matches specification */
+        protected static boolean doesPathMatch(String path, String spec)
+        {
+                // For robots 1.0, this function would do just this:
+                // return path.startsWith(spec);
+                // However, we implement the "google bot" spec, which allows wildcard matches that are, in fact, regular-expression-like in some ways.
+                // The "specification" can be found here: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40367
+                return doesPathMatch(path,0,spec,0);
+        }
+
+        /** Recursive method for matching specification to path. */
+        protected static boolean doesPathMatch(String path, int pathIndex, String spec, int specIndex)
+        {
+                while (true)
+                {
+                        if (specIndex == spec.length())
+                                // Hit the end of the specification!  We're done.
+                                return true;
+                        char specChar = spec.charAt(specIndex++);
+                        if (specChar == '*')
+                        {
+                                // Found a specification wildcard.
+                                // Eat up all the '*' characters at this position - otherwise each additional one increments the exponent of how long this can take,
+                                // making denial-of-service via robots parsing a possibility.
+                                while (specIndex < spec.length())
+                                {
+                                        if (spec.charAt(specIndex) != '*')
+                                                break;
+                                        specIndex++;
+                                }
+                                // It represents zero or more characters, so we must recursively try for a match against all remaining characters in the path string.
+                                while (true)
+                                {
+                                        boolean match = doesPathMatch(path,pathIndex,spec,specIndex);
+                                        if (match)
+                                                return true;
+                                        if (path.length() == pathIndex)
+                                                // Nothing further to try, and no match
+                                                return false;
+                                        pathIndex++;
+                                        // Try again
+                                }
+                        }
+                        else if (specChar == '$' && specIndex == spec.length())
+                        {
+                                // Found a specification end-of-path character.
+                                // (It can only be legitimately the last character of the specification.)
+                                return pathIndex == path.length();
+                        }
+                        if (pathIndex == path.length())
+                                // Hit the end of the path! (but not the end of the specification!)
+                                return false;
+                        if (path.charAt(pathIndex) != specChar)
+                                return false;
+                        // On to the next match
+                        pathIndex++;
+                }
+        }
+
+        /** This is the object description for a robots host object.
+        * This is the key that is used to look up cached data.
+        */
+        protected static class HostDescription extends org.apache.lcf.core.cachemanager.BaseDescription
+        {
+                protected String hostName;
+                protected String criticalSectionName;
+                protected StringSet cacheKeys;
+
+                public HostDescription(String hostName, StringSet invKeys)
+                {
+                        super("robotscache");
+                        this.hostName = hostName;
+                        criticalSectionName = getClass().getName()+"-"+hostName;
+                        cacheKeys = invKeys;
+                }
+
+                public String getHostName()
+                {
+                        return hostName;
+                }
+
+                public int hashCode()
+                {
+                        return hostName.hashCode();
+                }
+
+                public boolean equals(Object o)
+                {
+                        if (!(o instanceof HostDescription))
+                                return false;
+                        HostDescription d = (HostDescription)o;
+                        return d.hostName.equals(hostName);
+                }
+
+                public String getCriticalSectionName()
+                {
+                        return criticalSectionName;
+                }
+
+                /** Get the cache keys for an object (which may or may not exist yet in
+                * the cache).  This method is called in order for cache manager to throw the correct locks.
+                * @return the object's cache keys, or null if the object should not
+                * be cached.
+                */
+                public StringSet getObjectKeys()
+                {
+                        return cacheKeys;
+                }
+
+                /** Get the object class for an object.  The object class is used to determine
+                * the group of objects treated in the same LRU manner.
+                * @return the newly created object's object class, or null if there is no
+                * such class, and LRU behavior is not desired.
+                */
+                public ICacheClass getObjectClass()
+                {
+                        return robotsCacheClass;
+                }
+        }
+
+        /** Cache class for robots.
+        * An instance of this class describes the cache class for robots data caching.  There's
+        * only ever a need for one, so that will be created statically.
+        */
+        protected static class RobotsCacheClass implements ICacheClass
+        {
+                /** Get the name of the object class.
+                * This determines the set of objects that are treated in the same
+                * LRU pool.
+                *@return the class name.
+                */
+                public String getClassName()
+                {
+                        // We count all the robot data, so this is a constant string.
+                        return "ROBOTSCLASS";
+                }
+
+                /** Get the maximum LRU count of the object class.
+                *@return the maximum number of the objects of the particular class
+                * allowed.
+                */
+                public int getMaxLRUCount()
+                {
+                        // Hardwired for the moment; 2000 robots data records will be cached,
+                        // and no more.
+                        return 2000;
+                }
+
+        }
+
+        /** This is the executor object for locating robots host objects.
+        * This object furnishes the operations the cache manager needs to rebuild objects that it needs that are
+        * not in the cache at the moment.
+        */
+        protected static class HostExecutor extends org.apache.lcf.core.cachemanager.ExecutorBase
+        {
+                // Member variables
+                protected RobotsManager thisManager;
+                protected RobotsData returnValue;
+                protected HostDescription thisHost;
+                protected IVersionActivity activities;
+
+                /** Constructor.
+                *@param manager is the RobotsManager class instance.
+                *@param objectDescription is the desired object description.
+                */
+                public HostExecutor(RobotsManager manager, IVersionActivity activities, HostDescription objectDescription)
+                {
+                        super();
+                        thisManager = manager;
+                        this.activities = activities;
+                        thisHost = objectDescription;
+                        returnValue = null;
+                }
+
+                /** Get the result.
+                *@return the looked-up or read cached instance.
+                */
+                public RobotsData getResults()
+                {
+                        return returnValue;
+                }
+
+                /** Create a set of new objects to operate on and cache.  This method is called only
+                * if the specified object(s) are NOT available in the cache.  The specified objects
+                * should be created and returned; if they are not created, it means that the
+                * execution cannot proceed, and the execute() method will not be called.
+                * @param objectDescriptions is the set of unique identifier of the object.
+                * @return the newly created objects to cache, or null, if any object cannot be created.
+                *  The order of the returned objects must correspond to the order of the object descriptinos.
+                */
+                public Object[] create(ICacheDescription[] objectDescriptions) throws LCFException
+                {
+                        // I'm not expecting multiple values to be request, so it's OK to walk through the objects
+                        // and do a request at a time.
+                        RobotsData[] rval = new RobotsData[objectDescriptions.length];
+                        int i = 0;
+                        while (i < rval.length)
+                        {
+                                HostDescription desc = (HostDescription)objectDescriptions[i];
+                                // I need to cache both the data and the expiration date, and pick up both when I
+                                // do the query.  This is because I don't want to cache based on request time, since that
+                                // would screw up everything!
+                                rval[i] = thisManager.readRobotsData(desc.getHostName(),activities);
+                                i++;
+                        }
+
+                        return rval;
+                }
+
+
+                /** Notify the implementing class of the existence of a cached version of the
+                * object.  The object is passed to this method so that the execute() method below
+                * will have it available to operate on.  This method is also called for all objects
+                * that are freshly created as well.
+                * @param objectDescription is the unique identifier of the object.
+                * @param cachedObject is the cached object.
+                */
+                public void exists(ICacheDescription objectDescription, Object cachedObject) throws LCFException
+                {
+                        // Cast what came in as what it really is
+                        HostDescription objectDesc = (HostDescription)objectDescription;
+                        RobotsData robotsData = (RobotsData)cachedObject;
+                        if (objectDesc.equals(thisHost))
+                                returnValue = robotsData;
+                }
+
+                /** Perform the desired operation.  This method is called after either createGetObject()
+                * or exists() is called for every requested object.
+                */
+                public void execute() throws LCFException
+                {
+                        // Does nothing; we only want to fetch objects in this cacher.
+                }
+
+
+        }
+
+        /** This class represents a record in a robots.txt file.  It contains one or
+        * more user-agents, and one or more disallows.
+        */
+        protected static class Record
+        {
+                protected ArrayList userAgents = new ArrayList();
+                protected ArrayList disallows = new ArrayList();
+                protected ArrayList allows = new ArrayList();
+
+                /** Constructor.
+                */
+                public Record()
+                {
+                }
+
+                /** Add a user-agent.
+                */
+                public void addAgent(String agentName)
+                {
+                        userAgents.add(agentName);
+                }
+
+                /** Add a disallow.
+                */
+                public void addDisallow(String disallowPath)
+                {
+                        disallows.add(disallowPath);
+                }
+
+                /** Add an allow.
+                */
+                public void addAllow(String allowPath)
+                {
+                        allows.add(allowPath);
+                }
+                
+                /** See if user-agent matches.
+                */
+                public boolean isAgentMatch(String agentNameUpper, boolean exactMatch)
+                {
+                        int i = 0;
+                        while (i < userAgents.size())
+                        {
+                                String agent = ((String)userAgents.get(i++)).toUpperCase();
+                                if (exactMatch && agent.trim().equals(agentNameUpper))
+                                        return true;
+                                if (!exactMatch && agentNameUpper.indexOf(agent) != -1)
+                                        return true;
+                        }
+                        return false;
+                }
+
+                /** See if path is disallowed.  Only called if user-agent has already
+                * matched.  (This checks if there's an explicit match with one of the
+                * Disallows clauses.)
+                */
+                public boolean isDisallowed(String path)
+                {
+                        int i = 0;
+                        while (i < disallows.size())
+                        {
+                                String disallow = (String)disallows.get(i++);
+                                if (doesPathMatch(path,disallow))
+                                        return true;
+                        }
+                        return false;
+                }
+                
+                /** See if path is allowed.  Only called if user-agent has already
+                * matched.  (This checks if there's an explicit match with one of the
+                * Allows clauses).
+                */
+                public boolean isAllowed(String path)
+                {
+                        int i = 0;
+                        while (i < allows.size())
+                        {
+                                String allow = (String)allows.get(i++);
+                                if (doesPathMatch(path,allow))
+                                        return true;
+                        }
+                        return false;
+                }
+                
+        }
 
 }

Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java?rev=911418&r1=911417&r2=911418&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java (original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java Thu Feb 18 14:31:31 2010
@@ -50,8 +50,8 @@
 	public void install()
 		throws LCFException
 	{
-		beginTransaction();
-		try
+		// We always use an outer loop, in case the upgrade will need it.
+		while (true)
 		{
 			// Check if table is already present
 			Map existing = getTableSchema(null,null);
@@ -61,20 +61,14 @@
 				map.put(classNameField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false));
 				performCreate(map,null);
 			}
-		}
-		catch (LCFException e)
-		{
-			signalRollback();
-			throw e;
-		}
-		catch (Error e)
-		{
-			signalRollback();
-			throw e;
-		}
-		finally
-		{
-			endTransaction();
+			else
+			{
+				// Any required upgrade code goes here.
+			}
+			
+			// Any index creation goes here.
+			
+			break;
 		}
 	}
 



Mime
View raw message