From droids-commits-return-60-apmail-incubator-droids-commits-archive=incubator.apache.org@incubator.apache.org Sat Nov 08 22:46:23 2008 Return-Path: Delivered-To: apmail-incubator-droids-commits-archive@locus.apache.org Received: (qmail 57741 invoked from network); 8 Nov 2008 22:46:23 -0000 Received: from hermes.apache.org (HELO mail.apache.org) (140.211.11.2) by minotaur.apache.org with SMTP; 8 Nov 2008 22:46:23 -0000 Received: (qmail 86423 invoked by uid 500); 8 Nov 2008 22:46:30 -0000 Delivered-To: apmail-incubator-droids-commits-archive@incubator.apache.org Received: (qmail 86395 invoked by uid 500); 8 Nov 2008 22:46:30 -0000 Mailing-List: contact droids-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: droids-dev@incubator.apache.org Delivered-To: mailing list droids-commits@incubator.apache.org Received: (qmail 86384 invoked by uid 99); 8 Nov 2008 22:46:30 -0000 Received: from athena.apache.org (HELO athena.apache.org) (140.211.11.136) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 08 Nov 2008 14:46:30 -0800 X-ASF-Spam-Status: No, hits=-2000.0 required=10.0 tests=ALL_TRUSTED X-Spam-Check-By: apache.org Received: from [140.211.11.4] (HELO eris.apache.org) (140.211.11.4) by apache.org (qpsmtpd/0.29) with ESMTP; Sat, 08 Nov 2008 22:45:20 +0000 Received: by eris.apache.org (Postfix, from userid 65534) id C3D16238896F; Sat, 8 Nov 2008 14:46:02 -0800 (PST) Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: svn commit: r712445 - in /incubator/droids/trunk/droids-norobots/src: main/java/org/apache/droids/norobots/NoRobotClient.java test/java/org/apache/droids/norobots/TestNorobotsClient.java test/resources/simple-robots.txt Date: Sat, 08 Nov 2008 22:46:02 -0000 To: droids-commits@incubator.apache.org From: olegk@apache.org X-Mailer: svnmailer-1.0.8 Message-Id: <20081108224602.C3D16238896F@eris.apache.org> X-Virus-Checked: Checked by ClamAV on apache.org Author: olegk Date: Sat Nov 8 14:46:02 2008 New Revision: 712445 URL: http://svn.apache.org/viewvc?rev=712445&view=rev Log: * Eliminated in-memory content buffering in the NoRobotClient * Added method to return a complete set of rules for all user agents Removed: incubator/droids/trunk/droids-norobots/src/test/resources/simple-robots.txt Modified: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java Modified: incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java?rev=712445&r1=712444&r2=712445&view=diff ============================================================================== --- incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java (original) +++ incubator/droids/trunk/droids-norobots/src/main/java/org/apache/droids/norobots/NoRobotClient.java Sat Nov 8 14:46:02 2008 @@ -29,13 +29,15 @@ import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; -import java.io.Reader; -import java.io.StringReader; -import java.io.StringWriter; import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; import java.net.URLDecoder; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Locale; +import java.util.Map; +import java.util.Set; /** * A Client which may be used to decide which urls on a website @@ -65,8 +67,11 @@ throw new IllegalArgumentException("Content loader may not be null"); } this.contentLoader = contentLoader; - this.userAgent = userAgent; - this.rules = new RulesEngine(); + if (userAgent != null) { + this.userAgent = userAgent.toLowerCase(Locale.ENGLISH); + } else { + this.userAgent = null; + } } /** @@ -97,34 +102,44 @@ } public void parseText(InputStream instream) throws IOException { - StringWriter writer = new StringWriter(); - Reader reader = new InputStreamReader(instream, US_ASCII); + Map map = parse(instream); + this.rules = map.get(this.userAgent); + if (this.rules == null) { + this.rules = new RulesEngine(); + } + this.wildcardRules = map.get("*"); + if (this.wildcardRules == null) { + this.wildcardRules = new RulesEngine(); + } + } + + public static Map parse(InputStream instream) throws IOException { try { - char[] tmp = new char[2048]; - int l; - while ((l = reader.read(tmp)) != -1) { - writer.write(tmp, 0, l); - } + return doParse(instream); } finally { - reader.close(); + instream.close(); } - String txt = writer.toString(); - this.rules = parseTextForUserAgent(txt, this.userAgent); - this.wildcardRules = parseTextForUserAgent(txt, "*"); } + + enum ParserState + { + USER_AGENT_DEF, ALLOW_DISALLOW_DEF + } + + private static Map doParse(InputStream instream) throws IOException { - private RulesEngine parseTextForUserAgent(String txt, String userAgent) throws IOException { - - RulesEngine engine = new RulesEngine(); - + Map map = new HashMap(); // Classic basic parser style, read an element at a time, // changing a state variable [parsingAllowBlock] // take each line, one at a time - BufferedReader rdr = new BufferedReader( new StringReader(txt) ); + BufferedReader rdr = new BufferedReader(new InputStreamReader(instream, US_ASCII)); + + Set engines = new HashSet(); + + ParserState state = ParserState.ALLOW_DISALLOW_DEF; + String line = ""; - String value = null; - boolean parsingAllowBlock = false; while( (line = rdr.readLine()) != null ) { // trim whitespace from either side line = line.trim(); @@ -134,43 +149,38 @@ continue; } - // if User-agent == userAgent - // record the rest up until end or next User-agent - // then quit (? check spec) if(line.startsWith("User-agent:")) { - - if(parsingAllowBlock) { - // we've just finished reading allows/disallows - if(engine.isEmpty()) { - // multiple user agents in a line, let's - // wait til we get rules - continue; - } else { - break; - } + if (state == ParserState.ALLOW_DISALLOW_DEF) { + engines.clear(); } - - value = line.substring("User-agent:".length()).trim(); - if(value.equalsIgnoreCase(userAgent)) { - parsingAllowBlock = true; - continue; + state = ParserState.USER_AGENT_DEF; + String userAgent = line.substring("User-agent:".length()); + userAgent = userAgent.trim().toLowerCase(Locale.ENGLISH); + RulesEngine engine = map.get(userAgent); + if (engine == null) { + engine = new RulesEngine(); + map.put(userAgent, engine); } + engines.add(engine); } else { - // if not, then store if we're currently the user agent - if(parsingAllowBlock) { - if(line.startsWith("Allow:")) { - value = line.substring("Allow:".length()).trim(); - value = URLDecoder.decode(value, US_ASCII); + if (engines.isEmpty()) { + continue; + } + if(line.startsWith("Allow:")) { + state = ParserState.ALLOW_DISALLOW_DEF; + String value = line.substring("Allow:".length()).trim(); + value = URLDecoder.decode(value, US_ASCII); + for (RulesEngine engine: engines) { engine.allowPath( value ); - } else - if(line.startsWith("Disallow:")) { - value = line.substring("Disallow:".length()).trim(); - value = URLDecoder.decode(value, US_ASCII); + } + } else + if(line.startsWith("Disallow:")) { + state = ParserState.ALLOW_DISALLOW_DEF; + String value = line.substring("Disallow:".length()).trim(); + value = URLDecoder.decode(value, US_ASCII); + for (RulesEngine engine: engines) { engine.disallowPath( value ); - } else { - // ignore - continue; } } else { // ignore @@ -178,7 +188,7 @@ } } } - return engine; + return map; } /** Modified: incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java?rev=712445&r1=712444&r2=712445&view=diff ============================================================================== --- incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java (original) +++ incubator/droids/trunk/droids-norobots/src/test/java/org/apache/droids/norobots/TestNorobotsClient.java Sat Nov 8 14:46:02 2008 @@ -1,7 +1,8 @@ package org.apache.droids.norobots; +import java.io.ByteArrayInputStream; import java.net.URI; -import java.net.URL; +import java.util.Map; import junit.framework.Assert; @@ -9,14 +10,84 @@ public class TestNorobotsClient { + + @Test + public void testRobotsParsing() throws Exception { + String s = + "User-agent: *\r\n" + + "Disallow: /tmp/\r\n" + + "User-agent: BadRobot\r\n" + + "Disallow: /cgi-bin/\r\n" + + "Disallow: /blah/"; + Map map = NoRobotClient.parse( + new ByteArrayInputStream(s.getBytes("US-ASCII"))); + Assert.assertNotNull(map); + Assert.assertEquals(2, map.size()); + Assert.assertNotNull(map.get("*")); + Assert.assertNotNull(map.get("badrobot")); + Assert.assertNull(map.get("BadRobot")); + Assert.assertNull(map.get("wnatever")); + } + + @Test + public void testComplexRobotsParsing() throws Exception { + String s = + "User-agent: *\r\n" + + "Disallow: /tmp/\r\n" + + "User-agent: BadRobot1\r\n" + + "User-agent: BadRobot2\r\n" + + "User-agent: BadRobot3\r\n" + + "Disallow: /cgi-bin/\r\n" + + "Disallow: /blah/\r\n" + + "User-agent: BadRobot1\r\n" + + "Disallow: /yada/\r\n" + + "User-agent: BadRobot3\r\n" + + "Allow: /haha/"; + Map map = NoRobotClient.parse( + new ByteArrayInputStream(s.getBytes("US-ASCII"))); + Assert.assertNotNull(map); + Assert.assertEquals(4, map.size()); + Assert.assertNotNull(map.get("*")); + Assert.assertNotNull(map.get("badrobot1")); + Assert.assertNotNull(map.get("badrobot2")); + Assert.assertNotNull(map.get("badrobot3")); + Assert.assertNull(map.get("badrobot4")); + Assert.assertNull(map.get("wnatever")); + + RulesEngine e1 = map.get("*"); + Assert.assertEquals(Boolean.FALSE, e1.isAllowed("/tmp/")); + Assert.assertNull(e1.isAllowed("/blah/")); + Assert.assertNull(e1.isAllowed("/yada/")); + Assert.assertNull(e1.isAllowed("/haha/")); + + RulesEngine e2 = map.get("badrobot1"); + Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/cgi-bin/")); + Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/blah/")); + Assert.assertEquals(Boolean.FALSE, e2.isAllowed("/yada/")); + Assert.assertNull(e2.isAllowed("/haha/")); + + RulesEngine e3 = map.get("badrobot2"); + Assert.assertEquals(Boolean.FALSE, e3.isAllowed("/cgi-bin/")); + Assert.assertEquals(Boolean.FALSE, e3.isAllowed("/blah/")); + Assert.assertNull(e3.isAllowed("/yada/")); + Assert.assertNull(e3.isAllowed("/haha/")); + + RulesEngine e4 = map.get("badrobot3"); + Assert.assertEquals(Boolean.FALSE, e4.isAllowed("/cgi-bin/")); + Assert.assertEquals(Boolean.FALSE, e4.isAllowed("/blah/")); + Assert.assertNull(e4.isAllowed("/yada/")); + Assert.assertEquals(Boolean.TRUE, e4.isAllowed("/haha/")); + } @Test - public void testSimpleRobotsFile() throws Exception { - ClassLoader cl = getClass().getClassLoader(); - URL url = cl.getResource("simple-robots.txt"); - Assert.assertNotNull(url); + public void testSimpleRobotsCheck() throws Exception { + String s = + "User-agent: *\r\n" + + "Disallow: /cgi-bin/\r\n" + + "Disallow: /tmp/\r\n" + + "Disallow: /~mine/"; NoRobotClient nrc = new NoRobotClient(new SimpleContentLoader(), "whatever"); - nrc.parseText(url.openStream()); + nrc.parseText(new ByteArrayInputStream(s.getBytes("US-ASCII"))); Assert.assertTrue(nrc.isUrlAllowed(new URI("/whatever/"))); Assert.assertFalse(nrc.isUrlAllowed(new URI("/~mine/"))); Assert.assertFalse(nrc.isUrlAllowed(new URI("/tmp/")));