<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>commits@nutch.apache.org Archives</title>
<link rel="self" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/?format=atom"/>
<link href="http://mail-archives.apache.org/mod_mbox/nutch-commits/"/>
<id>http://mail-archives.apache.org/mod_mbox/nutch-commits/</id>
<updated>2013-05-21T08:33:14Z</updated>
<entry>
<title>svn commit: r1484638 - in /nutch/trunk: ./ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130521013145.4BC0C23889F7@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130521013145-4BC0C23889F7@eris-apache-org%3e</id>
<updated>2013-05-21T01:31:44Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue May 21 01:31:44 2013&#010;New Revision: 1484638&#010;&#010;URL: http://svn.apache.org/r1484638&#010;Log:&#010;NUTCH-1513 Support Robots.txt for Ftp urls&#010;&#010;Added:&#010;    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1484638&amp;r1=1484637&amp;r2=1484638&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Tue May 21 01:31:44 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1513 Support Robots.txt for Ftp urls (tejas)&#010;+&#010; * NUTCH-1249 and NUTCH-1275 : Resolve all issues flagged up by adding javac -Xlint argument&#010;(tejasp)&#010; &#010; * NUTCH-1053 Parsing of RSS feeds fails (tejasp)&#010;&#010;Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1484638&amp;r1=1484637&amp;r2=1484638&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;Tue May 21 01:31:44 2013&#010;@@ -131,8 +131,6 @@ public abstract class HttpBase implement&#010;   public Configuration getConf() {&#010;     return this.conf;&#010;   }&#010;-   &#010;-  &#010;   &#010;   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {&#010;     &#010;&#010;Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1484638&amp;r1=1484637&amp;r2=1484638&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)&#010;+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Tue&#010;May 21 01:31:44 2013&#010;@@ -29,11 +29,9 @@ import org.apache.nutch.net.protocols.Re&#010; import org.apache.hadoop.conf.Configuration;&#010; &#010; import org.apache.nutch.protocol.Content;&#010;-import org.apache.nutch.protocol.RobotRulesParser;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatus;&#010;-&#010; import crawlercommons.robots.BaseRobotRules;&#010; &#010; import java.net.URL;&#010;@@ -48,8 +46,6 @@ import java.io.IOException;&#010;  *                             {@code ftp.server.timeout}, {@code ftp.password}, &#010;  *                             {@code ftp.keep.connection} and {@code ftp.follow.talk}.&#010;  * For details see "FTP properties" section in {@code nutch-default.xml}.&#010;- *&#010;- * @author John Xing&#010;  */&#010; public class Ftp implements Protocol {&#010; &#010;@@ -84,9 +80,11 @@ public class Ftp implements Protocol {&#010; &#010;   private Configuration conf;&#010; &#010;+  private FtpRobotRulesParser robots = null;&#010; &#010;   // constructor&#010;   public Ftp() {&#010;+    robots = new FtpRobotRulesParser();&#010;   }&#010; &#010;   /** Set the timeout. */&#010;@@ -240,6 +238,7 @@ public class Ftp implements Protocol {&#010;     this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);&#010;     this.keepConnection = conf.getBoolean("ftp.keep.connection", false);&#010;     this.followTalk = conf.getBoolean("ftp.follow.talk", false);&#010;+    this.robots.setConf(conf);&#010;   }&#010; &#010;   /**&#010;@@ -250,12 +249,10 @@ public class Ftp implements Protocol {&#010;   }&#010; &#010;   /** &#010;-   * Currently, no robots parsing is done for ftp protocol &#010;-   * and this returns a set of empty rules which will allow every url.&#010;-   * There a jira logged for the same NUTCH-1513&#010;+   * Get the robots rules for a given url&#010;    */&#010;   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;-    return RobotRulesParser.EMPTY_RULES;&#010;+    return robots.getRobotRulesSet(this, url);&#010;   }&#010; &#010;   public int getBufferSize() {&#010;&#010;Added: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1484638&amp;view=auto&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;(added)&#010;+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;Tue May 21 01:31:44 2013&#010;@@ -0,0 +1,99 @@&#010;+/**&#010;+ * Licensed to the Apache Software Foundation (ASF) under one or more&#010;+ * contributor license agreements.  See the NOTICE file distributed with&#010;+ * this work for additional information regarding copyright ownership.&#010;+ * The ASF licenses this file to You under the Apache License, Version 2.0&#010;+ * (the "License"); you may not use this file except in compliance with&#010;+ * the License.  You may obtain a copy of the License at&#010;+ *&#010;+ *     http://www.apache.org/licenses/LICENSE-2.0&#010;+ *&#010;+ * Unless required by applicable law or agreed to in writing, software&#010;+ * distributed under the License is distributed on an "AS IS" BASIS,&#010;+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.&#010;+ * See the License for the specific language governing permissions and&#010;+ * limitations under the License.&#010;+ */&#010;+&#010;+package org.apache.nutch.protocol.ftp;&#010;+&#010;+import java.net.URL;&#010;+&#010;+import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.hadoop.io.Text;&#010;+import org.apache.nutch.crawl.CrawlDatum;&#010;+import org.apache.nutch.protocol.Protocol;&#010;+import org.apache.nutch.protocol.ProtocolOutput;&#010;+import org.apache.nutch.protocol.ProtocolStatus;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010;+import org.slf4j.Logger;&#010;+import org.slf4j.LoggerFactory;&#010;+&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules;&#010;+&#010;+/**&#010;+ * This class is used for parsing robots for urls belonging to FTP protocol.&#010;+ * It extends the generic {@link RobotRulesParser} class and contains &#010;+ * Ftp protocol specific implementation for obtaining the robots file.&#010;+ */&#010;+public class FtpRobotRulesParser extends RobotRulesParser {&#010;+&#010;+  private static final String CONTENT_TYPE = "text/plain";&#010;+  public static final Logger LOG = LoggerFactory.getLogger(FtpRobotRulesParser.class);&#010;+&#010;+  FtpRobotRulesParser() { }&#010;+&#010;+  public FtpRobotRulesParser(Configuration conf) {&#010;+    super(conf);&#010;+  }&#010;+&#010;+  /**&#010;+   * The hosts for which the caching of robots rules is yet to be done,&#010;+   * it sends a Ftp request to the host corresponding to the {@link URL} &#010;+   * passed, gets robots file, parses the rules and caches the rules object&#010;+   * to avoid re-work in future.&#010;+   * &#010;+   *  @param ftp The {@link Protocol} object&#010;+   *  @param url URL &#010;+   *  &#010;+   *  @return robotRules A {@link BaseRobotRules} object for the rules&#010;+   */&#010;+  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {&#010;+&#010;+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case&#010;+    String host = url.getHost().toLowerCase();          // normalize to lower case&#010;+&#010;+    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);&#010;+&#010;+    boolean cacheRule = true;&#010;+&#010;+    if (robotRules == null) {                     // cache miss&#010;+      if (LOG.isTraceEnabled())&#010;+        LOG.trace("cache miss " + url);&#010;+&#010;+      try {&#010;+        Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());&#010;+        ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new CrawlDatum());&#010;+        ProtocolStatus status = output.getStatus();&#010;+&#010;+        if (status.getCode() == ProtocolStatus.SUCCESS) {&#010;+          robotRules =  parseRules(url.toString(), output.getContent().getContent(), &#010;+                                  CONTENT_TYPE, agentNames);&#010;+        } else {                                       &#010;+          robotRules = EMPTY_RULES;                 // use default rules&#010;+        }&#010;+      } catch (Throwable t) {&#010;+        if (LOG.isInfoEnabled()) {&#010;+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());&#010;+        }&#010;+        cacheRule = false;&#010;+        robotRules = EMPTY_RULES;&#010;+      }&#010;+&#010;+      if (cacheRule)&#010;+        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host&#010;+    }&#010;+    return robotRules;&#010;+  }&#010;+}&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484637 - in /nutch/branches/2.x: CHANGES.txt src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130521012955.AAFB723889F7@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130521012955-AAFB723889F7@eris-apache-org%3e</id>
<updated>2013-05-21T01:29:55Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue May 21 01:29:55 2013&#010;New Revision: 1484637&#010;&#010;URL: http://svn.apache.org/r1484637&#010;Log:&#010;NUTCH-1513 Support Robots.txt for Ftp urls&#010;&#010;Added:&#010;    nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1484637&amp;r1=1484636&amp;r2=1484637&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Tue May 21 01:29:55 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1513 Support Robots.txt for Ftp urls (tejasp)&#010;+&#010; * NUTCH-1053 Parsing of RSS feeds fails (tejasp)&#010; &#010; * NUTCH-1563 FetchSchedule#getFields is never used by GeneratorJob (Feng)&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1484637&amp;r1=1484636&amp;r2=1484637&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;Tue May 21 01:29:55 2013&#010;@@ -28,7 +28,6 @@ import org.apache.commons.net.ftp.FTPFil&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.net.protocols.Response;&#010; import org.apache.nutch.protocol.Content;&#010;-import org.apache.nutch.protocol.RobotRulesParser;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatusCodes;&#010;@@ -46,8 +45,6 @@ import crawlercommons.robots.BaseRobotRu&#010;  *                             {@code ftp.server.timeout}, {@code ftp.password}, &#010;  *                             {@code ftp.keep.connection} and {@code ftp.follow.talk}.&#010;  * For details see "FTP properties" section in {@code nutch-default.xml}.&#010;- * &#010;- * @author John Xing&#010;  */&#010; public class Ftp implements Protocol {&#010; &#010;@@ -89,8 +86,11 @@ public class Ftp implements Protocol {&#010; &#010;   private Configuration conf;&#010; &#010;+  private FtpRobotRulesParser robots = null;&#010;+      &#010;   // constructor&#010;   public Ftp() {&#010;+    robots = new FtpRobotRulesParser();&#010;   }&#010; &#010;   /** Set the timeout. */&#010;@@ -179,6 +179,7 @@ public class Ftp implements Protocol {&#010;     this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);&#010;     this.keepConnection = conf.getBoolean("ftp.keep.connection", false);&#010;     this.followTalk = conf.getBoolean("ftp.follow.talk", false);&#010;+    this.robots.setConf(conf);&#010;   }&#010; &#010;   /**&#010;@@ -257,11 +258,9 @@ public class Ftp implements Protocol {&#010;   }&#010; &#010;   /** &#010;-   * Currently, no robots parsing is done for ftp protocol &#010;-   * and this returns a set of empty rules which will allow every url.&#010;-   * There a jira logged for the same NUTCH-1513&#010;+   * Get the robots rules for a given url&#010;    */&#010;   public BaseRobotRules getRobotRules(String url, WebPage page) {&#010;-    return RobotRulesParser.EMPTY_RULES;&#010;+    return robots.getRobotRulesSet(this, url);&#010;   }&#010; }&#010;&#010;Added: nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1484637&amp;view=auto&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;(added)&#010;+++ nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java&#010;Tue May 21 01:29:55 2013&#010;@@ -0,0 +1,100 @@&#010;+/**&#010;+ * Licensed to the Apache Software Foundation (ASF) under one or more&#010;+ * contributor license agreements.  See the NOTICE file distributed with&#010;+ * this work for additional information regarding copyright ownership.&#010;+ * The ASF licenses this file to You under the Apache License, Version 2.0&#010;+ * (the "License"); you may not use this file except in compliance with&#010;+ * the License.  You may obtain a copy of the License at&#010;+ *&#010;+ *     http://www.apache.org/licenses/LICENSE-2.0&#010;+ *&#010;+ * Unless required by applicable law or agreed to in writing, software&#010;+ * distributed under the License is distributed on an "AS IS" BASIS,&#010;+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.&#010;+ * See the License for the specific language governing permissions and&#010;+ * limitations under the License.&#010;+ */&#010;+&#010;+package org.apache.nutch.protocol.ftp;&#010;+&#010;+import java.net.URL;&#010;+&#010;+import org.apache.commons.io.IOUtils;&#010;+import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.nutch.protocol.Protocol;&#010;+import org.apache.nutch.protocol.ProtocolOutput;&#010;+import org.apache.nutch.protocol.ProtocolStatusCodes;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010;+import org.apache.nutch.storage.WebPage;&#010;+import org.slf4j.Logger;&#010;+import org.slf4j.LoggerFactory;&#010;+&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules;&#010;+&#010;+/**&#010;+ * This class is used for parsing robots for urls belonging to FTP protocol.&#010;+ * It extends the generic {@link RobotRulesParser} class and contains &#010;+ * Ftp protocol specific implementation for obtaining the robots file.&#010;+ */&#010;+public class FtpRobotRulesParser extends RobotRulesParser {&#010;+&#010;+  private static final String CONTENT_TYPE = "text/plain";&#010;+  public static final Logger LOG = LoggerFactory.getLogger(FtpRobotRulesParser.class);&#010;+&#010;+  FtpRobotRulesParser() { }&#010;+&#010;+  public FtpRobotRulesParser(Configuration conf) {&#010;+    super(conf);&#010;+  }&#010;+&#010;+  /**&#010;+   * The hosts for which the caching of robots rules is yet to be done,&#010;+   * it sends a Ftp request to the host corresponding to the {@link URL} &#010;+   * passed, gets robots file, parses the rules and caches the rules object&#010;+   * to avoid re-work in future.&#010;+   * &#010;+   *  @param ftp The {@link Protocol} object&#010;+   *  @param url URL &#010;+   *  &#010;+   *  @return robotRules A {@link BaseRobotRules} object for the rules&#010;+   */&#010;+  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {&#010;+&#010;+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case&#010;+    String host = url.getHost().toLowerCase();          // normalize to lower case&#010;+&#010;+    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);&#010;+&#010;+    boolean cacheRule = true;&#010;+&#010;+    if (robotRules == null) {                     // cache miss&#010;+&#010;+      if (LOG.isTraceEnabled())&#010;+        LOG.trace("cache miss " + url);&#010;+&#010;+      try {&#010;+        String robotsUrl = new URL(url, "/robots.txt").toString();        &#010;+        ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new WebPage());&#010;+        int statusCode = output.getStatus().getCode();&#010;+&#010;+        if (statusCode == ProtocolStatusCodes.SUCCESS) {&#010;+          robotRules =  parseRules(url.toString(), output.getContent().getContent(), &#010;+                                  CONTENT_TYPE, agentNames);&#010;+        } else {                                       &#010;+          robotRules = EMPTY_RULES;                 // use default rules&#010;+        }&#010;+      } catch (Throwable t) {&#010;+        if (LOG.isInfoEnabled()) {&#010;+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());&#010;+        }&#010;+        cacheRule = false;&#010;+        robotRules = EMPTY_RULES;&#010;+      }&#010;+&#010;+      if (cacheRule)&#010;+        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host&#010;+    }&#010;+    return robotRules;&#010;+  }&#010;+}&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484634 [2/2] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/parse/ ...</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130521011930.9ECB023889F7@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130521011930-9ECB023889F7@eris-apache-org%3e</id>
<updated>2013-05-21T01:19:27Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Modified: nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java (original)&#010;+++ nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java Tue&#010;May 21 01:19:26 2013&#010;@@ -42,8 +42,6 @@ import com.anotherbigidea.io.InStream;&#010; /**&#010;  * Parser for Flash SWF files. Loosely based on the sample in JavaSWF&#010;  * distribution.&#010;- * &#010;- * @author Andrzej Bialecki&#010;  */&#010; public class SWFParser implements Parser {&#010;   public static final Logger LOG = LoggerFactory.getLogger("org.apache.nutch.parse.swf");&#010;@@ -63,7 +61,7 @@ public class SWFParser implements Parser&#010;   public ParseResult getParse(Content content) {&#010; &#010;     String text = null;&#010;-    Vector outlinks = new Vector();&#010;+    Vector&lt;Outlink&gt; outlinks = new Vector&lt;Outlink&gt;();&#010; &#010;     try {&#010; &#010;@@ -120,6 +118,7 @@ public class SWFParser implements Parser&#010; &#010;     byte[] buf = new byte[in.available()];&#010;     in.read(buf);&#010;+    in.close();&#010;     SWFParser parser = new SWFParser();&#010;     ParseResult parseResult = parser.getParse(new Content("file:" + args[0], "file:" + args[0],&#010;                                           buf, "application/x-shockwave-flash",&#010;@@ -153,13 +152,13 @@ class ExtractText extends SWFTagTypesImp&#010;    * character codes for the correspnding font glyphs (An empty array denotes a&#010;    * System Font).&#010;    */&#010;-  protected HashMap fontCodes = new HashMap();&#010;+  protected HashMap&lt;Integer, int[]&gt; fontCodes = new HashMap&lt;Integer, int[]&gt;();&#010; &#010;-  public ArrayList strings = new ArrayList();&#010;+  public ArrayList&lt;String&gt; strings = new ArrayList&lt;String&gt;();&#010; &#010;-  public HashSet actionStrings = new HashSet();&#010;+  public HashSet&lt;String&gt; actionStrings = new HashSet&lt;String&gt;();&#010; &#010;-  public ArrayList urls = new ArrayList();&#010;+  public ArrayList&lt;String&gt; urls = new ArrayList&lt;String&gt;();&#010; &#010;   public ExtractText() {&#010;     super(null);&#010;@@ -167,7 +166,7 @@ class ExtractText extends SWFTagTypesImp&#010; &#010;   public String getText() {&#010;     StringBuffer res = new StringBuffer();&#010;-    Iterator it = strings.iterator();&#010;+    Iterator&lt;String&gt; it = strings.iterator();&#010;     while (it.hasNext()) {&#010;       if (res.length() &gt; 0) res.append(' ');&#010;       res.append(it.next());&#010;@@ -189,7 +188,7 @@ class ExtractText extends SWFTagTypesImp&#010;   public String[] getUrls() {&#010;     String[] res = new String[urls.size()];&#010;     int i = 0;&#010;-    Iterator it = urls.iterator();&#010;+    Iterator&lt;String&gt; it = urls.iterator();&#010;     while (it.hasNext()) {&#010;       res[i] = (String) it.next();&#010;       i++;&#010;@@ -350,26 +349,23 @@ class ExtractText extends SWFTagTypesImp&#010;  * ActionScript parser. This parser tries to extract free text embedded inside&#010;  * the script, but without polluting it too much with names of variables,&#010;  * methods, etc. Not ideal, but it works.&#010;- * &#010;- * @author Andrzej Bialecki&#010;  */&#010; class NutchSWFActions extends SWFActionBlockImpl implements SWFActions {&#010;-  private HashSet strings = null;&#010;+  private HashSet&lt;String&gt; strings = null;&#010; &#010;-  private ArrayList urls = null;&#010;+  private ArrayList&lt;String&gt; urls = null;&#010; &#010;   String[] dict = null;&#010; &#010;-  Stack stack = null;&#010;+  Stack&lt;Object&gt; stack = null;&#010; &#010;-  public NutchSWFActions(HashSet strings, ArrayList urls) {&#010;+  public NutchSWFActions(HashSet&lt;String&gt; strings, ArrayList&lt;String&gt; urls) {&#010;     this.strings = strings;&#010;     this.urls = urls;&#010;     stack = new SmallStack(100, strings);&#010;   }&#010; &#010;   public void lookupTable(String[] values) throws IOException {&#010;-    // System.out.println("-lookupTable: " + values.length);&#010;     for (int i = 0; i &lt; values.length; i++) {&#010;       if (!strings.contains(values[i])) strings.add(values[i]);&#010;     }&#010;@@ -378,7 +374,6 @@ class NutchSWFActions extends SWFActionB&#010;   }&#010; &#010;   public void defineLocal() throws IOException {&#010;-    // System.out.println("-defineLocal");&#010;     stack.pop();&#010;     super.defineLocal();&#010;   }&#010;@@ -398,69 +393,58 @@ class NutchSWFActions extends SWFActionB&#010;   }&#010; &#010;   public SWFActionBlock.TryCatchFinally _try(String var) throws IOException {&#010;-    // System.out.println("_try: var=" + var);&#010;     // stack.push(var);&#010;     strings.remove(var);&#010;     return super._try(var);&#010;   }&#010; &#010;   public void comment(String var) throws IOException {&#010;-    // System.out.println("-comment: var=" + var);&#010;     // stack.push(var);&#010;     strings.remove(var);&#010;     super.comment(var);&#010;   }&#010; &#010;   public void goToFrame(String var) throws IOException {&#010;-    // System.out.println("-goToFrame: var=" + var);&#010;     stack.push(var);&#010;     strings.remove(var);&#010;     super.gotoFrame(var);&#010;   }&#010; &#010;   public void ifJump(String var) throws IOException {&#010;-    // System.out.println("-ifJump: var=" + var);&#010;     strings.remove(var);&#010;     super.ifJump(var);&#010;   }&#010; &#010;   public void jump(String var) throws IOException {&#010;-    // System.out.println("-jump: var=" + var);&#010;     strings.remove(var);&#010;     super.jump(var);&#010;   }&#010; &#010;   public void jumpLabel(String var) throws IOException {&#010;-    // System.out.println("-jumpLabel: var=" + var);&#010;     strings.remove(var);&#010;     super.jumpLabel(var);&#010;   }&#010; &#010;   public void lookup(int var) throws IOException {&#010;-    // System.out.println("-lookup: var=" + var);&#010;     if (dict != null &amp;&amp; var &gt;= 0 &amp;&amp; var &lt; dict.length) {&#010;-      // System.out.println(" push " + dict[var]);&#010;       stack.push(dict[var]);&#010;     }&#010;     super.lookup(var);&#010;   }&#010; &#010;   public void push(String var) throws IOException {&#010;-    // System.out.println("-push: var=" + var);&#010;     stack.push(var);&#010;     strings.remove(var);&#010;     super.push(var);&#010;   }&#010; &#010;   public void setTarget(String var) throws IOException {&#010;-    // System.out.println("-setTarget: var=" + var);&#010;     stack.push(var);&#010;     strings.remove(var);&#010;     super.setTarget(var);&#010;   }&#010; &#010;   public SWFActionBlock startFunction(String var, String[] params) throws IOException {&#010;-    // System.out.println("-startFunction1: var=" + var);&#010;     stack.push(var);&#010;     strings.remove(var);&#010;     if (params != null) {&#010;@@ -472,7 +456,6 @@ class NutchSWFActions extends SWFActionB&#010;   }&#010; &#010;   public SWFActionBlock startFunction2(String var, int arg1, int arg2, String[] params, int[]&#010;arg3) throws IOException {&#010;-    // System.out.println("-startFunction2: var=" + var);&#010;     stack.push(var);&#010;     strings.remove(var);&#010;     if (params != null) {&#010;@@ -484,74 +467,61 @@ class NutchSWFActions extends SWFActionB&#010;   }&#010; &#010;   public void waitForFrame(int num, String var) throws IOException {&#010;-    // System.out.println("-waitForFrame: var=" + var);&#010;     stack.push(var);&#010;     strings.remove(var);&#010;     super.waitForFrame(num, var);&#010;   }&#010; &#010;   public void waitForFrame(String var) throws IOException {&#010;-    // System.out.println("-waitForFrame: var=" + var);&#010;     stack.push(var);&#010;     strings.remove(var);&#010;     super.waitForFrame(var);&#010;   }&#010; &#010;   public void done() throws IOException {&#010;-    // System.out.println("-done");&#010;     while (stack.size() &gt; 0) {&#010;       strings.remove(stack.pop());&#010;     }&#010;   }&#010; &#010;   public SWFActionBlock start(int arg0, int arg1) throws IOException {&#010;-    // System.out.println("-start");&#010;     return this;&#010;   }&#010; &#010;   public SWFActionBlock start(int arg0) throws IOException {&#010;-    // System.out.println("-start");&#010;     return this;&#010;   }&#010; &#010;   public void add() throws IOException {&#010;-    // System.out.println("-add");&#010;     super.add();&#010;   }&#010; &#010;   public void asciiToChar() throws IOException {&#010;-    // System.out.println("-asciitochar");&#010;     super.asciiToChar();&#010;   }&#010; &#010;   public void asciiToCharMB() throws IOException {&#010;-    // System.out.println("-asciitocharMB");&#010;     super.asciiToCharMB();&#010;   }&#010; &#010;   public void push(int var) throws IOException {&#010;-    // System.out.println("-push(int)");&#010;     if (dict != null &amp;&amp; var &gt;= 0 &amp;&amp; var &lt; dict.length) {&#010;-      // System.out.println(" push " + dict[var]);&#010;       stack.push(dict[var]);&#010;     }&#010;     super.push(var);&#010;   }&#010; &#010;   public void callFunction() throws IOException {&#010;-    // System.out.println("-callFunction");&#010;     strings.remove(stack.pop());&#010;     super.callFunction();&#010;   }&#010; &#010;   public void callMethod() throws IOException {&#010;-    // System.out.println("-callMethod");&#010;     strings.remove(stack.pop());&#010;     super.callMethod();&#010;   }&#010; &#010;   public void getMember() throws IOException {&#010;-    // System.out.println("-getMember");&#010;     // 0: name&#010;     String val = (String) stack.pop();&#010;     strings.remove(val);&#010;@@ -560,116 +530,97 @@ class NutchSWFActions extends SWFActionB&#010; &#010;   public void setMember() throws IOException {&#010;     // 0: value -1: name&#010;-    String val = (String) stack.pop();&#010;+    stack.pop(); // value&#010;     String name = (String) stack.pop();&#010;-    // System.out.println("-setMember: name=" + name + ", val=" + val);&#010;     strings.remove(name);&#010;     super.setMember();&#010;   }&#010; &#010;   public void setProperty() throws IOException {&#010;-    // System.out.println("-setProperty");&#010;     super.setProperty();&#010;   }&#010; &#010;   public void setVariable() throws IOException {&#010;-    // System.out.println("-setVariable");&#010;     super.setVariable();&#010;   }&#010; &#010;   public void call() throws IOException {&#010;-    // System.out.println("-call");&#010;     strings.remove(stack.pop());&#010;     super.call();&#010;   }&#010; &#010;   public void setTarget() throws IOException {&#010;-    // System.out.println("-setTarget");&#010;     strings.remove(stack.pop());&#010;     super.setTarget();&#010;   }&#010; &#010;   public void pop() throws IOException {&#010;-    // System.out.println("-pop");&#010;     strings.remove(stack.pop());&#010;     super.pop();&#010;   }&#010; &#010;   public void push(boolean arg0) throws IOException {&#010;-    // System.out.println("-push(b)");&#010;     stack.push("" + arg0);&#010;     super.push(arg0);&#010;   }&#010; &#010;   public void push(double arg0) throws IOException {&#010;-    // System.out.println("-push(d)");&#010;     stack.push("" + arg0);&#010;     super.push(arg0);&#010;   }&#010; &#010;   public void push(float arg0) throws IOException {&#010;-    // System.out.println("-push(f)");&#010;     stack.push("" + arg0);&#010;     super.push(arg0);&#010;   }&#010; &#010;   public void pushNull() throws IOException {&#010;-    // System.out.println("-push(null)");&#010;     stack.push("");&#010;     super.pushNull();&#010;   }&#010; &#010;   public void pushRegister(int arg0) throws IOException {&#010;-    // System.out.println("-push(reg)");&#010;     stack.push("" + arg0);&#010;     super.pushRegister(arg0);&#010;   }&#010; &#010;   public void pushUndefined() throws IOException {&#010;-    // System.out.println("-push(undef)");&#010;     stack.push("???");&#010;     super.pushUndefined();&#010;   }&#010; &#010;   public void getProperty() throws IOException {&#010;-    // System.out.println("-getProperty");&#010;     stack.pop();&#010;     super.getProperty();&#010;   }&#010; &#010;   public void getVariable() throws IOException {&#010;-    // System.out.println("-getVariable");&#010;     strings.remove(stack.pop());&#010;     super.getVariable();&#010;   }&#010; &#010;   public void gotoFrame(boolean arg0) throws IOException {&#010;-    // System.out.println("-gotoFrame(b)");&#010;     stack.push("" + arg0);&#010;     super.gotoFrame(arg0);&#010;   }&#010; &#010;   public void gotoFrame(int arg0) throws IOException {&#010;-    // System.out.println("-gotoFrame(int)");&#010;     stack.push("" + arg0);&#010;     super.gotoFrame(arg0);&#010;   }&#010; &#010;   public void gotoFrame(String arg0) throws IOException {&#010;-    // System.out.println("-gotoFrame(string)");&#010;     stack.push("" + arg0);&#010;     strings.remove(arg0);&#010;     super.gotoFrame(arg0);&#010;   }&#010; &#010;   public void newObject() throws IOException {&#010;-    // System.out.println("-newObject");&#010;     stack.pop();&#010;     super.newObject();&#010;   }&#010; &#010;   public SWFActionBlock startWith() throws IOException {&#010;-    // System.out.println("-startWith");&#010;     return this;&#010;   }&#010; &#010;@@ -678,13 +629,15 @@ class NutchSWFActions extends SWFActionB&#010; /*&#010;  * Small bottom-less stack.&#010;  */&#010;-class SmallStack extends Stack {&#010;+class SmallStack extends Stack&lt;Object&gt; {&#010;+&#010;+  private static final long serialVersionUID = 1L;&#010; &#010;   private int maxSize;&#010; &#010;-  private HashSet strings = null;&#010;+  private HashSet&lt;String&gt; strings = null;&#010; &#010;-  public SmallStack(int maxSize, HashSet strings) {&#010;+  public SmallStack(int maxSize, HashSet&lt;String&gt; strings) {&#010;     this.maxSize = maxSize;&#010;     this.strings = strings;&#010;   }&#010;&#010;Modified: nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java&#010;Tue May 21 01:19:26 2013&#010;@@ -32,13 +32,12 @@ import org.apache.nutch.parse.Parse;&#010; import org.apache.nutch.parse.ParseException;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;+import org.mortbay.log.Log;&#010; &#010; import junit.framework.TestCase;&#010; &#010; /** &#010;  * Unit tests for SWFParser.&#010;- *&#010;- * @author Andrzej Bialecki&#010;  */&#010; public class TestSWFParser extends TestCase {&#010; &#010;@@ -48,7 +47,6 @@ public class TestSWFParser extends TestC&#010;   &#010;   private String[] sampleFiles = new String[]{"test1.swf", "test2.swf", "test3.swf"};&#010;   private String[] sampleTexts = new String[]{"test1.txt", "test2.txt", "test3.txt"};&#010;-  private String[] texts = new String[sampleTexts.length];&#010; &#010;   public TestSWFParser(String name) { &#010;     super(name);&#010;@@ -94,5 +92,4 @@ public class TestSWFParser extends TestC&#010;       assertTrue(sampleTexts[i].equals(text));&#010;     }&#010;   }&#010;-&#010; }&#010;&#010;Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java&#010;Tue May 21 01:19:26 2013&#010;@@ -58,7 +58,7 @@ class DOMBuilder&#010;   public DocumentFragment m_docFrag = null;&#010; &#010;   /** Vector of element nodes          */&#010;-  protected Stack m_elemStack = new Stack();&#010;+  protected Stack&lt;Element&gt; m_elemStack = new Stack&lt;Element&gt;();&#010; &#010;   /**&#010;    * DOMBuilder instance constructor... it will add the DOM nodes&#010;&#010;Modified: nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;Tue May 21 01:19:26 2013&#010;@@ -59,7 +59,8 @@ public class TikaParser implements org.a&#010; &#009;private HtmlParseFilters htmlParseFilters;&#010; &#009;private String cachingPolicy;&#010; &#010;-&#009;public ParseResult getParse(Content content) {&#010;+&#009;@SuppressWarnings("deprecation")&#010;+  public ParseResult getParse(Content content) {&#010; &#009;&#009;String mimeType = content.getContentType();&#010; &#010; &#009;&#009;URL base;&#010;&#010;Modified: nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java (original)&#010;+++ nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java Tue&#010;May 21 01:19:26 2013&#010;@@ -18,15 +18,12 @@&#010; package org.apache.nutch.parse.zip;&#010; &#010; import java.io.ByteArrayInputStream;&#010;-import java.io.InputStream;&#010;-import java.util.Properties;&#010; import java.util.ArrayList;&#010; import java.util.List;&#010; &#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; &#010;-import org.apache.nutch.metadata.Metadata;&#010; import org.apache.nutch.net.protocols.Response;&#010; import org.apache.nutch.parse.Outlink;&#010; import org.apache.nutch.parse.ParseData;&#010;@@ -40,8 +37,6 @@ import org.apache.hadoop.conf.Configurat&#010; /**&#010;  * ZipParser class based on MSPowerPointParser class by Stephan Strittmatter.&#010;  * Nutch parse plugin for zip files - Content Type : application/zip&#010;- * &#010;- * @author Rohit Kulkarni &amp; Ashish Vaidya&#010;  */&#010; public class ZipParser implements Parser {&#010; &#010;@@ -57,17 +52,13 @@ public class ZipParser implements Parser&#010;     String resultText = null;&#010;     String resultTitle = null;&#010;     Outlink[] outlinks = null;&#010;-    List outLinksList = new ArrayList();&#010;-    Properties properties = null;&#010;+    List&lt;Outlink&gt; outLinksList = new ArrayList&lt;Outlink&gt;();&#010; &#010;     try {&#010;       final String contentLen = content.getMetadata().get(Response.CONTENT_LENGTH);&#010;       final int len = Integer.parseInt(contentLen);&#010;       if (LOG.isDebugEnabled()) { LOG.debug("ziplen: " + len); }&#010;       final byte[] contentInBytes = content.getContent();&#010;-      final ByteArrayInputStream bainput = new ByteArrayInputStream(&#010;-          contentInBytes);&#010;-      final InputStream input = bainput;&#010; &#010;       if (contentLen != null &amp;&amp; contentInBytes.length != len) {&#010;         return new ParseStatus(ParseStatus.FAILED,&#010;&#010;Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java&#010;Tue May 21 01:19:26 2013&#010;@@ -77,9 +77,9 @@ public class Client extends FTP&#010;     private int __dataTimeout;&#010;     private int __passivePort;&#010;     private String __passiveHost;&#010;-    private int __fileType, __fileFormat;&#010;+//    private int __fileType, __fileFormat;&#010;     private boolean __remoteVerificationEnabled;&#010;-    private FTPFileEntryParser __entryParser;&#010;+//    private FTPFileEntryParser __entryParser;&#010;     private String __systemName;&#010; &#010;     // constructor&#010;@@ -95,10 +95,10 @@ public class Client extends FTP&#010;     {&#010;         __passiveHost        = null;&#010;         __passivePort        = -1;&#010;-        __fileType           = FTP.ASCII_FILE_TYPE;&#010;-        __fileFormat         = FTP.NON_PRINT_TEXT_FORMAT;&#010;         __systemName         = null;&#010;-        __entryParser        = null;&#010;+//        __fileType           = FTP.ASCII_FILE_TYPE;&#010;+//        __fileFormat         = FTP.NON_PRINT_TEXT_FORMAT;&#010;+//        __entryParser        = null;&#010;     }&#010; &#010;     // parse reply for pass()&#010;@@ -315,7 +315,7 @@ public class Client extends FTP&#010;     }&#010; &#010;     // retrieve list reply for path&#010;-    public void retrieveList(String path, List entries, int limit,&#010;+    public void retrieveList(String path, List&lt;FTPFile&gt; entries, int limit,&#010;       FTPFileEntryParser parser)&#010;       throws IOException,&#010;         FtpExceptionCanNotHaveDataConnection,&#010;@@ -331,7 +331,7 @@ public class Client extends FTP&#010;           new BufferedReader(new InputStreamReader(socket.getInputStream()));&#010; &#010;       // force-close data channel socket, when download limit is reached&#010;-      boolean mandatory_close = false;&#010;+//      boolean mandatory_close = false;&#010; &#010;       //List entries = new LinkedList();&#010;       int count = 0;&#010;@@ -348,7 +348,7 @@ public class Client extends FTP&#010;         // impose download limit if limit &gt;= 0, otherwise no limit&#010;         // here, cut off is up to the line when total bytes is just over limit&#010;         if (limit &gt;= 0 &amp;&amp; count &gt; limit) {&#010;-          mandatory_close = true;&#010;+//          mandatory_close = true;&#010;           break;&#010;         }&#010;         line = parser.readNextEntry(reader);&#010;@@ -403,7 +403,7 @@ public class Client extends FTP&#010;       // fixme, should we instruct server here for binary file type?&#010; &#010;       // force-close data channel socket&#010;-      boolean mandatory_close = false;&#010;+      // boolean mandatory_close = false;&#010; &#010;       int len; int count = 0;&#010;       byte[] buf =&#010;@@ -414,7 +414,7 @@ public class Client extends FTP&#010;         // here, cut off is exactly of limit bytes&#010;         if (limit &gt;= 0 &amp;&amp; count &gt; limit) {&#010;           os.write(buf,0,len-(count-limit));&#010;-          mandatory_close = true;&#010;+       //   mandatory_close = true;&#010;           break;&#010;         }&#010;         os.write(buf,0,len);&#010;@@ -502,8 +502,8 @@ public class Client extends FTP&#010;     {&#010;         if (FTPReply.isPositiveCompletion(type(fileType)))&#010;         {&#010;-            __fileType = fileType;&#010;-            __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;&#010;+/*            __fileType = fileType;&#010;+            __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;*/&#010;             return true;&#010;         }&#010;         return false;&#010;&#010;Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)&#010;+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Tue&#010;May 21 01:19:26 2013&#010;@@ -55,7 +55,7 @@ public class Ftp implements Protocol {&#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);&#010; &#010;-  static final int BUFFER_SIZE = 16384; // 16*1024 = 16384&#010;+  private static final int BUFFER_SIZE = 16384; // 16*1024 = 16384&#010; &#010;   static final int MAX_REDIRECTS = 5;&#010; &#010;@@ -257,5 +257,9 @@ public class Ftp implements Protocol {&#010;   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;     return RobotRulesParser.EMPTY_RULES;&#010;   }&#010;+&#010;+  public int getBufferSize() {&#010;+    return BUFFER_SIZE;&#010;+  }&#010; }&#010; &#010;&#010;Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java&#010;Tue May 21 01:19:26 2013&#010;@@ -17,11 +17,9 @@&#010; &#010; package org.apache.nutch.protocol.ftp;&#010; &#010;-&#010; import org.apache.commons.net.ftp.FTP;&#010; import org.apache.commons.net.ftp.FTPFile;&#010; import org.apache.commons.net.ftp.FTPReply;&#010;-&#010; import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;&#010; import org.apache.commons.net.ftp.parser.ParserInitializationException;&#010; &#010;@@ -42,8 +40,7 @@ import java.util.LinkedList;&#010; import java.io.ByteArrayOutputStream;&#010; import java.io.IOException;&#010; &#010;-&#010;-/************************************&#010;+/**&#010;  * FtpResponse.java mimics ftp replies as http response.&#010;  * It tries its best to follow http's way for headers, response codes&#010;  * as well as exceptions.&#010;@@ -53,9 +50,7 @@ import java.io.IOException;&#010;  * and some important commons-net exceptions passed by Client.java&#010;  * must have been properly dealt with. They'd better not be leaked&#010;  * to the caller of this class.&#010;- *&#010;- * @author John Xing&#010;- ***********************************/&#010;+ */&#010; public class FtpResponse {&#010; &#010;   private String orig;&#010;@@ -146,7 +141,7 @@ public class FtpResponse {&#010;         // follow ftp talk?&#010;         if (ftp.followTalk)&#010;           ftp.client.addProtocolCommandListener(&#010;-            new PrintCommandListener(ftp.LOG));&#010;+            new PrintCommandListener(Ftp.LOG));&#010;       }&#010; &#010;       // quit from previous site if at a different site now&#010;@@ -284,8 +279,8 @@ public class FtpResponse {&#010;       }&#010;       &#010;     } catch (Exception e) {&#010;-      if (ftp.LOG.isWarnEnabled()) {&#010;-        ftp.LOG.warn("Error: ", e);&#010;+      if (Ftp.LOG.isWarnEnabled()) {&#010;+        Ftp.LOG.warn("Error: ", e);&#010;       }&#010;       // for any un-foreseen exception (run time exception or not),&#010;       // do ultimate clean and leave ftp.client for garbage collection&#010;@@ -312,11 +307,11 @@ public class FtpResponse {&#010;     throws IOException {&#010; &#010;     ByteArrayOutputStream os = null;&#010;-    List list = null;&#010;+    List&lt;FTPFile&gt; list = null;&#010; &#010;     try {&#010;       // first get its possible attributes&#010;-      list = new LinkedList();&#010;+      list = new LinkedList&lt;FTPFile&gt;();&#010;       ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);&#010; &#010;       FTPFile ftpFile = (FTPFile) list.get(0);&#010;@@ -329,7 +324,7 @@ public class FtpResponse {&#010;         code = 304;&#010;         return;&#010;       }&#010;-      os = new ByteArrayOutputStream(ftp.BUFFER_SIZE);&#010;+      os = new ByteArrayOutputStream(ftp.getBufferSize());&#010;       ftp.client.retrieveFile(path, os, ftp.maxContentLength);&#010; &#010;       this.content = os.toByteArray();&#010;@@ -414,7 +409,7 @@ public class FtpResponse {&#010;   // get ftp dir list as http response&#010;   private void getDirAsHttpResponse(String path, long lastModified)&#010;     throws IOException {&#010;-    List list = new LinkedList();&#010;+    List&lt;FTPFile&gt; list = new LinkedList&lt;FTPFile&gt;();&#010; &#010;     try {&#010; &#010;@@ -482,7 +477,7 @@ public class FtpResponse {&#010;   }&#010; &#010;   // generate html page from ftp dir list&#010;-  private byte[] list2html(List list, String path, boolean includeDotDot) {&#010;+  private byte[] list2html(List&lt;FTPFile&gt; list, String path, boolean includeDotDot)&#010;{&#010; &#010;     //StringBuffer x = new StringBuffer("&lt;!doctype html public \"-//ietf//dtd html//en\"&gt;&lt;html&gt;&lt;head&gt;");&#010;     StringBuffer x = new StringBuffer("&lt;html&gt;&lt;head&gt;");&#010;&#010;Modified: nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;Tue May 21 01:19:26 2013&#010;@@ -42,6 +42,8 @@ import org.apache.commons.httpclient.NTC&#010; import org.apache.commons.httpclient.auth.AuthScope;&#010; import org.apache.commons.httpclient.params.HttpConnectionManagerParams;&#010; import org.apache.commons.httpclient.protocol.Protocol;&#010;+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;&#010;+import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;&#010; &#010; // Nutch imports&#010; import org.apache.nutch.crawl.CrawlDatum;&#010;@@ -158,8 +160,8 @@ public class Http extends HttpBase {&#010;   private void configureClient() {&#010; &#010;     // Set up an HTTPS socket factory that accepts self-signed certs.&#010;-    Protocol https = new Protocol("https",&#010;-        new DummySSLProtocolSocketFactory(), 443);&#010;+    ProtocolSocketFactory factory = new SSLProtocolSocketFactory();&#010;+    Protocol https = new Protocol("https", factory, 443);&#010;     Protocol.registerProtocol("https", https);&#010; &#010;     HttpConnectionManagerParams params = connectionManager.getParams();&#010;@@ -174,7 +176,7 @@ public class Http extends HttpBase {&#010;     client.getParams().setConnectionManagerTimeout(timeout);&#010; &#010;     HostConfiguration hostConf = client.getHostConfiguration();&#010;-    ArrayList headers = new ArrayList();&#010;+    ArrayList&lt;Header&gt; headers = new ArrayList&lt;Header&gt;();&#010;     // Set the User Agent in the header&#010;     headers.add(new Header("User-Agent", userAgent));&#010;     // prefer English&#010;@@ -199,7 +201,7 @@ public class Http extends HttpBase {&#010; &#010;         NTCredentials proxyCredentials = new NTCredentials(&#010;             this.proxyUsername, this.proxyPassword,&#010;-            this.agentHost, this.proxyRealm);&#010;+            Http.agentHost, this.proxyRealm);&#010; &#010;         client.getState().setProxyCredentials(&#010;             proxyAuthScope, proxyCredentials);&#010;&#010;Modified: nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java&#010;Tue May 21 01:19:26 2013&#010;@@ -23,7 +23,6 @@ import junit.framework.TestCase;&#010; import org.mortbay.jetty.Server;&#010; import org.mortbay.jetty.bio.SocketConnector;&#010; import org.mortbay.jetty.handler.ContextHandler;&#010;-import org.mortbay.jetty.handler.ResourceHandler;&#010; import org.mortbay.jetty.servlet.ServletHandler;&#010; import org.mortbay.jetty.servlet.SessionHandler;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -32,8 +31,6 @@ import org.apache.nutch.net.protocols.Re&#010; &#010; /**&#010;  * Test cases for protocol-httpclient.&#010;- *&#010;- * @author Susam Pal&#010;  */&#010; public class TestProtocolHttpClient extends TestCase {&#010; &#010;&#010;Modified: nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java&#010;Tue May 21 01:19:26 2013&#010;@@ -14,9 +14,6 @@&#010;  * See the License for the specific language governing permissions and&#010;  * limitations under the License.&#010;  */&#010;-&#010;-// $Id$&#010;-&#010; package org.apache.nutch.urlfilter.prefix;&#010; &#010; import org.slf4j.Logger;&#010;@@ -79,7 +76,7 @@ public class PrefixURLFilter implements &#010;     throws IOException {&#010;     &#010;     BufferedReader in=new BufferedReader(reader);&#010;-    List urlprefixes = new ArrayList();&#010;+    List&lt;String&gt; urlprefixes = new ArrayList&lt;String&gt;();&#010;     String line;&#010; &#010;     while((line=in.readLine())!=null) {&#010;&#010;Modified: nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java&#010;Tue May 21 01:19:26 2013&#010;@@ -181,7 +181,7 @@ public class SuffixURLFilter implements &#010;       return;&#010;     }&#010;     BufferedReader in = new BufferedReader(reader);&#010;-    List aSuffixes = new ArrayList();&#010;+    List&lt;String&gt; aSuffixes = new ArrayList&lt;String&gt;();&#010;     boolean allow = false;&#010;     boolean ignore = false;&#010;     String line;&#010;&#010;Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java (original)&#010;+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java Tue May 21 01:19:26&#010;2013&#010;@@ -44,9 +44,9 @@ public class TestCrawlDbMerger extends T&#010;           url21&#010;   };&#010;   &#010;-  TreeSet init1 = new TreeSet();&#010;-  TreeSet init2 = new TreeSet();&#010;-  HashMap expected = new HashMap();&#010;+  TreeSet&lt;String&gt; init1 = new TreeSet&lt;String&gt;();&#010;+  TreeSet&lt;String&gt; init2 = new TreeSet&lt;String&gt;();&#010;+  HashMap&lt;String, CrawlDatum&gt; expected = new HashMap&lt;String, CrawlDatum&gt;();&#010;   CrawlDatum cd1, cd2, cd3;&#010;   Configuration conf;&#010;   FileSystem fs;&#010;@@ -83,6 +83,7 @@ public class TestCrawlDbMerger extends T&#010;     fs.mkdirs(testDir);&#010;   }&#010;   &#010;+  @SuppressWarnings("deprecation")&#010;   public void tearDown() {&#010;     try {&#010;       if (fs.exists(testDir))&#010;@@ -93,6 +94,7 @@ public class TestCrawlDbMerger extends T&#010;     } catch (Exception e) { }&#010;   }&#010; &#010;+  @SuppressWarnings("deprecation")&#010;   public void testMerge() throws Exception {&#010;     Path crawldb1 = new Path(testDir, "crawldb1");&#010;     Path crawldb2 = new Path(testDir, "crawldb2");&#010;@@ -105,11 +107,11 @@ public class TestCrawlDbMerger extends T&#010;     LOG.fine("* reading crawldb: " + output);&#010;     reader = new CrawlDbReader();&#010;     String crawlDb = output.toString();&#010;-    Iterator it = expected.keySet().iterator();&#010;+    Iterator&lt;String&gt; it = expected.keySet().iterator();&#010;     while (it.hasNext()) {&#010;-      String url = (String)it.next();&#010;+      String url = it.next();&#010;       LOG.fine("url=" + url);&#010;-      CrawlDatum cd = (CrawlDatum)expected.get(url);&#010;+      CrawlDatum cd = expected.get(url);&#010;       CrawlDatum res = reader.get(crawlDb, url, conf);&#010;       LOG.fine(" -&gt; " + res);&#010;       System.out.println("url=" + url);&#010;@@ -123,13 +125,13 @@ public class TestCrawlDbMerger extends T&#010;     fs.delete(testDir);&#010;   }&#010;   &#010;-  private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb, TreeSet init,&#010;CrawlDatum cd) throws Exception {&#010;+  private void createCrawlDb(Configuration config, FileSystem fs, Path crawldb, TreeSet&lt;String&gt;&#010;init, CrawlDatum cd) throws Exception {&#010;     LOG.fine("* creating crawldb: " + crawldb);&#010;     Path dir = new Path(crawldb, CrawlDb.CURRENT_NAME);&#010;     MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, "part-00000").toString(),&#010;Text.class, CrawlDatum.class);&#010;-    Iterator it = init.iterator();&#010;+    Iterator&lt;String&gt; it = init.iterator();&#010;     while (it.hasNext()) {&#010;-      String key = (String)it.next();&#010;+      String key = it.next();&#010;       writer.append(new Text(key), cd);&#010;     }&#010;     writer.close();&#010;&#010;Modified: nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java (original)&#010;+++ nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java Tue May 21 01:19:26&#010;2013&#010;@@ -68,9 +68,9 @@ public class TestLinkDbMerger extends Te&#010;   String[] urls20_expected = urls11_expected;&#010;   String[] urls21_expected = urls21;&#010;   &#010;-  TreeMap init1 = new TreeMap();&#010;-  TreeMap init2 = new TreeMap();&#010;-  HashMap expected = new HashMap();&#010;+  TreeMap&lt;String, String[]&gt; init1 = new TreeMap&lt;String, String[]&gt;();&#010;+  TreeMap&lt;String, String[]&gt; init2 = new TreeMap&lt;String, String[]&gt;();&#010;+  HashMap&lt;String, String[]&gt; expected = new HashMap&lt;String, String[]&gt;();&#010;   Configuration conf;&#010;   Path testDir;&#010;   FileSystem fs;&#010;@@ -116,16 +116,16 @@ public class TestLinkDbMerger extends Te&#010;     merger.merge(output, new Path[]{linkdb1, linkdb2}, false, false);&#010;     LOG.fine("* reading linkdb: " + output);&#010;     reader = new LinkDbReader(conf, output);&#010;-    Iterator it = expected.keySet().iterator();&#010;+    Iterator&lt;String&gt; it = expected.keySet().iterator();&#010;     while (it.hasNext()) {&#010;-      String url = (String)it.next();&#010;+      String url = it.next();&#010;       LOG.fine("url=" + url);&#010;-      String[] vals = (String[])expected.get(url);&#010;+      String[] vals = expected.get(url);&#010;       Inlinks inlinks = reader.getInlinks(new Text(url));&#010;       // may not be null&#010;       assertNotNull(inlinks);&#010;-      ArrayList links = new ArrayList();&#010;-      Iterator it2 = inlinks.iterator();&#010;+      ArrayList&lt;String&gt; links = new ArrayList&lt;String&gt;();&#010;+      Iterator&lt;?&gt; it2 = inlinks.iterator();&#010;       while (it2.hasNext()) {&#010;         Inlink in = (Inlink)it2.next();&#010;         links.add(in.getFromUrl());&#010;@@ -139,15 +139,15 @@ public class TestLinkDbMerger extends Te&#010;     fs.delete(testDir, true);&#010;   }&#010;   &#010;-  private void createLinkDb(Configuration config, FileSystem fs, Path linkdb, TreeMap init)&#010;throws Exception {&#010;+  private void createLinkDb(Configuration config, FileSystem fs, Path linkdb, TreeMap&lt;String,&#010;String[]&gt; init) throws Exception {&#010;     LOG.fine("* creating linkdb: " + linkdb);&#010;     Path dir = new Path(linkdb, LinkDb.CURRENT_NAME);&#010;     MapFile.Writer writer = new MapFile.Writer(config, fs, new Path(dir, "part-00000").toString(),&#010;Text.class, Inlinks.class);&#010;-    Iterator it = init.keySet().iterator();&#010;+    Iterator&lt;String&gt; it = init.keySet().iterator();&#010;     while (it.hasNext()) {&#010;-      String key = (String)it.next();&#010;+      String key = it.next();&#010;       Inlinks inlinks = new Inlinks();&#010;-      String[] vals = (String[])init.get(key);&#010;+      String[] vals = init.get(key);&#010;       for (int i = 0; i &lt; vals.length; i++) {&#010;         Inlink in = new Inlink(vals[i], vals[i]);&#010;         inlinks.add(in);&#010;&#010;Modified: nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java (original)&#010;+++ nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java Tue May 21 01:19:26&#010;2013&#010;@@ -35,13 +35,11 @@ import org.apache.nutch.util.NutchJob;&#010; &#010; /**&#010;  * Unit tests for the plugin system&#010;- * &#010;- * @author joa23&#010;  */&#010; public class TestPluginSystem extends TestCase {&#010;     private int fPluginCount;&#010; &#010;-    private LinkedList fFolders = new LinkedList();&#010;+    private LinkedList&lt;File&gt; fFolders = new LinkedList&lt;File&gt;();&#010;     private Configuration conf ;&#010;     private PluginRepository repository;&#010; &#010;@@ -62,11 +60,10 @@ public class TestPluginSystem extends Te&#010;      */&#010;     protected void tearDown() throws Exception {&#010;         for (int i = 0; i &lt; fFolders.size(); i++) {&#010;-            File folder = (File) fFolders.get(i);&#010;+            File folder = fFolders.get(i);&#010;             delete(folder);&#010;             folder.delete();&#010;         }&#010;-&#010;     }&#010; &#010;     /**&#010;&#010;Modified: nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java (original)&#010;+++ nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java Tue May 21 01:19:26&#010;2013&#010;@@ -42,11 +42,11 @@ public class TestSegmentMerger extends T&#010;   public void setUp() throws Exception {&#010;     conf = NutchConfiguration.create();&#010;     fs = FileSystem.get(conf);&#010;-    long blkSize = fs.getDefaultBlockSize();&#010;     testDir = new Path(conf.get("hadoop.tmp.dir"), "merge-" + System.currentTimeMillis());&#010;     seg1 = new Path(testDir, "seg1");&#010;     seg2 = new Path(testDir, "seg2");&#010;     out = new Path(testDir, "out");&#010;+&#010;     // create large parse-text segments&#010;     System.err.println("Creating large segment 1...");&#010;     DecimalFormat df = new DecimalFormat("0000000");&#010;@@ -55,6 +55,9 @@ public class TestSegmentMerger extends T&#010;     MapFile.Writer w = new MapFile.Writer(conf, fs, ptPath.toString(), Text.class, ParseText.class);&#010;     long curSize = 0;&#010;     countSeg1 = 0;&#010;+    FileStatus fileStatus = fs.getFileStatus(ptPath);&#010;+    long blkSize = fileStatus.getBlockSize();&#010;+    &#010;     while (curSize &lt; blkSize * 2) {&#010;       k.set("seg1-" + df.format(countSeg1));&#010;       w.append(k, new ParseText("seg1 text " + countSeg1));&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484634 [1/2] - in /nutch/trunk: ./ src/java/org/apache/nutch/crawl/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/indexer/ src/java/org/apache/nutch/metadata/ src/java/org/apache/nutch/net/ src/java/org/apache/nutch/parse/ ...</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130521011930.9C1012388906@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130521011930-9C1012388906@eris-apache-org%3e</id>
<updated>2013-05-21T01:19:27Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue May 21 01:19:26 2013&#010;New Revision: 1484634&#010;&#010;URL: http://svn.apache.org/r1484634&#010;Log:&#010;NUTCH-1249 and NUTCH-1275 : Resolve all issues flagged up by adding javac -Xlint argument&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/build.xml&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java&#010;    nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java&#010;    nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java&#010;    nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java&#010;    nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java&#010;    nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java&#010;    nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java&#010;    nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java&#010;    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java&#010;    nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java&#010;    nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java&#010;    nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java&#010;    nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java&#010;    nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java&#010;    nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilter.java&#010;    nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilters.java&#010;    nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java&#010;    nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java&#010;    nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java&#010;    nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java&#010;    nutch/trunk/src/java/org/apache/nutch/tools/proxy/SegmentHandler.java&#010;    nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java&#010;    nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java&#010;    nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java&#010;    nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java&#010;    nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java&#010;    nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java&#010;    nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java&#010;    nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java&#010;    nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java&#010;    nutch/trunk/src/plugin/parse-swf/src/java/org/apache/nutch/parse/swf/SWFParser.java&#010;    nutch/trunk/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java&#010;    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/DOMBuilder.java&#010;    nutch/trunk/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;    nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipParser.java&#010;    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Client.java&#010;    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java&#010;    nutch/trunk/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;    nutch/trunk/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java&#010;    nutch/trunk/src/plugin/urlfilter-prefix/src/java/org/apache/nutch/urlfilter/prefix/PrefixURLFilter.java&#010;    nutch/trunk/src/plugin/urlfilter-suffix/src/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java&#010;    nutch/trunk/src/test/org/apache/nutch/crawl/TestCrawlDbMerger.java&#010;    nutch/trunk/src/test/org/apache/nutch/crawl/TestLinkDbMerger.java&#010;    nutch/trunk/src/test/org/apache/nutch/plugin/TestPluginSystem.java&#010;    nutch/trunk/src/test/org/apache/nutch/segment/TestSegmentMerger.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Tue May 21 01:19:26 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1249 and NUTCH-1275 : Resolve all issues flagged up by adding javac -Xlint argument (tejasp)&#010;+&#010; * NUTCH-1053 Parsing of RSS feeds fails (tejasp)&#010; &#010; * Added crawler-commons dependency in pom.xml (tejasp)&#010;&#010;Modified: nutch/trunk/build.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/build.xml (original)&#010;+++ nutch/trunk/build.xml Tue May 21 01:19:26 2013&#010;@@ -57,6 +57,10 @@&#010;     &lt;/fileset&gt;&#010;   &lt;/path&gt;&#010; &#010;+  &lt;presetdef name="javac"&gt;&#010;+    &lt;javac includeantruntime="false" /&gt;&#010;+  &lt;/presetdef&gt;&#010;+&#010;   &lt;!-- ====================================================== --&gt;&#010;   &lt;!-- Stuff needed by all targets                            --&gt;&#010;   &lt;!-- ====================================================== --&gt;&#010;@@ -94,7 +98,7 @@&#010;      target="${javac.version}"&#010;      source="${javac.version}"&#010;      deprecation="${javac.deprecation}"&gt;&#010;-      &lt;compilerarg value="-Xlint"/&gt;&#010;+      &lt;compilerarg value="-Xlint:-path"/&gt;&#010;       &lt;classpath refid="classpath"/&gt;&#010;     &lt;/javac&gt;    &#010;   &lt;/target&gt;&#010;@@ -341,7 +345,7 @@&#010;      target="${javac.version}"&#010;      source="${javac.version}"&#010;      deprecation="${javac.deprecation}"&gt;&#010;-      &lt;compilerarg value="-Xlint"/&gt;&#010;+      &lt;compilerarg value="-Xlint:-path"/&gt;&#010;       &lt;classpath refid="test.classpath"/&gt;&#010;     &lt;/javac&gt;    &#010;   &lt;/target&gt;&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/FetchScheduleFactory.java Tue May 21 01:19:26 2013&#010;@@ -37,7 +37,7 @@ public class FetchScheduleFactory {&#010;     if (impl == null) {&#010;       try {&#010;         LOG.info("Using FetchSchedule impl: " + clazz);&#010;-        Class implClass = Class.forName(clazz);&#010;+        Class&lt;?&gt; implClass = Class.forName(clazz);&#010;         impl = (FetchSchedule)implClass.newInstance();&#010;         impl.setConf(conf);&#010;         objectCache.setObject(clazz, impl);&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Tue May 21 01:19:26 2013&#010;@@ -368,6 +368,7 @@ public class Generator extends Configure&#010;       super(Text.class);&#010;     }&#010; &#010;+    @SuppressWarnings("rawtypes" )&#010;     public int compare(WritableComparable a, WritableComparable b) {&#010;       Text url1 = (Text) a;&#010;       Text url2 = (Text) b;&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/Injector.java Tue May 21 01:19:26 2013&#010;@@ -60,7 +60,7 @@ public class Injector extends Configured&#010;   public static String nutchFixedFetchIntervalMDName = "nutch.fetchInterval.fixed";&#010; &#010;   /** Normalize and filter injected urls. */&#010;-  public static class InjectMapper implements Mapper&lt;WritableComparable, Text, Text, CrawlDatum&gt; {&#010;+  public static class InjectMapper implements Mapper&lt;WritableComparable&lt;?&gt;, Text, Text, CrawlDatum&gt; {&#010;     private URLNormalizers urlNormalizers;&#010;     private int interval;&#010;     private float scoreInjected;&#010;@@ -81,7 +81,7 @@ public class Injector extends Configured&#010; &#010;     public void close() {}&#010; &#010;-    public void map(WritableComparable key, Text value,&#010;+    public void map(WritableComparable&lt;?&gt; key, Text value,&#010;                     OutputCollector&lt;Text, CrawlDatum&gt; output, Reporter reporter)&#010;       throws IOException {&#010;       String url = value.toString();              // value is line of text&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Tue May 21 01:19:26 2013&#010;@@ -43,7 +43,7 @@ import java.io.Closeable;&#010; public class LinkDbReader extends Configured implements Tool, Closeable {&#010;   public static final Logger LOG = LoggerFactory.getLogger(LinkDbReader.class);&#010; &#010;-  private static final Partitioner&lt;WritableComparable, Writable&gt; PARTITIONER = new HashPartitioner&lt;WritableComparable, Writable&gt;();&#010;+  private static final Partitioner&lt;WritableComparable&lt;?&gt;, Writable&gt; PARTITIONER = new HashPartitioner&lt;WritableComparable&lt;?&gt;, Writable&gt;();&#010; &#010;   private FileSystem fs;&#010;   private Path directory;&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/MapWritable.java Tue May 21 01:19:26 2013&#010;@@ -79,9 +79,9 @@ public class MapWritable implements Writ&#010; &#010;   private ClassIdEntry fIdFirst;&#010; &#010;-  private static Map&lt;Class, Byte&gt; CLASS_ID_MAP = new HashMap&lt;Class, Byte&gt;();&#010;+  private static Map&lt;Class&lt;?&gt;, Byte&gt; CLASS_ID_MAP = new HashMap&lt;Class&lt;?&gt;, Byte&gt;();&#010; &#010;-  private static Map&lt;Byte, Class&gt; ID_CLASS_MAP = new HashMap&lt;Byte, Class&gt;();&#010;+  private static Map&lt;Byte, Class&lt;?&gt;&gt; ID_CLASS_MAP = new HashMap&lt;Byte, Class&lt;?&gt;&gt;();&#010; &#010;   static {&#010; &#010;@@ -101,7 +101,7 @@ public class MapWritable implements Writ&#010; &#010;   }&#010; &#010;-  private static void addToMap(Class clazz, Byte byteId) {&#010;+  private static void addToMap(Class&lt;?&gt; clazz, Byte byteId) {&#010;     CLASS_ID_MAP.put(clazz, byteId);&#010;     ID_CLASS_MAP.put(byteId, clazz);&#010;   }&#010;@@ -338,7 +338,7 @@ public class MapWritable implements Writ&#010;       // read class-id map&#010;       fIdCount = in.readByte();&#010;       byte id;&#010;-      Class clazz;&#010;+      Class&lt;?&gt; clazz;&#010;       for (int i = 0; i &lt; fIdCount; i++) {&#010;         try {&#010;           id = in.readByte();&#010;@@ -393,7 +393,7 @@ public class MapWritable implements Writ&#010;     }&#010;   }&#010; &#010;-  private byte addIdEntry(byte id, Class clazz) {&#010;+  private byte addIdEntry(byte id, Class&lt;?&gt; clazz) {&#010;     if (fIdFirst == null) {&#010;       fIdFirst = fIdLast = new ClassIdEntry(id, clazz);&#010;     } else {&#010;@@ -402,7 +402,7 @@ public class MapWritable implements Writ&#010;     return id;&#010;   }&#010; &#010;-  private byte getClassId(Class clazz) {&#010;+  private byte getClassId(Class&lt;?&gt; clazz) {&#010;     Byte classId = CLASS_ID_MAP.get(clazz);&#010;     if (classId != null) {&#010;       return classId.byteValue();&#010;@@ -438,8 +438,8 @@ public class MapWritable implements Writ&#010;       last = entry;&#010;       entry = entry.fNextEntry;&#010;     }&#010;-    Class keyClass = getClass(keyId);&#010;-    Class valueClass = getClass(valueId);&#010;+    Class&lt;?&gt; keyClass = getClass(keyId);&#010;+    Class&lt;?&gt; valueClass = getClass(valueId);&#010;     try {&#010;       return new KeyValueEntry((Writable) keyClass.newInstance(),&#010;           (Writable) valueClass.newInstance());&#010;@@ -449,8 +449,8 @@ public class MapWritable implements Writ&#010; &#010;   }&#010; &#010;-  private Class getClass(final byte id) throws IOException {&#010;-    Class clazz = ID_CLASS_MAP.get(new Byte(id));&#010;+  private Class&lt;?&gt; getClass(final byte id) throws IOException {&#010;+    Class&lt;?&gt; clazz = ID_CLASS_MAP.get(new Byte(id));&#010;     if (clazz == null) {&#010;       ClassIdEntry entry = fIdFirst;&#010;       while (entry != null) {&#010;@@ -502,14 +502,14 @@ public class MapWritable implements Writ&#010; &#010;   /** container for Id class tuples */&#010;   private class ClassIdEntry {&#010;-    public ClassIdEntry(byte id, Class clazz) {&#010;+    public ClassIdEntry(byte id, Class&lt;?&gt; clazz) {&#010;       fId = id;&#010;       fclazz = clazz;&#010;     }&#010; &#010;     private byte fId;&#010; &#010;-    private Class fclazz;&#010;+    private Class&lt;?&gt; fclazz;&#010; &#010;     private ClassIdEntry fNextIdEntry;&#010;   }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Tue May 21 01:19:26 2013&#010;@@ -19,12 +19,13 @@ package org.apache.nutch.crawl;&#010; import org.apache.hadoop.io.Writable;&#010; import org.apache.nutch.util.GenericWritableConfigurable;&#010; &#010;+@SuppressWarnings("unchecked")&#010; public class NutchWritable extends GenericWritableConfigurable {&#010; &#010;   private static Class&lt;? extends Writable&gt;[] CLASSES = null;&#010; &#010;   static {&#010;-    CLASSES = new Class[] {&#010;+    CLASSES = (Class&lt;? extends Writable&gt;[]) new Class&lt;?&gt;[] {&#010;       org.apache.hadoop.io.NullWritable.class,&#010;       org.apache.hadoop.io.BooleanWritable.class,&#010;       org.apache.hadoop.io.LongWritable.class,&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureComparator.java Tue May 21 01:19:26 2013&#010;@@ -19,7 +19,7 @@ package org.apache.nutch.crawl;&#010; &#010; import java.util.Comparator;&#010; &#010;-public class SignatureComparator implements Comparator {&#010;+public class SignatureComparator implements Comparator&lt;Object&gt; {&#010;   public int compare(Object o1, Object o2) {&#010;     return _compare(o1, o2);&#010;   }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/SignatureFactory.java Tue May 21 01:19:26 2013&#010;@@ -47,7 +47,7 @@ public class SignatureFactory {&#010;         if (LOG.isInfoEnabled()) {&#010;           LOG.info("Using Signature impl: " + clazz);&#010;         }&#010;-        Class implClass = Class.forName(clazz);&#010;+        Class&lt;?&gt; implClass = Class.forName(clazz);&#010;         impl = (Signature)implClass.newInstance();&#010;         impl.setConf(conf);&#010;         objectCache.setObject(clazz, impl);&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java Tue May 21 01:19:26 2013&#010;@@ -45,7 +45,7 @@ import org.apache.nutch.util.*;&#010; &#010; &#010; /** The fetcher. Most of the work is done by plugins. */&#010;-public class OldFetcher extends Configured implements Tool, MapRunnable&lt;WritableComparable, Writable, Text, NutchWritable&gt; { &#010;+public class OldFetcher extends Configured implements Tool, MapRunnable&lt;WritableComparable&lt;?&gt;, Writable, Text, NutchWritable&gt; { &#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(OldFetcher.class);&#010;   &#010;@@ -55,12 +55,11 @@ public class OldFetcher extends Configur&#010; &#010;   public static final String PROTOCOL_REDIR = "protocol";&#010; &#010;-  public static class InputFormat extends SequenceFileInputFormat&lt;WritableComparable, Writable&gt; {&#010;+  public static class InputFormat extends SequenceFileInputFormat&lt;WritableComparable&lt;?&gt;, Writable&gt; {&#010;     /** Don't split inputs, to keep things polite. */&#010;     public InputSplit[] getSplits(JobConf job, int nSplits)&#010;       throws IOException {&#010;       FileStatus[] files = listStatus(job);&#010;-      FileSystem fs = FileSystem.get(job);&#010;       InputSplit[] splits = new InputSplit[files.length];&#010;       for (int i = 0; i &lt; files.length; i++) {&#010;         FileStatus cur = files[i];&#010;@@ -71,7 +70,7 @@ public class OldFetcher extends Configur&#010;     }&#010;   }&#010; &#010;-  private RecordReader&lt;WritableComparable, Writable&gt; input;&#010;+  private RecordReader&lt;WritableComparable&lt;?&gt;, Writable&gt; input;&#010;   private OutputCollector&lt;Text, NutchWritable&gt; output;&#010;   private Reporter reporter;&#010; &#010;@@ -458,7 +457,7 @@ public class OldFetcher extends Configur&#010;     return conf.getBoolean("fetcher.store.content", true);&#010;   }&#010; &#010;-  public void run(RecordReader&lt;WritableComparable, Writable&gt; input, OutputCollector&lt;Text, NutchWritable&gt; output,&#010;+  public void run(RecordReader&lt;WritableComparable&lt;?&gt;, Writable&gt; input, OutputCollector&lt;Text, NutchWritable&gt; output,&#010;                   Reporter reporter) throws IOException {&#010; &#010;     this.input = input;&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/indexer/NutchField.java Tue May 21 01:19:26 2013&#010;@@ -28,8 +28,8 @@ import java.util.List;&#010; import org.apache.hadoop.io.*;&#010; &#010; /**&#010;- * This class represents a multi-valued field with a weight. Values are arbitrary&#010;- * objects.&#010;+ * This class represents a multi-valued field with a weight. &#010;+ * Values are arbitrary objects.&#010;  */&#010; public class NutchField implements Writable {&#010;   private float weight;&#010;@@ -44,7 +44,7 @@ public class NutchField implements Writa&#010;   public NutchField(Object value, float weight) {&#010;     this.weight = weight;&#010;     if (value instanceof Collection) {&#010;-      values.addAll((Collection&lt;Object&gt;)value);&#010;+      values.addAll((Collection&lt;?&gt;)value);&#010;     } else {&#010;       values.add(value);&#010;     }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/metadata/HttpHeaders.java Tue May 21 01:19:26 2013&#010;@@ -23,9 +23,6 @@ import org.apache.hadoop.io.Text;&#010;  *&#010;  * @see &lt;a href="http://rfc-ref.org/RFC-TEXTS/2616/"&gt;Hypertext Transfer&#010;  *      Protocol -- HTTP/1.1 (RFC 2616)&lt;/a&gt;&#010;- *&#010;- * @author Chris Mattmann&#010;- * @author J&amp;eacute;r&amp;ocirc;me Charron&#010;  */&#010; public interface HttpHeaders {&#010; &#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/metadata/Metadata.java Tue May 21 01:19:26 2013&#010;@@ -27,13 +27,8 @@ import java.util.Properties;&#010; import org.apache.hadoop.io.Text;&#010; import org.apache.hadoop.io.Writable;&#010; &#010;-&#010; /**&#010;  * A multi-valued metadata container.&#010;- *&#010;- * @author Chris Mattmann&#010;- * @author J&amp;eacute;r&amp;ocirc;me Charron&#010;- *&#010;  */&#010; public class Metadata implements Writable, CreativeCommons,&#010; DublinCore, HttpHeaders, Nutch, Feed {&#010;@@ -128,7 +123,7 @@ DublinCore, HttpHeaders, Nutch, Feed {&#010;    * @param properties properties to copy from&#010;    */&#010;   public void setAll(Properties properties) {&#010;-    Enumeration names = properties.propertyNames();&#010;+    Enumeration&lt;?&gt; names = properties.propertyNames();&#010;     while (names.hasMoreElements()) {&#010;       String name = (String) names.nextElement();&#010;       metadata.put(name, new String[]{properties.getProperty(name)});&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java Tue May 21 01:19:26 2013&#010;@@ -52,9 +52,9 @@ public class SpellCheckedMetadata extend&#010; &#010;     // Uses following array to fill the metanames index and the&#010;     // metanames list.&#010;-    Class[] spellthese = {HttpHeaders.class};&#010;+    Class&lt;?&gt;[] spellthese = {HttpHeaders.class};&#010; &#010;-    for (Class spellCheckedNames : spellthese) {&#010;+    for (Class&lt;?&gt; spellCheckedNames : spellthese) {&#010;       for (Field field : spellCheckedNames.getFields()) {&#010;         int mods = field.getModifiers();&#010;         if (Modifier.isFinal(mods) &amp;&amp; Modifier.isPublic(mods)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/net/URLNormalizers.java Tue May 21 01:19:26 2013&#010;@@ -101,7 +101,7 @@ public final class URLNormalizers {&#010;   public static final Logger LOG = LoggerFactory.getLogger(URLNormalizers.class);&#010; &#010;   /* Empty extension list for caching purposes. */&#010;-  private final List&lt;Extension&gt; EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST;&#010;+  private final List&lt;Extension&gt; EMPTY_EXTENSION_LIST = Collections.&lt;Extension&gt;emptyList();&#010;   &#010;   private final URLNormalizer[] EMPTY_NORMALIZERS = new URLNormalizer[0];&#010; &#010;@@ -194,6 +194,7 @@ public final class URLNormalizers {&#010;    *         empty list.&#010;    * @throws PluginRuntimeException&#010;    */&#010;+  @SuppressWarnings("unchecked")&#010;   private List&lt;Extension&gt; getExtensions(String scope) {&#010;     ObjectCache objectCache = ObjectCache.get(conf);&#010;     List&lt;Extension&gt; extensions = &#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/parse/HTMLMetaTags.java Tue May 21 01:19:26 2013&#010;@@ -188,7 +188,7 @@ public class HTMLMetaTags {&#010;             + ", refreshHref=" + refreshHref + "\n"&#010;             );&#010;     sb.append(" * general tags:\n");&#010;-    Iterator it = generalTags.keySet().iterator();&#010;+    Iterator&lt;Object&gt; it = generalTags.keySet().iterator();&#010;     while (it.hasNext()) {&#010;       String key = (String)it.next();&#010;       sb.append("   - " + key + "\t=\t" + generalTags.get(key) + "\n");&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Tue May 21 01:19:26 2013&#010;@@ -42,7 +42,7 @@ import java.util.Map.Entry;&#010; &#010; /* Parse content in a segment. */&#010; public class ParseSegment extends Configured implements Tool,&#010;-    Mapper&lt;WritableComparable, Content, Text, ParseImpl&gt;,&#010;+    Mapper&lt;WritableComparable&lt;?&gt;, Content, Text, ParseImpl&gt;,&#010;     Reducer&lt;Text, Writable, Text, Writable&gt; {&#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(ParseSegment.class);&#010;@@ -71,7 +71,7 @@ public class ParseSegment extends Config&#010;   &#010;   private Text newKey = new Text();&#010; &#010;-  public void map(WritableComparable key, Content content,&#010;+  public void map(WritableComparable&lt;?&gt; key, Content content,&#010;                   OutputCollector&lt;Text, ParseImpl&gt; output, Reporter reporter)&#010;     throws IOException {&#010;     // convert on the fly from old UTF8 keys&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserFactory.java Tue May 21 01:19:26 2013&#010;@@ -48,7 +48,7 @@ public final class ParserFactory {&#010;   public static final String DEFAULT_PLUGIN = "*";&#010;   &#010;   /** Empty extension list for caching purposes. */&#010;-  private final List EMPTY_EXTENSION_LIST = Collections.EMPTY_LIST;&#010;+  private final List&lt;Extension&gt; EMPTY_EXTENSION_LIST = Collections.&lt;Extension&gt;emptyList();&#010;   &#010;   private Configuration conf;&#010;   private ExtensionPoint extensionPoint;&#010;@@ -57,9 +57,9 @@ public final class ParserFactory {&#010;   public ParserFactory(Configuration conf) {&#010;     this.conf = conf;&#010;     ObjectCache objectCache = ObjectCache.get(conf);&#010;-    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(&#010;-        Parser.X_POINT_ID);&#010;+    this.extensionPoint = PluginRepository.get(conf).getExtensionPoint(Parser.X_POINT_ID);&#010;     this.parsePluginList = (ParsePluginList)objectCache.getObject(ParsePluginList.class.getName());&#010;+    &#010;     if (this.parsePluginList == null) {&#010;       this.parsePluginList = new ParsePluginsReader().parse(conf);&#010;       objectCache.setObject(ParsePluginList.class.getName(), this.parsePluginList);&#010;@@ -121,8 +121,8 @@ public final class ParserFactory {&#010;     }&#010; &#010;     parsers = new Vector&lt;Parser&gt;(parserExts.size());&#010;-    for (Iterator i=parserExts.iterator(); i.hasNext(); ){&#010;-      Extension ext = (Extension) i.next();&#010;+    for (Iterator&lt;Extension&gt; i = parserExts.iterator(); i.hasNext(); ){&#010;+      Extension ext = i.next();&#010;       Parser p = null;&#010;       try {&#010;         //check to see if we've cached this parser instance yet&#010;@@ -212,6 +212,7 @@ public final class ParserFactory {&#010;    * @return a list of extensions to be used for this contentType.&#010;    *         If none, returns &lt;code&gt;null&lt;/code&gt;.&#010;    */&#010;+  @SuppressWarnings("unchecked")&#010;   protected List&lt;Extension&gt; getExtensions(String contentType) {&#010;     &#010;     ObjectCache objectCache = ObjectCache.get(conf);&#010;@@ -411,5 +412,4 @@ public final class ParserFactory {&#010;   private Extension getExtensionFromAlias(Extension[] list, String id) {&#010;     return getExtension(list, parsePluginList.getAliases().get(id));&#010;   }&#010;-&#010; }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/plugin/Extension.java Tue May 21 01:19:26 2013&#010;@@ -25,8 +25,6 @@ import org.apache.hadoop.conf.Configurab&#010;  * An &lt;code&gt;Extension&lt;/code&gt; is a kind of listener descriptor that will be&#010;  * installed on a concrete &lt;code&gt;ExtensionPoint&lt;/code&gt; that acts as kind of&#010;  * Publisher.&#010;- * &#010;- * @author joa23&#010;  */&#010; public class Extension {&#010;   private PluginDescriptor fDescriptor;&#010;@@ -153,7 +151,7 @@ public class Extension {&#010;     synchronized (getId()) {&#010;       try {&#010;         PluginClassLoader loader = fDescriptor.getClassLoader();&#010;-        Class extensionClazz = loader.loadClass(getClazz());&#010;+        Class&lt;?&gt; extensionClazz = loader.loadClass(getClazz());&#010;         // lazy loading of Plugin in case there is no instance of the plugin&#010;         // already.&#010;         this.pluginRepository.getPluginInstance(getDescriptor());&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java Tue May 21 01:19:26 2013&#010;@@ -37,8 +37,6 @@ import org.apache.hadoop.conf.Configurat&#010;  * &lt;code&gt;ExtensionPoint&lt;/code&gt; and &lt;code&gt;Extension&lt;/code&gt;. To provide&#010;  * access to the meta data of a plugin via a descriptor allow a lazy loading&#010;  * mechanism.&#010;- * &#010;- * @author joa23&#010;  */&#010; public class PluginDescriptor {&#010;   private String fPluginPath;&#010;@@ -47,7 +45,7 @@ public class PluginDescriptor {&#010;   private String fVersion;&#010;   private String fName;&#010;   private String fProviderName;&#010;-  private HashMap fMessages = new HashMap();&#010;+  private HashMap&lt;String, ResourceBundle&gt; fMessages = new HashMap&lt;String, ResourceBundle&gt;();&#010;   private ArrayList&lt;ExtensionPoint&gt; fExtensionPoints = new ArrayList&lt;ExtensionPoint&gt;();&#010;   private ArrayList&lt;String&gt; fDependencies = new ArrayList&lt;String&gt;();&#010;   private ArrayList&lt;URL&gt; fExportedLibs = new ArrayList&lt;URL&gt;();&#010;@@ -338,8 +336,7 @@ public class PluginDescriptor {&#010;   public String getResourceString(String pKey, Locale pLocale)&#010;       throws IOException {&#010;     if (fMessages.containsKey(pLocale.toString())) {&#010;-      ResourceBundle bundle = (ResourceBundle) fMessages&#010;-          .get(pLocale.toString());&#010;+      ResourceBundle bundle = fMessages.get(pLocale.toString());&#010;       try {&#010;         return bundle.getString(pKey);&#010;       } catch (MissingResourceException e) {&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginRepository.java Tue May 21 01:19:26 2013&#010;@@ -39,8 +39,6 @@ import org.apache.nutch.util.NutchConfig&#010;  * descriptor represents all meta information about a plugin. So a plugin&#010;  * instance will be created later when it is required, this allow lazy plugin&#010;  * loading.&#010;- * &#010;- * @author joa23&#010;  */&#010; public class PluginRepository {&#010;   private static final WeakHashMap&lt;String, PluginRepository&gt; CACHE = new WeakHashMap&lt;String, PluginRepository&gt;();&#010;@@ -267,8 +265,8 @@ public class PluginRepository {&#010;       // Suggested by Stefan Groschupf &lt;sg@media-style.com&gt;&#010;       synchronized (pDescriptor) {&#010;         PluginClassLoader loader = pDescriptor.getClassLoader();&#010;-        Class pluginClass = loader.loadClass(pDescriptor.getPluginClass());&#010;-        Constructor constructor = pluginClass.getConstructor(new Class[] {&#010;+        Class&lt;?&gt; pluginClass = loader.loadClass(pDescriptor.getPluginClass());&#010;+        Constructor&lt;?&gt; constructor = pluginClass.getConstructor(new Class&lt;?&gt;[] {&#010;             PluginDescriptor.class, Configuration.class });&#010;         Plugin plugin = (Plugin) constructor.newInstance(new Object[] {&#010;             pDescriptor, this.conf });&#010;@@ -400,7 +398,7 @@ public class PluginRepository {&#010;     }&#010;     ClassLoader cl = d.getClassLoader();&#010;     // args[1] - class name&#010;-    Class clazz = null;&#010;+    Class&lt;?&gt; clazz = null;&#010;     try {&#010;       clazz = Class.forName(args[1], true, cl);&#010;     } catch (Exception e) {&#010;@@ -410,7 +408,7 @@ public class PluginRepository {&#010;     }&#010;     Method m = null;&#010;     try {&#010;-      m = clazz.getMethod("main", new Class[] { args.getClass() });&#010;+      m = clazz.getMethod("main", new Class&lt;?&gt;[] { args.getClass() });&#010;     } catch (Exception e) {&#010;       System.err.println("Could not find the 'main(String[])' method in class "&#010;           + args[1] + ": " + e.getMessage());&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Tue May 21 01:19:26 2013&#010;@@ -433,12 +433,17 @@ public class LinkDumper&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg()&#010;-      .withDescription("the web graph database to use").create("webgraphdb");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the web graph database to use");&#010;+    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webGraphDbOpts);&#010;+    &#010;     CommandLineParser parser = new GnuParser();&#010;     try {&#010; &#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Tue May 21 01:19:26 2013&#010;@@ -287,12 +287,10 @@ public class LinkRank&#010;     implements Mapper&lt;Text, Node, Text, LongWritable&gt;,&#010;     Reducer&lt;Text, LongWritable, Text, LongWritable&gt; {&#010; &#010;-    private JobConf conf;&#010;     private static Text numNodes = new Text(NUM_NODES);&#010;     private static LongWritable one = new LongWritable(1L);&#010; &#010;     public void configure(JobConf conf) {&#010;-      this.conf = conf;&#010;     }&#010; &#010;     /**&#010;@@ -678,11 +676,15 @@ public class LinkRank&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option webgraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(&#010;-      "the web graph db to use").create("webgraphdb");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the web graph db to use");&#010;+    Option webgraphOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webgraphOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java Tue May 21 01:19:26 2013&#010;@@ -45,9 +45,7 @@ public class LoopReader extends Configur&#010;   private FileSystem fs;&#010;   private MapFile.Reader[] loopReaders;&#010;   &#010;-  public LoopReader() {&#010;-    &#010;-  }&#010;+  public LoopReader() { }&#010;   &#010;   public LoopReader(Configuration conf) {&#010;     super(conf);&#010;@@ -94,14 +92,21 @@ public class LoopReader extends Configur&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()&#010;-      .withDescription("the webgraphdb to use").create("webgraphdb");&#010;-    Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg()&#010;-      .withDescription("the url to dump").create("url");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the webgraphdb to use");&#010;+    Option webGraphOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webGraphOpts);&#010;+    &#010;+    OptionBuilder.withArgName("url");&#010;+    OptionBuilder.hasOptionalArg();&#010;+    OptionBuilder.withDescription("the url to dump");&#010;+    Option urlOpts = OptionBuilder.create("url");&#010;     options.addOption(urlOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Tue May 21 01:19:26 2013&#010;@@ -583,11 +583,15 @@ public class Loops&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(&#010;-      "the web graph database to use").create("webgraphdb");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the web graph database to use");&#010;+    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webGraphDbOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java Tue May 21 01:19:26 2013&#010;@@ -343,36 +343,57 @@ public class NodeDumper&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(&#010;-      "the web graph database to use").create("webgraphdb");&#010;-    Option inlinkOpts = OptionBuilder.withArgName("inlinks").withDescription(&#010;-      "show highest inlinks").create("inlinks");&#010;-    Option outlinkOpts = OptionBuilder.withArgName("outlinks").withDescription(&#010;-      "show highest outlinks").create("outlinks");&#010;-    Option scoreOpts = OptionBuilder.withArgName("scores").withDescription(&#010;-      "show highest scores").create("scores");&#010;-    Option topNOpts = OptionBuilder.withArgName("topn").hasOptionalArg().withDescription(&#010;-      "show topN scores").create("topn");&#010;-    Option outputOpts = OptionBuilder.withArgName("output").hasArg().withDescription(&#010;-      "the output directory to use").create("output");&#010;-    Option effOpts = OptionBuilder.withArgName("asEff").withDescription(&#010;-      "Solr ExternalFileField compatible output format").create("asEff");&#010;-    Option groupOpts = OptionBuilder.hasArgs(2).withDescription(&#010;-      "group &lt;host|domain&gt; &lt;sum|max&gt;").create("group");&#010;-    Option sequenceFileOpts = OptionBuilder.withArgName("asSequenceFile").withDescription(&#010;-      "whether to output as a sequencefile").create("asSequenceFile");&#010;-&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the web graph database to use");&#010;+    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webGraphDbOpts);&#010;+    &#010;+    OptionBuilder.withArgName("inlinks");&#010;+    OptionBuilder.withDescription("show highest inlinks");&#010;+    Option inlinkOpts = OptionBuilder.create("inlinks");&#010;     options.addOption(inlinkOpts);&#010;+    &#010;+    OptionBuilder.withArgName("outlinks");&#010;+    OptionBuilder.withDescription("show highest outlinks");&#010;+    Option outlinkOpts = OptionBuilder.create("outlinks");&#010;     options.addOption(outlinkOpts);&#010;+    &#010;+    OptionBuilder.withArgName("scores");&#010;+    OptionBuilder.withDescription("show highest scores");&#010;+    Option scoreOpts = OptionBuilder.create("scores");&#010;     options.addOption(scoreOpts);&#010;+    &#010;+    OptionBuilder.withArgName("topn");&#010;+    OptionBuilder.hasOptionalArg();&#010;+    OptionBuilder.withDescription("show topN scores");&#010;+    Option topNOpts = OptionBuilder.create("topn");&#010;     options.addOption(topNOpts);&#010;+    &#010;+    OptionBuilder.withArgName("output");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the output directory to use");&#010;+    Option outputOpts = OptionBuilder.create("output");&#010;     options.addOption(outputOpts);&#010;+    &#010;+    OptionBuilder.withArgName("asEff");&#010;+    OptionBuilder.withDescription("Solr ExternalFileField compatible output format");&#010;+    Option effOpts = OptionBuilder.create("asEff");&#010;     options.addOption(effOpts);&#010;+    &#010;+    OptionBuilder.hasArgs(2);&#010;+    OptionBuilder.withDescription("group &lt;host|domain&gt; &lt;sum|max&gt;");&#010;+    Option groupOpts = OptionBuilder.create("group");&#010;     options.addOption(groupOpts);&#010;+    &#010;+    OptionBuilder.withArgName("asSequenceFile");&#010;+    OptionBuilder.withDescription("whether to output as a sequencefile");&#010;+    Option sequenceFileOpts = OptionBuilder.create("asSequenceFile");&#010;     options.addOption(sequenceFileOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;@@ -388,7 +409,6 @@ public class NodeDumper&#010;       String webGraphDb = line.getOptionValue("webgraphdb");&#010;       boolean inlinks = line.hasOption("inlinks");&#010;       boolean outlinks = line.hasOption("outlinks");&#010;-      boolean scores = line.hasOption("scores");&#010; &#010;       long topN = (line.hasOption("topn")&#010;         ? Long.parseLong(line.getOptionValue("topn")) : Long.MAX_VALUE);&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java Tue May 21 01:19:26 2013&#010;@@ -90,14 +90,21 @@ public class NodeReader extends Configur&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg()&#010;-      .withDescription("the webgraphdb to use").create("webgraphdb");&#010;-    Option urlOpts = OptionBuilder.withArgName("url").hasOptionalArg()&#010;-      .withDescription("the url to dump").create("url");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the webgraphdb to use");&#010;+    Option webGraphOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webGraphOpts);&#010;+    &#010;+    OptionBuilder.withArgName("url");&#010;+    OptionBuilder.hasOptionalArg();&#010;+    OptionBuilder.withDescription("the url to dump");&#010;+    Option urlOpts = OptionBuilder.create("url");&#010;     options.addOption(urlOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java Tue May 21 01:19:26 2013&#010;@@ -217,14 +217,21 @@ public class ScoreUpdater&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option crawlDbOpts = OptionBuilder.withArgName("crawldb").hasArg().withDescription(&#010;-      "the crawldb to use").create("crawldb");&#010;-    Option webGraphOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(&#010;-      "the webgraphdb to use").create("webgraphdb");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("crawldb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the crawldb to use");&#010;+    Option crawlDbOpts = OptionBuilder.create("crawldb");&#010;     options.addOption(crawlDbOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the webgraphdb to use");&#010;+    Option webGraphOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webGraphOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/WebGraph.java Tue May 21 01:19:26 2013&#010;@@ -405,28 +405,13 @@ public class WebGraph&#010;     extends Configured&#010;     implements Mapper&lt;Text, LinkDatum, Text, LinkDatum&gt; {&#010; &#010;-    private JobConf conf;&#010;     private long timestamp;&#010; &#010;     /**&#010;-     * Default constructor.&#010;-     */&#010;-    public InlinkDb() {&#010;-    }&#010;-&#010;-    /**&#010;-     * Configurable constructor.&#010;-     */&#010;-    public InlinkDb(Configuration conf) {&#010;-      setConf(conf);&#010;-    }&#010;-&#010;-    /**&#010;      * Configures job. Sets timestamp for all Inlink LinkDatum objects to the&#010;      * current system time.&#010;      */&#010;     public void configure(JobConf conf) {&#010;-      this.conf = conf;&#010;       timestamp = System.currentTimeMillis();&#010;     }&#010; &#010;@@ -461,30 +446,12 @@ public class WebGraph&#010;     extends Configured&#010;     implements Reducer&lt;Text, LinkDatum, Text, Node&gt; {&#010; &#010;-    private JobConf conf;&#010;-&#010;-    /**&#010;-     * Default constructor.&#010;-     */&#010;-    public NodeDb() {&#010;-    }&#010;-&#010;-    /**&#010;-     * Configurable constructor.&#010;-     */&#010;-    public NodeDb(Configuration conf) {&#010;-      setConf(conf);&#010;-    }&#010;-&#010;     /**&#010;      * Configures job.&#010;      */&#010;-    public void configure(JobConf conf) {&#010;-      this.conf = conf;&#010;-    }&#010;+    public void configure(JobConf conf) { }&#010; &#010;-    public void close() {&#010;-    }&#010;+    public void close() { }&#010; &#010;     /**&#010;      * Counts the number of inlinks and outlinks for each url and sets a default&#010;@@ -731,23 +698,37 @@ public class WebGraph&#010;     throws Exception {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option webGraphDbOpts = OptionBuilder.withArgName("webgraphdb").hasArg().withDescription(&#010;-      "the web graph database to use").create("webgraphdb");&#010;-    Option segOpts = OptionBuilder.withArgName("segment").hasArgs().withDescription(&#010;-      "the segment(s) to use").create("segment");&#010;-    Option segDirOpts = OptionBuilder.withArgName("segmentDir").hasArgs().withDescription(&#010;-      "the segment directory to use").create("segmentDir");&#010;-    Option normalizeOpts = OptionBuilder.withArgName("normalize").withDescription(&#010;-      "whether to use URLNormalizers on the URL's in the segment").create("normalize");&#010;-    Option filterOpts = OptionBuilder.withArgName("filter").withDescription(&#010;-      "whether to use URLFilters on the URL's in the segment").create("filter");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("webgraphdb");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the web graph database to use");&#010;+    Option webGraphDbOpts = OptionBuilder.create("webgraphdb");&#010;     options.addOption(webGraphDbOpts);&#010;+    &#010;+    OptionBuilder.withArgName("segment");&#010;+    OptionBuilder.hasArgs();&#010;+    OptionBuilder.withDescription("the segment(s) to use");&#010;+    Option segOpts = OptionBuilder.create("segment");&#010;     options.addOption(segOpts);&#010;+    &#010;+    OptionBuilder.withArgName("segmentDir");&#010;+    OptionBuilder.hasArgs();&#010;+    OptionBuilder.withDescription("the segment directory to use");&#010;+    Option segDirOpts = OptionBuilder.create("segmentDir");&#010;     options.addOption(segDirOpts);&#010;+    &#010;+    OptionBuilder.withArgName("normalize");&#010;+    OptionBuilder.withDescription("whether to use URLNormalizers on the URL's in the segment");&#010;+    Option normalizeOpts = OptionBuilder.create("normalize");&#010;     options.addOption(normalizeOpts);&#010;+    &#010;+    OptionBuilder.withArgName("filter");&#010;+    OptionBuilder.withDescription("whether to use URLFilters on the URL's in the segment");&#010;+    Option filterOpts = OptionBuilder.create("filter");&#010;     options.addOption(filterOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilter.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilter.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilter.java Tue May 21 01:19:26 2013&#010;@@ -18,7 +18,7 @@ package org.apache.nutch.segment;&#010; &#010; import java.util.Collection;&#010; &#010;-import org.apache.hadoop.io.WritableComparable;&#010;+import org.apache.hadoop.io.Text;&#010; import org.apache.nutch.crawl.CrawlDatum;&#010; import org.apache.nutch.parse.ParseData;&#010; import org.apache.nutch.parse.ParseText;&#010;@@ -41,7 +41,7 @@ public interface SegmentMergeFilter {&#010;    * @return &lt;tt&gt;true&lt;/tt&gt; values for this &lt;tt&gt;key&lt;/tt&gt; (URL) should be merged&#010;    *         into the new segment.&#010;    */&#010;-  public boolean filter(WritableComparable key, CrawlDatum generateData,&#010;+  public boolean filter(Text key, CrawlDatum generateData,&#010;       CrawlDatum fetchData, CrawlDatum sigData, Content content,&#010;       ParseData parseData, ParseText parseText, Collection&lt;CrawlDatum&gt; linked);&#010; }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilters.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilters.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilters.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMergeFilters.java Tue May 21 01:19:26 2013&#010;@@ -21,7 +21,7 @@ import java.util.Collection;&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; import org.apache.hadoop.conf.Configuration;&#010;-import org.apache.hadoop.io.WritableComparable;&#010;+import org.apache.hadoop.io.Text;&#010; import org.apache.nutch.crawl.CrawlDatum;&#010; import org.apache.nutch.net.URLFilter;&#010; import org.apache.nutch.parse.ParseData;&#010;@@ -65,16 +65,14 @@ public class SegmentMergeFilters {&#010;    * @return &lt;tt&gt;true&lt;/tt&gt; values for this &lt;tt&gt;key&lt;/tt&gt; (URL) should be merged&#010;    *         into the new segment.&#010;    */&#010;-  public boolean filter(WritableComparable key, CrawlDatum generateData,&#010;+  public boolean filter(Text key, CrawlDatum generateData,&#010;       CrawlDatum fetchData, CrawlDatum sigData, Content content,&#010;       ParseData parseData, ParseText parseText, Collection&lt;CrawlDatum&gt; linked) {&#010;     for (SegmentMergeFilter filter : filters) {&#010;       if (!filter.filter(key, generateData, fetchData, sigData, content,&#010;           parseData, parseText, linked)) {&#010;         if (LOG.isTraceEnabled())&#010;-          LOG&#010;-              .trace("Key " + key + " dropped by "&#010;-                  + filter.getClass().getName());&#010;+          LOG.trace("Key " + key + " dropped by " + filter.getClass().getName());&#010;         return false;&#010;       }&#010;     }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentMerger.java Tue May 21 01:19:26 2013&#010;@@ -16,6 +16,7 @@&#010;  */&#010; package org.apache.nutch.segment;&#010; &#010;+import java.io.Closeable;&#010; import java.io.IOException;&#010; import java.util.ArrayList;&#010; import java.util.HashMap;&#010;@@ -207,7 +208,7 @@ public class SegmentMerger extends Confi&#010;         MapFile.Writer pt_out = null;&#010;         SequenceFile.Writer g_out = null;&#010;         SequenceFile.Writer p_out = null;&#010;-        HashMap sliceWriters = new HashMap();&#010;+        HashMap&lt;String, Closeable&gt; sliceWriters = new HashMap&lt;String, Closeable&gt;();&#010;         String segmentName = job.get("segment.merger.segmentName");&#010;         &#010;         public void write(Text key, MetaWrapper wrapper) throws IOException {&#010;@@ -288,7 +289,7 @@ public class SegmentMerger extends Confi&#010;         }&#010; &#010;         public void close(Reporter reporter) throws IOException {&#010;-          Iterator&lt;Object&gt; it = sliceWriters.values().iterator();&#010;+          Iterator&lt;Closeable&gt; it = sliceWriters.values().iterator();&#010;           while (it.hasNext()) {&#010;             Object o = it.next();&#010;             if (o instanceof SequenceFile.Writer) {&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Tue May 21 01:19:26 2013&#010;@@ -80,10 +80,10 @@ public class SegmentReader extends Confi&#010;   private FileSystem fs;&#010; &#010;   public static class InputCompatMapper extends MapReduceBase implements&#010;-      Mapper&lt;WritableComparable, Writable, Text, NutchWritable&gt; {&#010;+      Mapper&lt;WritableComparable&lt;?&gt;, Writable, Text, NutchWritable&gt; {&#010;     private Text newKey = new Text();&#010; &#010;-    public void map(WritableComparable key, Writable value,&#010;+    public void map(WritableComparable&lt;?&gt; key, Writable value,&#010;         OutputCollector&lt;Text, NutchWritable&gt; collector, Reporter reporter) throws IOException {&#010;       // convert on the fly from old formats with UTF8 keys.&#010;       // UTF8 deprecated and replaced by Text.&#010;@@ -98,8 +98,8 @@ public class SegmentReader extends Confi&#010; &#010;   /** Implements a text output format */&#010;   public static class TextOutputFormat extends&#010;-      FileOutputFormat&lt;WritableComparable, Writable&gt; {&#010;-    public RecordWriter&lt;WritableComparable, Writable&gt; getRecordWriter(&#010;+      FileOutputFormat&lt;WritableComparable&lt;?&gt;, Writable&gt; {&#010;+    public RecordWriter&lt;WritableComparable&lt;?&gt;, Writable&gt; getRecordWriter(&#010;         final FileSystem fs, JobConf job,&#010;         String name, final Progressable progress) throws IOException {&#010; &#010;@@ -109,8 +109,8 @@ public class SegmentReader extends Confi&#010;       if (fs.exists(segmentDumpFile)) fs.delete(segmentDumpFile, true);&#010; &#010;       final PrintStream printStream = new PrintStream(fs.create(segmentDumpFile));&#010;-      return new RecordWriter&lt;WritableComparable, Writable&gt;() {&#010;-        public synchronized void write(WritableComparable key, Writable value) throws IOException {&#010;+      return new RecordWriter&lt;WritableComparable&lt;?&gt;, Writable&gt;() {&#010;+        public synchronized void write(WritableComparable&lt;?&gt; key, Writable value) throws IOException {&#010;           printStream.println(value);&#010;         }&#010; &#010;@@ -379,8 +379,8 @@ public class SegmentReader extends Confi&#010;   private List&lt;Writable&gt; getMapRecords(Path dir, Text key) throws Exception {&#010;     MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, dir, getConf());&#010;     ArrayList&lt;Writable&gt; res = new ArrayList&lt;Writable&gt;();&#010;-    Class keyClass = readers[0].getKeyClass();&#010;-    Class valueClass = readers[0].getValueClass();&#010;+    Class&lt;?&gt; keyClass = readers[0].getKeyClass();&#010;+    Class&lt;?&gt; valueClass = readers[0].getValueClass();&#010;     if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))&#010;       throw new IOException("Incompatible key (" + keyClass.getName() + ")");&#010;     Writable value = (Writable)valueClass.newInstance();&#010;@@ -403,8 +403,8 @@ public class SegmentReader extends Confi&#010;   private List&lt;Writable&gt; getSeqRecords(Path dir, Text key) throws Exception {&#010;     SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(getConf(), dir);&#010;     ArrayList&lt;Writable&gt; res = new ArrayList&lt;Writable&gt;();&#010;-    Class keyClass = readers[0].getKeyClass();&#010;-    Class valueClass = readers[0].getValueClass();&#010;+    Class&lt;?&gt; keyClass = readers[0].getKeyClass();&#010;+    Class&lt;?&gt; valueClass = readers[0].getValueClass();&#010;     if (!keyClass.getName().equals("org.apache.hadoop.io.Text"))&#010;       throw new IOException("Incompatible key (" + keyClass.getName() + ")");&#010;     Writable aKey = (Writable)keyClass.newInstance();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/tools/FreeGenerator.java Tue May 21 01:19:26 2013&#010;@@ -57,8 +57,6 @@ import org.apache.nutch.util.TimingUtil;&#010;  * This tool generates fetchlists (segments to be fetched) from plain text&#010;  * files containing one URL per line. It's useful when arbitrary URL-s need to&#010;  * be fetched without adding them first to the CrawlDb, or during testing.&#010;- * &#010;- * @author Andrzej Bialecki&#010;  */&#010; public class FreeGenerator extends Configured implements Tool {&#010;   private static final Logger LOG = LoggerFactory.getLogger(FreeGenerator.class);&#010;@@ -67,7 +65,7 @@ public class FreeGenerator extends Confi&#010;   private static final String NORMALIZE_KEY = "free.generator.normalize";&#010; &#010;   public static class FG extends MapReduceBase&#010;-  implements Mapper&lt;WritableComparable, Text, Text, Generator.SelectorEntry&gt;,&#010;+  implements Mapper&lt;WritableComparable&lt;?&gt;, Text, Text, Generator.SelectorEntry&gt;,&#010;   Reducer&lt;Text, Generator.SelectorEntry, Text, CrawlDatum&gt; {&#010;     private URLNormalizers normalizers = null;&#010;     private URLFilters filters = null;&#010;@@ -89,7 +87,7 @@ public class FreeGenerator extends Confi&#010;     &#010;     Generator.SelectorEntry entry = new Generator.SelectorEntry();&#010; &#010;-    public void map(WritableComparable key, Text value, OutputCollector&lt;Text,&#010;+    public void map(WritableComparable&lt;?&gt; key, Text value, OutputCollector&lt;Text,&#010;         Generator.SelectorEntry&gt; output, Reporter reporter) throws IOException {&#010;       // value is a line of text&#010;       String urlString = value.toString();&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/tools/ResolveUrls.java Tue May 21 01:19:26 2013&#010;@@ -77,7 +77,7 @@ public class ResolveUrls {&#010;         &#010;         // get the address by name and if no error is thrown then it &#010;         // is resolved successfully&#010;-        InetAddress ia = InetAddress.getByName(host);&#010;+        InetAddress.getByName(host);&#010;         LOG.info("Resolved: " + host);&#010;         numResolved.incrementAndGet();&#010;       }&#010;@@ -161,19 +161,25 @@ public class ResolveUrls {&#010;   public static void main(String[] args) {&#010; &#010;     Options options = new Options();&#010;-    Option helpOpts = OptionBuilder.withArgName("help").withDescription(&#010;-      "show this help message").create("help");&#010;-    Option urlOpts = OptionBuilder.withArgName("urls").hasArg().withDescription(&#010;-      "the urls file to check").create("urls");&#010;-    Option numThreadOpts = OptionBuilder.withArgName("numThreads").hasArgs().withDescription(&#010;-      "the number of threads to use").create("numThreads");&#010;+    OptionBuilder.withArgName("help");&#010;+    OptionBuilder.withDescription("show this help message");&#010;+    Option helpOpts = OptionBuilder.create("help");&#010;     options.addOption(helpOpts);&#010;+    &#010;+    OptionBuilder.withArgName("urls");&#010;+    OptionBuilder.hasArg();&#010;+    OptionBuilder.withDescription("the urls file to check");&#010;+    Option urlOpts = OptionBuilder.create("urls");&#010;     options.addOption(urlOpts);&#010;+    &#010;+    OptionBuilder.withArgName("numThreads");&#010;+    OptionBuilder.hasArgs();&#010;+    OptionBuilder.withDescription("the number of threads to use");&#010;+    Option numThreadOpts = OptionBuilder.create("numThreads");&#010;     options.addOption(numThreadOpts);&#010; &#010;     CommandLineParser parser = new GnuParser();&#010;     try {&#010;-&#010;       // parse out common line arguments&#010;       CommandLine line = parser.parse(options, args);&#010;       if (line.hasOption("help") || !line.hasOption("urls")) {&#010;@@ -196,5 +202,4 @@ public class ResolveUrls {&#010;       LOG.error("ResolveUrls: " + StringUtils.stringifyException(e));&#010;     }&#010;   }&#010;-&#010; }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/tools/proxy/SegmentHandler.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/proxy/SegmentHandler.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/tools/proxy/SegmentHandler.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/tools/proxy/SegmentHandler.java Tue May 21 01:19:26 2013&#010;@@ -42,8 +42,6 @@ import org.apache.hadoop.util.StringUtil&#010; import org.apache.nutch.crawl.CrawlDatum;&#010; import org.apache.nutch.metadata.Metadata;&#010; import org.apache.nutch.metadata.Nutch;&#010;-import org.apache.nutch.parse.ParseData;&#010;-import org.apache.nutch.parse.ParseText;&#010; import org.apache.nutch.protocol.Content;&#010; import org.apache.nutch.protocol.ProtocolStatus;&#010; import org.mortbay.jetty.Request;&#010;@@ -88,9 +86,8 @@ public class SegmentHandler extends Abst&#010;   &#010;   private static class Segment implements Closeable {&#010;     &#010;-    private static final Partitioner PARTITIONER = new HashPartitioner();&#010;+    private static final Partitioner&lt;Text,Writable&gt; PARTITIONER = new HashPartitioner&lt;Text,Writable&gt;();&#010; &#010;-    private FileSystem fs;&#010;     private Path segmentDir;&#010; &#010;     private Object cLock = new Object();&#010;@@ -102,7 +99,6 @@ public class SegmentHandler extends Abst&#010;     private Configuration conf;&#010; &#010;     public Segment(FileSystem fs, Path segmentDir, Configuration conf) throws IOException {&#010;-      this.fs = fs;&#010;       this.segmentDir = segmentDir;&#010;       this.conf = conf;&#010;     }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/util/GenericWritableConfigurable.java Tue May 21 01:19:26 2013&#010;@@ -41,7 +41,7 @@ public abstract class GenericWritableCon&#010;   @Override&#010;   public void readFields(DataInput in) throws IOException {&#010;     byte type = in.readByte();&#010;-    Class clazz = getTypes()[type];&#010;+    Class&lt;?&gt; clazz = getTypes()[type];&#010;     try {&#010;       set((Writable) clazz.newInstance());&#010;     } catch (Exception e) {&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/util/PrefixStringMatcher.java Tue May 21 01:19:26 2013&#010;@@ -45,11 +45,11 @@ public class PrefixStringMatcher extends&#010;    * @throws ClassCastException if any &lt;code&gt;Object&lt;/code&gt;s in the&#010;    * collection are not &lt;code&gt;String&lt;/code&gt;s&#010;    */&#010;-  public PrefixStringMatcher(Collection prefixes) {&#010;+  public PrefixStringMatcher(Collection&lt;String&gt; prefixes) {&#010;     super();&#010;-    Iterator iter= prefixes.iterator();&#010;+    Iterator&lt;String&gt; iter= prefixes.iterator();&#010;     while (iter.hasNext())&#010;-      addPatternForward((String)iter.next());&#010;+      addPatternForward(iter.next());&#010;   }&#010; &#010;   /**&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/util/SuffixStringMatcher.java Tue May 21 01:19:26 2013&#010;@@ -41,11 +41,11 @@ public class SuffixStringMatcher extends&#010;    * &lt;code&gt;String&lt;/code&gt;s with any suffix in the supplied&#010;    * &lt;code&gt;Collection&lt;/code&gt;&#010;    */&#010;-  public SuffixStringMatcher(Collection suffixes) {&#010;+  public SuffixStringMatcher(Collection&lt;String&gt; suffixes) {&#010;     super();&#010;-    Iterator iter= suffixes.iterator();&#010;+    Iterator&lt;String&gt; iter= suffixes.iterator();&#010;     while (iter.hasNext())&#010;-      addPatternBackward((String)iter.next());&#010;+      addPatternBackward(iter.next());&#010;   }&#010; &#010;   /**&#010;&#010;Modified: nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java (original)&#010;+++ nutch/trunk/src/plugin/creativecommons/src/java/org/creativecommons/nutch/CCParseFilter.java Tue May 21 01:19:26 2013&#010;@@ -22,7 +22,6 @@ import org.apache.nutch.parse.*;&#010; import org.apache.nutch.protocol.Content;&#010; import org.apache.nutch.metadata.Metadata;&#010; import org.apache.hadoop.conf.Configuration;&#010;-import org.apache.hadoop.io.Text;&#010; &#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010;@@ -217,11 +216,6 @@ public class CCParseFilter implements Ht&#010;           if (!CC_NS.equals(predicateElement.getNamespaceURI())) {&#010;             continue;&#010;           }&#010;-          String predicate = predicateElement.getLocalName();&#010;-&#010;-          // object is rdf:resource from cc:xxx predicates&#010;-          String object =&#010;-            predicateElement.getAttributeNodeNS(RDF_NS, "resource").getValue();&#010;         &#010;           // add object and predicate to metadata&#010;           // metadata.put(object, predicate);&#010;@@ -234,22 +228,19 @@ public class CCParseFilter implements Ht&#010;       // get cc:Work nodes from rdf:RDF&#010;       NodeList works = rdf.getElementsByTagNameNS(CC_NS, "Work");&#010;       for (int i = 0; i &lt; works.getLength(); i++) {&#010;-        Element l = (Element)works.item(i);&#010;-        &#010;         // get dc:type nodes from cc:Work&#010;         NodeList types = rdf.getElementsByTagNameNS(DC_NS, "type");&#010;+        &#010;         for (int j = 0; j &lt; types.getLength(); j++) {&#010;           Element type = (Element)types.item(j);&#010;-          String workUri = &#010;-            type.getAttributeNodeNS(RDF_NS, "resource").getValue();&#010;-          this.workType = (String)WORK_TYPE_NAMES.get(workUri);&#010;-          break;&#010;+          String workUri = type.getAttributeNodeNS(RDF_NS, "resource").getValue();&#010;+          this.workType = WORK_TYPE_NAMES.get(workUri);&#010;         }&#010;       }&#010;     }&#010;   }&#010; &#010;-  private static final HashMap WORK_TYPE_NAMES = new HashMap();&#010;+  private static final HashMap&lt;String, String&gt; WORK_TYPE_NAMES = new HashMap&lt;String, String&gt;();&#010;   static {&#010;     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/MovingImage", "video");&#010;     WORK_TYPE_NAMES.put("http://purl.org/dc/dcmitype/StillImage", "image");&#010;&#010;Modified: nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original)&#010;+++ nutch/trunk/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Tue May 21 01:19:26 2013&#010;@@ -109,7 +109,7 @@ public abstract class RegexURLFilterBase&#010;   &#010;   private static FilteredURL[] readURLFile(Reader reader) throws IOException {&#010;     BufferedReader in = new BufferedReader(reader);&#010;-    List list = new ArrayList();&#010;+    List&lt;FilteredURL&gt; list = new ArrayList&lt;FilteredURL&gt;();&#010;     String line;&#010;     while((line=in.readLine()) != null) {&#010;       if (line.length() != 0) {&#010;&#010;Modified: nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java (original)&#010;+++ nutch/trunk/src/plugin/microformats-reltag/src/java/org/apache/nutch/microformats/reltag/RelTagParser.java Tue May 21 01:19:26 2013&#010;@@ -42,15 +42,12 @@ import org.apache.nutch.util.StringUtil;&#010; &#010; // Hadoop imports&#010; import org.apache.hadoop.conf.Configuration;&#010;-import org.apache.hadoop.io.Text;&#010;-&#010; &#010; /**&#010;  * Adds microformat rel-tags of document if found.&#010;  *&#010;  * @see &lt;a href="http://www.microformats.org/wiki/rel-tag"&gt;&#010;  *      http://www.microformats.org/wiki/rel-tag&lt;/a&gt;&#010;- * @author J&amp;eacute;r&amp;ocirc;me Charron&#010;  */&#010; public class RelTagParser implements HtmlParseFilter {&#010;   &#010;@@ -58,10 +55,8 @@ public class RelTagParser implements Htm&#010; &#010;   public final static String REL_TAG = "Rel-Tag";&#010;   &#010;-  &#010;   private Configuration conf = null;&#010;   &#010;-  &#010;   /**&#010;    * Scan the HTML document looking at possible rel-tags&#010;    */&#010;@@ -72,25 +67,25 @@ public class RelTagParser implements Htm&#010;     Parse parse = parseResult.get(content.getUrl());&#010;     // Trying to find the document's rel-tags&#010;     Parser parser = new Parser(doc);&#010;-    Set tags = parser.getRelTags();&#010;-    Iterator iter = tags.iterator();&#010;+    Set&lt;?&gt; tags = parser.getRelTags();&#010;+    Iterator&lt;?&gt; iter = tags.iterator();&#010;     Metadata metadata = parse.getData().getParseMeta();&#010;-    while (iter.hasNext()) {&#010;+    while (iter.hasNext())&#010;       metadata.add(REL_TAG, (String) iter.next());&#010;-    }&#010;+&#010;     return parseResult;&#010;   }&#010; &#010;   private static class Parser {&#010; &#010;-    Set tags = null;&#010;+    Set&lt;String&gt; tags = null;&#010;     &#010;     Parser(Node node) {&#010;-      tags = new TreeSet();&#010;+      tags = new TreeSet&lt;String&gt;();&#010;       parse(node);&#010;     }&#010;   &#010;-    Set getRelTags() {&#010;+    Set&lt;String&gt; getRelTags() {&#010;       return tags;&#010;     }&#010;     &#010;@@ -120,9 +115,8 @@ public class RelTagParser implements Htm&#010;       &#010;       // Recurse&#010;       NodeList children = node.getChildNodes();&#010;-      for (int i=0; children != null &amp;&amp; i&lt;children.getLength(); i++) {&#010;+      for (int i=0; children != null &amp;&amp; i&lt;children.getLength(); i++)&#010;         parse(children.item(i));&#010;-      }&#010;     }&#010;     &#010;     private final static String parseTag(String url) {&#010;@@ -140,11 +134,6 @@ public class RelTagParser implements Htm&#010;     &#010;   }&#010; &#010;-&#010;-  /* ----------------------------- *&#010;-   * &lt;implementation:Configurable&gt; *&#010;-   * ----------------------------- */&#010;-  &#010;   public void setConf(Configuration conf) {&#010;     this.conf = conf;&#010;   }&#010;@@ -152,9 +141,4 @@ public class RelTagParser implements Htm&#010;   public Configuration getConf() {&#010;     return this.conf;&#010;   }&#010;-  &#010;-  /* ------------------------------ *&#010;-   * &lt;/implementation:Configurable&gt; *&#010;-   * ------------------------------ */&#010;-  &#010; }&#010;&#010;Modified: nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java (original)&#010;+++ nutch/trunk/src/plugin/parse-ext/src/java/org/apache/nutch/parse/ext/ExtParser.java Tue May 21 01:19:26 2013&#010;@@ -21,14 +21,12 @@ import org.apache.nutch.protocol.Content&#010; import org.apache.nutch.parse.ParseResult;&#010; import org.apache.nutch.parse.ParseStatus;&#010; import org.apache.nutch.parse.Parser;&#010;-import org.apache.nutch.parse.Parse;&#010; import org.apache.nutch.parse.ParseData;&#010; import org.apache.nutch.parse.ParseImpl;&#010; import org.apache.nutch.parse.Outlink;&#010; import org.apache.nutch.parse.OutlinkExtractor;&#010; &#010; import org.apache.nutch.util.CommandRunner;&#010;-import org.apache.nutch.metadata.Metadata;&#010; import org.apache.nutch.net.protocols.Response;&#010; import org.apache.hadoop.conf.Configuration;&#010; &#010;@@ -59,12 +57,10 @@ public class ExtParser implements Parser&#010;   static final int TIMEOUT_DEFAULT = 30; // in seconds&#010; &#010;   // handy map from String contentType to String[] {command, timeoutString, encoding}&#010;-  Hashtable TYPE_PARAMS_MAP = new Hashtable();&#010;+  Hashtable&lt;String, String[]&gt; TYPE_PARAMS_MAP = new Hashtable&lt;String, String[]&gt;();&#010; &#010;   private Configuration conf;  &#010; &#010;-  private boolean loaded = false;&#010;-&#010;   public ExtParser () { }&#010; &#010;   public ParseResult getParse(Content content) {&#010;&#010;Modified: nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java (original)&#010;+++ nutch/trunk/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/DOMBuilder.java Tue May 21 01:19:26 2013&#010;@@ -58,7 +58,7 @@ public class DOMBuilder&#010;   public DocumentFragment m_docFrag = null;&#010; &#010;   /** Vector of element nodes          */&#010;-  protected Stack m_elemStack = new Stack();&#010;+  protected Stack&lt;Element&gt; m_elemStack = new Stack&lt;Element&gt;();&#010; &#010;   /**&#010;    * DOMBuilder instance constructor... it will add the DOM nodes&#010;&#010;Modified: nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java?rev=1484634&amp;r1=1484633&amp;r2=1484634&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java (original)&#010;+++ nutch/trunk/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java Tue May 21 01:19:26 2013&#010;@@ -42,7 +42,6 @@ import org.apache.nutch.parse.Parser;&#010; import org.apache.nutch.protocol.Content;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.apache.hadoop.conf.Configuration;&#010;-import org.apache.hadoop.io.Text;&#010; import org.apache.oro.text.regex.MatchResult;&#010; import org.apache.oro.text.regex.Pattern;&#010; import org.apache.oro.text.regex.PatternCompiler;&#010;@@ -60,9 +59,6 @@ import org.w3c.dom.NodeList;&#010;  * This class is a heuristic link extractor for JavaScript files and&#010;  * code snippets. The general idea of a two-pass regex matching comes from&#010;  * Heritrix. Parts of the code come from OutlinkExtractor.java&#010;- * by Stephan Strittmatter.&#010;- *&#010;- * @author Andrzej Bialecki &amp;lt;ab@getopt.org&amp;gt;&#010;  */&#010; public class JSParseFilter implements HtmlParseFilter, Parser {&#010;   public static final Logger LOG = LoggerFactory.getLogger(JSParseFilter.class);&#010;@@ -77,12 +73,12 @@ public class JSParseFilter implements Ht&#010;     Parse parse = parseResult.get(content.getUrl());&#010; &#010;     String url = content.getBaseUrl();&#010;-    ArrayList outlinks = new ArrayList();&#010;+    ArrayList&lt;Outlink&gt; outlinks = new ArrayList&lt;Outlink&gt;();&#010;     walk(doc, parse, metaTags, url, outlinks);&#010;     if (outlinks.size() &gt; 0) {&#010;       Outlink[] old = parse.getData().getOutlinks();&#010;       String title = parse.getData().getTitle();&#010;-      List list = Arrays.asList(old);&#010;+      List&lt;Outlink&gt; list = Arrays.asList(old);&#010;       outlinks.addAll(list);&#010;       ParseStatus status = parse.getData().getStatus();&#010;       String text = parse.getText();&#010;@@ -97,14 +93,14 @@ public class JSParseFilter implements Ht&#010;     return parseResult;&#010;   }&#010;   &#010;-  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List outlinks) {&#010;+  private void walk(Node n, Parse parse, HTMLMetaTags metaTags, String base, List&lt;Outlink&gt; outlinks) {&#010;     if (n instanceof Element) {&#010;       String name = n.getNodeName();&#010;       if (name.equalsIgnoreCase("script")) {&#010;-        String lang = null;&#010;+ /*       String lang = null;&#010;         Node lNode = n.getAttributes().getNamedItem("language");&#010;         if (lNode == null) lang = "javascript";&#010;-        else lang = lNode.getNodeValue();&#010;+        else lang = lNode.getNodeValue(); */&#010;         StringBuffer script = new StringBuffer();&#010;         NodeList nn = n.getChildNodes();&#010;         if (nn.getLength() &gt; 0) {&#010;@@ -183,7 +179,7 @@ public class JSParseFilter implements Ht&#010;    */&#010;   private Outlink[] getJSLinks(String plainText, String anchor, String base) {&#010; &#010;-    final List outlinks = new ArrayList();&#010;+    final List&lt;Outlink&gt; outlinks = new ArrayList&lt;Outlink&gt;();&#010;     URL baseURL = null;&#010;     &#010;     try {&#010;@@ -265,7 +261,10 @@ public class JSParseFilter implements Ht&#010;     BufferedReader br = new BufferedReader(new InputStreamReader(in, "UTF-8"));&#010;     StringBuffer sb = new StringBuffer();&#010;     String line = null;&#010;-    while ((line = br.readLine()) != null) sb.append(line + "\n");&#010;+    while ((line = br.readLine()) != null) &#010;+      sb.append(line + "\n");&#010;+    br.close();&#010;+    &#010;     JSParseFilter parseFilter = new JSParseFilter();&#010;     parseFilter.setConf(NutchConfiguration.create());&#010;     Outlink[] links = parseFilter.getJSLinks(sb.toString(), "", args[1]);&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484628 - in /nutch/trunk: CHANGES.txt src/plugin/feed/ivy.xml</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130521004034.A358123889DA@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130521004034-A358123889DA@eris-apache-org%3e</id>
<updated>2013-05-21T00:40:34Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue May 21 00:40:34 2013&#010;New Revision: 1484628&#010;&#010;URL: http://svn.apache.org/r1484628&#010;Log:&#010;NUTCH-1053 Parsing of RSS feeds fails&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/plugin/feed/ivy.xml&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1484628&amp;r1=1484627&amp;r2=1484628&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Tue May 21 00:40:34 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1053 Parsing of RSS feeds fails (tejasp)&#010;+&#010; * Added crawler-commons dependency in pom.xml (tejasp)&#010; &#010; * NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)&#010;&#010;Modified: nutch/trunk/src/plugin/feed/ivy.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/feed/ivy.xml?rev=1484628&amp;r1=1484627&amp;r2=1484628&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/feed/ivy.xml (original)&#010;+++ nutch/trunk/src/plugin/feed/ivy.xml Tue May 21 00:40:34 2013&#010;@@ -37,6 +37,7 @@&#010; &#010;   &lt;dependencies&gt;&#010;     &lt;dependency org="rome" name="rome" rev="0.9" conf="*-&gt;master"/&gt;&#010;+    &lt;dependency org="org.jdom" name="jdom" rev="1.1" conf="*-&gt;master"/&gt;&#010;   &lt;/dependencies&gt;&#010;   &#010; &lt;/ivy-module&gt;&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484627 - in /nutch/branches/2.x: CHANGES.txt src/plugin/feed/ivy.xml</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130521004016.6D21B23888E3@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130521004016-6D21B23888E3@eris-apache-org%3e</id>
<updated>2013-05-21T00:40:16Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue May 21 00:40:16 2013&#010;New Revision: 1484627&#010;&#010;URL: http://svn.apache.org/r1484627&#010;Log:&#010;NUTCH-1053 Parsing of RSS feeds fails&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/plugin/feed/ivy.xml&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1484627&amp;r1=1484626&amp;r2=1484627&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Tue May 21 00:40:16 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1053 Parsing of RSS feeds fails (tejasp)&#010;+&#010; * NUTCH-1563 FetchSchedule#getFields is never used by GeneratorJob (Feng)&#010; &#010; * NUTCH-1573 Upgrade to most recent JUnit 4.x to improve test flexibility (lewismc)&#010;&#010;Modified: nutch/branches/2.x/src/plugin/feed/ivy.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/feed/ivy.xml?rev=1484627&amp;r1=1484626&amp;r2=1484627&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/feed/ivy.xml (original)&#010;+++ nutch/branches/2.x/src/plugin/feed/ivy.xml Tue May 21 00:40:16 2013&#010;@@ -37,6 +37,7 @@&#010; &#010;   &lt;dependencies&gt;&#010;     &lt;dependency org="net.java.dev.rome" name="rome" rev="1.0.0" conf="*-&gt;master"/&gt;&#010;+    &lt;dependency org="org.jdom" name="jdom" rev="1.1" conf="*-&gt;master"/&gt;&#010;   &lt;/dependencies&gt;&#010;   &#010; &lt;/ivy-module&gt;&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484482 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/crawl/GeneratorJob.java</title>
<author><name>fenglu@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130520134412.42DCF23889DE@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130520134412-42DCF23889DE@eris-apache-org%3e</id>
<updated>2013-05-20T13:44:12Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: fenglu&#010;Date: Mon May 20 13:44:11 2013&#010;New Revision: 1484482&#010;&#010;URL: http://svn.apache.org/r1484482&#010;Log:&#010;NUTCH-1563 FetchSchedule#getFields is never used by GeneraterJob&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1484482&amp;r1=1484481&amp;r2=1484482&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Mon May 20 13:44:11 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1563 FetchSchedule#getFields is never used by GeneratorJob (Feng)&#010;+&#010; * NUTCH-1573 Upgrade to most recent JUnit 4.x to improve test flexibility (lewismc)&#010; &#010; * Added crawler-commons dependency in pom.xml (tejasp)&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1484482&amp;r1=1484481&amp;r2=1484482&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Mon May 20 13:44:11&#010;2013&#010;@@ -24,7 +24,9 @@ import java.util.HashSet;&#010; import java.util.Map;&#010; import java.util.Random;&#010; import java.util.Set;&#010;+import java.util.Collection;&#010; &#010;+import org.apache.hadoop.mapreduce.Job;&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -152,6 +154,12 @@ public class GeneratorJob extends NutchT&#010;     setConf(conf);&#010;   }&#010; &#010;+  public Collection&lt;WebPage.Field&gt; getFields(Job job) {&#010;+    Collection&lt;WebPage.Field&gt; fields = new HashSet&lt;WebPage.Field&gt;(FIELDS);&#010;+    fields.addAll(FetchScheduleFactory.getFetchSchedule(job.getConfiguration()).getFields());&#010;+    return fields;&#010;+  }&#010;+&#010;   public Map&lt;String,Object&gt; run(Map&lt;String,Object&gt; args) throws Exception {&#010;     // map to inverted subset due for fetch, sort by score&#010;     Long topN = (Long)args.get(Nutch.ARG_TOPN);&#010;@@ -187,7 +195,8 @@ public class GeneratorJob extends NutchT&#010;     numJobs = 1;&#010;     currentJobNum = 0;&#010;     currentJob = new NutchJob(getConf(), "generate: " + batchId);&#010;-    StorageUtils.initMapperJob(currentJob, FIELDS, SelectorEntry.class,&#010;+    Collection&lt;WebPage.Field&gt; fields = getFields(currentJob);&#010;+    StorageUtils.initMapperJob(currentJob, fields, SelectorEntry.class,&#010;         WebPage.class, GeneratorMapper.class, SelectorEntryPartitioner.class, true);&#010;     StorageUtils.initReducerJob(currentJob, GeneratorReducer.class);&#010;     currentJob.waitForCompletion(true);&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484348 [1/2] - in /nutch/branches/2.x: ./ ivy/ src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/plugin/feed/src/test/org/apache/nutch/parse/feed/ src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/ src/plug...</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130519211158.893F12388906@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130519211158-893F12388906@eris-apache-org%3e</id>
<updated>2013-05-19T21:11:55Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Sun May 19 21:11:54 2013&#010;New Revision: 1484348&#010;&#010;URL: http://svn.apache.org/r1484348&#010;Log:&#010;NUTCH-1573 Upgrade to most recent JUnit 4.x to improve test flexibility&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/ivy/ivy.xml&#010;    nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java&#010;    nutch/branches/2.x/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java&#010;    nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java&#010;    nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java&#010;    nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java&#010;    nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java&#010;    nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;    nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java&#010;    nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java&#010;    nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java&#010;    nutch/branches/2.x/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java&#010;    nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java&#010;    nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java&#010;    nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java&#010;    nutch/branches/2.x/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java&#010;    nutch/branches/2.x/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java&#010;    nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java&#010;    nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java&#010;    nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java&#010;    nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java&#010;    nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java&#010;    nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java&#010;    nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java&#010;    nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java&#010;    nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java&#010;    nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestMetadata.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLFilters.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLNormalizers.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParserFactory.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/plugin/TestPluginSystem.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestContent.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestProtocolFactory.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestGZIPUtils.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestNodeWalker.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestStringUtil.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestTableUtil.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Sun May 19 21:11:54 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1573 Upgrade to most recent JUnit 4.x to improve test flexibility (lewismc)&#010;+&#010; * Added crawler-commons dependency in pom.xml (tejasp)&#010; &#010; * NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)&#010;&#010;Modified: nutch/branches/2.x/ivy/ivy.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/ivy/ivy.xml (original)&#010;+++ nutch/branches/2.x/ivy/ivy.xml Sun May 19 21:11:54 2013&#010;@@ -76,7 +76,7 @@&#010;     &lt;!--Configuration: test --&gt;&#010; &#010;     &lt;!--artifacts needed for testing --&gt;&#010;-    &lt;dependency org="junit" name="junit" rev="3.8.1" conf="test-&gt;default" /&gt;&#010;+    &lt;dependency org="junit" name="junit" rev="4.11" conf="test-&gt;default" /&gt;&#010; &#010;     &lt;dependency org="org.apache.hadoop" name="hadoop-test" rev="1.1.1" conf="test-&gt;default"&gt;&#010;       &lt;exclude org="net.sf.kosmosfs" name="kfs" /&gt;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/creativecommons/src/test/org/creativecommons/nutch/TestCCParseFilter.java Sun May 19 21:11:54 2013&#010;@@ -17,29 +17,24 @@&#010; &#010; package org.creativecommons.nutch;&#010; &#010;-import org.apache.nutch.metadata.Metadata;&#010;-import org.apache.nutch.parse.Parse;&#010; import org.apache.nutch.parse.ParseUtil;&#010;-import org.apache.nutch.protocol.Content;&#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;-import org.apache.tika.mime.MimeType;&#010;-&#010;-import java.util.Properties;&#010; import java.io.*;&#010;-import java.net.URL;&#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestCCParseFilter extends TestCase {&#010;+public class TestCCParseFilter {&#010; &#010; &#009;private static final File testDir = new File(&#010; &#009;&#009;&#009;System.getProperty("test.input"));&#010; &#010;+  @Test&#010; &#009;public void testPages() throws Exception {&#010; &#009;&#009;pageTest(new File(testDir, "anchor.html"), "http://foo.com/",&#010; &#009;&#009;&#009;&#009;"http://creativecommons.org/licenses/by-nc-sa/1.0", "a", null);&#010;@@ -56,7 +51,6 @@ public class TestCCParseFilter extends T&#010; &#009;public void pageTest(File file, String url, String license,&#010; &#009;&#009;&#009;String location, String type) throws Exception {&#010; &#010;-&#009;&#009;String contentType = "text/html";&#010; &#009;&#009;InputStream in = new FileInputStream(file);&#010; &#009;&#009;ByteArrayOutputStream out = new ByteArrayOutputStream(&#010; &#009;&#009;&#009;&#009;(int) file.length());&#010;&#010;Modified: nutch/branches/2.x/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/feed/src/test/org/apache/nutch/parse/feed/TestFeedParser.java Sun May 19 21:11:54 2013&#010;@@ -35,8 +35,8 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.protocol.ProtocolNotFound;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-// Junit imports&#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * &#010;@@ -45,7 +45,7 @@ import junit.framework.TestCase;&#010;  * Test Suite for the {@link FeedParser}.&#010;  * &#010;  */&#010;-public class TestFeedParser extends TestCase {&#010;+public class TestFeedParser {&#010; &#010;   private String fileSeparator = System.getProperty("file.separator");&#010; &#010;@@ -67,7 +67,6 @@ public class TestFeedParser extends Test&#010;    *          The name of this {@link TestCase}.&#010;    */&#010;   public TestFeedParser(String name) {&#010;-    super(name);&#010;   }&#010; &#010;   /**&#010;@@ -85,6 +84,7 @@ public class TestFeedParser extends Test&#010;    * @throws ParseException&#010;    *           If the {@link Parser}Layer cannot be loaded.&#010;    */&#010;+  @Test&#010;   public void testParseFetchChannel() throws ProtocolNotFound, ParseException {&#010;     String urlString;&#010;     Protocol protocol;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java Sun May 19 21:11:54 2013&#010;@@ -16,14 +16,13 @@&#010;  */&#010; package org.apache.nutch.indexer.anchor;&#010; &#010;-import junit.framework.TestCase;&#010;-&#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.indexer.NutchDocument;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; &#010;@@ -35,7 +34,7 @@ import org.slf4j.LoggerFactory;&#010;  * @author lewismc&#010;  *&#010;  */&#010;-public class TestAnchorIndexingFilter extends TestCase {&#010;+public class TestAnchorIndexingFilter {&#010;   &#010;   public static final Logger LOG = LoggerFactory.getLogger(TestAnchorIndexingFilter.class);&#010; &#009;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java Sun May 19 21:11:54 2013&#010;@@ -20,13 +20,12 @@ import java.nio.ByteBuffer;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010;-import org.apache.nutch.fetcher.FetcherJob;&#010; import org.apache.nutch.indexer.NutchDocument;&#010; import org.apache.nutch.metadata.Nutch;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.junit.Test;&#010;-import junit.framework.TestCase;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * JUnit test case which tests&#010;@@ -39,7 +38,7 @@ import junit.framework.TestCase;&#010;  * @author lewismc&#010;  */&#010; &#010;-public class TestBasicIndexingFilter extends TestCase {&#010;+public class TestBasicIndexingFilter {&#010;   &#010;   @Test&#010;   public void testBasicFields() throws Exception {&#010;&#010;Modified: nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/index-more/src/test/org/apache/nutch/indexer/more/TestMoreIndexingFilter.java Sun May 19 21:11:54 2013&#010;@@ -18,7 +18,8 @@ package org.apache.nutch.indexer.more;&#010; &#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -28,8 +29,9 @@ import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.EncodingDetector;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-public class TestMoreIndexingFilter extends TestCase {&#010;+public class TestMoreIndexingFilter {&#010; &#010;+  @Test&#010;   public void testContentType() throws IndexingException {&#010;     Configuration conf = NutchConfiguration.create();&#010;     assertContentType(conf, "text/html", "text/html");&#010;@@ -45,6 +47,7 @@ public class TestMoreIndexingFilter exte&#010;   /**&#010;    * @since NUTCH-901&#010;    */&#010;+  @Test&#010;   public void testNoParts(){&#010;      Configuration conf = NutchConfiguration.create();&#010;      conf.setBoolean("moreIndexingFilter.indexMimeTypeParts", false);&#010;&#010;Modified: nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/language-identifier/src/test/org/apache/nutch/analysis/lang/TestHTMLLanguageParser.java Sun May 19 21:11:54 2013&#010;@@ -21,7 +21,8 @@ import java.io.BufferedReader;&#010; import java.io.InputStreamReader;&#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.nutch.metadata.Metadata;&#010;@@ -32,7 +33,7 @@ import org.apache.nutch.util.EncodingDet&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.apache.tika.language.LanguageIdentifier;&#010; &#010;-public class TestHTMLLanguageParser extends TestCase {&#010;+public class TestHTMLLanguageParser {&#010; &#010;   private static Utf8 URL = new Utf8("http://foo.bar/");&#010; &#010;@@ -49,6 +50,7 @@ public class TestHTMLLanguageParser exte&#010;   /**&#010;    * Test parsing of language identifiers from html&#010;    **/&#010;+  @Test&#010;   public void testMetaHTMLParsing() {&#010; &#010;     try {&#010;@@ -71,6 +73,7 @@ public class TestHTMLLanguageParser exte&#010;   }&#010; &#010;   /** Test of &lt;code&gt;LanguageParser.parseLanguage(String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testParseLanguage() {&#010;     String tests[][] = { { "(SCHEME=ISO.639-1) sv", "sv" },&#010;         { "(SCHEME=RFC1766) sv-FI", "sv" }, { "(SCHEME=Z39.53) SWE", "sv" },&#010;@@ -98,6 +101,7 @@ public class TestHTMLLanguageParser exte&#010;     }&#010;   }&#010; &#010;+  @Test&#010;   public void testLanguageIndentifier() {&#010;     try {&#010;       long total = 0;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Sun May 19 21:11:54 2013&#010;@@ -17,8 +17,12 @@&#010; &#010; package org.apache.nutch.protocol.http.api;&#010; &#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+&#010; import crawlercommons.robots.BaseRobotRules;&#010;-import junit.framework.TestCase;&#010;+&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * JUnit test case which tests&#010;@@ -26,7 +30,7 @@ import junit.framework.TestCase;&#010;  * 2. that crawl delay is extracted correctly from the robots file&#010;  *&#010;  */&#010;-public class TestRobotRulesParser extends TestCase {&#010;+public class TestRobotRulesParser {&#010; &#010;   private static final String CONTENT_TYPE = "text/plain";&#010;   private static final String SINGLE_AGENT = "Agent1";&#010;@@ -72,14 +76,15 @@ public class TestRobotRulesParser extend&#010;   private HttpRobotRulesParser parser;&#010;   private BaseRobotRules rules;&#010; &#010;-  public TestRobotRulesParser(String name) {&#010;-    super(name);&#010;+  @Before&#010;+  public void setUp() {&#010;     parser = new HttpRobotRulesParser();&#010;   }&#010; &#010;   /**&#010;   * Test that the robots rules are interpreted correctly by the robots rules parser. &#010;   */&#010;+  @Test&#010;   public void testRobotsAgent() {&#010;     rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);&#010; &#010;@@ -104,6 +109,7 @@ public class TestRobotRulesParser extend&#010;   * Test that the crawl delay is extracted from the robots file for respective agent. &#010;   * If its not specified for a given agent, default value must be returned.&#010;   */&#010;+  @Test&#010;   public void testCrawlDelay() {&#010;     // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the parser&#010;     rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);&#010;&#010;Modified: nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java (original)&#010;+++ nutch/branches/2.x/src/plugin/lib-regex-filter/src/test/org/apache/nutch/urlfilter/api/RegexURLFilterBaseTest.java Sun May 19 21:11:54 2013&#010;@@ -24,23 +24,30 @@ import java.io.Reader;&#010; import java.util.ArrayList;&#010; import java.util.List;&#010; &#010;-// JUnit imports&#010;-import junit.framework.TestCase;&#010;+import static org.junit.Assert.*;&#010; &#010;-// Commons Logging imports&#010;+// Logging imports&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; &#010; // Nutch imports&#010; import org.apache.nutch.net.URLFilter;&#010;+//import org.apache.nutch.urlfilter.automaton.TestAutomatonURLFilter;&#010;+//import org.apache.nutch.urlfilter.regex.TestRegexURLFilter;&#010; &#010;+//import org.junit.runners.Suite;&#010;+//import org.junit.runner.RunWith;&#010; &#010; /**&#010;  * JUnit based test of class &lt;code&gt;RegexURLFilterBase&lt;/code&gt;.&#010;  *&#010;  * @author J&amp;eacute;r&amp;ocirc;me Charron&#010;  */&#010;-public abstract class RegexURLFilterBaseTest extends TestCase {&#010;+&#010;+&#010;+//@RunWith(Suite.class)&#010;+//@Suite.SuiteClasses({TestAutomatonURLFilter.class, TestRegexURLFilter.class})&#010;+public abstract class RegexURLFilterBaseTest {&#010;   &#010;   /** My logger */&#010;   protected static final Logger LOG = LoggerFactory.getLogger(RegexURLFilterBaseTest.class);  &#010;@@ -48,10 +55,6 @@ public abstract class RegexURLFilterBase&#010;   private final static String SEPARATOR = System.getProperty("file.separator");  &#010;   private final static String SAMPLES = System.getProperty("test.data", ".");&#010;   &#010;-  public RegexURLFilterBaseTest(String testName) {&#010;-    super(testName);&#010;-  }&#010;-  &#010;   protected abstract URLFilter getURLFilter(Reader rules);&#010; &#010;   protected void bench(int loops, String file) {&#010;&#010;Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagIndexingFilter.java Sun May 19 21:11:54 2013&#010;@@ -24,8 +24,7 @@ import org.apache.nutch.indexer.NutchDoc&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.junit.Test;&#010;-&#010;-import junit.framework.TestCase;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  *JUnit test case for {@link RelTagIndexingFilter} which &#010;@@ -34,7 +33,7 @@ import junit.framework.TestCase;&#010;  *@author lewismc&#010;  */&#010; &#010;-  public class TestRelTagIndexingFilter extends TestCase {&#010;+  public class TestRelTagIndexingFilter {&#010; &#010;   @Test&#010;   public void testRelTagFields() throws Exception {&#010;&#010;Modified: nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/microformats-reltag/src/test/org/apache/nutch/microformats/reltag/TestRelTagParser.java Sun May 19 21:11:54 2013&#010;@@ -32,20 +32,20 @@ import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.junit.Test;&#010;-import junit.framework.TestCase;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * Junit test for {@link RelTagParser} based mainly John Xing's parser tests.&#010;  * We are not concerned with actual parse text within the sample file, instead&#010;  * we assert that the rel-tags we expect are found in the WebPage metadata.&#010;  * To check the parser is working as expected we unwrap the ByteBuffer obtained &#010;- * from metadata, the same type as  * we use in expected (String). So just the &#010;+ * from metadata, the same type as we use in expected (String). So just the &#010;  * other way around as we wrapped the metadata value.&#010;  * &#010;  * @author lewismc&#010;  *&#010;  */&#010;-public class TestRelTagParser extends TestCase {&#010;+public class TestRelTagParser {&#010; &#010;   private String fileSeparator = System.getProperty("file.separator");&#010; &#010;@@ -61,39 +61,34 @@ public class TestRelTagParser extends Te&#010;   &#010;   private Configuration conf;&#010;   &#010;-  public TestRelTagParser(String name) {&#010;-    super(name);&#010;-  }&#010;-  &#010;   @Test&#010;-  public void testRelTagParser() throws ProtocolException, ParseException, IOException {&#010;-&#009;conf = NutchConfiguration.create();&#010;-&#009;conf.set("file.content.limit", "-1");&#010;-&#009;Parse parse;&#010;-&#009;String urlString = "file:" + sampleDir + fileSeparator + sampleFile;&#010;-&#010;-&#009;File file = new File(sampleDir + fileSeparator + sampleFile);&#010;-&#009;byte[] bytes = new byte[(int) file.length()];&#010;-&#009;DataInputStream in = new DataInputStream(new FileInputStream(file));&#010;-&#009;in.readFully(bytes);&#010;-&#009;in.close();&#010;-&#010;-&#009;WebPage page = new WebPage();&#010;-&#009;page.setBaseUrl(new Utf8(urlString));&#010;-&#009;page.setContent(ByteBuffer.wrap(bytes));&#010;-&#009;MimeUtil mimeutil = new MimeUtil(conf);&#010;-&#009;String mtype = mimeutil.getMimeType(file);&#010;-&#009;page.setContentType(new Utf8(mtype));&#010;-&#009;parse = new ParseUtil(conf).parse(urlString, page);&#010;-    &#010;-&#009;//begin assertion for tests&#010;-&#009;ByteBuffer bbuf = page.getFromMetadata(new Utf8("Rel-Tag"));&#010;-&#009;byte[] byteArray = new byte[bbuf.remaining()];&#010;-&#009;bbuf.get(byteArray);&#010;-&#009;String s = new String(byteArray);&#010;-&#009;//bbuf.flip();&#010;-&#009;assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", &#010;-&#009;  expectedRelTags, s);&#010;+  public void testRelTagParser() throws ParseException, ProtocolException, IOException {&#010;+    conf = NutchConfiguration.create();&#010;+    conf.set("file.content.limit", "-1");&#010;+    Parse parse;&#010;+    String urlString = "file:" + sampleDir + fileSeparator + sampleFile;&#010;+&#010;+    File file = new File(sampleDir + fileSeparator + sampleFile);&#010;+    byte[] bytes = new byte[(int) file.length()];&#010;+    DataInputStream in = new DataInputStream(new FileInputStream(file));&#010;+    in.readFully(bytes);&#010;+    in.close();&#010;+&#010;+    WebPage page = new WebPage();&#010;+    page.setBaseUrl(new Utf8(urlString));&#010;+    page.setContent(ByteBuffer.wrap(bytes));&#010;+    MimeUtil mimeutil = new MimeUtil(conf);&#010;+    String mtype = mimeutil.getMimeType(file);&#010;+    page.setContentType(new Utf8(mtype));&#010;+    parse = new ParseUtil(conf).parse(urlString, page);&#010;+    //begin assertion for tests&#010;+    ByteBuffer bbuf = page.getFromMetadata(new Utf8("Rel-Tag"));&#010;+    byte[] byteArray = new byte[bbuf.remaining()];&#010;+    bbuf.get(byteArray);&#010;+    String s = new String(byteArray);&#010;+    //bbuf.flip();&#010;+    assertEquals("We expect 2 tab-separated rel-tag's extracted by the filter", &#010;+      expectedRelTags, s);&#010;   }&#010;   &#010; }&#010;\ No newline at end of file&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-ext/src/test/org/apache/nutch/parse/ext/TestExtParser.java Sun May 19 21:11:54 2013&#010;@@ -30,7 +30,8 @@ import org.apache.nutch.util.NutchConfig&#010; &#010; import org.apache.hadoop.io.Text;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import java.io.File;&#010; import java.io.FileOutputStream;&#010;@@ -47,7 +48,7 @@ import java.io.IOException;&#010;  *&#010;  * @author John Xing&#010;  */&#010;-public class TestExtParser extends TestCase {&#010;+public class TestExtParser {&#010;   private File tempFile = null;&#010;   private String urlString = null;&#010;   private Content content = null;&#010;@@ -58,7 +59,6 @@ public class TestExtParser extends TestC&#010;   private String expectedMD5sum = "df46711a1a48caafc98b1c3b83aa1526";&#010; &#010;   public TestExtParser(String name) { &#010;-    super(name); &#010;   }&#010; &#010;   protected void setUp() throws ProtocolException, IOException {&#010;@@ -95,6 +95,7 @@ public class TestExtParser extends TestC&#010;     //  tempFile.delete();&#010;   }&#010; &#010;+  @Test&#010;   public void testIt() throws ParseException {&#010;     String contentType;&#010; &#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestDOMContentUtils.java Sun May 19 21:11:54 2013&#010;@@ -17,8 +17,6 @@&#010; &#010; package org.apache.nutch.parse.html;&#010; &#010;-import junit.framework.TestCase;&#010;-&#010; import org.apache.nutch.parse.Outlink;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;@@ -34,10 +32,14 @@ import org.xml.sax.*;&#010; import org.w3c.dom.*;&#010; import org.apache.html.dom.*;&#010; &#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010;+&#010; /** &#010;  * Unit tests for DOMContentUtils.&#010;  */&#010;-public class TestDOMContentUtils extends TestCase {&#010;+public class TestDOMContentUtils {&#010; &#010;   private static final String[] testPages= { &#010;     new String("&lt;html&gt;&lt;head&gt;&lt;title&gt; title &lt;/title&gt;&lt;script&gt; script &lt;/script&gt;"&#010;@@ -215,11 +217,8 @@ public class TestDOMContentUtils extends&#010;   private static Configuration conf;&#010;   private static DOMContentUtils utils = null;&#010;   &#010;-  public TestDOMContentUtils(String name) { &#010;-    super(name); &#010;-  }&#010;-&#010;-  private static void setup() {&#010;+  @Before&#010;+  public void setup() {&#010;     conf = NutchConfiguration.create();&#010;     conf.setBoolean("parser.html.form.use_action", true);&#010;     utils = new DOMContentUtils(conf);&#010;@@ -313,6 +312,7 @@ public class TestDOMContentUtils extends&#010;     return true;&#010;   }&#010; &#010;+  @Test&#010;   public void testGetText() {&#010;     if (testDOMs[0] == null) &#010;       setup();&#010;@@ -328,6 +328,7 @@ public class TestDOMContentUtils extends&#010;     }&#010;   }&#010; &#010;+  @Test&#010;   public void testGetTitle() {&#010;     if (testDOMs[0] == null) &#010;       setup();&#010;@@ -343,6 +344,7 @@ public class TestDOMContentUtils extends&#010;     }&#010;   }&#010; &#010;+  @Test&#010;   public void testGetOutlinks() {&#010;     if (testDOMs[0] == null) &#010;       setup();&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestRobotsMetaProcessor.java Sun May 19 21:11:54 2013&#010;@@ -17,8 +17,6 @@&#010; &#010; package org.apache.nutch.parse.html;&#010; &#010;-import junit.framework.TestCase;&#010;-&#010; import org.apache.nutch.parse.HTMLMetaTags;&#010; &#010; import java.io.ByteArrayInputStream;&#010;@@ -29,11 +27,11 @@ import org.xml.sax.*;&#010; import org.w3c.dom.*;&#010; import org.apache.html.dom.*;&#010; &#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010;+&#010; /** Unit tests for HTMLMetaProcessor. */&#010;-public class TestRobotsMetaProcessor extends TestCase {&#010;-  public TestRobotsMetaProcessor(String name) { &#010;-    super(name); &#010;-  }&#010;+public class TestRobotsMetaProcessor {&#010; &#010;   /*&#010; &#010;@@ -126,6 +124,7 @@ public class TestRobotsMetaProcessor ext&#010; &#010;   private URL[][] currURLsAndAnswers;&#010; &#010;+  @Test&#010;   public void testRobotsMetaProcessor() {&#010;     DOMFragmentParser parser= new DOMFragmentParser();;&#010; &#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java Sun May 19 21:11:54 2013&#010;@@ -19,7 +19,6 @@ package org.apache.nutch.parse.js;&#010; import java.io.DataInputStream;&#010; import java.io.File;&#010; import java.io.FileInputStream;&#010;-import java.io.FileNotFoundException;&#010; import java.io.IOException;&#010; import java.nio.ByteBuffer;&#010; &#010;@@ -33,9 +32,9 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;+import org.junit.Before;&#010; import org.junit.Test;&#010;-&#010;-import junit.framework.TestCase;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * JUnit test case for {@link JSParseFilter} which tests &#010;@@ -45,7 +44,7 @@ import junit.framework.TestCase;&#010;  * @author lewismc&#010;  */&#010; &#010;-public class TestJSParseFilter extends TestCase {&#010;+public class TestJSParseFilter {&#010; &#010;   private String fileSeparator = System.getProperty("file.separator");&#010; &#010;@@ -58,50 +57,44 @@ public class TestJSParseFilter extends T&#010; &#009;  &#010;   private Configuration conf;&#010; &#009;  &#010;-  public TestJSParseFilter(String name) {&#010;-&#009;super(name);&#010;-  }&#010;-&#009;  &#010;-  protected void setUp() {&#010;-&#009;conf = NutchConfiguration.create();&#010;-&#009;conf.set("file.content.limit", "-1");&#010;+  @Before&#010;+  public void setUp() {&#010;+    conf = NutchConfiguration.create();&#010;+    conf.set("file.content.limit", "-1");&#010;   }&#010; &#010;-  protected void tearDown() {&#010;-  }&#009;&#010;-  &#010;   public Outlink[] getOutlinks(String[] sampleFiles) throws ProtocolException, ParseException, IOException {&#010;-&#009;String urlString;&#010;-&#009;Parse parse;&#010;+    String urlString;&#010;+    Parse parse;&#010; &#009;&#010;-&#009;urlString = "file:" + sampleDir + fileSeparator + sampleFiles;&#010;-&#009;File file = new File(urlString);&#010;-&#009;byte[] bytes = new byte[(int) file.length()];&#010;-&#009;DataInputStream dip = new DataInputStream(new FileInputStream(file));&#010;-&#009;dip.readFully(bytes);&#010;-&#009;dip.close();&#010;+    urlString = "file:" + sampleDir + fileSeparator + sampleFiles;&#010;+    File file = new File(urlString);&#010;+    byte[] bytes = new byte[(int) file.length()];&#010;+    DataInputStream dip = new DataInputStream(new FileInputStream(file));&#010;+    dip.readFully(bytes);&#010;+    dip.close();&#010;     &#010;-&#009;WebPage page = new WebPage();&#010;-&#009;page.setBaseUrl(new Utf8(urlString));&#010;-&#009;page.setContent(ByteBuffer.wrap(bytes));&#010;-&#009;MimeUtil mutil = new MimeUtil(conf);&#010;-&#009;String mime = mutil.getMimeType(file);&#010;-&#009;page.setContentType(new Utf8(mime));&#010;+    WebPage page = new WebPage();&#010;+    page.setBaseUrl(new Utf8(urlString));&#010;+    page.setContent(ByteBuffer.wrap(bytes));&#010;+    MimeUtil mutil = new MimeUtil(conf);&#010;+    String mime = mutil.getMimeType(file);&#010;+    page.setContentType(new Utf8(mime));&#010; &#009;&#010;-&#009;parse = new ParseUtil(conf).parse(urlString, page);&#010;-&#009;return parse.getOutlinks();&#010;+    parse = new ParseUtil(conf).parse(urlString, page);&#010;+    return parse.getOutlinks();&#010;   }&#010;   &#010;   @Test&#010;   public void testOutlinkExtraction() throws ProtocolException, ParseException, IOException {&#010;-&#009;String[] filenames = new File(sampleDir).list();&#010;-&#009;for (int i = 0; i &lt; filenames.length; i++) {&#010;-&#009;  if (filenames[i].endsWith(".js") == true) {&#010;-&#009;&#009;assertEquals("number of outlinks in .js test file should be 5", 5, getOutlinks(sampleFiles));&#010;-&#009;&#009;// temporarily disabled as a suitable pure JS file could not be be found.&#010;-&#009;    //} else {&#010;-&#009;&#009;//assertEquals("number of outlinks in .html file should be X", 5, getOutlinks(sampleFiles));&#010;-&#009;  }&#010;+    String[] filenames = new File(sampleDir).list();&#010;+    for (int i = 0; i &lt; filenames.length; i++) {&#010;+      if (filenames[i].endsWith(".js") == true) {&#010;+        assertEquals("number of outlinks in .js test file should be 5", 5, getOutlinks(sampleFiles));&#010;+        // temporarily disabled as a suitable pure JS file could not be be found.&#010;+        //} else {&#010;+        //assertEquals("number of outlinks in .html file should be X", 5, getOutlinks(sampleFiles));&#010;+      }&#010;     }&#010;   }&#010; &#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-swf/src/test/org/apache/nutch/parse/swf/TestSWFParser.java Sun May 19 21:11:54 2013&#010;@@ -32,14 +32,15 @@ import org.apache.nutch.parse.ParseExcep&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /** &#010;  * Unit tests for SWFParser.&#010;  *&#010;  * @author Andrzej Bialecki&#010;  */&#010;-public class TestSWFParser extends TestCase {&#010;+public class TestSWFParser {&#010; &#010;   private String fileSeparator = System.getProperty("file.separator");&#010;   // This system property is defined in ./src/plugin/build-plugin.xml&#010;@@ -47,10 +48,8 @@ public class TestSWFParser extends TestC&#010;   &#010;   private String[] sampleFiles = new String[]{"test1.swf", "test2.swf", "test3.swf"};&#010;   private String[] sampleTexts = new String[]{"test1.txt", "test2.txt", "test3.txt"};&#010;-  private String[] texts = new String[sampleTexts.length];&#010;-&#010;+  &#010;   public TestSWFParser(String name) { &#010;-    super(name);&#010;     for (int i = 0; i &lt; sampleFiles.length; i++) {&#010;     try {&#010;       // read the test string&#010;@@ -74,6 +73,7 @@ public class TestSWFParser extends TestC&#010; &#010;   protected void tearDown() {}&#010; &#010;+  @Test&#010;   public void testIt() throws ProtocolException, ParseException {&#010;     String urlString;&#010;     Protocol protocol;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java Sun May 19 21:11:54 2013&#010;@@ -17,7 +17,8 @@&#010; &#010; package org.apache.nutch.parse.tika;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.nutch.parse.Outlink;&#010; import org.apache.nutch.parse.tika.DOMBuilder;&#010;@@ -43,7 +44,7 @@ import java.util.StringTokenizer;&#010; /**&#010;  * Unit tests for DOMContentUtils.&#010;  */&#010;-public class DOMContentUtilsTest extends TestCase {&#010;+public class DOMContentUtilsTest {&#010; &#010; &#009;private static final String[] testPages = {&#010; &#009;&#009;&#009;// 0.&#010;@@ -200,7 +201,6 @@ public class DOMContentUtilsTest extends&#010; &#009;public static final Logger Logger = LoggerFactory.getLogger(DOMContentUtilsTest.class);&#010; &#010; &#009;public DOMContentUtilsTest(String name) {&#010;-&#009;&#009;super(name);&#010; &#009;}&#010; &#010; &#009;private static void setup() throws Exception {&#010;@@ -317,6 +317,7 @@ public class DOMContentUtilsTest extends&#010; &#009;&#009;return true;&#010; &#009;}&#010; &#010;+&#009;@Test&#010; &#009;public void testGetText() throws Exception {&#010; &#009;&#009;if (testDOMs[0] == null)&#010; &#009;&#009;&#009;setup();&#010;@@ -335,6 +336,7 @@ public class DOMContentUtilsTest extends&#010; &#010; &#009;// won't work with Tika - the title is stored in the metadata but&#010; &#009;// not put in the XHTML representation&#010;+&#009;@Test&#010; &#009;public void testGetTitle() throws Exception {&#010; &#009;&#009;if (testDOMs[0] == null)&#010; &#009;&#009;&#009;setup();&#010;@@ -351,6 +353,7 @@ public class DOMContentUtilsTest extends&#010; &#009;&#009;}&#010; &#009;}&#010; &#010;+&#009;@Test&#010; &#009;public void testGetOutlinks() throws Exception {&#010; &#009;&#009;if (testDOMs[0] == null)&#010; &#009;&#009;&#009;setup();&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestMSWordParser.java Sun May 19 21:11:54 2013&#010;@@ -23,7 +23,9 @@ import java.io.FileInputStream;&#010; import java.io.IOException;&#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -34,14 +36,13 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;-import org.apache.tika.mime.MimeType;&#010; &#010; /**&#010;  * Unit tests for MSWordParser.&#010;  * &#010;  * @author John Xing&#010;  */&#010;-public class TestMSWordParser extends TestCase {&#010;+public class TestMSWordParser {&#010; &#010;     private String fileSeparator = System.getProperty("file.separator");&#010;     // This system property is defined in ./src/plugin/build-plugin.xml&#010;@@ -55,18 +56,12 @@ public class TestMSWordParser extends Te&#010; &#010;     private Configuration conf;&#010; &#010;-    public TestMSWordParser(String name) {&#010;-&#009;super(name);&#010;-    }&#010;-&#010;-    protected void setUp() {&#010;+    @Before&#010;+    public void setUp() {&#010; &#009;conf = NutchConfiguration.create();&#010; &#009;conf.set("file.content.limit", "-1");&#010;     }&#010; &#010;-    protected void tearDown() {&#010;-    }&#010;-&#010;     public String getTextContent(String fileName) throws ProtocolException,&#010; &#009;    ParseException, IOException {&#010; &#009;String urlString = sampleDir + fileSeparator + fileName;&#010;@@ -89,6 +84,7 @@ public class TestMSWordParser extends Te&#010; &#009;return parse.getText();&#010;     }&#010; &#010;+    @Test&#010;     public void testIt() throws ProtocolException, ParseException, IOException {&#010; &#009;for (int i = 0; i &lt; sampleFiles.length; i++) {&#010; &#009;    String found = getTextContent(sampleFiles[i]);&#010;@@ -97,6 +93,7 @@ public class TestMSWordParser extends Te&#010; &#009;}&#010;     }&#010; &#010;+    @Test&#010;     public void testOpeningDocs() throws ProtocolException, ParseException, IOException {&#010; &#009;String[] filenames = new File(sampleDir).list();&#010; &#009;for (int i = 0; i &lt; filenames.length; i++) {&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestOOParser.java Sun May 19 21:11:54 2013&#010;@@ -24,7 +24,8 @@ import java.io.IOException;&#010; import java.io.InputStreamReader;&#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -35,92 +36,83 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;-import org.apache.tika.mime.MimeType;&#010; &#010; /**&#010;  * Unit tests for OOParser.&#010;  * &#010;  * @author Andrzej Bialecki&#010;  */&#010;-public class TestOOParser extends TestCase {&#010;+public class TestOOParser {&#010; &#010;-    private String fileSeparator = System.getProperty("file.separator");&#010;-    // This system property is defined in ./src/plugin/build-plugin.xml&#010;-    private String sampleDir = System.getProperty("test.data", ".");&#010;-    // Make sure sample files are copied to "test.data" as specified in&#010;-    // ./src/plugin/parse-oo/build.xml during plugin compilation.&#010;-    private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };&#010;-&#010;-    private String sampleText = "ootest.txt";&#010;-&#010;-    private String expectedText;&#010;-&#010;-    public TestOOParser(String name) {&#010;-&#009;super(name);&#010;-&#009;try {&#010;-&#009;    // read the test string&#010;-&#009;    FileInputStream fis = new FileInputStream(sampleDir + fileSeparator&#010;-&#009;&#009;    + sampleText);&#010;-&#009;    StringBuffer sb = new StringBuffer();&#010;-&#009;    int len = 0;&#010;-&#009;    InputStreamReader isr = new InputStreamReader(fis, "UTF-8");&#010;-&#009;    char[] buf = new char[1024];&#010;-&#009;    while ((len = isr.read(buf)) &gt; 0) {&#010;-&#009;&#009;sb.append(buf, 0, len);&#010;-&#009;    }&#010;-&#009;    isr.close();&#010;-&#009;    expectedText = sb.toString();&#010;-&#009;    // normalize space&#010;-&#009;    expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");&#010;-&#009;} catch (Exception e) {&#010;-&#009;    e.printStackTrace();&#010;-&#009;}&#010;+  private String fileSeparator = System.getProperty("file.separator");&#010;+  // This system property is defined in ./src/plugin/build-plugin.xml&#010;+  private String sampleDir = System.getProperty("test.data", ".");&#010;+  // Make sure sample files are copied to "test.data" as specified in&#010;+  // ./src/plugin/parse-oo/build.xml during plugin compilation.&#010;+  private String[] sampleFiles = { "ootest.odt", "ootest.sxw" };&#010;+&#010;+  private String sampleText = "ootest.txt";&#010;+&#010;+  private String expectedText;&#010;+&#010;+  @Test&#010;+  public void testIt() throws ProtocolException, ParseException, IOException {&#010;+    String urlString;&#010;+    Parse parse;&#010;+    Configuration conf = NutchConfiguration.create();&#010;+    MimeUtil mimeutil = new MimeUtil(conf);&#010;+&#009;&#010;+    try {&#010;+      // read the test string&#010;+      FileInputStream fis = new FileInputStream(sampleDir + fileSeparator&#010;+        + sampleText);&#010;+      StringBuffer sb = new StringBuffer();&#010;+      int len = 0;&#010;+      InputStreamReader isr = new InputStreamReader(fis, "UTF-8");&#010;+      char[] buf = new char[1024];&#010;+      while ((len = isr.read(buf)) &gt; 0) {&#010;+        sb.append(buf, 0, len);&#010;+      }&#010;+      isr.close();&#010;+      expectedText = sb.toString();&#010;+      // normalize space&#010;+      expectedText = expectedText.replaceAll("[ \t\r\n]+", " ");&#010;+    } catch (Exception e) {&#010;+      e.printStackTrace();&#010;     }&#010; &#010;-    protected void setUp() {&#010;-    }&#010;+    System.out.println("Expected : " + expectedText);&#010; &#010;-    protected void tearDown() {&#010;-    }&#010;+    for (int i = 0; i &lt; sampleFiles.length; i++) {&#010;+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];&#010;+&#010;+      if (sampleFiles[i].startsWith("ootest") == false)&#010;+      continue;&#010;+&#010;+      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);&#010;+      byte[] bytes = new byte[(int) file.length()];&#010;+      DataInputStream in = new DataInputStream(new FileInputStream(file));&#010;+      in.readFully(bytes);&#010;+      in.close();&#010;+&#010;+      WebPage page = new WebPage();&#010;+      page.setBaseUrl(new Utf8(urlString));&#010;+      page.setContent(ByteBuffer.wrap(bytes));&#010;+      String mtype = mimeutil.getMimeType(file);&#010;+      page.setContentType(new Utf8(mtype));&#010;+&#010;+      parse = new ParseUtil(conf).parse(urlString, page);&#010;+&#010;+      String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();&#010; &#010;-    public void testIt() throws ProtocolException, ParseException, IOException {&#010;-&#009;String urlString;&#010;-&#009;Parse parse;&#010;-&#009;Configuration conf = NutchConfiguration.create();&#010;-&#009;MimeUtil mimeutil = new MimeUtil(conf);&#010;-&#010;-&#009;System.out.println("Expected : " + expectedText);&#010;-&#010;-&#009;for (int i = 0; i &lt; sampleFiles.length; i++) {&#010;-&#009;    urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];&#010;-&#010;-&#009;    if (sampleFiles[i].startsWith("ootest") == false)&#010;-&#009;&#009;continue;&#010;-&#010;-&#009;    File file = new File(sampleDir + fileSeparator + sampleFiles[i]);&#010;-&#009;    byte[] bytes = new byte[(int) file.length()];&#010;-&#009;    DataInputStream in = new DataInputStream(new FileInputStream(file));&#010;-&#009;    in.readFully(bytes);&#010;-&#009;    in.close();&#010;-&#010;-&#009;    WebPage page = new WebPage();&#010;-&#009;    page.setBaseUrl(new Utf8(urlString));&#010;-&#009;    page.setContent(ByteBuffer.wrap(bytes));&#010;-&#009;    String mtype = mimeutil.getMimeType(file);&#010;-&#009;    page.setContentType(new Utf8(mtype));&#010;-&#010;-&#009;    parse = new ParseUtil(conf).parse(urlString, page);&#010;-&#010;-&#009;    String text = parse.getText().replaceAll("[ \t\r\n]+", " ").trim();&#010;-&#010;-&#009;    // simply test for the presence of a text - the ordering of the&#010;-&#009;    // elements&#010;-&#009;    // may differ from what was expected&#010;-&#009;    // in the previous tests&#010;-&#009;    assertTrue(text != null &amp;&amp; text.length() &gt; 0);&#010;+      // simply test for the presence of a text - the ordering of the&#010;+      // elements&#010;+      // may differ from what was expected&#010;+      // in the previous tests&#010;+      assertTrue(text != null &amp;&amp; text.length() &gt; 0);&#010; &#010;-&#009;    System.out.println("Found " + sampleFiles[i] + ": " + text);&#010;-&#009;}&#010;+      System.out.println("Found " + sampleFiles[i] + ": " + text);&#010;     }&#010;+  }&#010; &#010; }&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestPdfParser.java Sun May 19 21:11:54 2013&#010;@@ -23,7 +23,8 @@ import java.io.FileInputStream;&#010; import java.io.IOException;&#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -34,14 +35,13 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;-import org.apache.tika.mime.MimeType;&#010; &#010; /**&#010;  * Unit tests for PdfParser.&#010;  * &#010;  * @author John Xing&#010;  */&#010;-public class TestPdfParser extends TestCase {&#010;+public class TestPdfParser {&#010; &#010;     private String fileSeparator = System.getProperty("file.separator");&#010;     // This system property is defined in ./src/plugin/build-plugin.xml&#010;@@ -53,16 +53,7 @@ public class TestPdfParser extends TestC&#010; &#010;     private String expectedText = "A VERY SMALL PDF FILE";&#010; &#010;-    public TestPdfParser(String name) {&#010;-&#009;super(name);&#010;-    }&#010;-&#010;-    protected void setUp() {&#010;-    }&#010;-&#010;-    protected void tearDown() {&#010;-    }&#010;-&#010;+    @Test&#010;     public void testIt() throws ProtocolException, ParseException, IOException {&#010; &#009;String urlString;&#010; &#009;Parse parse;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java Sun May 19 21:11:54 2013&#010;@@ -23,9 +23,6 @@ import java.io.FileInputStream;&#010; import java.io.IOException;&#010; import java.nio.ByteBuffer;&#010; &#010;-import org.apache.nutch.protocol.ProtocolFactory;&#010;-import org.apache.nutch.protocol.Protocol;&#010;-import org.apache.nutch.protocol.Content;&#010; import org.apache.nutch.protocol.ProtocolException;&#010; &#010; import org.apache.nutch.parse.Parse;&#010;@@ -38,9 +35,8 @@ import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import org.apache.tika.mime.MimeType;&#010;-&#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * Unit tests for the RSS Parser based on John Xing's TestPdfParser class.&#010;@@ -48,7 +44,7 @@ import junit.framework.TestCase;&#010;  * @author mattmann&#010;  * @version 1.0&#010;  */&#010;-public class TestRSSParser extends TestCase {&#010;+public class TestRSSParser {&#010; &#010;   private String fileSeparator = System.getProperty("file.separator");&#010; &#010;@@ -62,18 +58,6 @@ public class TestRSSParser extends TestC&#010; &#010;   /**&#010;    * &lt;p&gt;&#010;-   * Default constructor&#010;-   * &lt;/p&gt;&#010;-   * &#010;-   * @param name&#010;-   *          The name of the RSSParserTest&#010;-   */&#010;-  public TestRSSParser(String name) {&#010;-    super(name);&#010;-  }&#010;-&#010;-  /**&#010;-   * &lt;p&gt;&#010;    * The test method: tests out the following 2 asserts:&#010;    * &lt;/p&gt;&#010;    * &#010;@@ -83,9 +67,9 @@ public class TestRSSParser extends TestC&#010;    * file&lt;/li&gt;&#010;    * &lt;/ul&gt;&#010;    */&#010;+  @Test&#010;   public void testIt()throws ProtocolException, ParseException, IOException {&#010;     String urlString;&#010;-    Protocol protocol;&#010;     Parse parse;&#010; &#010;     Configuration conf = NutchConfiguration.create();&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRTFParser.java Sun May 19 21:11:54 2013&#010;@@ -23,7 +23,8 @@ import java.io.FileInputStream;&#010; import java.io.IOException;&#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -34,14 +35,13 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;-import org.apache.tika.mime.MimeType;&#010; &#010; /**&#010;  * Unit tests for TestRTFParser. (Adapted from John Xing msword unit tests).&#010;  * &#010;  * @author Andy Hedges&#010;  */&#010;-public class TestRTFParser extends TestCase {&#010;+public class TestRTFParser {&#010; &#010;     private String fileSeparator = System.getProperty("file.separator");&#010;     // This system property is defined in ./src/plugin/build-plugin.xml&#010;@@ -51,16 +51,7 @@ public class TestRTFParser extends TestC&#010;     // Check ./src/plugin/parse-rtf/sample/README.txt for what they are.&#010;     private String rtfFile = "test.rtf";&#010; &#010;-    public TestRTFParser(String name) {&#010;-&#009;super(name);&#010;-    }&#010;-&#010;-    protected void setUp() {&#010;-    }&#010;-&#010;-    protected void tearDown() {&#010;-    }&#010;-&#010;+    @Test&#010;     public void testIt() throws ProtocolException, ParseException, IOException {&#010; &#010; &#009;String urlString;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java (original)&#010;+++ nutch/branches/2.x/src/plugin/parse-zip/src/test/org/apache/nutch/parse/zip/TestZipParser.java Sun May 19 21:11:54 2013&#010;@@ -29,15 +29,16 @@ import org.apache.hadoop.conf.Configurat&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010; import org.apache.hadoop.io.Text;&#010;+import org.junit.Test;&#010; &#010;-import junit.framework.TestCase;&#010;+import static org.junit.Assert.*;&#010; &#010; /** &#010;  * Based on Unit tests for MSWordParser by John Xing&#010;  *&#010;  * @author Rohit Kulkarni &amp; Ashish Vaidya&#010;  */&#010;-public class TestZipParser extends TestCase {&#010;+public class TestZipParser {&#010; &#010;   private String fileSeparator = System.getProperty("file.separator");&#010;   // This system property is defined in ./src/plugin/build-plugin.xml&#010;@@ -50,13 +51,13 @@ public class TestZipParser extends TestC&#010;   private String expectedText = "textfile.txt This is text file number 1 ";&#010; &#010;   public TestZipParser(String name) { &#010;-    super(name); &#010;   }&#010; &#010;   protected void setUp() {}&#010; &#010;   protected void tearDown() {}&#010; &#010;+  @Test&#010;   public void testIt() throws ProtocolException, ParseException {&#010;     String urlString;&#010;     Protocol protocol;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java (original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-file/src/test/org/apache/nutch/protocol/file/TestProtocolFile.java Sun May 19 21:11:54 2013&#010;@@ -18,10 +18,8 @@&#010; package org.apache.nutch.protocol.file;&#010; &#010; // Hadoop imports&#010;-import junit.framework.TestCase;&#010; &#010; import org.apache.hadoop.conf.Configuration;&#010;-import org.apache.hadoop.io.Text;&#010; import org.apache.nutch.net.protocols.Response;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolException;&#010;@@ -31,6 +29,9 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.protocol.ProtocolStatusCodes;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * @author mattmann&#010;@@ -40,7 +41,7 @@ import org.apache.nutch.util.NutchConfig&#010;  * Unit tests for the {@link File}Protocol.&#010;  * &lt;/p&gt;.&#010;  */&#010;-public class TestProtocolFile extends TestCase {&#010;+public class TestProtocolFile {&#010; &#010;   private String fileSeparator = System.getProperty("file.separator");&#010;   private String sampleDir = System.getProperty("test.data", ".");&#010;@@ -52,10 +53,12 @@ public class TestProtocolFile extends Te&#010;   &#010;   private Configuration conf;&#010;   &#010;-  protected void setUp() {&#010;+  @Before&#010;+  public void setUp() {&#010;     conf = NutchConfiguration.create();&#010;   }&#010; &#010;+  @Test&#010;   public void testSetContentType() throws ProtocolException {&#010;     for (String testTextFile : testTextFiles) {&#010;       setContentType(testTextFile);&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java (original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/test/org/apache/nutch/protocol/httpclient/TestProtocolHttpClient.java Sun May 19 21:11:54 2013&#010;@@ -20,7 +20,10 @@ package org.apache.nutch.protocol.httpcl&#010; import java.net.MalformedURLException;&#010; import java.net.URL;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.After;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.net.protocols.Response;&#010;@@ -35,7 +38,7 @@ import org.mortbay.jetty.servlet.Servlet&#010;  * &#010;  * @author Susam Pal&#010;  */&#010;-public class TestProtocolHttpClient extends TestCase {&#010;+public class TestProtocolHttpClient {&#010; &#010; &#009;private Server server;&#010; &#009;private Configuration conf;&#010;@@ -43,7 +46,8 @@ public class TestProtocolHttpClient exte&#010; &#009;private int port;&#010; &#009;private Http http = new Http();&#010; &#010;-&#009;protected void setUp() throws Exception {&#010;+  @Before&#010;+&#009;public void setUp() throws Exception {&#010; &#010; &#009;&#009;server = new Server();&#010; &#009;&#009;&#010;@@ -69,7 +73,8 @@ public class TestProtocolHttpClient exte&#010; &#009;&#009;http.setConf(conf);&#010; &#009;}&#010; &#010;-&#009;protected void tearDown() throws Exception {&#010;+  @After&#010;+&#009;public void tearDown() throws Exception {&#010; &#009;&#009;server.stop();&#010; &#009;}&#010; &#010;@@ -79,6 +84,7 @@ public class TestProtocolHttpClient exte&#010; &#009; * @throws Exception&#010; &#009; *             If an error occurs or the test case fails.&#010; &#009; */&#010;+&#009;@Test&#010; &#009;public void testCookies() throws Exception {&#010; &#009;&#009;startServer(47500);&#010; &#009;&#009;fetchPage("/cookies.jsp", 200);&#010;@@ -92,6 +98,7 @@ public class TestProtocolHttpClient exte&#010; &#009; * @throws Exception&#010; &#009; *             If an error occurs or the test case fails.&#010; &#009; */&#010;+&#009;@Test&#010; &#009;public void testNoPreemptiveAuth() throws Exception {&#010; &#009;&#009;startServer(47500);&#010; &#009;&#009;fetchPage("/noauth.jsp", 200);&#010;@@ -104,6 +111,7 @@ public class TestProtocolHttpClient exte&#010; &#009; * @throws Exception&#010; &#009; *             If an error occurs or the test case fails.&#010; &#009; */&#010;+&#009;@Test&#010; &#009;public void testDefaultCredentials() throws Exception {&#010; &#009;&#009;startServer(47502);&#010; &#009;&#009;fetchPage("/basic.jsp", 200);&#010;@@ -116,6 +124,7 @@ public class TestProtocolHttpClient exte&#010; &#009; * @throws Exception&#010; &#009; *             If an error occurs or the test case fails.&#010; &#009; */&#010;+&#009;@Test&#010; &#009;public void testBasicAuth() throws Exception {&#010; &#009;&#009;startServer(47500);&#010; &#009;&#009;fetchPage("/basic.jsp", 200);&#010;@@ -132,6 +141,7 @@ public class TestProtocolHttpClient exte&#010; &#009; * @throws Exception&#010; &#009; *             If an error occurs or the test case fails.&#010; &#009; */&#010;+&#009;@Test&#010; &#009;public void testOtherRealmsNoAuth() throws Exception {&#010; &#009;&#009;startServer(47501);&#010; &#009;&#009;fetchPage("/basic.jsp", 200);&#010;@@ -146,6 +156,7 @@ public class TestProtocolHttpClient exte&#010; &#009; * @throws Exception&#010; &#009; *             If an error occurs or the test case fails.&#010; &#009; */&#010;+&#009;@Test&#010; &#009;public void testDigestAuth() throws Exception {&#010; &#009;&#009;startServer(47500);&#010; &#009;&#009;fetchPage("/digest.jsp", 200);&#010;@@ -158,6 +169,7 @@ public class TestProtocolHttpClient exte&#010; &#009; * @throws Exception&#010; &#009; *             If an error occurs or the test case fails.&#010; &#009; */&#010;+&#009;@Test&#010; &#009;public void testNtlmAuth() throws Exception {&#010; &#009;&#009;startServer(47501);&#010; &#009;&#009;fetchPage("/ntlm.jsp", 200);&#010;&#010;Modified: nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java (original)&#010;+++ nutch/branches/2.x/src/plugin/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java Sun May 19 21:11:54 2013&#010;@@ -22,14 +22,16 @@ import java.util.Collection;&#010; &#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestSubcollection extends TestCase {&#010;+public class TestSubcollection {&#010;   &#010;   /**Test filtering logic&#010;    * &#010;    * @throws Exception&#010;    */&#010;+  @Test&#010;   public void testFilter() throws Exception {&#010;     Subcollection sc=new Subcollection(NutchConfiguration.create());&#010;     sc.setWhiteList("www.nutch.org\nwww.apache.org");&#010;@@ -46,6 +48,7 @@ public class TestSubcollection extends T&#010;     assertEquals(null, sc.filter("http://www.google.com/"));&#010;   }&#010;   &#010;+  @Test&#010;   public void testInput(){&#010;     StringBuffer xml=new StringBuffer();&#010;     xml.append("&lt;?xml version=\"1.0\" encoding=\"UTF-8\"?&gt;");&#010;&#010;Modified: nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java Sun May 19 21:11:54 2013&#010;@@ -20,14 +20,12 @@ package org.apache.nutch.urlfilter.autom&#010; import java.io.IOException;&#010; import java.io.Reader;&#010; &#010;-// JUnit imports&#010;-import junit.framework.Test;&#010;-import junit.framework.TestSuite;&#010;-import junit.textui.TestRunner;&#010; import org.apache.nutch.net.*;&#010; &#010; // Nutch imports&#010; import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; &#010; /**&#010;@@ -37,18 +35,6 @@ import org.apache.nutch.urlfilter.api.Re&#010;  */&#010; public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {&#010;   &#010;-  public TestAutomatonURLFilter(String testName) {&#010;-    super(testName);&#010;-  }&#010;-  &#010;-  public static Test suite() {&#010;-    return new TestSuite(TestAutomatonURLFilter.class);&#010;-  }&#010;-  &#010;-  public static void main(String[] args) {&#010;-    TestRunner.run(suite());&#010;-  }&#010;-&#010;   protected URLFilter getURLFilter(Reader rules) {&#010;     try {&#010;       return new AutomatonURLFilter(rules);&#010;@@ -58,6 +44,7 @@ public class TestAutomatonURLFilter exte&#010;     }&#010;   }&#010;   &#010;+  @Test&#010;   public void test() {&#010;     test("WholeWebCrawling");&#010;     test("IntranetCrawling");&#010;&#010;Modified: nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/urlfilter-domain/src/test/org/apache/nutch/urlfilter/domain/TestDomainURLFilter.java Sun May 19 21:11:54 2013&#010;@@ -16,25 +16,22 @@&#010;  */&#010; package org.apache.nutch.urlfilter.domain;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-public class TestDomainURLFilter&#010;-  extends TestCase {&#010;+public class TestDomainURLFilter {&#010; &#010;   protected static final Logger LOG = LoggerFactory.getLogger(TestDomainURLFilter.class);&#010; &#010;   private final static String SEPARATOR = System.getProperty("file.separator");&#010;   private final static String SAMPLES = System.getProperty("test.data", ".");&#010; &#010;-  public TestDomainURLFilter(String testName) {&#010;-    super(testName);&#010;-  }&#010;-&#010;+  @Test&#010;   public void testFilter()&#010;     throws Exception {&#010; &#010;&#010;Modified: nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java Sun May 19 21:11:54 2013&#010;@@ -20,15 +20,11 @@ package org.apache.nutch.urlfilter.regex&#010; import java.io.IOException;&#010; import java.io.Reader;&#010; &#010;-// JUnit imports&#010;-import junit.framework.Test;&#010;-import junit.framework.TestSuite;&#010;-import junit.textui.TestRunner;&#010; import org.apache.nutch.net.*;&#010;-&#010;-// Nutch imports&#010; import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;&#010; &#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * JUnit based test of class &lt;code&gt;RegexURLFilter&lt;/code&gt;.&#010;@@ -37,18 +33,6 @@ import org.apache.nutch.urlfilter.api.Re&#010;  */&#010; public class TestRegexURLFilter extends RegexURLFilterBaseTest {&#010;   &#010;-  public TestRegexURLFilter(String testName) {&#010;-    super(testName);&#010;-  }&#010;-  &#010;-  public static Test suite() {&#010;-    return new TestSuite(TestRegexURLFilter.class);&#010;-  }&#010;-  &#010;-  public static void main(String[] args) {&#010;-    TestRunner.run(suite());&#010;-  }&#010;-&#010;   protected URLFilter getURLFilter(Reader rules) {&#010;     try {&#010;       return new RegexURLFilter(rules);&#010;@@ -58,6 +42,7 @@ public class TestRegexURLFilter extends &#010;     }&#010;   }&#010;   &#010;+  @Test&#010;   public void test() {&#010;     test("WholeWebCrawling");&#010;     test("IntranetCrawling");&#010;&#010;Modified: nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java (original)&#010;+++ nutch/branches/2.x/src/plugin/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java Sun May 19 21:11:54 2013&#010;@@ -19,10 +19,9 @@ package org.apache.nutch.urlfilter.suffi&#010; import java.io.IOException;&#010; import java.io.StringReader;&#010; &#010;-import junit.framework.Test;&#010;-import junit.framework.TestCase;&#010;-import junit.framework.TestSuite;&#010;-import junit.textui.TestRunner;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; &#010; /**&#010;@@ -30,7 +29,7 @@ import junit.textui.TestRunner;&#010;  *&#010;  * @author Andrzej Bialecki&#010;  */&#010;-public class TestSuffixURLFilter extends TestCase {&#010;+public class TestSuffixURLFilter {&#010;   private static final String suffixes =&#010;     "# this is a comment\n" +&#010;     "\n" +&#010;@@ -117,22 +116,12 @@ public class TestSuffixURLFilter extends&#010;   &#010;   private SuffixURLFilter filter = null;&#010;   &#010;-  public TestSuffixURLFilter(String testName) {&#010;-    super(testName);&#010;-  }&#010;-  &#010;-  public static Test suite() {&#010;-    return new TestSuite(TestSuffixURLFilter.class);&#010;-  }&#010;-  &#010;-  public static void main(String[] args) {&#010;-    TestRunner.run(suite());&#010;-  }&#010;-  &#010;+  @Before&#010;   public void setUp() throws IOException {&#010;     filter = new SuffixURLFilter(new StringReader(suffixes));&#010;   }&#010;   &#010;+  @Test&#010;   public void testModeAccept() {&#010;     filter.setIgnoreCase(false);&#010;     filter.setModeAccept(true);&#010;@@ -141,6 +130,7 @@ public class TestSuffixURLFilter extends&#010;     }&#010;   }&#010; &#010;+  @Test&#010;   public void testModeReject() {&#010;     filter.setIgnoreCase(false);&#010;     filter.setModeAccept(false);&#010;@@ -149,6 +139,7 @@ public class TestSuffixURLFilter extends&#010;     }&#010;   }&#010; &#010;+  @Test&#010;   public void testModeAcceptIgnoreCase() {&#010;     filter.setIgnoreCase(true);&#010;     filter.setModeAccept(true);&#010;@@ -157,6 +148,7 @@ public class TestSuffixURLFilter extends&#010;     }&#010;   }&#010; &#010;+  @Test&#010;   public void testModeRejectIgnoreCase() {&#010;     filter.setIgnoreCase(true);&#010;     filter.setModeAccept(false);&#010;@@ -165,6 +157,7 @@ public class TestSuffixURLFilter extends&#010;     }&#010;   }&#010;   &#010;+  @Test&#010;   public void testModeAcceptAndNonPathFilter() {&#010;     filter.setModeAccept(true);&#010;     filter.setFilterFromPath(false);&#010;@@ -173,6 +166,7 @@ public class TestSuffixURLFilter extends&#010;     }&#010;   }&#010;   &#010;+  @Test&#010;   public void testModeAcceptAndPathFilter() {&#010;     filter.setModeAccept(true);&#010;     filter.setFilterFromPath(true);&#010;&#010;Modified: nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java (original)&#010;+++ nutch/branches/2.x/src/plugin/urlnormalizer-basic/src/test/org/apache/nutch/net/urlnormalizer/basic/TestBasicURLNormalizer.java Sun May 19 21:11:54 2013&#010;@@ -21,20 +21,23 @@ import org.apache.hadoop.conf.Configurat&#010; import org.apache.nutch.net.URLNormalizers;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /** Unit tests for BasicURLNormalizer. */&#010;-public class TestBasicURLNormalizer extends TestCase {&#010;+public class TestBasicURLNormalizer {&#010;   private BasicURLNormalizer normalizer;&#010;   private Configuration conf;&#010;   &#010;-  public TestBasicURLNormalizer(String name) {&#010;-    super(name);&#010;+  @Before&#010;+  public void setUp() {&#010;     normalizer = new BasicURLNormalizer();&#010;     conf = NutchConfiguration.create();&#010;     normalizer.setConf(conf);&#010;   }&#010; &#010;+  @Test&#010;   public void testNormalizer() throws Exception {&#010;     // check that leading and trailing spaces are removed&#010;     normalizeTest(" http://foo.com/ ", "http://foo.com/");&#010;@@ -109,11 +112,4 @@ public class TestBasicURLNormalizer exte&#010;     assertEquals(normal, normalizer.normalize(weird, URLNormalizers.SCOPE_DEFAULT));&#010;   }&#010; &#010;-  public static void main(String[] args) throws Exception {&#010;-    new TestBasicURLNormalizer("test").testNormalizer();&#010;-  }&#010;-&#010;-&#010;-&#010;-&#010; }&#010;\ No newline at end of file&#010;&#010;Modified: nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java (original)&#010;+++ nutch/branches/2.x/src/plugin/urlnormalizer-pass/src/test/org/apache/nutch/net/urlnormalizer/pass/TestPassURLNormalizer.java Sun May 19 21:11:54 2013&#010;@@ -23,10 +23,12 @@ import org.apache.hadoop.conf.Configurat&#010; import org.apache.nutch.net.URLNormalizers;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestPassURLNormalizer extends TestCase {&#010;+public class TestPassURLNormalizer {&#010; &#010;+  @Test&#010;   public void testPassURLNormalizer() {&#010;     Configuration conf = NutchConfiguration.create();&#010;     &#010;&#010;Modified: nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java (original)&#010;+++ nutch/branches/2.x/src/plugin/urlnormalizer-regex/src/test/org/apache/nutch/net/urlnormalizer/regex/TestRegexURLNormalizer.java Sun May 19 21:11:54 2013&#010;@@ -24,22 +24,22 @@ import java.io.FileInputStream;&#010; import java.io.FileReader;&#010; import java.io.IOException;&#010; import java.io.InputStreamReader;&#010;-import java.io.Reader;&#010; import java.util.ArrayList;&#010; import java.util.HashMap;&#010; import java.util.Iterator;&#010; import java.util.List;&#010; &#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.net.URLNormalizers;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;-&#010; /** Unit tests for RegexUrlNormalizer. */&#010;-public class TestRegexURLNormalizer extends TestCase {&#010;+public class TestRegexURLNormalizer {&#010;   private static final Logger LOG = LoggerFactory.getLogger(TestRegexURLNormalizer.class);&#010;   &#010;   private RegexURLNormalizer normalizer;&#010;@@ -51,8 +51,8 @@ public class TestRegexURLNormalizer exte&#010;   // Make sure sample files are copied to "test.data" as specified in&#010;   // ./src/plugin/urlnormalizer-regex/build.xml during plugin compilation.&#010;   &#010;-  public TestRegexURLNormalizer(String name) throws IOException {&#010;-    super(name);&#010;+  @Before&#010;+  public void setUp() throws IOException {&#010;     normalizer = new RegexURLNormalizer();&#010;     conf = NutchConfiguration.create();&#010;     normalizer.setConf(conf);&#010;@@ -77,11 +77,13 @@ public class TestRegexURLNormalizer exte&#010;     }&#010;   }&#010; &#010;+  @Test&#010;   public void testNormalizerDefault() throws Exception {&#010;     normalizeTest((NormalizedURL[])testData.get(URLNormalizers.SCOPE_DEFAULT),&#010;             URLNormalizers.SCOPE_DEFAULT);&#010;   }&#010; &#010;+  @Test&#010;   public void testNormalizerScope() throws Exception {&#010;     Iterator it = testData.keySet().iterator();&#010;     while (it.hasNext()) {&#010;@@ -139,43 +141,4 @@ public class TestRegexURLNormalizer exte&#010;     }&#010;     return (NormalizedURL[]) list.toArray(new NormalizedURL[list.size()]);&#010;   }  &#010;-&#010;-  public static void main(String[] args) throws Exception {&#010;-    if (args.length == 0) {&#010;-      System.err.println("TestRegexURLNormalizer [-bench &lt;iter&gt;] &lt;scope&gt;");&#010;-      System.exit(-1);&#010;-    }&#010;-    boolean bench = false;&#010;-    int iter = -1;&#010;-    String scope = null;&#010;-    for (int i = 0; i &lt; args.length; i++) {&#010;-      if (args[i].equals("-bench")) {&#010;-        bench = true;&#010;-        iter = Integer.parseInt(args[++i]);&#010;-      } else scope = args[i];&#010;-    }&#010;-    if (scope == null) {&#010;-      System.err.println("Missing required scope name.");&#010;-      System.exit(-1);&#010;-    }&#010;-    if (bench &amp;&amp; iter &lt; 0) {&#010;-      System.err.println("Invalid number of iterations: " + iter);&#010;-      System.exit(-1);&#010;-    }&#010;-    TestRegexURLNormalizer test = new TestRegexURLNormalizer("test");&#010;-    NormalizedURL[] urls = (NormalizedURL[])test.testData.get(scope);&#010;-    if (urls == null) {&#010;-      LOG.warn("Missing test data for scope '" + scope + "', using default scope.");&#010;-      scope = URLNormalizers.SCOPE_DEFAULT;&#010;-      urls = (NormalizedURL[])test.testData.get(scope);&#010;-    }&#010;-    if (bench) {&#010;-      test.bench(iter, scope);&#010;-    } else {&#010;-      test.normalizeTest(urls, scope);&#010;-    }&#010;-  }&#010;-&#010;-&#010;-&#010; }&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Sun May 19 21:11:54 2013&#010;@@ -28,15 +28,16 @@ import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.AbstractNutchTest;&#010; import org.apache.nutch.util.CrawlTestUtil;&#010; import org.apache.nutch.util.TableUtil;&#010;+import org.junit.After;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * Basic generator test. 1. Insert entries in webtable 2. Generates entries to&#010;  * fetch 3. Verifies that number of generated urls match 4. Verifies that&#010;  * highest scoring urls are generated&#010;  *&#010;- * @author nutch-dev &lt;nutch-dev at lucene.apache.org&gt;&#010;- * @param &lt;URLWebPage&gt;&#010;- *&#010;  */&#010; public class TestGenerator extends AbstractNutchTest {&#010; &#010;@@ -46,12 +47,25 @@ public class TestGenerator extends Abstr&#010;     WebPage.Field.MARKERS.getName(),&#010;     WebPage.Field.SCORE.getName()&#010;   };&#010;+  &#010;+  @Override&#010;+  @Before&#010;+  public void setUp() throws Exception{&#010;+    super.setUp();&#010;+  }&#010;+  &#010;+  @Override&#010;+  @After&#010;+  public void tearDown()throws Exception {&#010;+    super.tearDown();&#010;+  }&#010; &#010;   /**&#010;    * Test that generator generates fetchlist ordered by score (desc).&#010;    *&#010;    * @throws Exception&#010;    */&#010;+  @Test&#010;   public void testGenerateHighest() throws Exception {&#010; &#010;     final int NUM_RESULTS = 2;&#010;@@ -111,6 +125,7 @@ public class TestGenerator extends Abstr&#010;    *&#010;    * @throws Exception&#010;    */&#010;+  @Test&#010;   public void testGenerateHostLimit() throws Exception {&#010;     ArrayList&lt;URLWebPage&gt; list = new ArrayList&lt;URLWebPage&gt;();&#010; &#010;@@ -158,6 +173,7 @@ public class TestGenerator extends Abstr&#010;    *&#010;    * @throws Exception&#010;    */&#010;+  @Test&#010;   public void testGenerateDomainLimit() throws Exception {&#010;     ArrayList&lt;URLWebPage&gt; list = new ArrayList&lt;URLWebPage&gt;();&#010; &#010;@@ -209,6 +225,7 @@ public class TestGenerator extends Abstr&#010;    * @throws Exception&#010;    * @throws IOException&#010;    */&#010;+  @Test&#010;   public void testFilter() throws IOException, Exception {&#010; &#010;     ArrayList&lt;URLWebPage&gt; list = new ArrayList&lt;URLWebPage&gt;();&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestInjector.java Sun May 19 21:11:54 2013&#010;@@ -28,6 +28,8 @@ import org.apache.nutch.util.AbstractNut&#010; import org.apache.nutch.util.CrawlTestUtil;&#010; import org.apache.gora.util.ByteUtils;&#010; import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * Basic injector test: 1. Creates a text file with urls 2. Injects them into&#010;@@ -46,6 +48,7 @@ public class TestInjector extends Abstra&#010;     urlPath = new Path(testdir, "urls");&#010;   }&#010; &#010;+  @Test&#010;   public void testInject() throws Exception {&#010;     ArrayList&lt;String&gt; urls = new ArrayList&lt;String&gt;();&#010;     for (int i = 0; i &lt; 100; i++) {&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestSignatureFactory.java Sun May 19 21:11:54 2013&#010;@@ -19,10 +19,12 @@ package org.apache.nutch.crawl;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestSignatureFactory extends TestCase {&#010;+public class TestSignatureFactory {&#010; &#010;+  @Test&#010;   public void testGetSignature() {&#010;     Configuration conf=NutchConfiguration.create();&#010;     Signature signature1=SignatureFactory.getSignature(conf);&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestURLPartitioner.java Sun May 19 21:11:54 2013&#010;@@ -18,7 +18,8 @@ package org.apache.nutch.crawl;&#010; &#010; import java.net.MalformedURLException;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.hadoop.io.IntWritable;&#010;@@ -33,11 +34,12 @@ import org.apache.nutch.util.TableUtil;&#010; /**&#010;  * Tests {@link URLPartitioner}&#010;  */&#010;-public class TestURLPartitioner extends TestCase {&#010;+public class TestURLPartitioner {&#010; &#010;   /**&#010;    * tests one reducer, everything goes into one partition, using host partitioner.&#010;    */&#010;+  @Test&#010;   public void testOneReducer() {&#010;     URLPartitioner partitioner = new URLPartitioner();&#010;     Configuration conf = NutchConfiguration.create();&#010;@@ -53,6 +55,7 @@ public class TestURLPartitioner extends &#010;   /**&#010;    * tests partitioning by host&#010;    */&#010;+  @Test&#010;   public void testModeHost() {&#010;     URLPartitioner partitioner = new URLPartitioner();&#010;     Configuration conf = NutchConfiguration.create();&#010;@@ -75,6 +78,7 @@ public class TestURLPartitioner extends &#010;   /**&#010;    * tests partitioning by domain&#010;    */&#010;+  @Test&#010;   public void testModeDomain() {&#010;     URLPartitioner partitioner = new URLPartitioner();&#010;     Configuration conf = NutchConfiguration.create();&#010;@@ -97,6 +101,7 @@ public class TestURLPartitioner extends &#010;   /**&#010;    * tests partitioning by IP&#010;    */&#010;+  @Test&#010;   public void testModeIP() {&#010;     URLPartitioner partitioner = new URLPartitioner();&#010;     Configuration conf = NutchConfiguration.create();&#010;@@ -117,10 +122,10 @@ public class TestURLPartitioner extends &#010;         partitionWithoutWWW, partitionWithWWW);&#010;   }&#010;   &#010;-  &#010;   /**&#010;    * Test the seed functionality, using host partitioner.&#010;    */&#010;+  @Test&#010;   public void testSeed() {&#010;     URLPartitioner partitioner = new URLPartitioner();&#010;     Configuration conf = NutchConfiguration.create();&#010;@@ -143,6 +148,7 @@ public class TestURLPartitioner extends &#010;   /**&#010;    * Tests the {@link SelectorEntryPartitioner}.&#010;    */&#010;+  @Test&#010;   public void testSelectorEntryPartitioner() {&#010;     //The reference partitioner&#010;     URLPartitioner refPartitioner = new URLPartitioner();&#010;@@ -174,6 +180,7 @@ public class TestURLPartitioner extends &#010;    * Tests the {@link FetchEntryPartitioner}&#010;    * @throws MalformedURLException &#010;    */&#010;+  @Test&#010;   public void testFetchEntryPartitioner() throws MalformedURLException {&#010;     //The reference partitioner&#010;     URLPartitioner refPartitioner = new URLPartitioner();&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestUrlWithScore.java Sun May 19 21:11:54 2013&#010;@@ -22,7 +22,8 @@ import java.io.DataInputStream;&#010; import java.io.DataOutputStream;&#010; import java.io.IOException;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.hadoop.io.RawComparator;&#010; import org.apache.nutch.crawl.UrlWithScore;&#010;@@ -33,8 +34,9 @@ import org.apache.nutch.crawl.UrlWithSco&#010; /**&#010;  * Tests {@link UrlWithScore} with serialization, partitioning and sorting.&#010;  */&#010;-public class TestUrlWithScore extends TestCase {&#010;+public class TestUrlWithScore {&#010; &#010;+  @Test&#010;   public void testSerialization() throws IOException {&#010;     // create a key and test basic functionality&#010;     UrlWithScore keyOut = new UrlWithScore("http://example.org/", 1f);&#010;@@ -58,6 +60,7 @@ public class TestUrlWithScore extends Te&#010;     out.close();&#010;   }&#010;   &#010;+  @Test&#010;   public void testPartitioner() throws IOException {&#010;     UrlOnlyPartitioner part = new UrlOnlyPartitioner();&#010;     &#010;@@ -80,6 +83,7 @@ public class TestUrlWithScore extends Te&#010;     assertEquals(partForKey3, part.getPartition(k5, null, numReduces));&#010;   }&#010;   &#010;+  @Test&#010;   public void testUrlOnlySorting() throws IOException {&#010;     UrlOnlyComparator comp = new UrlOnlyComparator();&#010;     &#010;@@ -104,6 +108,7 @@ public class TestUrlWithScore extends Te&#010;     assertEquals(1, compareBothRegularAndRaw(comp, k5, k1));&#010;   }&#010;   &#010;+  @Test&#010;   public void testUrlScoreSorting() throws IOException {&#010;     UrlScoreComparator comp = new UrlScoreComparator();&#010;     &#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/fetcher/TestFetcher.java Sun May 19 21:11:54 2013&#010;@@ -30,6 +30,11 @@ import org.apache.nutch.util.AbstractNut&#010; import org.apache.nutch.util.CrawlTestUtil;&#010; import org.mortbay.jetty.Server;&#010; &#010;+import org.junit.After;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010;+&#010; /**&#010;  * Basic fetcher test&#010;  * 1. generate seedlist&#010;@@ -37,7 +42,6 @@ import org.mortbay.jetty.Server;&#010;  * 3. generate&#010;  * 3. fetch&#010;  * 4. Verify contents&#010;- * @author nutch-dev &lt;nutch-dev at lucene.apache.org&gt;&#010;  *&#010;  */&#010; public class TestFetcher extends AbstractNutchTest {&#010;@@ -47,6 +51,7 @@ public class TestFetcher extends Abstrac&#010;   Server server;&#010; &#010;   @Override&#010;+  @Before&#010;   public void setUp() throws Exception{&#010;     super.setUp();&#010;     urlPath = new Path(testdir, "urls");&#010;@@ -55,11 +60,13 @@ public class TestFetcher extends Abstrac&#010;   }&#010; &#010;   @Override&#010;+  @After&#010;   public void tearDown() throws Exception{&#010;     server.stop();&#010;     fs.delete(testdir, true);&#010;   }&#010; &#010;+  @Test&#010;   public void testFetch() throws Exception {&#010; &#010;     //generate seedlist&#010;@@ -124,6 +131,7 @@ public class TestFetcher extends Abstrac&#010;     urls.add("http://127.0.0.1:" + server.getConnectors()[0].getPort() + "/" + page);&#010;   }&#010; &#010;+  @Test&#010;   public void testAgentNameCheck() {&#010; &#010;     boolean failedNoAgentName = false;&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1484348 [2/2] - in /nutch/branches/2.x: ./ ivy/ src/plugin/creativecommons/src/test/org/creativecommons/nutch/ src/plugin/feed/src/test/org/apache/nutch/parse/feed/ src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/ src/plug...</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130519211158.8C1F523889CB@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130519211158-8C1F523889CB@eris-apache-org%3e</id>
<updated>2013-05-19T21:11:55Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Modified: nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/indexer/TestIndexingFilters.java Sun May&#010;19 21:11:54 2013&#010;@@ -16,7 +16,8 @@&#010;  */&#010; package org.apache.nutch.indexer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010;@@ -24,12 +25,13 @@ import org.apache.nutch.metadata.Metadat&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-public class TestIndexingFilters extends TestCase {&#010;+public class TestIndexingFilters {&#010; &#010;   /**&#010;    * Test behaviour when defined filter does not exist.&#010;    * @throws IndexingException&#010;    */&#010;+  @Test&#010;   public void testNonExistingIndexingFilter() throws IndexingException {&#010;     Configuration conf = NutchConfiguration.create();&#010;     conf.addResource("nutch-default.xml");&#010;@@ -50,6 +52,7 @@ public class TestIndexingFilters extends&#010;    * Test behaviour when NutchDOcument is null&#010;    * @throws IndexingException&#010;    */&#010;+  @Test&#010;   public void testNutchDocumentNullIndexingFilter() throws IndexingException{&#010;     Configuration conf = NutchConfiguration.create();&#010;     conf.addResource("nutch-default.xml");&#010;@@ -64,38 +67,38 @@ public class TestIndexingFilters extends&#010;     assertNull(doc);&#010;   }&#010; &#010;-    /**&#010;-     * Test behaviour when reset the index filter order will not take effect&#010;-     *&#010;-     * @throws IndexingException&#010;-     */&#010;-    public void testFilterCacheIndexingFilter() throws IndexingException{&#010;-        Configuration conf = NutchConfiguration.create();&#010;-        conf.addResource("nutch-default.xml");&#010;-        conf.addResource("crawl-tests.xml");&#010;-&#010;-        String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";&#010;-        conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);&#010;-&#010;-        IndexingFilters filters1 = new IndexingFilters(conf);&#010;-        WebPage page = new WebPage();&#010;-        page.setText(new Utf8("text"));&#010;-        page.setTitle(new Utf8("title"));&#010;-        NutchDocument fdoc1 = filters1.filter(new NutchDocument(),"http://www.example.com/",page);&#010;-&#010;-        // add another index filter&#010;-        String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";&#010;-        // set content metadata&#010;-        Metadata md = new Metadata();&#010;-        md.add("example","data");&#010;-        // set content metadata property defined in MetadataIndexer&#010;-        conf.set("index.content.md","example");&#010;-        // add MetadataIndxer filter&#010;-        conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);&#010;-        IndexingFilters filters2 = new IndexingFilters(conf);&#010;-        NutchDocument fdoc2 = filters2.filter(new NutchDocument(),"http://www.example.com/",page);&#010;-        assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());&#010;-    }&#010;+  /**&#010;+   * Test behaviour when reset the index filter order will not take effect&#010;+   *&#010;+   * @throws IndexingException&#010;+   */&#010;+  @Test&#010;+  public void testFilterCacheIndexingFilter() throws IndexingException{&#010;+    Configuration conf = NutchConfiguration.create();&#010;+    conf.addResource("nutch-default.xml");&#010;+    conf.addResource("crawl-tests.xml");&#010;+&#010;+    String class1 = "org.apache.nutch.indexer.basic.BasicIndexingFilter";&#010;+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1);&#010; &#010;+    IndexingFilters filters1 = new IndexingFilters(conf);&#010;+    WebPage page = new WebPage();&#010;+    page.setText(new Utf8("text"));&#010;+    page.setTitle(new Utf8("title"));&#010;+    NutchDocument fdoc1 = filters1.filter(new NutchDocument(),"http://www.example.com/",page);&#010;+&#010;+    // add another index filter&#010;+    String class2 = "org.apache.nutch.indexer.metadata.MetadataIndexer";&#010;+    // set content metadata&#010;+    Metadata md = new Metadata();&#010;+    md.add("example","data");&#010;+    // set content metadata property defined in MetadataIndexer&#010;+    conf.set("index.content.md","example");&#010;+    // add MetadataIndxer filter&#010;+    conf.set(IndexingFilters.INDEXINGFILTER_ORDER, class1 + " " + class2);&#010;+    IndexingFilters filters2 = new IndexingFilters(conf);&#010;+    NutchDocument fdoc2 = filters2.filter(new NutchDocument(),"http://www.example.com/",page);&#010;+    assertEquals(fdoc1.getFieldNames().size(),fdoc2.getFieldNames().size());&#010;+  }&#010; &#010; }&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestMetadata.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestMetadata.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestMetadata.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestMetadata.java Sun May 19 21:11:54&#010;2013&#010;@@ -22,30 +22,16 @@ import java.io.DataInputStream;&#010; import java.io.DataOutputStream;&#010; import java.io.IOException;&#010; import java.util.Properties;&#010;-import junit.framework.Test;&#010;-import junit.framework.TestCase;&#010;-import junit.framework.TestSuite;&#010;-import junit.textui.TestRunner;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * JUnit based tests of class {@link org.apache.nutch.metadata.Metadata}.&#010;  */&#010;-public class TestMetadata extends TestCase {&#010;+public class TestMetadata {&#010; &#010;   private static final String CONTENTTYPE = "contenttype";&#010; &#010;-  public TestMetadata(String testName) {&#010;-    super(testName);&#010;-  }&#010;-&#010;-  public static Test suite() {&#010;-    return new TestSuite(TestMetadata.class);&#010;-  }&#010;-&#010;-  public static void main(String[] args) {&#010;-    TestRunner.run(suite());&#010;-  }&#010;-  &#010;   /**&#010;    * Test to ensure that only non-null values get written when the&#010;    * {@link Metadata} object is written using a Writeable.&#010;@@ -53,6 +39,7 @@ public class TestMetadata extends TestCa&#010;    * @since NUTCH-406&#010;    * &#010;    */&#010;+  @Test&#010;   public void testWriteNonNull() {&#010;     Metadata met = new Metadata();&#010;     met.add(CONTENTTYPE, null);&#010;@@ -83,6 +70,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for the &lt;code&gt;add(String, String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testAdd() {&#010;     String[] values = null;&#010;     Metadata meta = new Metadata();&#010;@@ -112,6 +100,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for the &lt;code&gt;set(String, String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testSet() {&#010;     String[] values = null;&#010;     Metadata meta = new Metadata();&#010;@@ -138,6 +127,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for &lt;code&gt;setAll(Properties)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testSetProperties() {&#010;     String[] values = null;&#010;     Metadata meta = new Metadata();&#010;@@ -165,6 +155,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for &lt;code&gt;get(String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testGet() {&#010;     Metadata meta = new Metadata();&#010;     assertNull(meta.get("a-name"));&#010;@@ -175,6 +166,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for &lt;code&gt;isMultiValued()&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testIsMultiValued() {&#010;     Metadata meta = new Metadata();&#010;     assertFalse(meta.isMultiValued("key"));&#010;@@ -185,6 +177,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for &lt;code&gt;names&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testNames() {&#010;     String[] names = null;&#010;     Metadata meta = new Metadata();&#010;@@ -201,6 +194,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for &lt;code&gt;remove(String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testRemove() {&#010;     Metadata meta = new Metadata();&#010;     meta.remove("name-one");&#010;@@ -222,6 +216,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for &lt;code&gt;equals(Object)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testObject() {&#010;     Metadata meta1 = new Metadata();&#010;     Metadata meta2 = new Metadata();&#010;@@ -247,6 +242,7 @@ public class TestMetadata extends TestCa&#010;   }&#010; &#010;   /** Test for &lt;code&gt;Writable&lt;/code&gt; implementation. */&#010;+  @Test&#010;   public void testWritable() {&#010;     Metadata result = null;&#010;     Metadata meta = new Metadata();&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/metadata/TestSpellCheckedMetadata.java Sun&#010;May 19 21:11:54 2013&#010;@@ -22,10 +22,8 @@ import java.io.DataInputStream;&#010; import java.io.DataOutputStream;&#010; import java.io.IOException;&#010; import java.util.Properties;&#010;-import junit.framework.Test;&#010;-import junit.framework.TestCase;&#010;-import junit.framework.TestSuite;&#010;-import junit.textui.TestRunner;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * JUnit based tests of class&#010;@@ -34,23 +32,13 @@ import junit.textui.TestRunner;&#010;  * @author Chris Mattmann&#010;  * @author J&amp;eacute;r&amp;ocirc;me Charron&#010;  */&#010;-public class TestSpellCheckedMetadata extends TestCase {&#010; &#010;-  private static final int NUM_ITERATIONS = 10000;&#010;-&#010;-  public TestSpellCheckedMetadata(String testName) {&#010;-    super(testName);&#010;-  }&#010;+public class TestSpellCheckedMetadata {&#010; &#010;-  public static Test suite() {&#010;-    return new TestSuite(TestSpellCheckedMetadata.class);&#010;-  }&#010;-&#010;-  public static void main(String[] args) {&#010;-    TestRunner.run(suite());&#010;-  }&#010;+  private static final int NUM_ITERATIONS = 10000;&#010; &#010;   /** Test for the &lt;code&gt;getNormalizedName(String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testGetNormalizedName() {&#010;     assertEquals("Content-Type", SpellCheckedMetadata&#010;         .getNormalizedName("Content-Type"));&#010;@@ -67,6 +55,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010;   &#010;   /** Test for the &lt;code&gt;add(String, String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testAdd() {&#010;     String[] values = null;&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;@@ -96,6 +85,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for the &lt;code&gt;set(String, String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testSet() {&#010;     String[] values = null;&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;@@ -122,6 +112,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for &lt;code&gt;setAll(Properties)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testSetProperties() {&#010;     String[] values = null;&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;@@ -149,6 +140,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for &lt;code&gt;get(String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testGet() {&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;     assertNull(meta.get("a-name"));&#010;@@ -160,6 +152,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for &lt;code&gt;isMultiValued()&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testIsMultiValued() {&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;     assertFalse(meta.isMultiValued("key"));&#010;@@ -170,6 +163,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for &lt;code&gt;names&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testNames() {&#010;     String[] names = null;&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;@@ -186,6 +180,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for &lt;code&gt;remove(String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testRemove() {&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;     meta.remove("name-one");&#010;@@ -207,6 +202,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for &lt;code&gt;equals(Object)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testObject() {&#010;     SpellCheckedMetadata meta1 = new SpellCheckedMetadata();&#010;     SpellCheckedMetadata meta2 = new SpellCheckedMetadata();&#010;@@ -232,6 +228,7 @@ public class TestSpellCheckedMetadata ex&#010;   }&#010; &#010;   /** Test for &lt;code&gt;Writable&lt;/code&gt; implementation. */&#010;+  @Test&#010;   public void testWritable() {&#010;     SpellCheckedMetadata result = null;&#010;     SpellCheckedMetadata meta = new SpellCheckedMetadata();&#010;@@ -259,6 +256,7 @@ public class TestSpellCheckedMetadata ex&#010;    * IO Test method, usable only when you plan to do changes in metadata&#010;    * to measure relative performance impact.&#010;    */&#010;+  @Test&#010;   public final void testHandlingSpeed() {&#010;     SpellCheckedMetadata result;&#010;     long start = System.currentTimeMillis();&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLFilters.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLFilters.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLFilters.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLFilters.java Sun May 19 21:11:54&#010;2013&#010;@@ -19,14 +19,16 @@ package org.apache.nutch.net;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestURLFilters extends TestCase {&#010;+public class TestURLFilters {&#010; &#010;   /**&#010;    * Testcase for NUTCH-325.&#010;    * @throws URLFilterException&#010;    */&#010;+  @Test&#010;   public void testNonExistingUrlFilter() throws URLFilterException {&#010;     Configuration conf = NutchConfiguration.create();&#010;     String class1 = "NonExistingFilter";&#010;@@ -34,7 +36,12 @@ public class TestURLFilters extends Test&#010;     conf.set(URLFilters.URLFILTER_ORDER, class1 + " " + class2);&#010; &#010;     URLFilters normalizers = new URLFilters(conf);&#010;-    normalizers.filter("http://someurl/");&#010;+    assertNotNull(normalizers);&#010;+    try {&#010;+      normalizers.filter("http://someurl/");&#010;+    } catch (URLFilterException ufe) {&#010;+      fail(ufe.toString());&#010;+    }&#010;   }&#010; &#010; }&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLNormalizers.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLNormalizers.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLNormalizers.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/net/TestURLNormalizers.java Sun May 19 21:11:54&#010;2013&#010;@@ -21,10 +21,12 @@ import java.net.MalformedURLException;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestURLNormalizers extends TestCase {&#010;+public class TestURLNormalizers {&#010; &#010;+  @Test&#010;   public void testURLNormalizers() {&#010;     Configuration conf = NutchConfiguration.create();&#010;     String clazz1 = "org.apache.nutch.net.urlnormalizer.regex.RegexURLNormalizer";&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/parse/TestOutlinkExtractor.java Sun May 19&#010;21:11:54 2013&#010;@@ -22,7 +22,8 @@ import org.apache.nutch.parse.OutlinkExt&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /**&#010;  * TestCase to check regExp extraction of URLs.&#010;@@ -31,7 +32,7 @@ import junit.framework.TestCase;&#010;  * &#010;  * @version 1.0&#010;  */&#010;-public class TestOutlinkExtractor extends TestCase {&#010;+public class TestOutlinkExtractor {&#010; &#010;   private static Configuration conf = NutchConfiguration.create();&#010;   public void testGetNoOutlinks() {&#010;@@ -46,6 +47,7 @@ public class TestOutlinkExtractor extend&#010;     assertEquals(0, outlinks.length);&#010;   }&#010;   &#010;+  @Test&#010;   public void testGetOutlinksHttp() {&#010;     Outlink[] outlinks = OutlinkExtractor.getOutlinks(&#010;         "Test with http://www.nutch.org/index.html is it found? " +&#010;@@ -58,6 +60,7 @@ public class TestOutlinkExtractor extend&#010;     assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl());&#010;   }&#010;   &#010;+  @Test&#010;   public void testGetOutlinksHttp2() {&#010;     Outlink[] outlinks = OutlinkExtractor.getOutlinks(&#010;         "Test with http://www.nutch.org/index.html is it found? " +&#010;@@ -69,6 +72,8 @@ public class TestOutlinkExtractor extend&#010;     assertEquals("Wrong URL", "http://www.google.de", outlinks[1].getToUrl());&#010;     assertEquals("Wrong URL", "http://www.sybit.com/solutions/portals.html", outlinks[2].getToUrl());&#010;   }&#010;+  &#010;+  @Test&#010;   public void testGetOutlinksFtp() {&#010;     Outlink[] outlinks = OutlinkExtractor.getOutlinks(&#010;         "Test with ftp://www.nutch.org is it found? " +&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParserFactory.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParserFactory.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParserFactory.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/parse/TestParserFactory.java Sun May 19 21:11:54&#010;2013&#010;@@ -17,8 +17,9 @@&#010; &#010; package org.apache.nutch.parse;&#010; &#010;-// JUnit imports&#010;-import junit.framework.TestCase;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; // Nutch imports&#010; import org.apache.nutch.plugin.Extension;&#010;@@ -29,17 +30,15 @@ import org.apache.nutch.util.NutchConfig&#010;  * Unit test for new parse plugin selection.&#010;  *&#010;  * @author Sebastien Le Callonnec&#010;- * @version 1.0&#010;  */&#010;-public class TestParserFactory extends TestCase {&#010;+public class TestParserFactory {&#010; &#009;&#010;   private Configuration conf;&#010;   private ParserFactory parserFactory;&#010;     &#010;-  public TestParserFactory(String name) { super(name); }&#010;-&#010;   /** Inits the Test Case with the test parse-plugin file */&#010;-  protected void setUp() throws Exception {&#010;+  @Before&#010;+  public void setUp() throws Exception {&#010;       conf = NutchConfiguration.create();&#010;       conf.set("plugin.includes", ".*");&#010;       conf.set("parse.plugin.file",&#010;@@ -48,6 +47,7 @@ public class TestParserFactory extends T&#010;   }&#010;     &#010;   /** Unit test for &lt;code&gt;getExtensions(String)&lt;/code&gt; method. */&#010;+  @Test&#010;   public void testGetExtensions() throws Exception {&#010;     Extension ext = parserFactory.getExtensions("text/html").get(0);&#010;     assertEquals("parse-tika", ext.getDescriptor().getPluginId());&#010;@@ -58,6 +58,7 @@ public class TestParserFactory extends T&#010;   }&#010;   &#010;   /** Unit test to check &lt;code&gt;getParsers&lt;/code&gt; method */&#010;+  @Test&#010;   public void testGetParsers() throws Exception {&#010;     Parser [] parsers = parserFactory.getParsers("text/html", "http://foo.com");&#010;     assertNotNull(parsers);&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/plugin/TestPluginSystem.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/plugin/TestPluginSystem.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/plugin/TestPluginSystem.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/plugin/TestPluginSystem.java Sun May 19 21:11:54&#010;2013&#010;@@ -26,12 +26,14 @@ import java.util.LinkedList;&#010; import java.util.Locale;&#010; import java.util.Properties;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.After;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.hadoop.mapred.JobConf;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;-import org.apache.nutch.util.NutchJob;&#010; import org.apache.nutch.util.NutchJobConf;&#010; &#010; /**&#010;@@ -39,14 +41,15 @@ import org.apache.nutch.util.NutchJobCon&#010;  * &#010;  * @author joa23&#010;  */&#010;-public class TestPluginSystem extends TestCase {&#010;+public class TestPluginSystem {&#010;     private int fPluginCount;&#010; &#010;-    private LinkedList fFolders = new LinkedList();&#010;+    private LinkedList&lt;File&gt; fFolders = new LinkedList&lt;File&gt;();&#010;     private Configuration conf ;&#010;     private PluginRepository repository;&#010; &#010;-    protected void setUp() throws Exception {&#010;+    @Before&#010;+    public void setUp() throws Exception {&#010;         this.conf = NutchConfiguration.create();&#010;         conf.set("plugin.includes", ".*");&#010; //        String string = this.conf.get("plugin.includes", "");&#010;@@ -56,12 +59,8 @@ public class TestPluginSystem extends Te&#010;         this.repository = PluginRepository.get(conf);&#010;     }&#010; &#010;-    /*&#010;-     * (non-Javadoc)&#010;-     * &#010;-     * @see junit.framework.TestCase#tearDown()&#010;-     */&#010;-    protected void tearDown() throws Exception {&#010;+    @After&#010;+    public void tearDown() throws Exception {&#010;         for (int i = 0; i &lt; fFolders.size(); i++) {&#010;             File folder = (File) fFolders.get(i);&#010;             delete(folder);&#010;@@ -72,6 +71,7 @@ public class TestPluginSystem extends Te&#010; &#010;     /**&#010;      */&#010;+    @Test&#010;     public void testPluginConfiguration() {&#010;         String string = getPluginFolder();&#010;         File file = new File(string);&#010;@@ -83,6 +83,7 @@ public class TestPluginSystem extends Te&#010; &#010;     /**&#010;      */&#010;+    @Test&#010;     public void testLoadPlugins() {&#010;         PluginDescriptor[] descriptors = repository&#010;                 .getPluginDescriptors();&#010;@@ -98,6 +99,7 @@ public class TestPluginSystem extends Te&#010;         }&#010;     }&#010; &#010;+    @Test&#010;     public void testRepositoryCache() {&#010;       Configuration config = NutchConfiguration.create();&#010;       PluginRepository repo = PluginRepository.get(config);&#010;@@ -117,6 +119,7 @@ public class TestPluginSystem extends Te&#010;     /**&#010;      *  &#010;      */&#010;+    @Test&#010;     public void testGetExtensionAndAttributes() {&#010;         String xpId = " sdsdsd";&#010;         ExtensionPoint extensionPoint =repository&#010;@@ -135,6 +138,7 @@ public class TestPluginSystem extends Te&#010;     /**&#010;      * @throws PluginRuntimeException&#010;      */&#010;+    @Test&#010;     public void testGetExtensionInstances() throws PluginRuntimeException {&#010;         Extension[] extensions = repository&#010;                 .getExtensionPoint(getGetExtensionId()).getExtensions();&#010;@@ -154,6 +158,7 @@ public class TestPluginSystem extends Te&#010;      * &#010;      *  &#010;      */&#010;+    @Test&#010;     public void testGetClassLoader() {&#010;         PluginDescriptor[] descriptors = repository&#010;                 .getPluginDescriptors();&#010;@@ -166,6 +171,7 @@ public class TestPluginSystem extends Te&#010;     /**&#010;      * @throws IOException&#010;      */&#010;+    @Test&#010;     public void testGetResources() throws IOException {&#010;         PluginDescriptor[] descriptors = repository&#010;                 .getPluginDescriptors();&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestContent.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestContent.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestContent.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestContent.java Sun May 19 21:11:54&#010;2013&#010;@@ -24,17 +24,17 @@ import org.apache.nutch.util.NutchConfig&#010; import org.apache.nutch.util.WritableTestUtils;&#010; import org.apache.tika.mime.MimeTypes;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; &#010; /** Unit tests for Content. */&#010; &#010;-public class TestContent extends TestCase {&#010;+public class TestContent {&#010; &#010;   private static Configuration conf = NutchConfiguration.create();&#010; &#010;-  public TestContent(String name) { super(name); }&#010;-&#010;+  @Test&#010;   public void testContent() throws Exception {&#010; &#010;     String page = "&lt;HTML&gt;&lt;BODY&gt;&lt;H1&gt;Hello World&lt;/H1&gt;&lt;P&gt;The Quick&#010;Brown Fox Jumped Over the Lazy Fox.&lt;/BODY&gt;&lt;/HTML&gt;";&#010;@@ -55,6 +55,7 @@ public class TestContent extends TestCas&#010;   }&#010; &#010;   /** Unit tests for getContentType(String, String, byte[]) method. */&#010;+  @Test&#010;   public void testGetContentType() throws Exception {&#010;     Content c = null;&#010;     Metadata p = new Metadata();&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestProtocolFactory.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestProtocolFactory.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestProtocolFactory.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/protocol/TestProtocolFactory.java Sun May&#010;19 21:11:54 2013&#010;@@ -20,20 +20,24 @@ import org.apache.hadoop.conf.Configurat&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.apache.nutch.util.ObjectCache;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestProtocolFactory extends TestCase {&#010;+public class TestProtocolFactory {&#010; &#010;   Configuration conf;&#010;   ProtocolFactory factory;&#010;   &#010;-  protected void setUp() throws Exception {&#010;+  @Before&#010;+  public void setUp() throws Exception {&#010;     conf = NutchConfiguration.create();&#010;     conf.set("plugin.includes", ".*");&#010;     conf.set("http.agent.name", "test-bot");&#010;     factory=new ProtocolFactory(conf);&#010;   }&#010; &#010;+  @Test&#010;   public void testGetProtocol(){&#010; &#010;     //non existing protocol&#010;@@ -69,6 +73,7 @@ public class TestProtocolFactory extends&#010;     }&#010;   }&#010;   &#010;+  @Test&#010;   public void testContains(){&#010;     assertTrue(factory.contains("http", "http"));&#010;     assertTrue(factory.contains("http", "http,ftp"));&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/storage/TestGoraStorage.java Sun May 19 21:11:54&#010;2013&#010;@@ -35,16 +35,34 @@ import org.apache.nutch.util.AbstractNut&#010; import org.apache.nutch.util.CrawlTestUtil;&#010; import org.hsqldb.Server;&#010; &#010;+import org.junit.After;&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010;+&#010; /**&#010;  * Tests basic Gora functionality by writing and reading webpages.&#010;  */&#010; public class TestGoraStorage extends AbstractNutchTest {&#010; &#010;+  @Override&#010;+  @Before&#010;+  public void setUp() throws Exception {&#010;+    super.setUp();&#010;+  }&#010;+&#010;+  @Override&#010;+  @After&#010;+  public void tearDown() throws Exception {&#010;+    super.tearDown();&#010;+  }&#010;+  &#010;   /**&#010;    * Sequentially read and write pages to a store.&#010;    * &#010;    * @throws Exception&#010;    */&#010;+  @Test&#010;   public void testSinglethreaded() throws Exception {&#010;     String id = "singlethread";&#010;     readWrite(id, webPageStore);&#010;@@ -87,6 +105,7 @@ public class TestGoraStorage extends Abs&#010;    * &#010;    * @throws Exception&#010;    */&#010;+  @Test&#010;   public void testMultithreaded() throws Exception {&#010;     // create a fixed thread pool&#010;     int numThreads = 8;&#010;@@ -127,6 +146,7 @@ public class TestGoraStorage extends Abs&#010;    * &#010;    * @throws Exception&#010;    */&#010;+  @Test&#010;   public void testMultiProcess() throws Exception {&#010;     // create and start a hsql server, a stand-alone (memory backed) db&#010;     // (important: a stand-alone server should be used because simple&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/AbstractNutchTest.java Sun May 19 21:11:54&#010;2013&#010;@@ -16,8 +16,6 @@&#010;  */&#010; package org.apache.nutch.util;&#010; &#010;-import junit.framework.TestCase;&#010;-&#010; import org.apache.gora.store.DataStore;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.hadoop.fs.FileSystem;&#010;@@ -29,7 +27,7 @@ import org.apache.nutch.storage.WebPage;&#010;  * This class provides common routines for setup/teardown of an in-memory data&#010;  * store.&#010;  */&#010;-public class AbstractNutchTest extends TestCase {&#010;+public class AbstractNutchTest {&#010; &#010;   protected Configuration conf;&#010;   protected FileSystem fs;&#010;@@ -37,9 +35,7 @@ public class AbstractNutchTest extends T&#010;   protected DataStore&lt;String, WebPage&gt; webPageStore;&#010;   protected boolean persistentDataStore = false;&#010; &#010;-  @Override&#010;   public void setUp() throws Exception {&#010;-    super.setUp();&#010;     conf = CrawlTestUtil.createConfiguration();&#010;     conf.set("storage.data.store.class", "org.apache.gora.sql.store.SqlStore");&#010;     fs = FileSystem.get(conf);&#010;@@ -47,7 +43,6 @@ public class AbstractNutchTest extends T&#010;         WebPage.class);&#010;   }&#010; &#010;-  @Override&#010;   public void tearDown() throws Exception {&#010;     // empty the database after test&#010;     if (!persistentDataStore) {&#010;@@ -55,7 +50,6 @@ public class AbstractNutchTest extends T&#010;       webPageStore.flush();&#010;       webPageStore.close();&#010;     }&#010;-    super.tearDown();&#010;     fs.delete(testdir, true);&#010;   }&#010; &#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestEncodingDetector.java Sun May 19&#010;21:11:54 2013&#010;@@ -19,14 +19,15 @@ package org.apache.nutch.util;&#010; import java.io.UnsupportedEncodingException;&#010; import java.nio.ByteBuffer;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.avro.util.Utf8;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.net.protocols.Response;&#010; import org.apache.nutch.storage.WebPage;&#010; &#010;-public class TestEncodingDetector extends TestCase {&#010;+public class TestEncodingDetector {&#010;   private static Configuration conf = NutchConfiguration.create();&#010; &#010;   private static byte[] contentInOctets;&#010;@@ -39,10 +40,7 @@ public class TestEncodingDetector extend&#010;     }&#010;   }&#010; &#010;-  public TestEncodingDetector(String name) {&#010;-    super(name);&#010;-  }&#010;-&#010;+  @Test&#010;   public void testGuessing() {&#010;     // first disable auto detection&#010;     conf.setInt(EncodingDetector.MIN_CONFIDENCE_KEY, -1);&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestGZIPUtils.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestGZIPUtils.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestGZIPUtils.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestGZIPUtils.java Sun May 19 21:11:54&#010;2013&#010;@@ -17,15 +17,13 @@&#010; &#010; package org.apache.nutch.util;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import java.io.IOException;&#010; &#010; /** Unit tests for GZIPUtils methods. */&#010;-public class TestGZIPUtils extends TestCase {&#010;-  public TestGZIPUtils(String name) { &#010;-    super(name); &#010;-  }&#010;+public class TestGZIPUtils {&#010; &#010;   /* a short, highly compressable, string */&#010;   String SHORT_TEST_STRING= &#010;@@ -117,6 +115,7 @@ public class TestGZIPUtils extends TestC&#010; &#010;   // tests&#010; &#010;+  @Test&#010;   public void testZipUnzip() {&#010;     byte[] testBytes= SHORT_TEST_STRING.getBytes();&#010;     testZipUnzip(testBytes);&#010;@@ -126,6 +125,7 @@ public class TestGZIPUtils extends TestC&#010;     testZipUnzip(testBytes);&#010;   }&#010; &#010;+  @Test&#010;   public void testZipUnzipBestEffort() {&#010;     byte[] testBytes= SHORT_TEST_STRING.getBytes();&#010;     testZipUnzipBestEffort(testBytes);&#010;@@ -135,6 +135,7 @@ public class TestGZIPUtils extends TestC&#010;     testZipUnzipBestEffort(testBytes);&#010;   }&#010;   &#010;+  @Test&#010;   public void testTruncation() {&#010;     byte[] testBytes= SHORT_TEST_STRING.getBytes();&#010;     testTruncation(testBytes);&#010;@@ -144,6 +145,7 @@ public class TestGZIPUtils extends TestC&#010;     testTruncation(testBytes);&#010;   }&#010; &#010;+  @Test&#010;   public void testLimit() {&#010;     byte[] testBytes= SHORT_TEST_STRING.getBytes();&#010;     testLimit(testBytes);&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestNodeWalker.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestNodeWalker.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestNodeWalker.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestNodeWalker.java Sun May 19 21:11:54&#010;2013&#010;@@ -18,20 +18,17 @@&#010; package org.apache.nutch.util;&#010; &#010; import java.io.ByteArrayInputStream;&#010;-import junit.framework.TestCase;&#010;+&#010;+import org.junit.Before;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; import org.apache.xerces.parsers.DOMParser;&#010; import org.w3c.dom.Node;&#010; import org.xml.sax.InputSource;&#010; &#010;-&#010;-&#010;-&#010; /** Unit tests for NodeWalker methods. */&#010;-public class TestNodeWalker extends TestCase {&#010;-  public TestNodeWalker(String name) { &#010;-    super(name); &#010;-  }&#010;+public class TestNodeWalker {&#010; &#010;   /* a snapshot of the nutch webpage */&#010;   private final static String WEBPAGE= &#010;@@ -49,13 +46,15 @@ public class TestNodeWalker extends Test&#010; &#010;   private final static String[] ULCONTENT = new String[4];&#010;   &#010;-  protected void setUp() throws Exception{&#010;+  @Before&#010;+  public void setUp() throws Exception{&#010;     ULCONTENT[0]="crawl several billion pages per month" ;&#010;     ULCONTENT[1]="maintain an index of these pages" ;&#010;     ULCONTENT[2]="search that index up to 1000 times per second"  ;&#010;     ULCONTENT[3]="operate at minimal cost" ;&#010;   }&#010; &#010;+  @Test&#010;   public void testSkipChildren() {&#010;     DOMParser parser= new DOMParser();&#010;     &#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestPrefixStringMatcher.java Sun May&#010;19 21:11:54 2013&#010;@@ -17,13 +17,11 @@&#010; &#010; package org.apache.nutch.util;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /** Unit tests for PrefixStringMatcher. */&#010;-public class TestPrefixStringMatcher extends TestCase {&#010;-  public TestPrefixStringMatcher(String name) { &#010;-    super(name); &#010;-  }&#010;+public class TestPrefixStringMatcher {&#010; &#010;   private final static int NUM_TEST_ROUNDS= 20;&#010;   private final static int MAX_TEST_PREFIXES= 100;&#010;@@ -51,6 +49,7 @@ public class TestPrefixStringMatcher ext&#010;     return new String(chars);&#010;   }&#010;   &#010;+  @Test&#010;   public void testPrefixMatcher() {&#010;     int numMatches= 0;&#010;     int numInputsTested= 0;&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestStringUtil.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestStringUtil.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestStringUtil.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestStringUtil.java Sun May 19 21:11:54&#010;2013&#010;@@ -17,14 +17,13 @@&#010; &#010; package org.apache.nutch.util;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /** Unit tests for StringUtil methods. */&#010;-public class TestStringUtil extends TestCase {&#010;-  public TestStringUtil(String name) { &#010;-    super(name); &#010;-  }&#010;+public class TestStringUtil {&#010; &#010;+  @Test&#010;   public void testRightPad() {&#010;     String s= "my string";&#010; &#010;@@ -42,6 +41,7 @@ public class TestStringUtil extends Test&#010; &#010;   }&#010; &#010;+  @Test&#010;   public void testLeftPad() {&#010;     String s= "my string";&#010; &#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestSuffixStringMatcher.java Sun May&#010;19 21:11:54 2013&#010;@@ -17,13 +17,11 @@&#010; &#010; package org.apache.nutch.util;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /** Unit tests for SuffixStringMatcher. */&#010;-public class TestSuffixStringMatcher extends TestCase {&#010;-  public TestSuffixStringMatcher(String name) { &#010;-    super(name); &#010;-  }&#010;+public class TestSuffixStringMatcher {&#010; &#010;   private final static int NUM_TEST_ROUNDS= 20;&#010;   private final static int MAX_TEST_SUFFIXES= 100;&#010;@@ -51,6 +49,7 @@ public class TestSuffixStringMatcher ext&#010;     return new String(chars);&#010;   }&#010;   &#010;+  @Test&#010;   public void testSuffixMatcher() {&#010;     int numMatches= 0;&#010;     int numInputsTested= 0;&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestTableUtil.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestTableUtil.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestTableUtil.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestTableUtil.java Sun May 19 21:11:54&#010;2013&#010;@@ -17,9 +17,10 @@&#010; package org.apache.nutch.util;&#010; &#010; import org.apache.nutch.util.TableUtil;&#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010;-public class TestTableUtil extends TestCase {&#010;+public class TestTableUtil {&#010; &#010;   String urlString1 = "http://foo.com/";&#010;   String urlString2 = "http://foo.com:8900/";&#010;@@ -38,6 +39,7 @@ public class TestTableUtil extends TestC&#010;   String reversedUrlString6 = "com.foo:http";&#010;   String reversedUrlString7 = ":file/var/www/index.html";&#010; &#010;+  @Test&#010;   public void testReverseUrl() throws Exception {&#010;     assertReverse(urlString1, reversedUrlString1);&#010;     assertReverse(urlString2, reversedUrlString2);&#010;@@ -49,6 +51,7 @@ public class TestTableUtil extends TestC&#010;     assertReverse(urlString7, reversedUrlString7);&#010;   }&#010; &#010;+  @Test&#010;   public void testUnreverseUrl() throws Exception {&#010;     assertUnreverse(reversedUrlString1, urlString1);&#010;     assertUnreverse(reversedUrlString2, urlString2);&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java?rev=1484348&amp;r1=1484347&amp;r2=1484348&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/util/TestURLUtil.java Sun May 19 21:11:54&#010;2013&#010;@@ -19,18 +19,13 @@ package org.apache.nutch.util;&#010; &#010; import java.net.URL;&#010; &#010;-import junit.framework.TestCase;&#010;+import org.junit.Test;&#010;+import static org.junit.Assert.*;&#010; &#010; /** Test class for URLUtil */&#010;-public class TestURLUtil&#010;-  extends TestCase {&#010;-&#010;-  @Override&#010;-  protected void setUp()&#010;-    throws Exception {&#010;-    super.setUp();&#010;-  }&#010;+public class TestURLUtil {&#010; &#010;+  @Test&#010;   public void testGetDomainName()&#010;     throws Exception {&#010; &#010;@@ -81,6 +76,7 @@ public class TestURLUtil&#010; &#010;   }&#010; &#010;+  @Test&#010;   public void testGetDomainSuffix()&#010;     throws Exception {&#010;     URL url = null;&#010;@@ -133,6 +129,7 @@ public class TestURLUtil&#010; &#010;   }&#010; &#010;+  @Test&#010;   public void testGetHostBatches()&#010;     throws Exception {&#010;     URL url;&#010;@@ -165,6 +162,7 @@ public class TestURLUtil&#010; &#010;   }&#010; &#010;+  @Test&#010;   public void testChooseRepr()&#010;     throws Exception {&#010;     &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1480551 - in /nutch/trunk: CHANGES.txt default.properties pom.xml</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130509074940.846E82388906@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130509074940-846E82388906@eris-apache-org%3e</id>
<updated>2013-05-09T07:49:40Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Thu May  9 07:49:40 2013&#010;New Revision: 1480551&#010;&#010;URL: http://svn.apache.org/r1480551&#010;Log:&#010;Added crawler-commons dependency in pom.xml&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/default.properties&#010;    nutch/trunk/pom.xml&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1480551&amp;r1=1480550&amp;r2=1480551&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Thu May  9 07:49:40 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* Added crawler-commons dependency in pom.xml (tejasp)&#010;+&#010; * NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)&#010; &#010; * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)&#010;@@ -34,7 +36,7 @@ Nutch Change Log&#010; &#010; * NUTCH-1521 CrawlDbFilter pass null url to urlNormalizers (Lufeng via lewismc)&#010; &#010;-* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas Patil)&#010;+* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (tejasp)&#010; &#010; * NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc)&#010; &#010;@@ -48,17 +50,17 @@ Nutch Change Log&#010; &#010; * NUTCH-1500 bin/crawl fails on step solrindex with wrong path to segment (Tristan Buckner,&#010;snagel)&#010; &#010;-* NUTCH-1274 Fix [cast] javac warnings (Tejas Patil via lewismc)&#010;+* NUTCH-1274 Fix [cast] javac warnings (tejasp via lewismc)&#010; &#010;-* NUTCH-1494 RSS feed plugin seems broken (Sourajit Basak, Tejas Patil and lewismc)&#010;+* NUTCH-1494 RSS feed plugin seems broken (Sourajit Basak, tejasp and lewismc)&#010; &#010;-* NUTCH-1127 JUnit test for urlfilter-validator (Tejas Patil via lewismc)&#010;+* NUTCH-1127 JUnit test for urlfilter-validator (tejasp via lewismc)&#010; &#010;-* NUTCH-1119 JUnit test for index-static (Tejas Patil via lewismc)&#010;+* NUTCH-1119 JUnit test for index-static (tejasp via lewismc)&#010; &#010; * NUTCH-1510 Upgrade to Hadoop 1.1.1 (markus)&#010; &#010;-* NUTCH-1118 JUnit test for index-basic (Tejas Patil via lewismc)&#010;+* NUTCH-1118 JUnit test for index-basic (tejasp via lewismc)&#010; &#010; * NUTCH-1331 limit crawler to defined depth (jnioche)&#010; &#010;&#010;Modified: nutch/trunk/default.properties&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1480551&amp;r1=1480550&amp;r2=1480551&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/default.properties (original)&#010;+++ nutch/trunk/default.properties Thu May  9 07:49:40 2013&#010;@@ -16,7 +16,7 @@&#010; name=apache-nutch&#010; version=1.7-SNAPSHOT&#010; final.name=${name}-${version}&#010;-year=2012&#010;+year=2013&#010; &#010; basedir = ./&#010; src.dir = ./src/java&#010;&#010;Modified: nutch/trunk/pom.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/pom.xml?rev=1480551&amp;r1=1480550&amp;r2=1480551&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/pom.xml (original)&#010;+++ nutch/trunk/pom.xml Thu May  9 07:49:40 2013&#010;@@ -93,6 +93,21 @@&#010; &#009;&#009;&#009;&lt;name&gt;Ferdy Galema&lt;/name&gt;&#010; &#009;&#009;&#009;&lt;email&gt;ferdy@apache.org&lt;/email&gt;&#010; &#009;&#009;&lt;/developer&gt;&#010;+                &lt;developer&gt;&#010;+                        &lt;id&gt;tejasp&lt;/id&gt;&#010;+                        &lt;name&gt;Tejas Patil&lt;/name&gt;&#010;+                        &lt;email&gt;tejasp@apache.org&lt;/email&gt;&#010;+                &lt;/developer&gt;&#010;+                &lt;developer&gt;&#010;+                        &lt;id&gt;kiranch&lt;/id&gt;&#010;+                        &lt;name&gt;Kiran Chitturi&lt;/name&gt;&#010;+                        &lt;email&gt;kiranch@apache.org&lt;/email&gt;&#010;+                &lt;/developer&gt;&#010;+                &lt;developer&gt;&#010;+                        &lt;id&gt;fenglu&lt;/id&gt;&#010;+                        &lt;name&gt;Feng&lt;/name&gt;&#010;+                        &lt;email&gt;fenglu@apache.org&lt;/email&gt;&#010;+                &lt;/developer&gt;&#010; &#009;&lt;/developers&gt;&#010; &#009;&lt;build&gt;&#010; &#009;&#009;&lt;testSourceDirectory&gt;${basedir}/src/test&lt;/testSourceDirectory&gt;&#010;@@ -295,6 +310,12 @@&#010; &#009;&#009;&#009;&lt;version&gt;6.1.22&lt;/version&gt;&#010; &#009;&#009;&#009;&lt;optional&gt;true&lt;/optional&gt;&#010; &#009;&#009;&lt;/dependency&gt;&#010;+&#009;&#009;&lt;dependency&gt;&#010;+&#009;&#009;&#009;&lt;groupId&gt;com.google.code.crawler-commons&lt;/groupId&gt;&#010;+&#009;&#009;&#009;&lt;artifactId&gt;crawler-commons&lt;/artifactId&gt;&#010;+&#009;&#009;&#009;&lt;version&gt;0.2&lt;/version&gt;&#010;+                        &lt;scope&gt;compile&lt;/scope&gt;&#010;+&#009;&#009;&lt;/dependency&gt;&#010; &#009;&lt;/dependencies&gt;&#010; &lt;/project&gt;&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1480550 - in /nutch/branches/2.x: CHANGES.txt default.properties pom.xml</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130509074510.419FF2388906@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130509074510-419FF2388906@eris-apache-org%3e</id>
<updated>2013-05-09T07:45:10Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Thu May  9 07:45:09 2013&#010;New Revision: 1480550&#010;&#010;URL: http://svn.apache.org/r1480550&#010;Log:&#010;Added crawler-commons dependency in pom.xml&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/default.properties&#010;    nutch/branches/2.x/pom.xml&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1480550&amp;r1=1480549&amp;r2=1480550&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Thu May  9 07:45:09 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* Added crawler-commons dependency in pom.xml (tejasp)&#010;+&#010; * NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)&#010; &#010; * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)&#010;@@ -46,11 +48,11 @@ Release 2.2 - Current Development&#010; &#010; * NUTCH-XX remove unused db.max.inlinks property in nutch-default.xml (lewismc)&#010; &#010;-* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (Tejas Patil)&#010;+* NUTCH-1284 Add site fetcher.max.crawl.delay as log output by default (tejasp)&#010; &#010; * NUTCH-1453 Substantiate tests for IndexingFilters (lufeng via lewismc)&#010; &#010;-* NUTCH-1274 Fix [cast] javac warnings (Tejas Patil via lewismc)&#010;+* NUTCH-1274 Fix [cast] javac warnings (tejasp via lewismc)&#010; &#010; * NUTCH-1516 Nutch 2.x pom.xml out of sync with ivy.xml (lewismc)&#010; &#010;&#010;Modified: nutch/branches/2.x/default.properties&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/default.properties?rev=1480550&amp;r1=1480549&amp;r2=1480550&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/default.properties (original)&#010;+++ nutch/branches/2.x/default.properties Thu May  9 07:45:09 2013&#010;@@ -17,7 +17,7 @@&#010; name=apache-nutch&#010; version=2.2-SNAPSHOT&#010; final.name=${name}-${version}&#010;-year=2012&#010;+year=2013&#010; &#010; basedir = ./&#010; src.dir = ./src/java&#010;&#010;Modified: nutch/branches/2.x/pom.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/pom.xml?rev=1480550&amp;r1=1480549&amp;r2=1480550&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/pom.xml (original)&#010;+++ nutch/branches/2.x/pom.xml Thu May  9 07:45:09 2013&#010;@@ -82,6 +82,21 @@&#010; &#009;&#009;&#009;&lt;name&gt;Lewis John McGibbney&lt;/name&gt;&#010; &#009;&#009;&#009;&lt;email&gt;lewismc@apache.org&lt;/email&gt;&#010; &#009;&#009;&lt;/developer&gt;&#010;+                &lt;developer&gt;&#010;+                        &lt;id&gt;tejasp&lt;/id&gt;&#010;+                        &lt;name&gt;Tejas Patil&lt;/name&gt;&#010;+                        &lt;email&gt;tejasp@apache.org&lt;/email&gt;&#010;+                &lt;/developer&gt;&#010;+                &lt;developer&gt;&#010;+                        &lt;id&gt;kiranch&lt;/id&gt;&#010;+                        &lt;name&gt;Kiran Chitturi&lt;/name&gt;&#010;+                        &lt;email&gt;kiranch@apache.org&lt;/email&gt;&#010;+                &lt;/developer&gt;&#010;+                &lt;developer&gt;&#010;+                        &lt;id&gt;fenglu&lt;/id&gt;&#010;+                        &lt;name&gt;Feng&lt;/name&gt;&#010;+                        &lt;email&gt;fenglu@apache.org&lt;/email&gt;&#010;+                &lt;/developer&gt;&#010; &#009;&lt;/developers&gt;&#010;         &lt;build&gt;&#010;           &lt;sourceDirectory&gt;src/java&lt;/sourceDirectory&gt;&#010;@@ -247,12 +262,6 @@&#010;                         &lt;optional&gt;true&lt;/optional&gt;&#010;                 &lt;/dependency&gt;&#010;                 &lt;dependency&gt;&#010;-                        &lt;groupId&gt;org.jdom&lt;/groupId&gt;&#010;-                        &lt;artifactId&gt;jdom&lt;/artifactId&gt;&#010;-                        &lt;version&gt;1.1&lt;/version&gt;&#010;-                        &lt;optional&gt;true&lt;/optional&gt;&#010;-                &lt;/dependency&gt;&#010;-                &lt;dependency&gt;&#010;                         &lt;groupId&gt;org.apache.gora&lt;/groupId&gt;&#010;                         &lt;artifactId&gt;gora-sql&lt;/artifactId&gt;&#010;                         &lt;version&gt;0.1.1-incubating&lt;/version&gt;&#010;@@ -270,6 +279,12 @@&#010;                         &lt;version&gt;2.0.5&lt;/version&gt;&#010;                         &lt;optional&gt;true&lt;/optional&gt;&#010;                 &lt;/dependency&gt;&#010;+                &lt;dependency&gt;&#010;+                        &lt;groupId&gt;com.google.code.crawler-commons&lt;/groupId&gt;&#010;+                        &lt;artifactId&gt;crawler-commons&lt;/artifactId&gt;&#010;+                        &lt;version&gt;0.2&lt;/version&gt;&#010;+                        &lt;scope&gt;compile&lt;/scope&gt;&#010;+                &lt;/dependency&gt;&#010;         &lt;/dependencies&gt;&#010; &lt;/project&gt;&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1480485 - in /nutch/trunk: CHANGES.txt conf/schema-solr4.xml conf/schema.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java</title>
<author><name>snagel@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130508220454.24A9D2388962@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130508220454-24A9D2388962@eris-apache-org%3e</id>
<updated>2013-05-08T22:04:53Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: snagel&#010;Date: Wed May  8 22:04:53 2013&#010;New Revision: 1480485&#010;&#010;URL: http://svn.apache.org/r1480485&#010;Log:&#010;NUTCH-956 solrindex issues&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/conf/schema-solr4.xml&#010;    nutch/trunk/conf/schema.xml&#010;    nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1480485&amp;r1=1480484&amp;r2=1480485&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Wed May  8 22:04:53 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)&#010;+&#010; * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)&#010; &#010; * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)&#010;&#010;Modified: nutch/trunk/conf/schema-solr4.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema-solr4.xml?rev=1480485&amp;r1=1480484&amp;r2=1480485&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/conf/schema-solr4.xml (original)&#010;+++ nutch/trunk/conf/schema-solr4.xml Wed May  8 22:04:53 2013&#010;@@ -345,6 +345,9 @@&#010; &#010;     &lt;!-- fields for creativecommons plugin --&gt;&#010;     &lt;field name="cc" type="string" stored="true" indexed="true" multiValued="true"/&gt;&#010;+&#010;+    &lt;!-- fields for tld plugin --&gt;    &#010;+    &lt;field name="tld" type="string" stored="false" indexed="false"/&gt;&#010;  &lt;/fields&gt;&#010;  &lt;uniqueKey&gt;id&lt;/uniqueKey&gt;&#010;  &lt;defaultSearchField&gt;text&lt;/defaultSearchField&gt;&#010;&#010;Modified: nutch/trunk/conf/schema.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/conf/schema.xml?rev=1480485&amp;r1=1480484&amp;r2=1480485&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/conf/schema.xml (original)&#010;+++ nutch/trunk/conf/schema.xml Wed May  8 22:04:53 2013&#010;@@ -114,6 +114,9 @@&#010;         &lt;!-- fields for creativecommons plugin --&gt;&#010;         &lt;field name="cc" type="string" stored="true" indexed="true"&#010;             multiValued="true"/&gt;&#010;+            &#010;+        &lt;!-- fields for tld plugin --&gt;    &#010;+        &lt;field name="tld" type="string" stored="false" indexed="false"/&gt;&#010;     &lt;/fields&gt;&#010;     &lt;uniqueKey&gt;id&lt;/uniqueKey&gt;&#010;     &lt;defaultSearchField&gt;content&lt;/defaultSearchField&gt;&#010;&#010;Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480485&amp;r1=1480484&amp;r2=1480485&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;Wed May  8 22:04:53 2013&#010;@@ -52,12 +52,13 @@ import org.apache.commons.lang.StringUti&#010; import org.apache.commons.lang.time.DateUtils;&#010; &#010; /**&#010;- * Add (or reset) a few metaData properties as respective fields&#010;- * (if they are available), so that they can be displayed by more.jsp&#010;- * (called by search.jsp).&#010;- *&#010;- * content-type is indexed to support query by type:&#010;- * last-modifed is indexed to support query by date:&#010;+ * Add (or reset) a few metaData properties as respective fields (if they are&#010;+ * available), so that they can be accurately used within the search index.&#010;+ * &#010;+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length&#010;from the HTTP&#010;+ * header, 'type' field is indexed to support query by type and finally the 'title' field&#010;is an attempt &#010;+ * to reset the title if a content-disposition hint exists. The logic is that such a presence&#010;is indicative &#010;+ * that the content provider wants the filename therein to be used as the title.&#010;  *&#010;  * Still need to make content-length searchable!&#010;  *&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1480484 - in /nutch/branches/2.x: CHANGES.txt conf/schema-solr4.xml conf/schema.xml src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java</title>
<author><name>snagel@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130508220404.CC5D42388962@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130508220404-CC5D42388962@eris-apache-org%3e</id>
<updated>2013-05-08T22:04:04Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: snagel&#010;Date: Wed May  8 22:04:04 2013&#010;New Revision: 1480484&#010;&#010;URL: http://svn.apache.org/r1480484&#010;Log:&#010;NUTCH-956 solrindex issues&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/conf/schema-solr4.xml&#010;    nutch/branches/2.x/conf/schema.xml&#010;    nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1480484&amp;r1=1480483&amp;r2=1480484&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Wed May  8 22:04:04 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-956 solrindex issues: add field tld to Solr schema (Alexis via lewismc, snagel)&#010;+&#010; * NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)&#010; &#010; * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)&#010;&#010;Modified: nutch/branches/2.x/conf/schema-solr4.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1480484&amp;r1=1480483&amp;r2=1480484&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/conf/schema-solr4.xml (original)&#010;+++ nutch/branches/2.x/conf/schema-solr4.xml Wed May  8 22:04:04 2013&#010;@@ -346,6 +346,9 @@&#010; &#010;     &lt;!-- fields for creativecommons plugin --&gt;&#010;     &lt;field name="cc" type="string" stored="true" indexed="true" multiValued="true"/&gt;&#010;+&#010;+    &lt;!-- fields for tld plugin --&gt;    &#010;+    &lt;field name="tld" type="string" stored="false" indexed="false"/&gt;&#010;  &lt;/fields&gt;&#010;  &lt;uniqueKey&gt;id&lt;/uniqueKey&gt;&#010;  &lt;defaultSearchField&gt;text&lt;/defaultSearchField&gt;&#010;&#010;Modified: nutch/branches/2.x/conf/schema.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema.xml?rev=1480484&amp;r1=1480483&amp;r2=1480484&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/conf/schema.xml (original)&#010;+++ nutch/branches/2.x/conf/schema.xml Wed May  8 22:04:04 2013&#010;@@ -114,6 +114,9 @@&#010;         &lt;!-- fields for creativecommons plugin --&gt;&#010;         &lt;field name="cc" type="string" stored="true" indexed="true"&#010;             multiValued="true"/&gt;&#010;+            &#010;+        &lt;!-- fields for tld plugin --&gt;    &#010;+        &lt;field name="tld" type="string" stored="false" indexed="false"/&gt;&#010;     &lt;/fields&gt;&#010;     &lt;uniqueKey&gt;id&lt;/uniqueKey&gt;&#010;     &lt;defaultSearchField&gt;content&lt;/defaultSearchField&gt;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1480484&amp;r1=1480483&amp;r2=1480484&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;Wed May  8 22:04:04 2013&#010;@@ -44,10 +44,12 @@ import org.slf4j.LoggerFactory;&#010; &#010; /**&#010;  * Add (or reset) a few metaData properties as respective fields (if they are&#010;- * available), so that they can be displayed by more.jsp (called by search.jsp).&#010;+ * available), so that they can be accurately used within the search index.&#010;  * &#010;- * content-type is indexed to support query by type: last-modifed is indexed to&#010;- * support query by date:&#010;+ * 'lastModifed' is indexed to support query by date, 'contentLength' obtains content length&#010;from the HTTP&#010;+ * header, 'type' field is indexed to support query by type and finally the 'title' field&#010;is an attempt &#010;+ * to reset the title if a content-disposition hint exists. The logic is that such a presence&#010;is indicative &#010;+ * that the content provider wants the filename therein to be used as the title.&#010;  * &#010;  * Still need to make content-length searchable!&#010;  * &#010;@@ -171,7 +173,9 @@ public class MoreIndexingFilter implemen&#010;    */&#010;   private NutchDocument addType(NutchDocument doc, WebPage page, String url) {&#010;     String mimeType = null;&#010;-    Utf8 contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));&#010;+    Utf8 contentType = page.getContentType();&#010;+    if (contentType == null)&#010;+    &#009;contentType = page.getFromHeaders(new Utf8(HttpHeaders.CONTENT_TYPE));&#010;     if (contentType == null) {&#010;       // Note by Jerome Charron on 20050415:&#010;       // Content Type not solved by a previous plugin&#010;@@ -194,13 +198,11 @@ public class MoreIndexingFilter implemen&#010;       return doc;&#010;     }&#010; &#010;-    //String scontentType = mimeType.getName();&#010;-&#010;     doc.add("type", mimeType);&#010; &#010;     // Check if we need to split the content type in sub parts&#010;-    if ( null != contentType &amp;&amp; conf.getBoolean("moreIndexingFilter.indexMimeTypeParts",&#010;true)) {&#010;-      String[] parts = getParts(contentType.toString());&#010;+    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {&#010;+      String[] parts = getParts(mimeType);&#010; &#010;       for(String part: parts) {&#010;         doc.add("type", part);&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1479775 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/Fetcher.java src/java/org/apache/nutch/fetcher/OldFetcher.java</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130507064215.52FE1238896F@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130507064215-52FE1238896F@eris-apache-org%3e</id>
<updated>2013-05-07T06:42:15Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue May  7 06:42:14 2013&#010;New Revision: 1479775&#010;&#010;URL: http://svn.apache.org/r1479775&#010;Log:&#010;NUTCH-1277 Fix [fallthrough] javac warnings&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&#010;    nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1479775&amp;r1=1479774&amp;r2=1479775&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Tue May  7 06:42:14 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)&#010;+&#010; * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)&#010; &#010; * NUTCH-1334 NPE in FetcherOutputFormat (jnioche via tejasp)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1479775&amp;r1=1479774&amp;r2=1479775&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Tue May  7 06:42:14 2013&#010;@@ -627,6 +627,7 @@ public class Fetcher extends Configured &#010;       outlinksDepthDivisor = conf.getInt("fetcher.follow.outlinks.depth.divisor", 2);&#010;     }&#010; &#010;+    @SuppressWarnings("fallthrough")&#010;     public void run() {&#010;       activeThreads.incrementAndGet(); // count threads&#010; &#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java?rev=1479775&amp;r1=1479774&amp;r2=1479775&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/fetcher/OldFetcher.java Tue May  7 06:42:14 2013&#010;@@ -111,6 +111,7 @@ public class OldFetcher extends Configur&#010;       this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);&#010;     }&#010; &#010;+    @SuppressWarnings("fallthrough")&#010;     public void run() {&#010;       synchronized (OldFetcher.this) {activeThreads++;} // count threads&#010;       &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1479774 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/api/impl/RAMJobManager.java src/java/org/apache/nutch/fetcher/FetcherReducer.java</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130507064135.1D9CB2388847@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130507064135-1D9CB2388847@eris-apache-org%3e</id>
<updated>2013-05-07T06:41:34Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue May  7 06:41:34 2013&#010;New Revision: 1479774&#010;&#010;URL: http://svn.apache.org/r1479774&#010;Log:&#010;NUTCH-1277 Fix [fallthrough] javac warnings&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java&#010;    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1479774&amp;r1=1479773&amp;r2=1479774&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Tue May  7 06:41:34 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1277 Fix [fallthrough] javac warnings (tejasp)&#010;+&#010; * NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)&#010; &#010; * NUTCH-1273 Fix [deprecation] javac warnings (lewsimc + tejasp)&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java?rev=1479774&amp;r1=1479773&amp;r2=1479774&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/api/impl/RAMJobManager.java Tue May  7 06:41:34&#010;2013&#010;@@ -110,6 +110,7 @@ public class RAMJobManager implements Jo&#010;   }&#010;   &#010;   @Override&#010;+  @SuppressWarnings("fallthrough")&#010;   public List&lt;JobStatus&gt; list(String crawlId, State state) throws Exception {&#010;     List&lt;JobStatus&gt; res = new ArrayList&lt;JobStatus&gt;();&#010;     if (state == null) state = State.ANY;&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1479774&amp;r1=1479773&amp;r2=1479774&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Tue May  7 06:41:34&#010;2013&#010;@@ -451,6 +451,7 @@ extends GoraReducer&lt;IntWritable, FetchEn&#010;     }&#010; &#010;     @Override&#010;+    @SuppressWarnings("fallthrough")&#010;     public void run() {&#010;       activeThreads.incrementAndGet(); // count threads&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1478939 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java src/java/org/apache/nutch/crawl/Generator.java</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130503192743.0AB832388A64@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130503192743-0AB832388A64@eris-apache-org%3e</id>
<updated>2013-05-03T19:27:42Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Fri May  3 19:27:39 2013&#010;New Revision: 1478939&#010;&#010;URL: http://svn.apache.org/r1478939&#010;Log:&#010;NUTCH-1514 Phase out the deprecated configuration properties (if possible)&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/conf/nutch-default.xml&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1478939&amp;r1=1478938&amp;r2=1478939&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Fri May  3 19:27:39 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)&#010;+&#010; * NUTCH-1334 NPE in FetcherOutputFormat (jnioche via tejasp)&#010; &#010; * NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil (tejasp)&#010;&#010;Modified: nutch/trunk/conf/nutch-default.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1478939&amp;r1=1478938&amp;r2=1478939&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/conf/nutch-default.xml (original)&#010;+++ nutch/trunk/conf/nutch-default.xml Fri May  3 19:27:39 2013&#010;@@ -325,13 +325,6 @@&#010; &lt;!-- web db properties --&gt;&#010; &#010; &lt;property&gt;&#010;-  &lt;name&gt;db.default.fetch.interval&lt;/name&gt;&#010;-  &lt;value&gt;30&lt;/value&gt;&#010;-  &lt;description&gt;(DEPRECATED) The default number of days between re-fetches of a page.&#010;-  &lt;/description&gt;&#010;-&lt;/property&gt;&#010;-&#010;-&lt;property&gt;&#010;   &lt;name&gt;db.fetch.interval.default&lt;/name&gt;&#010;   &lt;value&gt;2592000&lt;/value&gt;&#010;   &lt;description&gt;The default number of seconds between re-fetches of a page (30 days).&#010;@@ -611,14 +604,6 @@&#010; &lt;/property&gt;&#010; &#010; &lt;property&gt;&#010;-  &lt;name&gt;generate.max.per.host&lt;/name&gt;&#010;-  &lt;value&gt;-1&lt;/value&gt;&#010;-  &lt;description&gt;(Deprecated). Use generate.max.count and generate.count.mode instead.&#010;-  The maximum number of urls per host in a single&#010;-  fetchlist.  -1 if unlimited.&lt;/description&gt;&#010;-&lt;/property&gt;&#010;-&#010;-&lt;property&gt;&#010;   &lt;name&gt;generate.min.score&lt;/name&gt;&#010;   &lt;value&gt;0&lt;/value&gt;&#010;   &lt;description&gt;Select only entries with a score larger than&#010;@@ -698,8 +683,7 @@&#010;   &lt;name&gt;fetcher.threads.per.queue&lt;/name&gt;&#010;   &lt;value&gt;1&lt;/value&gt;&#010;   &lt;description&gt;This number is the maximum number of threads that&#010;-    should be allowed to access a queue at one time. Replaces &#010;-    deprecated parameter 'fetcher.threads.per.host'.&#010;+    should be allowed to access a queue at one time.&#010;    &lt;/description&gt;&#010; &lt;/property&gt;&#010; &#010;@@ -707,8 +691,7 @@&#010;   &lt;name&gt;fetcher.queue.mode&lt;/name&gt;&#010;   &lt;value&gt;byHost&lt;/value&gt;&#010;   &lt;description&gt;Determines how to put URLs into queues. Default value is 'byHost', &#010;-  also takes 'byDomain' or 'byIP'. Replaces the deprecated parameter &#010;-  'fetcher.threads.per.host.by.ip'.&#010;+  also takes 'byDomain' or 'byIP'. &#010;   &lt;/description&gt;&#010; &lt;/property&gt;&#010; &#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java?rev=1478939&amp;r1=1478938&amp;r2=1478939&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/AbstractFetchSchedule.java Fri May  3 19:27:39&#010;2013&#010;@@ -47,12 +47,8 @@ public abstract class AbstractFetchSched&#010;   public void setConf(Configuration conf) {&#010;     super.setConf(conf);&#010;     if (conf == null) return;&#010;-    int oldDefaultInterval = conf.getInt("db.default.fetch.interval", 0);&#010;     defaultInterval = conf.getInt("db.fetch.interval.default", 0);&#010;-    if (oldDefaultInterval &gt; 0 &amp;&amp; defaultInterval == 0) defaultInterval = oldDefaultInterval&#010;* SECONDS_PER_DAY;&#010;-    int oldMaxInterval = conf.getInt("db.max.fetch.interval", 0);&#010;     maxInterval = conf.getInt("db.fetch.interval.max", 0 );&#010;-    if (oldMaxInterval &gt; 0 &amp;&amp; maxInterval == 0) maxInterval = oldMaxInterval *&#010;FetchSchedule.SECONDS_PER_DAY;&#010;     LOG.info("defaultInterval=" + defaultInterval);&#010;     LOG.info("maxInterval=" + maxInterval);&#010;   }&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1478939&amp;r1=1478938&amp;r2=1478939&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Fri May  3 19:27:39 2013&#010;@@ -76,7 +76,6 @@ public class Generator extends Configure&#010;   &#010;   // deprecated parameters &#010;   public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";&#010;-  public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";&#010; &#010;   public static class SelectorEntry implements Writable {&#010;     public Text url;&#010;@@ -140,10 +139,7 @@ public class Generator extends Configure&#010;       curTime = job.getLong(GENERATOR_CUR_TIME, System.currentTimeMillis());&#010;       limit = job.getLong(GENERATOR_TOP_N, Long.MAX_VALUE) / job.getNumReduceTasks();&#010;       maxCount = job.getInt(GENERATOR_MAX_COUNT, -1);&#010;-      // back compatibility with old param&#010;-      int oldMaxPerHost = job.getInt(GENERATE_MAX_PER_HOST, -1);&#010;-      if (maxCount==-1 &amp;&amp; oldMaxPerHost!=-1){&#010;-        maxCount = oldMaxPerHost;&#010;+      if (maxCount==-1){&#010;         byDomain = false;&#010;       }&#010;       if (GENERATOR_COUNT_VALUE_DOMAIN.equals(job.get(GENERATOR_COUNT_MODE))) byDomain =&#010;true;&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1478938 - /nutch/branches/2.x/conf/nutch-default.xml</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130503192714.1E127238899C@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130503192714-1E127238899C@eris-apache-org%3e</id>
<updated>2013-05-03T19:27:14Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Fri May  3 19:27:06 2013&#010;New Revision: 1478938&#010;&#010;URL: http://svn.apache.org/r1478938&#010;Log:&#010;NUTCH-1514 Phase out the deprecated configuration properties (if possible)&#010;&#010;Modified:&#010;    nutch/branches/2.x/conf/nutch-default.xml&#010;&#010;Modified: nutch/branches/2.x/conf/nutch-default.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1478938&amp;r1=1478937&amp;r2=1478938&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/conf/nutch-default.xml (original)&#010;+++ nutch/branches/2.x/conf/nutch-default.xml Fri May  3 19:27:06 2013&#010;@@ -318,13 +318,6 @@&#010; &lt;!-- web db properties --&gt;&#010; &#010; &lt;property&gt;&#010;-  &lt;name&gt;db.default.fetch.interval&lt;/name&gt;&#010;-  &lt;value&gt;30&lt;/value&gt;&#010;-  &lt;description&gt;(DEPRECATED) The default number of days between re-fetches of a page.&#010;-  &lt;/description&gt;&#010;-&lt;/property&gt;&#010;-&#010;-&lt;property&gt;&#010;   &lt;name&gt;db.fetch.interval.default&lt;/name&gt;&#010;   &lt;value&gt;2592000&lt;/value&gt;&#010;   &lt;description&gt;The default number of seconds between re-fetches of a page (30 days).&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1478937 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/crawl/GeneratorJob.java src/test/org/apache/nutch/crawl/TestGenerator.java</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201305.mbox/%3c20130503192453.7D78D238899C@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130503192453-7D78D238899C@eris-apache-org%3e</id>
<updated>2013-05-03T19:24:53Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Fri May  3 19:24:41 2013&#010;New Revision: 1478937&#010;&#010;URL: http://svn.apache.org/r1478937&#010;Log:&#010;NUTCH-1514 Phase out the deprecated configuration properties (if possible)&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java&#010;    nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1478937&amp;r1=1478936&amp;r2=1478937&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Fri May  3 19:24:41 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1514 Phase out the deprecated configuration properties (if possible) (tejasp)&#010;+&#010; * NUTCH-1273 Fix [deprecation] javac warnings (lewsimc + tejasp)&#010; &#010; * NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java?rev=1478937&amp;r1=1478936&amp;r2=1478937&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/GeneratorJob.java Fri May  3 19:24:41&#010;2013&#010;@@ -236,7 +236,7 @@ public class GeneratorJob extends NutchT&#010;       System.out.println("    -noFilter      - do not activate the filter plugin to filter&#010;the url, default is true ");&#010;       System.out.println("    -noNorm        - do not activate the normalizer plugin to normalize&#010;the url, default is true ");&#010;       System.out.println("    -adddays       - Adds numDays to the current time to facilitate&#010;crawling urls already");&#010;-      System.out.println("                     fetched sooner then db.default.fetch.interval.&#010;Default value is 0.");&#010;+      System.out.println("                     fetched sooner then db.fetch.interval.default.&#010;Default value is 0.");&#010;       System.out.println("----------------------");&#010;       System.out.println("Please set the params.");&#010;       return -1;&#010;&#010;Modified: nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java?rev=1478937&amp;r1=1478936&amp;r2=1478937&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java (original)&#010;+++ nutch/branches/2.x/src/test/org/apache/nutch/crawl/TestGenerator.java Fri May  3 19:24:41&#010;2013&#010;@@ -107,7 +107,7 @@ public class TestGenerator extends Abstr&#010;   }&#010; &#010;   /**&#010;-   * Test that generator obeys the property "generate.max.per.host".&#010;+   * Test that generator obeys the property "generate.max.count" and "generate.count.mode".&#010;    *&#010;    * @throws Exception&#010;    */&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1477847 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130430214219.7B69223889E7@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130430214219-7B69223889E7@eris-apache-org%3e</id>
<updated>2013-04-30T21:42:19Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue Apr 30 21:42:19 2013&#010;New Revision: 1477847&#010;&#010;URL: http://svn.apache.org/r1477847&#010;Log:&#010;NUTCH-1334 NPE in FetcherOutputFormat&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1477847&amp;r1=1477846&amp;r2=1477847&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Tue Apr 30 21:42:19 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1334 NPE in FetcherOutputFormat (jnioche via tejasp)&#010;+&#010; * NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil (tejasp)&#010; &#010; * NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via tejasp)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java?rev=1477847&amp;r1=1477846&amp;r2=1477847&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java Tue Apr 30 21:42:19&#010;2013&#010;@@ -98,9 +98,9 @@ public class FetcherOutputFormat impleme&#010;           &#010;           if (w instanceof CrawlDatum)&#010;             fetchOut.append(key, w);&#010;-          else if (w instanceof Content)&#010;+          else if (w instanceof Content &amp;&amp; contentOut != null)&#010;             contentOut.append(key, w);&#010;-          else if (w instanceof Parse)&#010;+          else if (w instanceof Parse &amp;&amp; parseOut != null)&#010;             parseOut.write(key, (Parse)w);&#010;         }&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1477821 - in /nutch/trunk: ./ src/java/org/apache/nutch/util/ src/plugin/index-more/src/java/org/apache/nutch/indexer/more/ src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ src/plugin/protocol-file/src/java/org/apache/nutch/protoc...</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130430205148.D6C5323889FD@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130430205148-D6C5323889FD@eris-apache-org%3e</id>
<updated>2013-04-30T20:51:48Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue Apr 30 20:51:44 2013&#010;New Revision: 1477821&#010;&#010;URL: http://svn.apache.org/r1477821&#010;Log:&#010;NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java&#010;    nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;    nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java&#010;    nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1477821&amp;r1=1477820&amp;r2=1477821&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Tue Apr 30 20:51:44 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1549 Fix deprecated use of Tika MimeType API in o.a.n.util.MimeUtil (tejasp)&#010;+&#010; * NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via tejasp)&#010; &#010; * NUTCH-829 duplicate hadoop temp files (Mike Baranczak, lewismc, tejasp)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java?rev=1477821&amp;r1=1477820&amp;r2=1477821&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/util/MimeUtil.java Tue Apr 30 20:51:44 2013&#010;@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configurat&#010; &#010; // Tika imports&#010; import org.apache.tika.Tika;&#010;+import org.apache.tika.config.TikaConfig;&#010; import org.apache.tika.mime.MimeType;&#010; import org.apache.tika.mime.MimeTypeException;&#010; import org.apache.tika.mime.MimeTypes;&#010;@@ -169,12 +170,19 @@ public final class MimeUtil {&#010;         || (type != null &amp;&amp; type.getName().equals(MimeTypes.OCTET_STREAM))) {&#010;       // If no mime-type header, or cannot find a corresponding registered&#010;       // mime-type, then guess a mime-type from the url pattern&#010;-      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes&#010;-          .getMimeType(url) : type;&#010;+      try {&#010;+        TikaConfig tikaConfig = TikaConfig.getDefaultConfig();&#010;+        Tika tika = new Tika(tikaConfig);&#010;+        retType = tika.detect(url) != null ? tika.detect(url) : null;&#010;+      } catch (Exception e) {&#010;+        String message = "Problem loading default Tika configuration";&#010;+        LOG.error(message, e);&#010;+        throw new RuntimeException(e);&#010;+      }&#010;+    } else {&#010;+        retType = type.getName();&#010;     }&#010; &#010;-    retType= type.getName();&#010;-&#010;     // if magic is enabled use mime magic to guess if the mime type returned&#010;     // from the magic guess is different than the one that's already set so far&#010;     // if it is, and it's not the default mime type, then go with the mime type&#010;&#010;Modified: nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1477821&amp;r1=1477820&amp;r2=1477821&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;Tue Apr 30 20:51:44 2013&#010;@@ -16,8 +16,6 @@&#010;  */&#010; package org.apache.nutch.indexer.more;&#010; &#010;-import org.apache.tika.mime.MimeType;&#010;-&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; &#010;@@ -36,19 +34,16 @@ import org.apache.nutch.crawl.CrawlDatum&#010; import org.apache.nutch.crawl.Inlinks;&#010; import org.apache.nutch.parse.ParseData;&#010; import org.apache.nutch.util.MimeUtil;&#010;+import org.apache.tika.Tika;&#010; &#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.hadoop.io.Text;&#010; import org.apache.hadoop.io.Writable;&#010; &#010; import java.text.ParseException;&#010;-import java.text.SimpleDateFormat;&#010; &#010; import java.io.BufferedReader;&#010;-import java.io.FileReader;&#010; import java.io.IOException;&#010;-import java.io.Reader;&#010;-import java.io.StringReader;&#010; import java.util.Date;&#010; import java.util.regex.*;&#010; import java.util.HashMap;&#010;@@ -72,11 +67,9 @@ import org.apache.commons.lang.time.Date&#010; public class MoreIndexingFilter implements IndexingFilter {&#010;   public static final Logger LOG = LoggerFactory.getLogger(MoreIndexingFilter.class);&#010; &#010;-  /** A flag that tells if magic resolution must be performed */&#010;-  private boolean MAGIC;&#010;-&#010;   /** Get the MimeTypes resolver instance. */&#010;   private MimeUtil MIME;&#010;+  private Tika tika = new Tika();&#010; &#010;   /** Map for mime-type substitution */&#010;   private HashMap&lt;String,String&gt; mimeMap = null;&#010;@@ -114,7 +107,6 @@ public class MoreIndexingFilter implemen&#010; &#010;     // un-stored, indexed and un-tokenized&#010;     doc.add("date", new Date(time));&#010;-&#010;     return doc;&#010;   }&#010; &#010;@@ -220,7 +212,8 @@ public class MoreIndexingFilter implemen&#010;       // } else {&#010;       //   contentType = MIME.getMimeType(url);&#010;       // }&#010;-      mimeType = MIME.getMimeType(url);&#010;+&#010;+      mimeType = tika.detect(url);&#010;     } else {&#010;       mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType));&#010;     }&#010;@@ -240,7 +233,6 @@ public class MoreIndexingFilter implemen&#010;     }&#010; &#010;     contentType = mimeType;&#010;-&#010;     doc.add("type", contentType);&#010; &#010;     // Check if we need to split the content type in sub parts&#010;&#010;Modified: nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java?rev=1477821&amp;r1=1477820&amp;r2=1477821&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/parse-zip/src/java/org/apache/nutch/parse/zip/ZipTextExtractor.java&#010;Tue Apr 30 20:51:44 2013&#010;@@ -38,13 +38,10 @@ import org.apache.nutch.net.protocols.Re&#010; import org.apache.nutch.parse.Parse;&#010; import org.apache.nutch.parse.ParseData;&#010; import org.apache.nutch.parse.ParseUtil;&#010;-import org.apache.nutch.parse.ParseImpl;&#010; import org.apache.nutch.parse.ParseException;&#010; import org.apache.nutch.parse.Outlink;&#010; import org.apache.nutch.protocol.Content;&#010;-import org.apache.nutch.util.MimeUtil;&#010;-&#010;-&#010;+import org.apache.tika.Tika;&#010; &#010; /**&#010;  *&#010;@@ -52,26 +49,18 @@ import org.apache.nutch.util.MimeUtil;&#010;  */&#010; public class ZipTextExtractor {&#010;   &#010;-  /** Get the MimeTypes resolver instance. */&#010;-  private MimeUtil MIME;&#010;-  &#010;   public static final Logger LOG = LoggerFactory.getLogger(ZipTextExtractor.class);&#010; &#010;   private Configuration conf;&#010;-  &#010;-  &#010;+&#010;   /** Creates a new instance of ZipTextExtractor */&#010;   public ZipTextExtractor(Configuration conf) {&#010;     this.conf = conf;&#010;-    this.MIME = new MimeUtil(conf);&#010;   }&#010;   &#010;-  public String extractText(InputStream input, String url, List outLinksList) throws IOException&#010;{&#010;+  public String extractText(InputStream input, String url, List&lt;Outlink&gt; outLinksList)&#010;throws IOException {&#010;     String resultText = "";&#010;-    byte temp;&#010;-    &#010;     ZipInputStream zin = new ZipInputStream(input);&#010;-    &#010;     ZipEntry entry;&#010;     &#010;     while ((entry = zin.getNextEntry()) != null) {&#010;@@ -93,7 +82,8 @@ public class ZipTextExtractor {&#010;         int i = fname.lastIndexOf('.');&#010;         if (i != -1) {&#010;           // Trying to resolve the Mime-Type&#010;-          String contentType = MIME.getMimeType(fname);&#010;+          Tika tika = new Tika();&#010;+          String contentType = tika.detect(fname);&#010;           try {&#010;             Metadata metadata = new Metadata();&#010;             metadata.set(Response.CONTENT_LENGTH, Long.toString(entry.getSize()));&#010;&#010;Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1477821&amp;r1=1477820&amp;r2=1477821&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;Tue Apr 30 20:51:44 2013&#010;@@ -224,7 +224,7 @@ public class FileResponse {&#010;     headers.set(Response.LAST_MODIFIED,&#010;         HttpDateFormat.toString(f.lastModified()));&#010; &#010;-    String mimeType = MIME.getMimeType(f);&#010;+    String mimeType = tika.detect(f);&#010; &#010;     headers.set(Response.CONTENT_TYPE, mimeType != null ? mimeType : "");&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1477792 - in /nutch/branches/2.x: ./ ivy/ src/java/org/apache/nutch/plugin/ src/java/org/apache/nutch/util/ src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/ src/plugin/...</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130430193624.76D9023888CD@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130430193624-76D9023888CD@eris-apache-org%3e</id>
<updated>2013-04-30T19:36:24Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Tue Apr 30 19:36:23 2013&#010;New Revision: 1477792&#010;&#010;URL: http://svn.apache.org/r1477792&#010;Log:&#010;NUTCH-1273 Fix [deprecation] javac warnings&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/ivy/ivy.xml&#010;    nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java&#010;    nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java&#010;    nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;    nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java&#010;    nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;    nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Tue Apr 30 19:36:23 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1273 Fix [deprecation] javac warnings (lewsimc + tejasp)&#010;+&#010; * NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010; &#010; * NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via tejasp)&#010;&#010;Modified: nutch/branches/2.x/ivy/ivy.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/ivy/ivy.xml (original)&#010;+++ nutch/branches/2.x/ivy/ivy.xml Tue Apr 30 19:36:23 2013&#010;@@ -65,6 +65,7 @@&#010; &#010;     &lt;dependency org="xerces" name="xercesImpl" rev="2.9.1" /&gt;&#010;     &lt;dependency org="xerces" name="xmlParserAPIs" rev="2.6.2" /&gt;&#010;+    &lt;dependency org="xalan" name="serializer" rev="2.7.1" /&gt;&#010;     &lt;dependency org="oro" name="oro" rev="2.0.8" /&gt;&#010; &#010;     &lt;dependency org="org.jdom" name="jdom" rev="1.1" conf="*-&gt;default" /&gt;&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginDescriptor.java Tue Apr 30 19:36:23&#010;2013&#010;@@ -219,7 +219,7 @@ public class PluginDescriptor {&#010;    */&#010;   public void addExportedLibRelative(String pLibPath)&#010;       throws MalformedURLException {&#010;-    URL url = new File(getPluginPath() + File.separator + pLibPath).toURL();&#010;+    URL url = new File(getPluginPath() + File.separator + pLibPath).toURI().toURL();&#010;     fExportedLibs.add(url);&#010;   }&#010; &#010;@@ -248,7 +248,7 @@ public class PluginDescriptor {&#010;    */&#010;   public void addNotExportedLibRelative(String pLibPath)&#010;       throws MalformedURLException {&#010;-    URL url = new File(getPluginPath() + File.separator + pLibPath).toURL();&#010;+    URL url = new File(getPluginPath() + File.separator + pLibPath).toURI().toURL();&#010;     fNotExportedLibs.add(url);&#010;   }&#010; &#010;@@ -279,7 +279,7 @@ public class PluginDescriptor {&#010;     try {&#010;       for (File file2 : file.listFiles()) {&#010;         if (file2.getAbsolutePath().endsWith("properties"))&#010;-          arrayList.add(file2.getParentFile().toURL());&#010;+          arrayList.add(file2.getParentFile().toURI().toURL());&#010;       }&#010;     } catch (MalformedURLException e) {&#010;       LOG.debug(getPluginId() + " " + e.toString());&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/plugin/PluginManifestParser.java Tue Apr&#010;30 19:36:23 2013&#010;@@ -147,7 +147,7 @@ public class PluginManifestParser {&#010;   private PluginDescriptor parseManifestFile(String pManifestPath)&#010;       throws MalformedURLException, SAXException, IOException,&#010;       ParserConfigurationException {&#010;-    Document document = parseXML(new File(pManifestPath).toURL());&#010;+    Document document = parseXML(new File(pManifestPath).toURI().toURL());&#010;     String pPath = new File(pManifestPath).getParent();&#010;     return parsePlugin(document, pPath);&#010;   }&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/util/MimeUtil.java Tue Apr 30 19:36:23 2013&#010;@@ -25,6 +25,7 @@ import org.apache.hadoop.conf.Configurat&#010; &#010; // Tika imports&#010; import org.apache.tika.Tika;&#010;+import org.apache.tika.config.TikaConfig;&#010; import org.apache.tika.mime.MimeType;&#010; import org.apache.tika.mime.MimeTypeException;&#010; import org.apache.tika.mime.MimeTypes;&#010;@@ -33,7 +34,7 @@ import org.apache.tika.mime.MimeTypesFac&#010; // Slf4j logging imports&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010;-&#010;+ &#010; // imported for Javadoc&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; &#010;@@ -169,11 +170,19 @@ public final class MimeUtil {&#010;         || (type != null &amp;&amp; type.getName().equals(MimeTypes.OCTET_STREAM))) {&#010;       // If no mime-type header, or cannot find a corresponding registered&#010;       // mime-type, then guess a mime-type from the url pattern&#010;-      type = this.mimeTypes.getMimeType(url) != null ? this.mimeTypes&#010;-          .getMimeType(url) : type;&#010;-    }&#010; &#010;-    retType= type.getName();&#010;+      try {&#010;+        TikaConfig tikaConfig = TikaConfig.getDefaultConfig();&#010;+        Tika tika = new Tika(tikaConfig);&#010;+        retType = tika.detect(url) != null ? tika.detect(url) : null;&#010;+      } catch (Exception e) {&#010;+        String message = "Problem loading default Tika configuration";&#010;+        LOG.error(message, e);&#010;+        throw new RuntimeException(e);&#010;+      }&#010;+    } else {&#010;+        retType = type.getName();&#010;+    }&#010; &#010;     // if magic is enabled use mime magic to guess if the mime type returned&#010;     // from the magic guess is different than the one that's already set so far&#010;@@ -257,6 +266,4 @@ public final class MimeUtil {&#010;       return null;&#010;     }&#010;   }&#010;-&#010;-&#010; }&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/java/org/apache/nutch/parse/tika/TikaParser.java&#010;Tue Apr 30 19:36:23 2013&#010;@@ -50,6 +50,7 @@ import org.apache.nutch.util.MimeUtil;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; import org.apache.nutch.util.TableUtil;&#010; import org.apache.tika.metadata.Metadata;&#010;+import org.apache.tika.metadata.TikaCoreProperties;&#010; import org.apache.tika.mime.MimeType;&#010; import org.apache.tika.parser.ParseContext;&#010; import org.apache.tika.parser.Parser;&#010;@@ -164,7 +165,8 @@ public class TikaParser implements org.a&#010;     // populate Nutch metadata with Tika metadata&#010;     String[] TikaMDNames = tikamd.names();&#010;     for (String tikaMDName : TikaMDNames) {&#010;-      if (tikaMDName.equalsIgnoreCase(Metadata.TITLE)) continue;&#010;+      if (tikaMDName.equalsIgnoreCase(TikaCoreProperties.TITLE.toString()))&#010;+      continue;&#010;       // TODO what if multivalued?&#010;       page.putToMetadata(new Utf8(tikaMDName), ByteBuffer.wrap(Bytes.toBytes(tikamd&#010;           .get(tikaMDName))));&#010;&#010;Modified: nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/DOMContentUtilsTest.java&#010;Tue Apr 30 19:36:23 2013&#010;@@ -29,7 +29,7 @@ import org.apache.nutch.util.NutchConfig&#010; import org.apache.tika.metadata.Metadata;&#010; import org.apache.tika.parser.ParseContext;&#010; import org.apache.tika.parser.Parser;&#010;-import org.apache.xml.serialize.DOMSerializerImpl;&#010;+import org.apache.xml.serializer.dom3.LSSerializerImpl;&#010; &#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010;@@ -229,9 +229,9 @@ public class DOMContentUtilsTest extends&#010; &#009;&#009;&#009;&#009;fail("caught exception: " + e);&#010; &#009;&#009;&#009;}&#010; &#009;&#009;&#009;testDOMs[i] = root;&#010;-&#009;&#009;&#009;DOMSerializerImpl ds = new DOMSerializerImpl();&#010;+&#009;&#009;&#009;LSSerializerImpl lsi = new LSSerializerImpl();&#010; &#009;&#009;&#009;System.out.println("input " + i + ": '" + testPages[i] + "'");&#010;-&#009;&#009;&#009;System.out.println("output " + i + ": '" + ds.writeToString(root)&#010;+&#009;&#009;&#009;System.out.println("output " + i + ": '" + lsi.writeToString(root)&#010; &#009;&#009;&#009;&#009;&#009;+ "'");&#010; &#010; &#009;&#009;}&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java&#010;Tue Apr 30 19:36:23 2013&#010;@@ -143,7 +143,7 @@ public class FileResponse {&#010;       if (!f.equals(f.getCanonicalFile())) {&#010;         // set headers&#010;         //hdrs.put("Location", f.getCanonicalFile().toURI());&#010;-        headers.set(Response.LOCATION, f.getCanonicalFile().toURL().toString());&#010;+        headers.set(Response.LOCATION, f.getCanonicalFile().toURI().toURL().toString());&#010; &#010;         this.code = 300;  // http redirect&#010;         return;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java?rev=1477792&amp;r1=1477791&amp;r2=1477792&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-httpclient/src/java/org/apache/nutch/protocol/httpclient/Http.java&#010;Tue Apr 30 19:36:23 2013&#010;@@ -45,6 +45,8 @@ import org.apache.commons.httpclient.NTC&#010; import org.apache.commons.httpclient.auth.AuthScope;&#010; import org.apache.commons.httpclient.params.HttpConnectionManagerParams;&#010; import org.apache.commons.httpclient.protocol.Protocol;&#010;+import org.apache.commons.httpclient.protocol.ProtocolSocketFactory;&#010;+import org.apache.commons.httpclient.protocol.SSLProtocolSocketFactory;&#010; &#010; // Nutch imports&#010; import org.apache.nutch.storage.WebPage;&#010;@@ -124,7 +126,7 @@ public class Http extends HttpBase {&#010; &#009; */&#010; &#009;public void setConf(Configuration conf) {&#010; &#009;&#009;super.setConf(conf);&#010;-&#009;&#009;this.conf = conf;&#010;+&#009;&#009;Http.conf = conf;&#010; &#009;&#009;this.maxThreadsTotal = conf.getInt("fetcher.threads.fetch", 10);&#010; &#009;&#009;this.proxyUsername = conf.get("http.proxy.username", "");&#010; &#009;&#009;this.proxyPassword = conf.get("http.proxy.password", "");&#010;@@ -178,8 +180,8 @@ public class Http extends HttpBase {&#010; &#009;private void configureClient() {&#010; &#010; &#009;&#009;// Set up an HTTPS socket factory that accepts self-signed certs.&#010;-&#009;&#009;Protocol https = new Protocol("https",&#010;-&#009;&#009;&#009;&#009;new DummySSLProtocolSocketFactory(), 443);&#010;+&#009;  ProtocolSocketFactory factory = new SSLProtocolSocketFactory();&#010;+&#009;&#009;Protocol https = new Protocol("https", factory, 443);&#010; &#009;&#009;Protocol.registerProtocol("https", https);&#010; &#010; &#009;&#009;HttpConnectionManagerParams params = connectionManager.getParams();&#010;@@ -195,7 +197,7 @@ public class Http extends HttpBase {&#010; &#009;&#009;client.getParams().setConnectionManagerTimeout(timeout);&#010; &#010; &#009;&#009;HostConfiguration hostConf = client.getHostConfiguration();&#010;-&#009;&#009;ArrayList headers = new ArrayList();&#010;+&#009;&#009;ArrayList&lt;Header&gt; headers = new ArrayList&lt;Header&gt;();&#010; &#009;&#009;// Set the User Agent in the header&#010; &#009;&#009;headers.add(new Header("User-Agent", userAgent));&#010; &#009;&#009;// prefer English&#010;@@ -222,7 +224,7 @@ public class Http extends HttpBase {&#010; &#009;&#009;&#009;&#009;&#009;&#009;this.proxyPort, this.proxyRealm);&#010; &#010; &#009;&#009;&#009;&#009;NTCredentials proxyCredentials = new NTCredentials(&#010;-&#009;&#009;&#009;&#009;&#009;&#009;this.proxyUsername, this.proxyPassword, this.agentHost,&#010;+&#009;&#009;&#009;&#009;&#009;&#009;this.proxyUsername, this.proxyPassword, Http.agentHost,&#010; &#009;&#009;&#009;&#009;&#009;&#009;this.proxyRealm);&#010; &#010; &#009;&#009;&#009;&#009;client.getState().setProxyCredentials(proxyAuthScope,&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1477319 - in /nutch/branches/2.x: ./ ivy/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/protocol/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/a...</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130429202653.E9DDD2388A32@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130429202653-E9DDD2388A32@eris-apache-org%3e</id>
<updated>2013-04-29T20:26:53Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Mon Apr 29 20:26:52 2013&#010;New Revision: 1477319&#010;&#010;URL: http://svn.apache.org/r1477319&#010;Log:&#010;NUTCH-1031 Delegate parsing of robots.txt to crawler-commons&#010;&#010;Added:&#010;    nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java&#010;    nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;Removed:&#010;    nutch/branches/2.x/src/java/org/apache/nutch/protocol/EmptyRobotRules.java&#010;    nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/ivy/ivy.xml&#010;    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java&#010;    nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java&#010;    nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;    nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;    nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;    nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;    nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Mon Apr 29 20:26:52 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010;+&#010; * NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via tejasp)&#010; &#010; * NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + lewismc)&#010;&#010;Modified: nutch/branches/2.x/ivy/ivy.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/ivy/ivy.xml?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/ivy/ivy.xml (original)&#010;+++ nutch/branches/2.x/ivy/ivy.xml Mon Apr 29 20:26:52 2013&#010;@@ -70,6 +70,7 @@&#010;     &lt;dependency org="org.jdom" name="jdom" rev="1.1" conf="*-&gt;default" /&gt;&#010; &#010;     &lt;dependency org="com.google.guava" name="guava" rev="11.0.2" /&gt;&#010;+    &lt;dependency org="com.google.code.crawler-commons" name="crawler-commons" rev="0.2"&#010;/&gt;&#010; &#010;     &lt;!--Configuration: test --&gt;&#010; &#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherReducer.java Mon Apr 29 20:26:52&#010;2013&#010;@@ -61,6 +61,8 @@ import org.apache.nutch.util.TableUtil;&#010; import org.apache.nutch.util.URLUtil;&#010; import org.slf4j.Logger;&#010; &#010;+import crawlercommons.robots.BaseRobotRules;&#010;+&#010; public class FetcherReducer&#010; extends GoraReducer&lt;IntWritable, FetchEntry, String, WebPage&gt; {&#010; &#010;@@ -152,9 +154,6 @@ extends GoraReducer&lt;IntWritable, FetchEn&#010;       return "FetchItem [queueID=" + queueID + ", url=" + url + ", u=" + u&#010;           + ", page=" + page + "]";&#010;     }&#010;-    &#010;-    &#010;-&#010;   }&#010; &#010;   /**&#010;@@ -489,8 +488,8 @@ extends GoraReducer&lt;IntWritable, FetchEn&#010; &#010;             // fetch the page&#010;             final Protocol protocol = this.protocolFactory.getProtocol(fit.url);&#010;-            final RobotRules rules = protocol.getRobotRules(fit.url, fit.page);&#010;-            if (!rules.isAllowed(fit.u)) {&#010;+            final BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.page);&#010;+            if (!rules.isAllowed(fit.u.toString())) {&#010;               // unblock&#010;               fetchQueues.finishFetchItem(fit, true);&#010;               if (LOG.isDebugEnabled()) {&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/Protocol.java Mon Apr 29 20:26:52&#010;2013&#010;@@ -23,6 +23,8 @@ import org.apache.hadoop.conf.Configurab&#010; import org.apache.nutch.plugin.FieldPluggable;&#010; import org.apache.nutch.storage.WebPage;&#010; &#010;+import crawlercommons.robots.BaseRobotRules;&#010;+&#010; /** A retriever of url content.  Implemented by protocol extensions. */&#010; public interface Protocol extends FieldPluggable, Configurable {&#010;   /** The name of the extension point. */&#010;@@ -46,7 +48,8 @@ public interface Protocol extends FieldP&#010;    */&#010;   public final static String CHECK_ROBOTS = "protocol.plugin.check.robots";&#010; &#010;-  /** Returns the {@link Content} for a fetchlist entry.&#010;+  /*&#010;+   * Returns the {@link Content} for a fetchlist entry.&#010;    */&#010;   ProtocolOutput getProtocolOutput(String url, WebPage page);&#010; &#010;@@ -56,5 +59,5 @@ public interface Protocol extends FieldP&#010;    * @param page&#010;    * @return robot rules (specific for this url or default), never null&#010;    */&#010;-  RobotRules getRobotRules(String url, WebPage page);&#010;+  BaseRobotRules getRobotRules(String url, WebPage page);&#010; }&#010;&#010;Added: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1477319&amp;view=auto&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java (added)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java Mon Apr 29&#010;20:26:52 2013&#010;@@ -0,0 +1,195 @@&#010;+/**&#010;+ * Licensed to the Apache Software Foundation (ASF) under one or more&#010;+ * contributor license agreements. See the NOTICE file distributed with&#010;+ * this work for additional information regarding copyright ownership.&#010;+ * The ASF licenses this file to You under the Apache License, Version 2.0&#010;+ * (the "License"); you may not use this file except in compliance with&#010;+ * the License.  You may obtain a copy of the License at&#010;+ *&#010;+ *     http://www.apache.org/licenses/LICENSE-2.0&#010;+ *&#010;+ * Unless required by applicable law or agreed to in writing, software&#010;+ * distributed under the License is distributed on an "AS IS" BASIS,&#010;+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.&#010;+ * See the License for the specific language governing permissions and&#010;+ * limitations under the License.&#010;+ */&#010;+&#010;+package org.apache.nutch.protocol;&#010;+&#010;+// JDK imports&#010;+import java.io.File;&#010;+import java.io.FileReader;&#010;+import java.io.LineNumberReader;&#010;+import java.net.URL;&#010;+import java.util.ArrayList;&#010;+import java.util.Hashtable;&#010;+import java.util.StringTokenizer;&#010;+&#010;+// Commons Logging imports&#010;+import org.slf4j.Logger;&#010;+import org.slf4j.LoggerFactory;&#010;+&#010;+// Nutch imports&#010;+import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.hadoop.conf.Configurable;&#010;+import org.apache.hadoop.io.Text;&#010;+&#010;+import com.google.common.io.Files;&#010;+&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;&#010;+import crawlercommons.robots.SimpleRobotRulesParser;&#010;+&#010;+/**&#010;+ * This class uses crawler-commons for handling the parsing of {@code robots.txt} files.&#010;+ * It emits SimpleRobotRules objects, which describe the download permissions&#010;+ * as described in SimpleRobotRulesParser.&#010;+ */&#010;+public abstract class RobotRulesParser implements Configurable {&#010;+&#010;+  public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);&#010;+&#010;+  protected static final Hashtable&lt;String, BaseRobotRules&gt; CACHE = new Hashtable&lt;String,&#010;BaseRobotRules&gt; ();&#010;+&#010;+  /**&#010;+   *  A {@link BaseRobotRules} object appropriate for use&#010;+   *  when the {@code robots.txt} file is empty or missing;&#010;+   *  all requests are allowed.&#010;+   */&#010;+  public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);&#010;+&#010;+  /**&#010;+   *  A {@link BaseRobotRules} object appropriate for use when the &#010;+   *  {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}&#010;+   *  response; all requests are disallowed. &#010;+   */&#010;+  public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);&#010;+&#010;+  private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();&#010;+  private Configuration conf;&#010;+  protected String agentNames;&#010;+&#010;+  public RobotRulesParser() { }&#010;+&#010;+  public RobotRulesParser(Configuration conf) {&#010;+    setConf(conf);&#010;+  }&#010;+&#010;+  /**&#010;+   * Set the {@link Configuration} object&#010;+   */&#010;+  public void setConf(Configuration conf) {&#010;+    this.conf = conf;&#010;+&#010;+    // Grab the agent names we advertise to robots files.&#010;+    String agentName = conf.get("http.agent.name");&#010;+    if (null == agentName) {&#010;+      throw new RuntimeException("Agent name not configured!");&#010;+    }&#010;+&#010;+    String agentNames = conf.get("http.robots.agents");&#010;+    StringTokenizer tok = new StringTokenizer(agentNames, ",");&#010;+    ArrayList&lt;String&gt; agents = new ArrayList&lt;String&gt;();&#010;+    while (tok.hasMoreTokens()) {&#010;+      agents.add(tok.nextToken().trim());&#010;+    }&#010;+&#010;+    /**&#010;+     * If there are no agents for robots-parsing, use the&#010;+     * default agent-string. If both are present, our agent-string&#010;+     * should be the first one we advertise to robots-parsing.&#010;+     */&#010;+    if (agents.size() == 0) {&#010;+      if (LOG.isErrorEnabled()) {&#010;+        LOG.error("No agents listed in 'http.robots.agents' property!");&#010;+      }&#010;+    } else { &#010;+      StringBuffer combinedAgentsString = new StringBuffer(agentName);&#010;+      int index = 0;&#010;+&#010;+      if ((agents.get(0)).equalsIgnoreCase(agentName))&#010;+        index++;&#010;+      else if (LOG.isErrorEnabled()) {&#010;+        LOG.error("Agent we advertise (" + agentName&#010;+            + ") not listed first in 'http.robots.agents' property!");&#010;+      }&#010;+&#010;+      // append all the agents from the http.robots.agents property&#010;+      for(; index &lt; agents.size(); index++) {&#010;+        combinedAgentsString.append(", " + agents.get(index));&#010;+      }&#010;+&#010;+      // always make sure "*" is included in the end&#010;+      combinedAgentsString.append(", *");&#010;+      this.agentNames = combinedAgentsString.toString();&#010;+    }&#010;+  }&#010;+&#010;+  /**&#010;+   * Get the {@link Configuration} object&#010;+   */&#010;+  public Configuration getConf() {&#010;+    return conf;&#010;+  }&#010;+&#010;+  /**&#010;+   * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons&#010;+   *    &#010;+   * @param url A string containing url&#010;+   * @param content Contents of the robots file in a byte array &#010;+   * @param contentType The &#010;+   * @param robotName A string containing value of  &#010;+   * @return BaseRobotRules object &#010;+   */&#010;+  public BaseRobotRules parseRules (String url, byte[] content, String contentType, String&#010;robotName) {&#010;+    return robotParser.parseContent(url, content, contentType, robotName); &#010;+  }&#010;+&#010;+  public BaseRobotRules getRobotRulesSet(Protocol protocol, String url) {&#010;+    URL u = null;&#010;+    try {&#010;+      u = new URL(url);&#010;+    } catch (Exception e) {&#010;+      return EMPTY_RULES;&#010;+    }&#010;+    return getRobotRulesSet(protocol, u);&#010;+  }&#010;+&#010;+  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);&#010;+&#010;+  /** command-line main for testing */&#010;+  public static void main(String[] argv) {&#010;+&#010;+    if (argv.length &lt; 3) {&#010;+      System.err.println("Usage: RobotRulesParser &lt;robots-file&gt; &lt;url-file&gt; &lt;agent-names&gt;\n");&#010;+      System.err.println("    &lt;robots-file&gt; - Input robots.txt file which will be parsed.");&#010;+      System.err.println("    &lt;url-file&gt;    - Contains input URLs (1 per line) which&#010;are tested against the rules.");&#010;+      System.err.println("    &lt;agent-names&gt; - Input agent name. Multiple agent names&#010;can be specified using spaces.");&#010;+      System.exit(-1);&#010;+    }&#010;+&#010;+    try {&#010;+      StringBuilder agentNames = new StringBuilder();&#010;+      for(int counter = 2; counter &lt; argv.length; counter++) &#010;+        agentNames.append(argv[counter]).append(",");&#010;+&#010;+      agentNames.deleteCharAt(agentNames.length()-1);&#010;+&#010;+      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));&#010;+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain",&#010;agentNames.toString());&#010;+&#010;+      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));&#010;+      String testPath = testsIn.readLine().trim();&#010;+      while (testPath != null) {&#010;+        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +&#010;+            ":\t" + testPath);&#010;+        testPath = testsIn.readLine();&#010;+      }&#010;+      testsIn.close();&#010;+    } catch (Exception e) {&#010;+      e.printStackTrace();&#010;+    }&#010;+  }&#010;+}&#010;&#010;Modified: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;Mon Apr 29 20:26:52 2013&#010;@@ -32,23 +32,21 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatusCodes;&#010; import org.apache.nutch.protocol.ProtocolStatusUtils;&#010;-import org.apache.nutch.protocol.RobotRules;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.GZIPUtils;&#010; import org.apache.nutch.util.DeflateUtils;&#010; import org.apache.nutch.util.MimeUtil;&#010; &#010;-/**&#010;- * @author J&amp;eacute;r&amp;ocirc;me Charron&#010;- */&#010;-public abstract class HttpBase implements Protocol {&#010;+// crawler-commons imports&#010;+import crawlercommons.robots.BaseRobotRules;&#010; &#010;+public abstract class HttpBase implements Protocol {&#010; &#010;   public static final int BUFFER_SIZE = 8 * 1024;&#010; &#010;   private static final byte[] EMPTY_CONTENT = new byte[0];&#010; &#010;-  private RobotRulesParser robots = null;&#010;+  private HttpRobotRulesParser robots = null;&#010; &#010;   /** The proxy hostname. */&#010;   protected String proxyHost = null;&#010;@@ -102,7 +100,7 @@ public abstract class HttpBase implement&#010;     if (logger != null) {&#010;       this.logger = logger;&#010;     }&#010;-    robots = new RobotRulesParser();&#010;+    robots = new HttpRobotRulesParser();&#010;   }&#010; &#010;   // Inherited Javadoc&#010;@@ -128,13 +126,10 @@ public abstract class HttpBase implement&#010;     return this.conf;&#010;   }&#010; &#010;-&#010;-&#010;   public ProtocolOutput getProtocolOutput(String url, WebPage page) {&#010; &#010;     try {&#010;       URL u = new URL(url);&#010;-      String host = null;&#010;       Response response = getResponse(u, page, false); // make a request&#010;       int code = response.getCode();&#010;       byte[] content = response.getContent();&#010;@@ -145,7 +140,6 @@ public abstract class HttpBase implement&#010; &#010;       if (code == 200) { // got a good response&#010;         return new ProtocolOutput(c); // return it&#010;-&#010;       } else if (code == 410) { // page is gone&#010;         return new ProtocolOutput(c,&#010;             ProtocolStatusUtils.makeStatus(ProtocolStatusCodes.GONE, "Http: " + code + "&#010;url=" + url));&#010;@@ -206,8 +200,6 @@ public abstract class HttpBase implement&#010;   /* -------------------------- *&#010;    * &lt;/implementation:Protocol&gt; *&#010;    * -------------------------- */&#010;-&#010;-&#010;   public String getProxyHost() {&#010;     return proxyHost;&#010;   }&#010;@@ -367,10 +359,6 @@ public abstract class HttpBase implement&#010;         url = args[i];&#010;     }&#010; &#010;-    //    if (verbose) {&#010;-    //      LOGGER.setLevel(Level.FINE);&#010;-    //    }&#010;-&#010;     ProtocolOutput out = http.getProtocolOutput(url, new WebPage());&#010;     Content content = out.getContent();&#010; &#010;@@ -383,17 +371,14 @@ public abstract class HttpBase implement&#010;       String text = new String(content.getContent());&#010;       System.out.println(text);&#010;     }&#010;-&#010;   }&#010; &#010;-&#010;   protected abstract Response getResponse(URL url,&#010;       WebPage page, boolean followRedirects)&#010;   throws ProtocolException, IOException;&#010; &#010;   @Override&#010;-  public RobotRules getRobotRules(String url, WebPage page) {&#010;+  public BaseRobotRules getRobotRules(String url, WebPage page) {&#010;     return robots.getRobotRulesSet(this, url);&#010;   }&#010;-&#010; }&#010;&#010;Added: nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1477319&amp;view=auto&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;(added)&#010;+++ nutch/branches/2.x/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;Mon Apr 29 20:26:52 2013&#010;@@ -0,0 +1,126 @@&#010;+/**&#010;+ * Licensed to the Apache Software Foundation (ASF) under one or more&#010;+ * contributor license agreements.  See the NOTICE file distributed with&#010;+ * this work for additional information regarding copyright ownership.&#010;+ * The ASF licenses this file to You under the Apache License, Version 2.0&#010;+ * (the "License"); you may not use this file except in compliance with&#010;+ * the License.  You may obtain a copy of the License at&#010;+ *&#010;+ *     http://www.apache.org/licenses/LICENSE-2.0&#010;+ *&#010;+ * Unless required by applicable law or agreed to in writing, software&#010;+ * distributed under the License is distributed on an "AS IS" BASIS,&#010;+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.&#010;+ * See the License for the specific language governing permissions and&#010;+ * limitations under the License.&#010;+ */&#010;+&#010;+package org.apache.nutch.protocol.http.api;&#010;+&#010;+import java.net.URL;&#010;+&#010;+import org.slf4j.Logger;&#010;+import org.slf4j.LoggerFactory;&#010;+&#010;+import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.nutch.net.protocols.Response;&#010;+import org.apache.nutch.protocol.Protocol;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010;+import org.apache.nutch.storage.WebPage;&#010;+&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules;&#010;+&#010;+/**&#010;+ * This class is used for parsing robots for urls belonging to HTTP protocol.&#010;+ * It extends the generic {@link RobotRulesParser} class and contains &#010;+ * Http protocol specific implementation for obtaining the robots file.&#010;+ */&#010;+public class HttpRobotRulesParser extends RobotRulesParser {&#010;+  &#010;+  public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class);&#010;+  protected boolean allowForbidden = false;&#010;+&#010;+  HttpRobotRulesParser() { }&#010;+&#010;+  public HttpRobotRulesParser(Configuration conf) {&#010;+    super(conf);&#010;+    allowForbidden = conf.getBoolean("http.robots.403.allow", false);&#010;+  }&#010;+&#010;+  /**&#010;+   * The hosts for which the caching of robots rules is yet to be done,&#010;+   * it sends a Http request to the host corresponding to the {@link URL} &#010;+   * passed, gets robots file, parses the rules and caches the rules object&#010;+   * to avoid re-work in future.&#010;+   * &#010;+   *  @param http The {@link Protocol} object&#010;+   *  @param url URL &#010;+   *  &#010;+   *  @return robotRules A {@link BaseRobotRules} object for the rules&#010;+   */&#010;+  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {&#010;+&#010;+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case&#010;+    String host = url.getHost().toLowerCase();          // normalize to lower case&#010;+&#010;+    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);&#010;+&#010;+    boolean cacheRule = true;&#010;+    &#010;+    if (robotRules == null) {                     // cache miss&#010;+      URL redir = null;&#010;+      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }&#010;+      try {&#010;+        Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),&#010;+                                             new WebPage(), true);&#010;+        // try one level of redirection ?&#010;+        if (response.getCode() == 301 || response.getCode() == 302) {&#010;+          String redirection = response.getHeader("Location");&#010;+          if (redirection == null) {&#010;+            // some versions of MS IIS are known to mangle this header&#010;+            redirection = response.getHeader("location");&#010;+          }&#010;+          if (redirection != null) {&#010;+            if (!redirection.startsWith("http")) {&#010;+              // RFC says it should be absolute, but apparently it isn't&#010;+              redir = new URL(url, redirection);&#010;+            } else {&#010;+              redir = new URL(redirection);&#010;+            }&#010;+            &#010;+            response = ((HttpBase)http).getResponse(redir, new WebPage(), true);&#010;+          }&#010;+        }&#010;+&#010;+        if (response.getCode() == 200)               // found rules: parse them&#010;+          robotRules =  parseRules(url.toString(), response.getContent(), &#010;+                                   response.getHeader("Content-Type"), &#010;+                                   agentNames);&#010;+&#010;+        else if ( (response.getCode() == 403) &amp;&amp; (!allowForbidden) )&#010;+          robotRules = FORBID_ALL_RULES;            // use forbid all&#010;+        else if (response.getCode() &gt;= 500) {&#010;+          cacheRule = false;&#010;+          robotRules = EMPTY_RULES;&#010;+        }else                                        &#010;+          robotRules = EMPTY_RULES;                 // use default rules&#010;+      } catch (Throwable t) {&#010;+        if (LOG.isInfoEnabled()) {&#010;+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());&#010;+        }&#010;+        cacheRule = false;&#010;+        robotRules = EMPTY_RULES;&#010;+      }&#010;+&#010;+      if (cacheRule) {&#010;+        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host&#010;+        if (redir != null &amp;&amp; !redir.getHost().equals(host)) {&#010;+          // cache also for the redirected host&#010;+          CACHE.put(protocol + ":" + redir.getHost(), robotRules);&#010;+        }&#010;+      }&#010;+    }&#010;+    return robotRules;&#010;+  }&#010;+}&#010;&#010;Modified: nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;Mon Apr 29 20:26:52 2013&#010;@@ -17,292 +17,100 @@&#010; &#010; package org.apache.nutch.protocol.http.api;&#010; &#010;-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;&#010;-&#010;+import crawlercommons.robots.BaseRobotRules;&#010; import junit.framework.TestCase;&#010; &#010;+/**&#010;+ * JUnit test case which tests&#010;+ * 1. that robots filtering is performed correctly as per the agent name&#010;+ * 2. that crawl delay is extracted correctly from the robots file&#010;+ *&#010;+ */&#010; public class TestRobotRulesParser extends TestCase {&#010;-  private static final String LF= "\n";&#010;-  private static final String CR= "\r";&#010;-  private static final String CRLF= "\r\n";&#010;+&#010;+  private static final String CONTENT_TYPE = "text/plain";&#010;+  private static final String SINGLE_AGENT = "Agent1";&#010;+  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";&#010;+  private static final String UNKNOWN_AGENT = "AgentABC";&#010;+  private static final String CR = "\r";&#010;   &#010;-  private static final boolean[] ACCEPT_ALL = {&#010;-    true,   // "/a",&#009;      &#010;-    true,   // "/a/",&#009;      &#010;-    true,   // "/a/bloh/foo.html"&#010;-    true,   // "/b",&#009;      &#010;-    true,   // "/b/a",&#009;      &#010;-    true,   // "/b/a/index.html",&#010;-    true,   // "/b/b/foo.html",  &#010;-    true,   // "/c",&#009;      &#010;-    true,   // "/c/a",&#009;      &#010;-    true,   // "/c/a/index.html",&#010;-    true,   // "/c/b/foo.html",  &#010;-    true,   // "/d",&#009;      &#010;-    true,   // "/d/a",&#009;      &#010;-    true,   // "/e/a/index.html",&#010;-    true,   // "/e/d",&#009;      &#010;-    true,   // "/e/d/foo.html",  &#010;-    true,   // "/e/doh.html",    &#010;-    true,   // "/f/index.html",  &#010;-    true,   // "/foo/bar.html",  &#010;-    true,   // "/f/",&#010;-  };&#010;+  private static final String ROBOTS_STRING = &#010;+      "User-Agent: Agent1 #foo" + CR &#010;+      + "Disallow: /a" + CR &#010;+      + "Disallow: /b/a" + CR &#010;+      + "#Disallow: /c" + CR &#010;+      + "Crawl-delay: 10" + CR  // set crawl delay for Agent1 as 10 sec&#010;+      + "" + CR &#010;+      + "" + CR &#010;+      + "User-Agent: Agent2" + CR &#010;+      + "Disallow: /a/bloh" + CR &#010;+      + "Disallow: /c" + CR&#010;+      + "Disallow: /foo" + CR&#010;+      + "Crawl-delay: 20" + CR&#010;+      + "" + CR &#010;+      + "User-Agent: *" + CR &#010;+      + "Disallow: /foo/bar/" + CR;   // no crawl delay for other agents&#010;   &#010;-  private static final String[] ROBOTS_STRINGS= new String[] {&#010;-    "User-Agent: Agent1 #foo" + CR &#010;-    + "Disallow: /a" + CR &#010;-    + "Disallow: /b/a" + CR &#010;-    + "#Disallow: /c" + CR &#010;-    + "" + CR &#010;-    + "" + CR &#010;-    + "User-Agent: Agent2 Agent3#foo" + CR &#010;-    + "User-Agent: Agent4" + CR &#010;-    + "Disallow: /d" + CR &#010;-    + "Disallow: /e/d/" + CR&#010;-    + "" + CR &#010;-    + "User-Agent: *" + CR &#010;-    + "Disallow: /foo/bar/" + CR,&#010;-    null  // Used to test EMPTY_RULES&#010;+  private static final String[] TEST_PATHS = new String[] {&#010;+    "http://example.com/a",&#010;+    "http://example.com/a/bloh/foo.html",&#010;+    "http://example.com/b",&#010;+    "http://example.com/c",&#010;+    "http://example.com/b/a/index.html",&#010;+    "http://example.com/foo/bar/baz.html"&#010;+  };&#010;+&#010;+  private static final boolean[] RESULTS = new boolean[] {&#010;+    false,  //  /a&#010;+    false,  //  /a/bloh/foo.html&#010;+    true,   //  /b&#010;+    true,   //  /c&#010;+    false,  //  /b/a/index.html&#010;+    true    //  /foo/bar/baz.html&#010;   };&#010; &#010;-  private static final String[] AGENT_STRINGS= new String[] {&#010;-    "Agent1",&#010;-    "Agent2",&#010;-    "Agent3",&#010;-    "Agent4",&#010;-    "Agent5",&#010;-  };&#010;-&#010;-  private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {&#010;-    { &#010;-      false, &#010;-      false,&#010;-      false,&#010;-      false,&#010;-      true,&#010;-    },&#010;-    { &#010;-      false, &#010;-      false,&#010;-      false,&#010;-      false,&#010;-      true,&#010;-    }    &#010;-  };&#010;+  private HttpRobotRulesParser parser;&#010;+  private BaseRobotRules rules;&#010; &#010;-  private static final String[] TEST_PATHS= new String[] {&#010;-    "/a",&#010;-    "/a/",&#010;-    "/a/bloh/foo.html",&#010;-    "/b",&#010;-    "/b/a",&#010;-    "/b/a/index.html",&#010;-    "/b/b/foo.html",&#010;-    "/c",&#010;-    "/c/a",&#010;-    "/c/a/index.html",&#010;-    "/c/b/foo.html",&#010;-    "/d",&#010;-    "/d/a",&#010;-    "/e/a/index.html",&#010;-    "/e/d",&#010;-    "/e/d/foo.html",&#010;-    "/e/doh.html",&#010;-    "/f/index.html",&#010;-    "/foo/bar/baz.html",  &#010;-    "/f/",&#010;-  };&#010;-&#010;-  private static final boolean[][][] ALLOWED= new boolean[][][] {&#010;-    { // ROBOTS_STRINGS[0]&#010;-      { // Agent1&#010;-&#009;false,  // "/a",&#009;      &#010;-&#009;false,  // "/a/",&#009;      &#010;-&#009;false,  // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;false,  // "/b/a",&#009;      &#010;-&#009;false,  // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;true,   // "/d",&#009;      &#010;-&#009;true,   // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;true,   // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      }, &#010;-      { // Agent2&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;false,  // "/d",&#009;      &#010;-&#009;false,  // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;false,  // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      },&#010;-      { // Agent3&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;false,  // "/d",&#009;      &#010;-&#009;false,  // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;false,  // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      },&#010;-      { // Agent4&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;false,  // "/d",&#009;      &#010;-&#009;false,  // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;false,  // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      },&#010;-      { // Agent5/"*"&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;true,   // "/d",&#009;      &#010;-&#009;true,   // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;true,   // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;false,  // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      }&#010;-    },&#010;-    { // ROBOTS_STRINGS[1]&#010;-      ACCEPT_ALL, // Agent 1&#010;-      ACCEPT_ALL, // Agent 2&#010;-      ACCEPT_ALL, // Agent 3&#010;-      ACCEPT_ALL, // Agent 4&#010;-      ACCEPT_ALL, // Agent 5&#010;-    }&#010;-  };&#010;- &#010;   public TestRobotRulesParser(String name) {&#010;     super(name);&#010;+    parser = new HttpRobotRulesParser();&#010;   }&#010; &#010;-  public void testRobotsOneAgent() {&#010;-    for (int i= 0; i &lt; ROBOTS_STRINGS.length; i++) {&#010;-      for (int j= 0; j &lt; AGENT_STRINGS.length; j++) {&#010;-&#009;testRobots(i, new String[] { AGENT_STRINGS[j] },&#010;-&#009;&#009;   TEST_PATHS, ALLOWED[i][j]);&#010;-      }&#010;+  /**&#010;+  * Test that the robots rules are interpreted correctly by the robots rules parser. &#010;+  */&#010;+  public void testRobotsAgent() {&#010;+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE,&#010;SINGLE_AGENT);&#010;+&#010;+    for(int counter = 0; counter &lt; TEST_PATHS.length; counter++) {&#010;+      assertTrue("testing on agent (" + SINGLE_AGENT + "), and " &#010;+              + "path " + TEST_PATHS[counter] &#010;+              + " got " + rules.isAllowed(TEST_PATHS[counter]),&#010;+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);&#010;     }&#010;-  }&#010; &#010;-  public void testRobotsTwoAgents() {&#010;-    for (int i= 0; i &lt; ROBOTS_STRINGS.length; i++) {&#010;-      for (int j= 0; j &lt; AGENT_STRINGS.length; j++) {&#010;-&#009;for (int k= 0; k &lt; AGENT_STRINGS.length; k++) {&#010;-&#009;  int key= j;&#010;-&#009;  if (NOT_IN_ROBOTS_STRING[i][j])&#010;-&#009;    key= k;&#010;-&#009;  testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },&#010;-&#009;&#009;     TEST_PATHS, ALLOWED[i][key]);&#010;-&#009;}&#010;-      }&#010;-    }&#010;-  }&#010;-  &#010;-  public void testCrawlDelay() {&#010;-    RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });&#010;-    String delayRule1 = "User-agent: nutchbot" + CR +&#010;-                        "Crawl-delay: 10" + CR +&#010;-                        "User-agent: foobot" + CR +&#010;-                        "Crawl-delay: 20" + CR +&#010;-                        "User-agent: *" + CR + &#010;-                        "Disallow:/baz" + CR;&#010;-    String delayRule2 = "User-agent: foobot" + CR +&#010;-                        "Crawl-delay: 20" + CR +&#010;-                        "User-agent: *" + CR + &#010;-                        "Disallow:/baz" + CR;&#010;-    RobotRuleSet rules = p.parseRules(delayRule1.getBytes());&#010;-    long crawlDelay = rules.getCrawlDelay();&#010;-    assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000));&#010;-    rules = p.parseRules(delayRule2.getBytes());&#010;-    crawlDelay = rules.getCrawlDelay();&#010;-    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1));&#010;-  }&#010;+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE,&#010;MULTIPLE_AGENTS);&#010; &#010;-  // helper&#010;-&#010;-  public void testRobots(int robotsString, String[] agents, String[] paths, &#010;-&#009;&#009;&#009; boolean[] allowed) {&#010;-    String agentsString= agents[0];&#010;-    for (int i= 1; i &lt; agents.length; i++)&#010;-      agentsString= agentsString + "," + agents[i];&#010;-    RobotRulesParser p= new RobotRulesParser(agents);&#010;-    RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null&#010;-                                     ? ROBOTS_STRINGS[robotsString].getBytes()&#010;-                                     : null);&#010;-    for (int i= 0; i &lt; paths.length; i++) {&#010;-      assertTrue("testing robots file "+robotsString+", on agents ("&#010;-&#009;&#009; + agentsString + "), and path " + TEST_PATHS[i] + "; got " &#010;-&#009;&#009; + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF&#010;-&#009;&#009;&#009;&#009;   + rules,&#010;-&#009;&#009; rules.isAllowed(TEST_PATHS[i]) == allowed[i]);&#010;+    for(int counter = 0; counter &lt; TEST_PATHS.length; counter++) {&#010;+      assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " &#010;+              + "path " + TEST_PATHS[counter] &#010;+              + " got " + rules.isAllowed(TEST_PATHS[counter]),&#010;+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);&#010;     }&#010;   }&#010; &#010;-&#010;-  &#010;+  /**&#010;+  * Test that the crawl delay is extracted from the robots file for respective agent. &#010;+  * If its not specified for a given agent, default value must be returned.&#010;+  */&#010;+  public void testCrawlDelay() {&#010;+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the&#010;parser&#010;+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);&#010;+    assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay()&#010;== 10000));&#010;+    &#010;+    // for UNKNOWN_AGENT, the default crawl delay must be returned.&#010;+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT);&#010;+    assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay()&#010;== Long.MIN_VALUE));&#010;+  }&#010; }&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;Mon Apr 29 20:26:52 2013&#010;@@ -14,7 +14,6 @@&#010;  * See the License for the specific language governing permissions and&#010;  * limitations under the License.&#010;  */&#010;-&#010; package org.apache.nutch.protocol.file;&#010; &#010; import java.net.URL;&#010;@@ -23,28 +22,30 @@ import java.util.HashSet;&#010; &#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010;+&#010; import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.hadoop.io.Text;&#010;+&#010; import org.apache.nutch.net.protocols.Response;&#010; import org.apache.nutch.protocol.Content;&#010;-import org.apache.nutch.protocol.EmptyRobotRules;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatusCodes;&#010; import org.apache.nutch.protocol.ProtocolStatusUtils;&#010;-import org.apache.nutch.protocol.RobotRules;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010; import org.apache.nutch.storage.ProtocolStatus;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.storage.WebPage.Field;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-/************************************&#010;- * File.java deals with file: scheme.&#010;- * &#010;- * Configurable parameters are defined under "FILE properties" section in&#010;- * ./conf/nutch-default.xml or similar.&#010;- * &#010;- * @author John Xing&#010;- ***********************************/&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+&#010;+/**&#010;+ * This class is a protocol plugin used for file: scheme.&#010;+ * It creates {@link FileResponse} object and gets the content of the url from it.&#010;+ * Configurable parameters are {@code file.content.limit} and {@code file.crawl.parent} &#010;+ * in nutch-default.xml defined under "file properties" section.&#010;+ */&#010; public class File implements Protocol {&#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(File.class);&#010;@@ -65,14 +66,40 @@ public class File implements Protocol {&#010;   private Configuration conf;&#010; &#010;   // constructor&#010;-  public File() {&#010;-  }&#010;+  public File() { }&#010; &#010;-  /** Set the point at which content is truncated. */&#010;-  public void setMaxContentLength(int length) {&#010;-    maxContentLength = length;&#010;+  /**&#010;+   * Set the {@link Configuration} object&#010;+   */&#010;+  public void setConf(Configuration conf) {&#010;+    this.conf = conf;&#010;+    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);&#010;+    this.crawlParents = conf.getBoolean("file.crawl.parent", true);&#010;   }&#010;-&#010;+  &#010;+  /**&#010;+   * Get the {@link Configuration} object&#010;+   */&#010;+  public Configuration getConf() {&#010;+    return this.conf;&#010;+  }&#010;+    &#010;+  /** &#010;+   * Set the point at which content is truncated. &#010;+   */&#010;+  public void setMaxContentLength(int maxContentLength) {&#010;+    this.maxContentLength = maxContentLength;&#010;+  }&#010;+  &#010;+  /** &#010;+   * Creates a {@link FileResponse} object corresponding to the url and &#010;+   * return a {@link ProtocolOutput} object as per the content received&#010;+   * &#010;+   * @param url Text containing the url&#010;+   * @param datum The CrawlDatum object corresponding to the url&#010;+   * &#010;+   * @return {@link ProtocolOutput} object for the content of the file indicated by url&#010;+   */&#010;   public ProtocolOutput getProtocolOutput(String url, WebPage page) {&#010;     String urlString = url.toString();&#010;     try {&#010;@@ -82,16 +109,11 @@ public class File implements Protocol {&#010; &#010;       while (true) {&#010;         FileResponse response;&#010;-        response = new FileResponse(u, page, this, getConf()); // make&#010;-        // a&#010;-        // request&#010;-&#010;+        response = new FileResponse(u, page, this, getConf()); // make a request&#010;         int code = response.getCode();&#010; &#010;         if (code == 200) { // got a good response&#010;-          return new ProtocolOutput(response.toContent()); // return&#010;-          // it&#010;-&#010;+          return new ProtocolOutput(response.toContent()); // return it&#010;         } else if (code &gt;= 300 &amp;&amp; code &lt; 400) { // handle redirect&#010;           if (redirects == MAX_REDIRECTS)&#010;             throw new FileException("Too many redirects: " + url);&#010;@@ -114,16 +136,13 @@ public class File implements Protocol {&#010;   }&#010; &#010;   @Override&#010;-  public RobotRules getRobotRules(String url, WebPage page) {&#010;-    return EmptyRobotRules.RULES;&#010;-  }&#010;-&#010;-  @Override&#010;   public Collection&lt;Field&gt; getFields() {&#010;     return FIELDS;&#010;   }&#010; &#010;-  /** For debugging. */&#010;+  /** &#010;+   * Quick way for running this class. Useful for debugging. &#010;+   */&#010;   public static void main(String[] args) throws Exception {&#010;     int maxContentLength = Integer.MIN_VALUE;&#010;     boolean dumpContent = false;&#010;@@ -154,9 +173,6 @@ public class File implements Protocol {&#010;     if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength&#010;       file.setMaxContentLength(maxContentLength);&#010; &#010;-    // set log level&#010;-    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));&#010;-&#010;     Content content = file.getProtocolOutput(urlString, new WebPage())&#010;         .getContent();&#010; &#010;@@ -172,13 +188,11 @@ public class File implements Protocol {&#010;     file = null;&#010;   }&#010; &#010;-  public void setConf(Configuration conf) {&#010;-    this.conf = conf;&#010;-    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);&#010;-    this.crawlParents = conf.getBoolean("file.crawl.parent", true);&#010;-  }&#010;-&#010;-  public Configuration getConf() {&#010;-    return this.conf;&#010;-  }&#010;+  /** &#010;+   * No robots parsing is done for file protocol. &#010;+   * So this returns a set of empty rules which will allow every url.&#010;+   */&#010;+  public BaseRobotRules getRobotRules(String url, WebPage page) {&#010;+    return RobotRulesParser.EMPTY_RULES;&#010;+  }   &#010; }&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;Mon Apr 29 20:26:52 2013&#010;@@ -28,23 +28,27 @@ import org.apache.commons.net.ftp.FTPFil&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.nutch.net.protocols.Response;&#010; import org.apache.nutch.protocol.Content;&#010;-import org.apache.nutch.protocol.EmptyRobotRules;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatusCodes;&#010; import org.apache.nutch.protocol.ProtocolStatusUtils;&#010;-import org.apache.nutch.protocol.RobotRules;&#010; import org.apache.nutch.storage.ProtocolStatus;&#010; import org.apache.nutch.storage.WebPage;&#010; &#010;-/************************************&#010;- * Ftp.java deals with ftp: scheme.&#010;- * &#010;- * Configurable parameters are defined under "FTP properties" section in&#010;- * ./conf/nutch-default.xml or similar.&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+&#010;+/**&#010;+ * This class is a protocol plugin used for ftp: scheme.&#010;+ * It creates {@link FtpResponse} object and gets the content of the url from it.&#010;+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},&#010;+ *                             {@code ftp.content.limit}, {@code ftp.timeout}, &#010;+ *                             {@code ftp.server.timeout}, {@code ftp.password}, &#010;+ *                             {@code ftp.keep.connection} and {@code ftp.follow.talk}.&#010;+ * For details see "FTP properties" section in {@code nutch-default.xml}.&#010;  * &#010;  * @author John Xing&#010;- ***********************************/&#010;+ */&#010; public class Ftp implements Protocol {&#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);&#010;@@ -109,6 +113,15 @@ public class Ftp implements Protocol {&#010;     this.keepConnection = keepConnection;&#010;   }&#010; &#010;+  /** &#010;+   * Creates a {@link FtpResponse} object corresponding to the url and &#010;+   * returns a {@link ProtocolOutput} object as per the content received&#010;+   * &#010;+   * @param url Text containing the ftp url&#010;+   * @param datum The CrawlDatum object corresponding to the url&#010;+   * &#010;+   * @return {@link ProtocolOutput} object for the url&#010;+   */&#010;   public ProtocolOutput getProtocolOutput(String url, WebPage page) {&#010;     try {&#010;       URL u = new URL(url);&#010;@@ -154,6 +167,9 @@ public class Ftp implements Protocol {&#010;     }&#010;   }&#010; &#010;+  /**&#010;+   * Set the {@link Configuration} object&#010;+   */&#010;   public void setConf(Configuration conf) {&#010;     this.conf = conf;&#010;     this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);&#010;@@ -165,14 +181,13 @@ public class Ftp implements Protocol {&#010;     this.followTalk = conf.getBoolean("ftp.follow.talk", false);&#010;   }&#010; &#010;+  /**&#010;+   * Get the {@link Configuration} object&#010;+   */&#010;   public Configuration getConf() {&#010;     return this.conf;&#010;   }&#010; &#010;-  public RobotRules getRobotRules(String url, WebPage page) {&#010;-    return EmptyRobotRules.RULES;&#010;-  }&#010;-&#010;   /** For debugging. */&#010;   public static void main(String[] args) throws Exception {&#010;     int timeout = Integer.MIN_VALUE;&#010;@@ -222,9 +237,6 @@ public class Ftp implements Protocol {&#010;     if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength&#010;       ftp.setMaxContentLength(maxContentLength);&#010; &#010;-    // set log level&#010;-    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));&#010;-&#010;     Content content = ftp.getProtocolOutput(urlString, new WebPage())&#010;         .getContent();&#010; &#010;@@ -244,4 +256,12 @@ public class Ftp implements Protocol {&#010;     return FIELDS;&#010;   }&#010; &#010;+  /** &#010;+   * Currently, no robots parsing is done for ftp protocol &#010;+   * and this returns a set of empty rules which will allow every url.&#010;+   * There a jira logged for the same NUTCH-1513&#010;+   */&#010;+  public BaseRobotRules getRobotRules(String url, WebPage page) {&#010;+    return RobotRulesParser.EMPTY_RULES;&#010;+  }&#010; }&#010;&#010;Modified: nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java?rev=1477319&amp;r1=1477318&amp;r2=1477319&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/protocol-sftp/src/java/org/apache/nutch/protocol/sftp/Sftp.java&#010;Mon Apr 29 20:26:52 2013&#010;@@ -38,9 +38,9 @@ import org.apache.nutch.net.protocols.Re&#010; import org.apache.nutch.protocol.Content;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010;-import org.apache.nutch.protocol.RobotRules;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.storage.WebPage.Field;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010; &#010; //JSCH imports&#010; import com.jcraft.jsch.ChannelSftp;&#010;@@ -50,6 +50,8 @@ import com.jcraft.jsch.Session;&#010; import com.jcraft.jsch.SftpException;&#010; import com.jcraft.jsch.ChannelSftp.LsEntry;&#010; &#010;+import crawlercommons.robots.BaseRobotRules;&#010;+&#010; /**&#010;  * This class uses the Jsch package to fetch content using the Sftp protocol.&#010;  * &#010;@@ -224,10 +226,16 @@ public class Sftp implements Protocol {&#010;     }&#010;   }&#010; &#010;+  /**&#010;+   * Get the {@link Configuration} object&#010;+   */&#010;   public Configuration getConf() {&#010;     return configuration;&#010;   }&#010; &#010;+  /**&#010;+   * Set the {@link Configuration} object&#010;+   */&#010;   public void setConf(Configuration arg0) {&#010;     configuration = arg0;&#010; &#010;@@ -288,34 +296,9 @@ public class Sftp implements Protocol {&#010;     }&#010;   }&#010; &#010;-  /*&#010;-   * (non-Javadoc)&#010;-   * &#010;-   * @see org.apache.nutch.protocol.Protocol#getRobotRules(java.lang.String,&#010;-   * org.apache.nutch.storage.WebPage)&#010;-   */&#010;   @Override&#010;-  public RobotRules getRobotRules(String url, WebPage page) {&#010;-    return new RobotRules() {&#010;-&#010;-      @Override&#010;-      public boolean isAllowed(URL url) {&#010;-        // they're all allowed for now.&#010;-        return true;&#010;-      }&#010;-&#010;-      @Override&#010;-      public long getExpireTime() {&#010;-        // set to 0 for never expire&#010;-        return 0;&#010;-      }&#010;-&#010;-      @Override&#010;-      public long getCrawlDelay() {&#010;-        // no delay&#010;-        return 0;&#010;-      }&#010;-    };&#010;+  public BaseRobotRules getRobotRules(String url, WebPage page) {&#010;+    return RobotRulesParser.EMPTY_RULES;&#010;   }&#010; &#010;   /*&#010;@@ -327,5 +310,4 @@ public class Sftp implements Protocol {&#010;   public Collection&lt;Field&gt; getFields() {&#010;     return Collections.emptySet();&#010;   }&#010;-&#010; }&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1476861 - in /nutch/branches/2.x: CHANGES.txt conf/log4j.properties</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130428233559.5C54A2388847@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130428233559-5C54A2388847@eris-apache-org%3e</id>
<updated>2013-04-28T23:35:59Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Sun Apr 28 23:35:58 2013&#010;New Revision: 1476861&#010;&#010;URL: http://svn.apache.org/r1476861&#010;Log:&#010;NUTCH-346 Improve readability of logs/hadoop.log&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/conf/log4j.properties&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1476861&amp;r1=1476860&amp;r2=1476861&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Sun Apr 28 23:35:58 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via tejasp)&#010;+&#010; * NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + lewismc)&#010; &#010; * NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc)&#010;&#010;Modified: nutch/branches/2.x/conf/log4j.properties&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1476861&amp;r1=1476860&amp;r2=1476861&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/conf/log4j.properties (original)&#010;+++ nutch/branches/2.x/conf/log4j.properties Sun Apr 28 23:35:58 2013&#010;@@ -40,6 +40,7 @@ log4j.logger.org.apache.nutch.crawl.WebT&#010; log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout&#010; log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout&#010; log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout&#010;+log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN&#010; &#010; log4j.logger.org.apache.nutch=INFO&#010; log4j.logger.org.apache.hadoop=WARN&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1476859 - in /nutch/trunk: CHANGES.txt conf/log4j.properties</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130428233259.A995E23888E4@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130428233259-A995E23888E4@eris-apache-org%3e</id>
<updated>2013-04-28T23:32:59Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Sun Apr 28 23:32:59 2013&#010;New Revision: 1476859&#010;&#010;URL: http://svn.apache.org/r1476859&#010;Log:&#010;NUTCH-346 Improve readability of logs/hadoop.log&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/conf/log4j.properties&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1476859&amp;r1=1476858&amp;r2=1476859&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Sun Apr 28 23:32:59 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-346 Improve readability of logs/hadoop.log (Renaud Richardet via tejasp)&#010;+&#010; * NUTCH-829 duplicate hadoop temp files (Mike Baranczak, lewismc, tejasp)&#010; &#010; * NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + lewismc)&#010;&#010;Modified: nutch/trunk/conf/log4j.properties&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/conf/log4j.properties?rev=1476859&amp;r1=1476858&amp;r2=1476859&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/conf/log4j.properties (original)&#010;+++ nutch/trunk/conf/log4j.properties Sun Apr 28 23:32:59 2013&#010;@@ -35,6 +35,7 @@ log4j.logger.org.apache.nutch.indexer.In&#010; log4j.logger.org.apache.nutch.tools.FreeGenerator=INFO,cmdstdout&#010; log4j.logger.org.apache.nutch.util.domain.DomainStatistics=INFO,cmdstdout&#010; log4j.logger.org.apache.nutch.tools.CrawlDBScanner=INFO,cmdstdout&#010;+log4j.logger.org.apache.nutch.plugin.PluginRepository=WARN&#010; &#010; log4j.logger.org.apache.nutch=INFO&#010; log4j.logger.org.apache.hadoop=WARN&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1476702 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/crawl/Generator.java</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130428011321.B42A223889E7@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130428011321-B42A223889E7@eris-apache-org%3e</id>
<updated>2013-04-28T01:13:21Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Sun Apr 28 01:13:21 2013&#010;New Revision: 1476702&#010;&#010;URL: http://svn.apache.org/r1476702&#010;Log:&#010;NUTCH-829 duplicate hadoop temp files&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1476702&amp;r1=1476701&amp;r2=1476702&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Sun Apr 28 01:13:21 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-829 duplicate hadoop temp files (Mike Baranczak, lewismc, tejasp)&#010;+&#010; * NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + lewismc)&#010; &#010; * NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java?rev=1476702&amp;r1=1476701&amp;r2=1476702&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/crawl/Generator.java Sun Apr 28 01:13:21 2013&#010;@@ -492,7 +492,7 @@ public class Generator extends Configure&#010;       throws IOException {&#010; &#010;     Path tempDir = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-"&#010;-        + System.currentTimeMillis());&#010;+        + java.util.UUID.randomUUID().toString());&#010; &#010;     Path lock = new Path(dbDir, CrawlDb.LOCK_NAME);&#010;     FileSystem fs = FileSystem.get(getConf());&#010;@@ -582,7 +582,7 @@ public class Generator extends Configure&#010;     if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {&#010;       // update the db from tempDir&#010;       Path tempDir2 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-"&#010;-          + System.currentTimeMillis());&#010;+          + java.util.UUID.randomUUID().toString());&#010; &#010;       job = new NutchJob(getConf());&#010;       job.setJobName("generate: updatedb " + dbDir);&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1475631 - in /nutch/site: forrest/src/documentation/content/xdocs/ publish/ publish/skin/images/</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130425050558.0D6E523888E3@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130425050558-0D6E523888E3@eris-apache-org%3e</id>
<updated>2013-04-25T05:05:57Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Thu Apr 25 05:05:56 2013&#010;New Revision: 1475631&#010;&#010;URL: http://svn.apache.org/r1475631&#010;Log:&#010;NUTCH-1565 Proper downloads page for Nutch (lewismc and tejasp)&#010;&#010;Added:&#010;    nutch/site/forrest/src/documentation/content/xdocs/downloads.xml&#010;    nutch/site/publish/downloads.html&#010;    nutch/site/publish/downloads.pdf   (with props)&#010;    nutch/site/publish/skin/images/apache-thanks.png   (with props)&#010;    nutch/site/publish/skin/images/built-with-cocoon.gif   (with props)&#010;Modified:&#010;    nutch/site/forrest/src/documentation/content/xdocs/site.xml&#010;    nutch/site/publish/about.html&#010;    nutch/site/publish/about.pdf&#010;    nutch/site/publish/bot.html&#010;    nutch/site/publish/bot.pdf&#010;    nutch/site/publish/credits.html&#010;    nutch/site/publish/credits.pdf&#010;    nutch/site/publish/faq.html&#010;    nutch/site/publish/faq.pdf&#010;    nutch/site/publish/index.html&#010;    nutch/site/publish/index.pdf&#010;    nutch/site/publish/issue_tracking.html&#010;    nutch/site/publish/issue_tracking.pdf&#010;    nutch/site/publish/linkmap.html&#010;    nutch/site/publish/linkmap.pdf&#010;    nutch/site/publish/mailing_lists.html&#010;    nutch/site/publish/mailing_lists.pdf&#010;    nutch/site/publish/nightly.html&#010;    nutch/site/publish/nightly.pdf&#010;    nutch/site/publish/old_downloads.html&#010;    nutch/site/publish/old_downloads.pdf&#010;    nutch/site/publish/sonar.html&#010;    nutch/site/publish/sonar.pdf&#010;    nutch/site/publish/tutorial.html&#010;    nutch/site/publish/tutorial.pdf&#010;    nutch/site/publish/version_control.html&#010;    nutch/site/publish/version_control.pdf&#010;    nutch/site/publish/wiki.html&#010;    nutch/site/publish/wiki.pdf&#010;&#010;Added: nutch/site/forrest/src/documentation/content/xdocs/downloads.xml&#010;URL: http://svn.apache.org/viewvc/nutch/site/forrest/src/documentation/content/xdocs/downloads.xml?rev=1475631&amp;view=auto&#010;==============================================================================&#010;--- nutch/site/forrest/src/documentation/content/xdocs/downloads.xml (added)&#010;+++ nutch/site/forrest/src/documentation/content/xdocs/downloads.xml Thu Apr 25 05:05:56 2013&#010;@@ -0,0 +1,111 @@&#010;+&lt;?xml version="1.0"?&gt;&#010;+&#010;+&lt;!DOCTYPE document PUBLIC "-//APACHE//DTD Documentation V2.0//EN" "http://forrest.apache.org/dtd/document-v20.dtd"&gt;&#010;+&#010;+&lt;!--&#010;+   Licensed to the Apache Software Foundation (ASF) under one or more&#010;+   contributor license agreements.  See the NOTICE file distributed with&#010;+   this work for additional information regarding copyright ownership.&#010;+   The ASF licenses this file to You under the Apache License, Version 2.0&#010;+   (the "License"); you may not use this file except in compliance with&#010;+   the License.  You may obtain a copy of the License at&#010;+&#010;+       http://www.apache.org/licenses/LICENSE-2.0&#010;+&#010;+   Unless required by applicable law or agreed to in writing, software&#010;+   distributed under the License is distributed on an "AS IS" BASIS,&#010;+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.&#010;+   See the License for the specific language governing permissions and&#010;+   limitations under the License.&#010;+--&gt;&#010;+&#010;+&lt;document&gt; &#010;+&#010;+  &lt;header&gt; &#010;+    &lt;title&gt;Nutch Downloads&lt;/title&gt; &#010;+  &lt;/header&gt; &#010;+&#010;+  &lt;body&gt; &#010;+&#010;+    &lt;section&gt;&#010;+      &lt;title&gt;Download&lt;/title&gt;&#010;+      &#010;+      &lt;p&gt; Apache Nutch 2.1 (src-tar and src-zip only) and 1.6 (src-tar, src-zip, bin-tar&#010;and bin-zip) are now available. See &#010;+      the &#010;+      &lt;a href="http://apache.org/dist/nutch/2.1/CHANGES-2.1.txt"&gt;CHANGES-2.1.txt&lt;/a&gt;,&#010;and &#010;+      &lt;a href="http://apache.org/dist/nutch/1.6/CHANGES_1.6.txt"&gt;CHANGES_1.6.txt&lt;/a&gt;&#010;+      files for more information on the list of updates in these releases.&#010;+      &lt;/p&gt;&#010;+      &lt;p&gt; All Apache Nutch distributions is distributed under the &lt;a href="http://www.apache.org/licenses/LICENSE-2.0.html"&gt;Apache&#010;License, version 2.0&lt;/a&gt;.&#010;+      &lt;/p&gt;&#010;+      &lt;p&gt;The link in the Mirrors column below should display a list of available mirrors&#010;with a default selection based on your inferred location. If you do not see that page, try&#010;a different browser. The checksum and signature are links to the originals on the main distribution&#010;server.&#010;+      &lt;/p&gt;&#010;+&#010;+&lt;table&gt;&#010;+  &lt;caption&gt;Downloads&lt;/caption&gt;&#010;+  &lt;tr&gt;&lt;th&gt;Version&lt;/th&gt; &lt;th&gt;Mirrors&lt;/th&gt; &lt;th&gt;Checksum&lt;/th&gt;&#010;&lt;th&gt;Signature&lt;/th&gt;&lt;/tr&gt;&#010;+  &lt;tr&gt;&lt;td&gt;Apache Nutch 2.1 (src.tar.gz)&lt;/td&gt;&lt;td&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/2.1/apache-nutch-2.1-src.tar.gz"&gt;&#010;+       apache-nutch-2.1-src.tar.gz &lt;/a&gt;&lt;/td&gt; &lt;td&gt;&lt;a href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.tar.gz.md5"&gt;&#010;+       apache-nutch-2.1-src.tar.gz.md5&lt;/a&gt; &lt;/td&gt; &lt;td&gt;&lt;a href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.tar.gz.asc"&gt;&#010;+       apache-nutch-2.1-src.tar.gz.asc&lt;/a&gt; &lt;/td&gt;&lt;/tr&gt;&#010;+  &lt;tr&gt;&lt;td&gt;Apache Nutch 2.1 (src.zip)&lt;/td&gt;&lt;td&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/2.1/apache-nutch-2.1-src.zip"&gt;&#010;+       apache-nutch-2.1-src.zip&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.zip.md5"&gt;&#010;+       apache-nutch-2.1-src.zip.md5&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.zip.asc"&gt;&#010;+       apache-nutch-2.1-src.zip.asc&lt;/a&gt; &lt;/td&gt;&lt;/tr&gt;&#010;+  &lt;tr&gt;&lt;td&gt;Apache Nutch 1.6 (src.tar.gz)&lt;/td&gt;&lt;td&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-src.tar.gz"&gt;&#010;+       apache-nutch-1.6-src.tar.gz&lt;/a&gt;&lt;/td&gt; &lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.tar.gz.md5"&gt;&#010;+       apache-nutch-1.6-src.tar.gz.md5&lt;/a&gt; &lt;/td&gt; &lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.tar.gz.asc"&gt;&#010;+       apache-nutch-1.6-src.tar.gz.asc&lt;/a&gt; &lt;/td&gt;&lt;/tr&gt;&#010;+  &lt;tr&gt;&lt;td&gt;Apache Nutch 1.6 (src.zip)&lt;/td&gt;&lt;td&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-src.zip"&gt;&#010;+       apache-nutch-1.6-src.zip&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.zip.md5"&gt;&#010;+       apache-nutch-1.6-src.zip.md5&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.zip.asc"&gt;&#010;+       apache-nutch-1.6-src.zip.asc&lt;/a&gt; &lt;/td&gt;&lt;/tr&gt;&#010;+  &lt;tr&gt;&lt;td&gt;Apache Nutch 1.6 (bin.tar.gz)&lt;/td&gt;&lt;td&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-bin.tar.gz"&gt;&#010;+       apache-nutch-1.6-bin.tar.gz&lt;/a&gt;&lt;/td&gt; &lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.tar.gz.md5"&gt;&#010;+       apache-nutch-1.6-bin.tar.gz.md5&lt;/a&gt; &lt;/td&gt; &lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.tar.gz.asc"&gt;&#010;+       apache-nutch-1.6-bin.tar.gz.asc&lt;/a&gt; &lt;/td&gt;&lt;/tr&gt;&#010;+  &lt;tr&gt;&lt;td&gt;Apache Nutch 1.6 (bin.zip)&lt;/td&gt;&lt;td&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-bin.zip"&gt;&#010;+       apache-nutch-1.6-bin.zip&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.zip.md5"&gt;&#010;+       apache-nutch-1.6-bin.zip.md5&lt;/a&gt;&lt;/td&gt;&lt;td&gt;&lt;a href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.zip.asc"&gt;&#010;+       apache-nutch-1.6-bin.zip.asc&lt;/a&gt; &lt;/td&gt;&lt;/tr&gt;&#010;+&lt;/table&gt;&#010;+    &lt;/section&gt;&#010;+    &#010;+    &lt;section&gt; &#010;+      &lt;title&gt;Verify Releases&lt;/title&gt;&#010;+      &lt;p&gt;It is essential that you verify the integrity of the downloaded files using&#010;the PGP or MD5 signatures. Please read &lt;a href="http://httpd.apache.org/dev/verification.html"&gt;Verifying&#010;Apache HTTP Server Releases&lt;/a&gt; for more information on why you should verify our releases.&#010;+      We strongly recommend you verify your downloads with both PGP and MD5.&lt;/p&gt;&#010;+      &#010;+      &lt;p&gt;&lt;strong&gt;PGP Signature&lt;/strong&gt;&lt;/p&gt;&#010;+      &lt;p&gt;The PGP signatures can be verified using PGP or GPG. First download the &lt;a&#010;href="http://www.apache.org/dist/nutch/KEYS"&gt;KEYS&lt;/a&gt; as well as the asc signature&#010;file for the relevant distribution. Make sure you get these files from the &lt;a href="http://www.apache.org/dist/nutch/"&gt;main&#010;distribution directory&lt;/a&gt;, rather than from a mirror. Then verify the signatures using&#010;&lt;/p&gt;&#010;+      &lt;p&gt;&lt;code&gt; $ gpg --import KEYS &lt;/code&gt;&lt;/p&gt;&#010;+      &lt;p&gt;&lt;code&gt; $ gpg --verify apache-nutch-X.Y.Z &lt;/code&gt;&lt;/p&gt;&#010;+      &lt;p&gt;The files in Apache Nurch 2.1 and 1.6 releases are signed by Lewis John McGibbney&#010;(lewismc) C601BCA7 &lt;/p&gt;&#010;+&#010;+      &lt;p&gt;&lt;strong&gt;MD5 Signature&lt;/strong&gt;&lt;/p&gt;&#010;+      &lt;p&gt;Alternatively, you can verify the MD5 signature on the files. A unix program&#010;called md5 or md5sum is included in many unix distributions.&lt;/p&gt;&#010;+      &lt;p&gt;&lt;code&gt; $ md5sum apache-nutch-X.Y.Z&lt;/code&gt;&lt;/p&gt;&#010;+      &lt;p&gt;&lt;code&gt; ... output should match the string in apache-nutch-X.Y.Z&lt;/code&gt;&lt;/p&gt;&#010;+      &#010;+    &lt;/section&gt;   &#010;+     &#010;+    &lt;section&gt;&#010;+      &lt;title&gt;Previous Releases&lt;/title&gt;   &#010;+      &lt;p&gt;If you are looking for previous releases of Apache Nutch, have a look in the&#010;&lt;a href="old_downloads.html"&gt;old downloads&lt;/a&gt; page, or alternatively for even&#010;older releases check out the  &#010;+         &lt;a href="http://archive.apache.org/dist/incubator/nutch/"&gt;Incubator archives&lt;/a&gt;.&#010;+      &lt;/p&gt;&#010;+&#010;+      &lt;p&gt; Subscribe to &#010;+      the &lt;code&gt;dev@&lt;/code&gt; &lt;a href="mailing_lists.html"&gt;mailing list&lt;/a&gt;&#010;if you want to &#010;+      get notified about future release candidates and subsequent Nutch official releases.&#010;+      &lt;/p&gt;&#010;+    &#010;+      &lt;p&gt;Apache Nutch releases are available under the Apache License, Version 2.0.&#010;See &#010;+         the NOTICE.txt file contained in each release artifact for applicable copyright&#010;&#010;+         attribution notices.&#010;+      &lt;/p&gt;&#010;+    &lt;/section&gt;&#010;+&#010;+  &lt;/body&gt;&#010;+&#010;+&lt;/document&gt;&#010;&#010;Modified: nutch/site/forrest/src/documentation/content/xdocs/site.xml&#010;URL: http://svn.apache.org/viewvc/nutch/site/forrest/src/documentation/content/xdocs/site.xml?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/forrest/src/documentation/content/xdocs/site.xml (original)&#010;+++ nutch/site/forrest/src/documentation/content/xdocs/site.xml Thu Apr 25 05:05:56 2013&#010;@@ -53,7 +53,7 @@ See http://forrest.apache.org/docs/linki&#010;   &lt;/docs&gt;&#010; &#010;   &lt;resources label="Resources"&gt;&#010;-    &lt;download    label="Download"         href="ext:release" /&gt;&#010;+    &lt;download    label="Download"         href="downloads.html" /&gt;&#010;     &lt;nightly     label="Nightly builds"   href="nightly.html" /&gt;&#010;     &lt;sonar       label="Sonar Analysis"   href="sonar.html" /&gt;&#010;     &lt;contact     label="Mailing Lists"    href="mailing_lists.html" /&gt;&#010;@@ -80,7 +80,6 @@ See http://forrest.apache.org/docs/linki&#010;     &lt;tika      href="http://tika.apache.org/"/&gt;&#010;     &lt;gora      href="http://gora.apache.org"/&gt;&#010;     &lt;store     href="http://www.cafepress.com/nutch/"/&gt;&#010;-    &lt;release   href="http://www.apache.org/dyn/closer.cgi/nutch/"/&gt;&#010;     &lt;license   href="http://www.apache.org/licenses/"/&gt;&#010;     &lt;sponsor   href="http://www.apache.org/foundation/sponsorship.html" /&gt;&#010;     &lt;thanks   href="http://www.apache.org/foundation/thanks.html" /&gt;&#010;&#010;Modified: nutch/site/publish/about.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/about.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/about.html (original)&#010;+++ nutch/site/publish/about.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/about.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/about.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/bot.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/bot.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/bot.html (original)&#010;+++ nutch/site/publish/bot.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/bot.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/bot.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/credits.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/credits.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/credits.html (original)&#010;+++ nutch/site/publish/credits.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/credits.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/credits.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Added: nutch/site/publish/downloads.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/downloads.html?rev=1475631&amp;view=auto&#010;==============================================================================&#010;--- nutch/site/publish/downloads.html (added)&#010;+++ nutch/site/publish/downloads.html Thu Apr 25 05:05:56 2013&#010;@@ -0,0 +1,414 @@&#010;+&lt;!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"&gt;&#010;+&lt;html&gt;&#010;+&lt;head&gt;&#010;+&lt;META http-equiv="Content-Type" content="text/html; charset=UTF-8"&gt;&#010;+&lt;meta content="Apache Forrest" name="Generator"&gt;&#010;+&lt;meta name="Forrest-version" content="0.9"&gt;&#010;+&lt;meta name="Forrest-skin-name" content="nutch"&gt;&#010;+&lt;title&gt;Nutch Downloads&lt;/title&gt;&#010;+&lt;link type="text/css" href="skin/basic.css" rel="stylesheet"&gt;&#010;+&lt;link media="screen" type="text/css" href="skin/screen.css" rel="stylesheet"&gt;&#010;+&lt;link media="print" type="text/css" href="skin/print.css" rel="stylesheet"&gt;&#010;+&lt;link type="text/css" href="skin/profile.css" rel="stylesheet"&gt;&#010;+&lt;script src="skin/getBlank.js" language="javascript" type="text/javascript"&gt;&lt;/script&gt;&lt;script&#010;src="skin/getMenu.js" language="javascript" type="text/javascript"&gt;&lt;/script&gt;&lt;script&#010;src="skin/fontsize.js" language="javascript" type="text/javascript"&gt;&lt;/script&gt;&#010;+&lt;link rel="shortcut icon" href="images/favicon.ico"&gt;&#010;+&lt;/head&gt;&#010;+&lt;body onload="init()"&gt;&#010;+&lt;script type="text/javascript"&gt;ndeSetTextSize();&lt;/script&gt;&#010;+&lt;div id="top"&gt;&#010;+&lt;!--+&#010;+    |breadtrail&#010;+    +--&gt;&#010;+&lt;div class="breadtrail"&gt;&#010;+&lt;a href="http://www.apache.org/"&gt;Apache&lt;/a&gt; &amp;gt; &lt;a href="http://nutch.apache.org"&gt;Nutch&lt;/a&gt;&#010;&amp;gt; &lt;a href="http://nutch.apache.org"&gt;Home&lt;/a&gt;&lt;script src="skin/breadcrumbs.js"&#010;language="JavaScript" type="text/javascript"&gt;&lt;/script&gt;&#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |header&#010;+    +--&gt;&#010;+&lt;div class="header"&gt;&#010;+&lt;!--+&#010;+    |start group logo&#010;+    +--&gt;&#010;+&lt;div class="grouplogo"&gt;&#010;+&lt;a href="http://www.apache.org/"&gt;&lt;img class="logoImage" alt="Apache" src="images/feather-small.gif"&#010;title="Apache Software Foundation "&gt;&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |end group logo&#010;+    +--&gt;&#010;+&lt;!--+&#010;+    |start Project Logo&#010;+    +--&gt;&#010;+&lt;div class="projectlogo"&gt;&#010;+&lt;a href="http://nutch.apache.org/"&gt;&lt;img class="logoImage" alt="Nutch" src="images/nutch_logo_tm.gif"&#010;title="Open Source Web Search Software"&gt;&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |end Project Logo&#010;+    +--&gt;&#010;+&lt;!--+&#010;+    |start Search&#010;+    +--&gt;&#010;+&lt;div class="searchbox"&gt;&#010;+&lt;script type="text/javascript"&gt;&#010;+                      function selectProvider(form) {&#010;+                        provider = form.elements['searchProvider'].value;&#010;+                        if (provider == "any") {&#010;+                          if (Math.random() &gt; 0.5) {&#010;+                            provider = "lucid";&#010;+                          } else {&#010;+                            provider = "sl";&#010;+                          }&#010;+                        }&#010;+&#010;+                        if (provider == "lucid") {&#010;+                          form.action = "http://search.lucidimagination.com/p:nutch";&#010;+                        } else if (provider == "sl") {&#010;+                          form.action = "http://search-lucene.com/nutch";&#010;+                        }&#010;+&#010;+                        days = 90; // cookie will be valid for 90 days&#010;+                        date = new Date();&#010;+                        date.setTime(date.getTime() + (days * 24 * 60 * 60 * 1000));&#010;+                        expires = "; expires=" + date.toGMTString();&#010;+                        document.cookie = "searchProvider=" + provider + expires + "; path=/";&#010;+                      }&#010;+                    &lt;/script&gt;&#010;+&lt;form id="searchform" action="http://search.lucidimagination.com/p:nutch" method="get"&#010;class="roundtopsmall"&gt;&#010;+&lt;input onFocus="getBlank (this, 'Search the site with Solr');" size="25" name="q" id="query"&#010;type="text" value="Search the site with Solr"&gt;&amp;nbsp; &#010;+                    &lt;input onclick="selectProvider(this.form)" name="Search" value="Search"&#010;type="submit"&gt;&#010;+                      @&#010;+                      &lt;select id="searchProvider" name="searchProvider"&gt;&lt;option&#010;value="any"&gt;select provider&lt;/option&gt;&lt;option value="lucid"&gt;Lucid Find&lt;/option&gt;&lt;option&#010;value="sl"&gt;Search-Lucene&lt;/option&gt;&lt;/select&gt;&lt;script type="text/javascript"&gt;&#010;+                        if (document.cookie.length&gt;0) {&#010;+                          cStart=document.cookie.indexOf("searchProvider=");&#010;+                          if (cStart!=-1) {&#010;+                            cStart=cStart + "searchProvider=".length;&#010;+                            cEnd=document.cookie.indexOf(";", cStart);&#010;+                            if (cEnd==-1) {&#010;+                              cEnd=document.cookie.length;&#010;+                            }&#010;+                            provider = unescape(document.cookie.substring(cStart,cEnd));&#010;+                            document.forms['searchform'].elements['searchProvider'].value&#010;= provider;&#010;+                          }&#010;+                        }&#010;+                      &lt;/script&gt;&#010;+&lt;/form&gt;&#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |end search&#010;+    +--&gt;&#010;+&lt;!--+&#010;+    |start Tabs&#010;+    +--&gt;&#010;+&lt;ul id="tabs"&gt;&#010;+&lt;li class="current"&gt;&#010;+&lt;a class="selected" href="index.html"&gt;Main&lt;/a&gt;&#010;+&lt;/li&gt;&#010;+&lt;li&gt;&#010;+&lt;a class="unselected" href="wiki.html"&gt;Wiki&lt;/a&gt;&#010;+&lt;/li&gt;&#010;+&lt;li&gt;&#010;+&lt;a class="unselected" href="http://issues.apache.org/jira/browse/NUTCH"&gt;Jira&lt;/a&gt;&#010;+&lt;/li&gt;&#010;+&lt;/ul&gt;&#010;+&lt;!--+&#010;+    |end Tabs&#010;+    +--&gt;&#010;+&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;div id="main"&gt;&#010;+&lt;div id="publishedStrip"&gt;&#010;+&lt;!--+&#010;+    |start Subtabs&#010;+    +--&gt;&#010;+&lt;div id="level2tabs"&gt;&lt;/div&gt;&#010;+&lt;!--+&#010;+    |end Endtabs&#010;+    +--&gt;&#010;+&lt;script type="text/javascript"&gt;&lt;!--&#010;+document.write("Last Published: " + document.lastModified);&#010;+//  --&gt;&lt;/script&gt;&#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |breadtrail&#010;+    +--&gt;&#010;+&lt;div class="breadtrail"&gt;&#010;+&#010;+             &amp;nbsp;&#010;+           &lt;/div&gt;&#010;+&lt;!--+&#010;+    |start Menu, mainarea&#010;+    +--&gt;&#010;+&lt;!--+&#010;+    |start Menu&#010;+    +--&gt;&#010;+&lt;div id="menu"&gt;&#010;+&lt;div onclick="SwitchMenu('menu_1.1', 'skin/')" id="menu_1.1Title" class="menutitle"&gt;Project&lt;/div&gt;&#010;+&lt;div id="menu_1.1" class="menuitemgroup"&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="index.html"&gt;News&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="about.html"&gt;About&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="credits.html"&gt;Credits&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://www.apache.org/foundation/thanks.html"&gt;Thanks&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://www.cafepress.com/nutch/"&gt;Buy Stuff&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://www.apache.org/foundation/sponsorship.html"&gt;Sponsorship&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://www.apache.org/licenses/"&gt;License&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://www.apache.org/security/"&gt;Security&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;div onclick="SwitchMenu('menu_1.2', 'skin/')" id="menu_1.2Title" class="menutitle"&gt;Documentation&lt;/div&gt;&#010;+&lt;div id="menu_1.2" class="menuitemgroup"&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="faq.html"&gt;FAQ&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="wiki.html"&gt;Wiki&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="tutorial.html"&gt;Tutorial&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="bot.html"&gt;Robot     &lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="apidocs-2.1/index.html"&gt;API Docs (2.1)&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="apidocs-1.6/index.html"&gt;API Docs (1.6)&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="https://builds.apache.org/job/Nutch-trunk/javadoc/"&gt;API Docs (trunk nightly)&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="https://builds.apache.org/job/Nutch-nutchgora/javadoc/"&gt;API Docs (2.x nightly)&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle"&#010;style="background-image: url('skin/images/chapter_open.gif');"&gt;Resources&lt;/div&gt;&#010;+&lt;div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"&gt;&#010;+&lt;div class="menupage"&gt;&#010;+&lt;div class="menupagetitle"&gt;Download&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="sonar.html"&gt;Sonar Analysis&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="mailing_lists.html"&gt;Mailing Lists&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="issue_tracking.html"&gt;Issue Tracking&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="version_control.html"&gt;Version Control&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="old_downloads.html"&gt;Older Downloads&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;div onclick="SwitchMenu('menu_1.4', 'skin/')" id="menu_1.4Title" class="menutitle"&gt;Related&#010;Projects&lt;/div&gt;&#010;+&lt;div id="menu_1.4" class="menuitemgroup"&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://lucene.apache.org/java/"&gt;Lucene&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://hadoop.apache.org/"&gt;Hadoop&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://lucene.apache.org/solr/"&gt;Solr&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://tika.apache.org/"&gt;Tika&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="menuitem"&gt;&#010;+&lt;a href="http://gora.apache.org"&gt;Gora&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;div id="credit"&gt;&lt;/div&gt;&#010;+&lt;div id="roundbottom"&gt;&#010;+&lt;img style="display: none" class="corner" height="15" width="15" alt="" src="skin/images/rc-b-l-15-1body-2menu-3menu.png"&gt;&lt;/div&gt;&#010;+&lt;!--+&#010;+  |alternative credits&#010;+  +--&gt;&#010;+&lt;div id="credit2"&gt;&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |end Menu&#010;+    +--&gt;&#010;+&lt;!--+&#010;+    |start content&#010;+    +--&gt;&#010;+&lt;div id="content"&gt;&#010;+&lt;div title="Portable Document Format" class="pdflink"&gt;&#010;+&lt;a class="dida" href="downloads.pdf"&gt;&lt;img alt="PDF -icon" src="skin/images/pdfdoc.gif"&#010;class="skin"&gt;&lt;br&gt;&#010;+        PDF&lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;h1&gt;Nutch Downloads&lt;/h1&gt;&#010;+&lt;div id="minitoc-area"&gt;&#010;+&lt;ul class="minitoc"&gt;&#010;+&lt;li&gt;&#010;+&lt;a href="#Download"&gt;Download&lt;/a&gt;&#010;+&lt;/li&gt;&#010;+&lt;li&gt;&#010;+&lt;a href="#Verify+Releases"&gt;Verify Releases&lt;/a&gt;&#010;+&lt;/li&gt;&#010;+&lt;li&gt;&#010;+&lt;a href="#Previous+Releases"&gt;Previous Releases&lt;/a&gt;&#010;+&lt;/li&gt;&#010;+&lt;/ul&gt;&#010;+&lt;/div&gt; &#010;+&#010;+    &#010;+&lt;a name="N1000E"&gt;&lt;/a&gt;&lt;a name="Download"&gt;&lt;/a&gt;&#010;+&lt;h2 class="h3"&gt;Download&lt;/h2&gt;&#010;+&lt;div class="section"&gt;&#010;+&lt;p&gt; Apache Nutch 2.1 (src-tar and src-zip only) and 1.6 (src-tar, src-zip, bin-tar&#010;and bin-zip) are now available. See &#010;+      the &#010;+      &lt;a href="http://apache.org/dist/nutch/2.1/CHANGES-2.1.txt"&gt;CHANGES-2.1.txt&lt;/a&gt;,&#010;and &#010;+      &lt;a href="http://apache.org/dist/nutch/1.6/CHANGES_1.6.txt"&gt;CHANGES_1.6.txt&lt;/a&gt;&#010;+      files for more information on the list of updates in these releases.&#010;+      &lt;/p&gt;&#010;+&lt;p&gt; All Apache Nutch distributions is distributed under the &lt;a href="http://www.apache.org/licenses/LICENSE-2.0.html"&gt;Apache&#010;License, version 2.0&lt;/a&gt;.&#010;+      &lt;/p&gt;&#010;+&lt;p&gt;The link in the Mirrors column below should display a list of available mirrors&#010;with a default selection based on your inferred location. If you do not see that page, try&#010;a different browser. The checksum and signature are links to the originals on the main distribution&#010;server.&#010;+      &lt;/p&gt;&#010;+&lt;table class="ForrestTable" cellspacing="1" cellpadding="4"&gt;&#010;+  &#010;+&lt;caption&gt;Downloads&lt;/caption&gt;&#010;+  &#010;+&lt;tr&gt;&#010;+&lt;th colspan="1" rowspan="1"&gt;Version&lt;/th&gt; &lt;th colspan="1" rowspan="1"&gt;Mirrors&lt;/th&gt;&#010;&lt;th colspan="1" rowspan="1"&gt;Checksum&lt;/th&gt; &lt;th colspan="1" rowspan="1"&gt;Signature&lt;/th&gt;&#010;+&lt;/tr&gt;&#010;+  &#010;+&lt;tr&gt;&#010;+&lt;td colspan="1" rowspan="1"&gt;Apache Nutch 2.1 (src.tar.gz)&lt;/td&gt;&lt;td colspan="1"&#010;rowspan="1"&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/2.1/apache-nutch-2.1-src.tar.gz"&gt;&#010;+       apache-nutch-2.1-src.tar.gz &lt;/a&gt;&lt;/td&gt; &lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.tar.gz.md5"&gt;&#010;+       apache-nutch-2.1-src.tar.gz.md5&lt;/a&gt; &lt;/td&gt; &lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.tar.gz.asc"&gt;&#010;+       apache-nutch-2.1-src.tar.gz.asc&lt;/a&gt; &lt;/td&gt;&#010;+&lt;/tr&gt;&#010;+  &#010;+&lt;tr&gt;&#010;+&lt;td colspan="1" rowspan="1"&gt;Apache Nutch 2.1 (src.zip)&lt;/td&gt;&lt;td colspan="1"&#010;rowspan="1"&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/2.1/apache-nutch-2.1-src.zip"&gt;&#010;+       apache-nutch-2.1-src.zip&lt;/a&gt;&lt;/td&gt;&lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.zip.md5"&gt;&#010;+       apache-nutch-2.1-src.zip.md5&lt;/a&gt;&lt;/td&gt;&lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/2.1/apache-nutch-2.1-src.zip.asc"&gt;&#010;+       apache-nutch-2.1-src.zip.asc&lt;/a&gt; &lt;/td&gt;&#010;+&lt;/tr&gt;&#010;+  &#010;+&lt;tr&gt;&#010;+&lt;td colspan="1" rowspan="1"&gt;Apache Nutch 1.6 (src.tar.gz)&lt;/td&gt;&lt;td colspan="1"&#010;rowspan="1"&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-src.tar.gz"&gt;&#010;+       apache-nutch-1.6-src.tar.gz&lt;/a&gt;&lt;/td&gt; &lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.tar.gz.md5"&gt;&#010;+       apache-nutch-1.6-src.tar.gz.md5&lt;/a&gt; &lt;/td&gt; &lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.tar.gz.asc"&gt;&#010;+       apache-nutch-1.6-src.tar.gz.asc&lt;/a&gt; &lt;/td&gt;&#010;+&lt;/tr&gt;&#010;+  &#010;+&lt;tr&gt;&#010;+&lt;td colspan="1" rowspan="1"&gt;Apache Nutch 1.6 (src.zip)&lt;/td&gt;&lt;td colspan="1"&#010;rowspan="1"&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-src.zip"&gt;&#010;+       apache-nutch-1.6-src.zip&lt;/a&gt;&lt;/td&gt;&lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.zip.md5"&gt;&#010;+       apache-nutch-1.6-src.zip.md5&lt;/a&gt;&lt;/td&gt;&lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-src.zip.asc"&gt;&#010;+       apache-nutch-1.6-src.zip.asc&lt;/a&gt; &lt;/td&gt;&#010;+&lt;/tr&gt;&#010;+  &#010;+&lt;tr&gt;&#010;+&lt;td colspan="1" rowspan="1"&gt;Apache Nutch 1.6 (bin.tar.gz)&lt;/td&gt;&lt;td colspan="1"&#010;rowspan="1"&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-bin.tar.gz"&gt;&#010;+       apache-nutch-1.6-bin.tar.gz&lt;/a&gt;&lt;/td&gt; &lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.tar.gz.md5"&gt;&#010;+       apache-nutch-1.6-bin.tar.gz.md5&lt;/a&gt; &lt;/td&gt; &lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.tar.gz.asc"&gt;&#010;+       apache-nutch-1.6-bin.tar.gz.asc&lt;/a&gt; &lt;/td&gt;&#010;+&lt;/tr&gt;&#010;+  &#010;+&lt;tr&gt;&#010;+&lt;td colspan="1" rowspan="1"&gt;Apache Nutch 1.6 (bin.zip)&lt;/td&gt;&lt;td colspan="1"&#010;rowspan="1"&gt;&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/1.6/apache-nutch-1.6-bin.zip"&gt;&#010;+       apache-nutch-1.6-bin.zip&lt;/a&gt;&lt;/td&gt;&lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.zip.md5"&gt;&#010;+       apache-nutch-1.6-bin.zip.md5&lt;/a&gt;&lt;/td&gt;&lt;td colspan="1" rowspan="1"&gt;&lt;a&#010;href="http://apache.org/dist/nutch/1.6/apache-nutch-1.6-bin.zip.asc"&gt;&#010;+       apache-nutch-1.6-bin.zip.asc&lt;/a&gt; &lt;/td&gt;&#010;+&lt;/tr&gt;&#010;+&#010;+&lt;/table&gt;&#010;+&lt;/div&gt;&#010;+    &#010;+    &#010;+&lt;a name="N100E4"&gt;&lt;/a&gt;&lt;a name="Verify+Releases"&gt;&lt;/a&gt;&#010;+&lt;h2 class="h3"&gt;Verify Releases&lt;/h2&gt;&#010;+&lt;div class="section"&gt;&#010;+&lt;p&gt;It is essential that you verify the integrity of the downloaded files using the&#010;PGP or MD5 signatures. Please read &lt;a href="http://httpd.apache.org/dev/verification.html"&gt;Verifying&#010;Apache HTTP Server Releases&lt;/a&gt; for more information on why you should verify our releases.&#010;+      We strongly recommend you verify your downloads with both PGP and MD5.&lt;/p&gt;&#010;+&lt;p&gt;&#010;+&lt;strong&gt;PGP Signature&lt;/strong&gt;&#010;+&lt;/p&gt;&#010;+&lt;p&gt;The PGP signatures can be verified using PGP or GPG. First download the &lt;a href="http://www.apache.org/dist/nutch/KEYS"&gt;KEYS&lt;/a&gt;&#010;as well as the asc signature file for the relevant distribution. Make sure you get these files&#010;from the &lt;a href="http://www.apache.org/dist/nutch/"&gt;main distribution directory&lt;/a&gt;,&#010;rather than from a mirror. Then verify the signatures using &lt;/p&gt;&#010;+&lt;p&gt;&#010;+&lt;span class="codefrag"&gt; $ gpg --import KEYS &lt;/span&gt;&#010;+&lt;/p&gt;&#010;+&lt;p&gt;&#010;+&lt;span class="codefrag"&gt; $ gpg --verify apache-nutch-X.Y.Z &lt;/span&gt;&#010;+&lt;/p&gt;&#010;+&lt;p&gt;The files in Apache Nurch 2.1 and 1.6 releases are signed by Lewis John McGibbney&#010;(lewismc) C601BCA7 &lt;/p&gt;&#010;+&lt;p&gt;&#010;+&lt;strong&gt;MD5 Signature&lt;/strong&gt;&#010;+&lt;/p&gt;&#010;+&lt;p&gt;Alternatively, you can verify the MD5 signature on the files. A unix program called&#010;md5 or md5sum is included in many unix distributions.&lt;/p&gt;&#010;+&lt;p&gt;&#010;+&lt;span class="codefrag"&gt; $ md5sum apache-nutch-X.Y.Z&lt;/span&gt;&#010;+&lt;/p&gt;&#010;+&lt;p&gt;&#010;+&lt;span class="codefrag"&gt; ... output should match the string in apache-nutch-X.Y.Z&lt;/span&gt;&#010;+&lt;/p&gt;&#010;+&lt;/div&gt;   &#010;+     &#010;+    &#010;+&lt;a name="N1011B"&gt;&lt;/a&gt;&lt;a name="Previous+Releases"&gt;&lt;/a&gt;&#010;+&lt;h2 class="h3"&gt;Previous Releases&lt;/h2&gt;&#010;+&lt;div class="section"&gt;&#010;+&lt;p&gt;If you are looking for previous releases of Apache Nutch, have a look in the &lt;a&#010;href="old_downloads.html"&gt;old downloads&lt;/a&gt; page, or alternatively for even older&#010;releases check out the  &#010;+         &lt;a href="http://archive.apache.org/dist/incubator/nutch/"&gt;Incubator archives&lt;/a&gt;.&#010;+      &lt;/p&gt;&#010;+&lt;p&gt; Subscribe to &#010;+      the &lt;span class="codefrag"&gt;dev@&lt;/span&gt; &lt;a href="mailing_lists.html"&gt;mailing&#010;list&lt;/a&gt; if you want to &#010;+      get notified about future release candidates and subsequent Nutch official releases.&#010;+      &lt;/p&gt;&#010;+&lt;p&gt;Apache Nutch releases are available under the Apache License, Version 2.0. See &#010;+         the NOTICE.txt file contained in each release artifact for applicable copyright&#010;&#010;+         attribution notices.&#010;+      &lt;/p&gt;&#010;+&lt;/div&gt;&#010;+&#010;+  &#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |end content&#010;+    +--&gt;&#010;+&lt;div class="clearboth"&gt;&amp;nbsp;&lt;/div&gt;&#010;+&lt;/div&gt;&#010;+&lt;div id="footer"&gt;&#010;+&lt;!--+&#010;+    |start bottomstrip&#010;+    +--&gt;&#010;+&lt;div class="lastmodified"&gt;&#010;+&lt;script type="text/javascript"&gt;&lt;!--&#010;+document.write("Last Published: " + document.lastModified);&#010;+//  --&gt;&lt;/script&gt;&#010;+&lt;/div&gt;&#010;+&lt;div class="copyright"&gt;&#010;+        Copyright &amp;copy;&#010;+         2005-2011 &lt;a href="http://www.apache.org/licenses/"&gt;The Apache Software Foundation.&#010; &#010;+Apache Nutch, Nutch, Apache, the Apache feather logo, and the Apache Nutch project logo are&#010;trademarks of The Apache Software Foundation.&#010;+  &lt;/a&gt;&#010;+&lt;/div&gt;&#010;+&lt;!--+&#010;+    |end bottomstrip&#010;+    +--&gt;&#010;+&lt;/div&gt;&#010;+&lt;/body&gt;&#010;+&lt;/html&gt;&#010;&#010;Added: nutch/site/publish/downloads.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/downloads.pdf?rev=1475631&amp;view=auto&#010;==============================================================================&#010;Binary file - no diff available.&#010;&#010;Propchange: nutch/site/publish/downloads.pdf&#010;------------------------------------------------------------------------------&#010;    svn:mime-type = application/octet-stream&#010;&#010;Modified: nutch/site/publish/faq.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/faq.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/faq.html (original)&#010;+++ nutch/site/publish/faq.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/faq.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/faq.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/index.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/index.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/index.html (original)&#010;+++ nutch/site/publish/index.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/index.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/index.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/issue_tracking.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/issue_tracking.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/issue_tracking.html (original)&#010;+++ nutch/site/publish/issue_tracking.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle"&#010;style="background-image: url('skin/images/chapter_open.gif');"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/issue_tracking.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/issue_tracking.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/linkmap.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/linkmap.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/linkmap.html (original)&#010;+++ nutch/site/publish/linkmap.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;@@ -391,7 +391,7 @@ document.write("Last Published: " + docu&#010;     &#010; &lt;ul&gt;&#010; &lt;li&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&amp;nbsp;&amp;nbsp;___________________&amp;nbsp;&amp;nbsp;&lt;em&gt;download&lt;/em&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&amp;nbsp;&amp;nbsp;___________________&amp;nbsp;&amp;nbsp;&lt;em&gt;download&lt;/em&gt;&#010; &lt;/li&gt;&#010; &lt;/ul&gt;&#010;     &#010;&#010;Modified: nutch/site/publish/linkmap.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/linkmap.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/mailing_lists.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/mailing_lists.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/mailing_lists.html (original)&#010;+++ nutch/site/publish/mailing_lists.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle"&#010;style="background-image: url('skin/images/chapter_open.gif');"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/mailing_lists.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/mailing_lists.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/nightly.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/nightly.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/nightly.html (original)&#010;+++ nutch/site/publish/nightly.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle"&#010;style="background-image: url('skin/images/chapter_open.gif');"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menupage"&gt;&#010; &lt;div class="menupagetitle"&gt;Nightly builds&lt;/div&gt;&#010;&#010;Modified: nutch/site/publish/nightly.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/nightly.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/old_downloads.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/old_downloads.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/old_downloads.html (original)&#010;+++ nutch/site/publish/old_downloads.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle"&#010;style="background-image: url('skin/images/chapter_open.gif');"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/old_downloads.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/old_downloads.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Added: nutch/site/publish/skin/images/apache-thanks.png&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/skin/images/apache-thanks.png?rev=1475631&amp;view=auto&#010;==============================================================================&#010;Binary file - no diff available.&#010;&#010;Propchange: nutch/site/publish/skin/images/apache-thanks.png&#010;------------------------------------------------------------------------------&#010;    svn:mime-type = application/octet-stream&#010;&#010;Added: nutch/site/publish/skin/images/built-with-cocoon.gif&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/skin/images/built-with-cocoon.gif?rev=1475631&amp;view=auto&#010;==============================================================================&#010;Binary file - no diff available.&#010;&#010;Propchange: nutch/site/publish/skin/images/built-with-cocoon.gif&#010;------------------------------------------------------------------------------&#010;    svn:mime-type = application/octet-stream&#010;&#010;Modified: nutch/site/publish/sonar.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/sonar.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/sonar.html (original)&#010;+++ nutch/site/publish/sonar.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle"&#010;style="background-image: url('skin/images/chapter_open.gif');"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/sonar.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/sonar.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/tutorial.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/tutorial.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/tutorial.html (original)&#010;+++ nutch/site/publish/tutorial.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/tutorial.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/tutorial.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/version_control.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/version_control.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/version_control.html (original)&#010;+++ nutch/site/publish/version_control.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_selected_1.3', 'skin/')" id="menu_selected_1.3Title" class="menutitle"&#010;style="background-image: url('skin/images/chapter_open.gif');"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_selected_1.3" class="selectedmenuitemgroup" style="display: block;"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/version_control.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/version_control.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;Modified: nutch/site/publish/wiki.html&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/wiki.html?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;--- nutch/site/publish/wiki.html (original)&#010;+++ nutch/site/publish/wiki.html Thu Apr 25 05:05:56 2013&#010;@@ -198,7 +198,7 @@ document.write("Last Published: " + docu&#010; &lt;div onclick="SwitchMenu('menu_1.3', 'skin/')" id="menu_1.3Title" class="menutitle"&gt;Resources&lt;/div&gt;&#010; &lt;div id="menu_1.3" class="menuitemgroup"&gt;&#010; &lt;div class="menuitem"&gt;&#010;-&lt;a href="http://www.apache.org/dyn/closer.cgi/nutch/"&gt;Download&lt;/a&gt;&#010;+&lt;a href="downloads.html"&gt;Download&lt;/a&gt;&#010; &lt;/div&gt;&#010; &lt;div class="menuitem"&gt;&#010; &lt;a href="nightly.html"&gt;Nightly builds&lt;/a&gt;&#010;&#010;Modified: nutch/site/publish/wiki.pdf&#010;URL: http://svn.apache.org/viewvc/nutch/site/publish/wiki.pdf?rev=1475631&amp;r1=1475630&amp;r2=1475631&amp;view=diff&#010;==============================================================================&#010;Binary files - no diff available.&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1469241 - /nutch/branches/2.x/src/bin/crawl</title>
<author><name>jnioche@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130418092607.8C4812388A29@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130418092607-8C4812388A29@eris-apache-org%3e</id>
<updated>2013-04-18T09:26:07Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: jnioche&#010;Date: Thu Apr 18 09:26:07 2013&#010;New Revision: 1469241&#010;&#010;URL: http://svn.apache.org/r1469241&#010;Log:&#010;Fixed detection of job file for distributed mode&#010;&#010;Modified:&#010;    nutch/branches/2.x/src/bin/crawl&#010;&#010;Modified: nutch/branches/2.x/src/bin/crawl&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/bin/crawl?rev=1469241&amp;r1=1469240&amp;r2=1469241&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/bin/crawl (original)&#010;+++ nutch/branches/2.x/src/bin/crawl Thu Apr 18 09:26:07 2013&#010;@@ -73,7 +73,7 @@ addDays=0&#010; # determines whether mode based on presence of job file&#010; &#010; mode=local&#010;-if [ -f ../nutch-*.job ]; then&#010;+if [ -f ../*nutch-*.job ]; then&#010;     mode=distributed&#010; fi&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1469231 - /nutch/trunk/src/bin/crawl</title>
<author><name>jnioche@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130418090817.067E023889F7@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130418090817-067E023889F7@eris-apache-org%3e</id>
<updated>2013-04-18T09:08:16Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: jnioche&#010;Date: Thu Apr 18 09:08:16 2013&#010;New Revision: 1469231&#010;&#010;URL: http://svn.apache.org/r1469231&#010;Log:&#010;Fixed detection of job file for distributed mode&#010;&#010;Modified:&#010;    nutch/trunk/src/bin/crawl&#010;&#010;Modified: nutch/trunk/src/bin/crawl&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/bin/crawl?rev=1469231&amp;r1=1469230&amp;r2=1469231&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/bin/crawl (original)&#010;+++ nutch/trunk/src/bin/crawl Thu Apr 18 09:08:16 2013&#010;@@ -72,7 +72,7 @@ numThreads=50&#010; # determines whether mode based on presence of job file&#010; &#010; mode=local&#010;-if [ -f ../nutch-*.job ]; then&#010;+if [ -f ../*nutch-*.job ]; then&#010;     mode=distributed&#010; fi&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1469100 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java src/java/org/apache/nutch/parse/ParserChecker.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130417232259.C19EE23889FD@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130417232259-C19EE23889FD@eris-apache-org%3e</id>
<updated>2013-04-17T23:22:59Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Wed Apr 17 23:22:59 2013&#010;New Revision: 1469100&#010;&#010;URL: http://svn.apache.org/r1469100&#010;Log:&#010;NUTCH-1501 Harmonize behavior of parsechecker and indexchecker&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java&#010;    nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1469100&amp;r1=1469099&amp;r2=1469100&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Wed Apr 17 23:22:59 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + lewismc)&#010;+&#010; * NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010; &#010; * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1469100&amp;r1=1469099&amp;r2=1469100&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Apr 17 23:22:59&#010;2013&#010;@@ -68,7 +68,7 @@ public class IndexingFiltersChecker exte&#010; &#010;     if (args.length != 1) {&#010;       System.err.println(usage);&#010;-      System.exit(-1);&#010;+      return -1;&#010;     }&#010; &#010;     url = URLUtil.toASCII(args[0]);&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1469100&amp;r1=1469099&amp;r2=1469100&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/parse/ParserChecker.java Wed Apr 17 23:22:59 2013&#010;@@ -35,19 +35,38 @@ import org.apache.nutch.util.StringUtil;&#010; &#010; /**&#010;  * Parser checker, useful for testing parser.&#010;- * &#010;+ * It also accurately reports possible fetching and &#010;+ * parsing failures and presents protocol status signals to aid &#010;+ * debugging. The tool enables us to retrieve the following data from &#010;+ * any url:&#010;+ * &lt;ol&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;contentType&lt;/tt&gt;: The URL {@link org.apache.nutch.protocol.Content}&#010;type.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;signature&lt;/tt&gt;: Digest is used to identify pages (like unique&#010;ID) and is used to remove&#010;+ * duplicates during the dedup procedure. &#010;+ * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or&#010;+ * {@link org.apache.nutch.crawl.TextProfileSignature}.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Version&lt;/tt&gt;: From {@link org.apache.nutch.parse.ParseData}.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Status&lt;/tt&gt;: From {@link org.apache.nutch.parse.ParseData}.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Title&lt;/tt&gt;: of the URL&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Outlinks&lt;/tt&gt;: associated with the URL&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Content Metadata&lt;/tt&gt;: such as &lt;i&gt;X-AspNet-Version&lt;/i&gt;,&#010;&lt;i&gt;Date&lt;/i&gt;,&#010;+ * &lt;i&gt;Content-length&lt;/i&gt;, &lt;i&gt;servedBy&lt;/i&gt;, &lt;i&gt;Content-Type&lt;/i&gt;,&#010;&lt;i&gt;Cache-Control&lt;/&gt;, etc.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Parse Metadata&lt;/tt&gt;: such as &lt;i&gt;CharEncodingForConversion&lt;/i&gt;,&#010;+ * &lt;i&gt;OriginalCharEncoding&lt;/i&gt;, &lt;i&gt;language&lt;/i&gt;, etc.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;ParseText&lt;/tt&gt;: The page parse text which varies in length depdnecing&#010;on &#010;+ * &lt;code&gt;content.length&lt;/code&gt; configuration.&lt;/li&gt;&#010;+ * &lt;/ol&gt;&#010;  * @author John Xing&#010;  */&#010; &#010; public class ParserChecker implements Tool {&#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(ParserChecker.class);&#010;+  private Configuration conf;&#010; &#010;   public ParserChecker() {&#010;   }&#010; &#010;-  Configuration conf = null;&#010;-&#010;   public int run(String[] args) throws Exception {&#010;     boolean dumpText = false;&#010;     boolean force = false;&#010;@@ -57,8 +76,8 @@ public class ParserChecker implements To&#010;     String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";&#010; &#010;     if (args.length == 0) {&#010;-      System.err.println(usage);&#010;-      System.exit(-1);&#010;+      LOG.error(usage);&#010;+      return (-1);&#010;     }&#010; &#010;     for (int i = 0; i &lt; args.length; i++) {&#010;@@ -68,7 +87,7 @@ public class ParserChecker implements To&#010;       } else if (args[i].equals("-dumpText")) {&#010;         dumpText = true;&#010;       } else if (i != args.length - 1) {&#010;-        System.err.println(usage);&#010;+        LOG.error(usage);&#010;         System.exit(-1);&#010;       } else {&#010;         url = URLUtil.toASCII(args[i]);&#010;@@ -102,7 +121,7 @@ public class ParserChecker implements To&#010;     }&#010; &#010;     if (contentType == null) {&#010;-      System.err.println("");&#010;+      LOG.error("Failed to determine content type!");&#010;       return (-1);&#010;     }&#010; &#010;@@ -112,9 +131,14 @@ public class ParserChecker implements To&#010; &#010;     ParseResult parseResult = new ParseUtil(conf).parse(content);&#010; &#010;+    if (parseResult == null) {&#010;+      LOG.error("Problem with parse - check log");&#010;+      return (-1);&#010;+    }&#010;+&#010;     // Calculate the signature&#010;     byte[] signature = SignatureFactory.getSignature(getConf()).calculate(content, parseResult.get(new&#010;Text(url)));&#010;-&#010;+    &#010;     if (LOG.isInfoEnabled()) {&#010;       LOG.info("parsing: " + url);&#010;       LOG.info("contentType: " + contentType);&#010;@@ -123,12 +147,12 @@ public class ParserChecker implements To&#010; &#010;     for (java.util.Map.Entry&lt;Text, Parse&gt; entry : parseResult) {&#010;       Parse parse = entry.getValue();&#010;-      System.out.print("---------\nUrl\n---------------\n");&#010;+      LOG.info("---------\nUrl\n---------------\n");&#010;       System.out.print(entry.getKey());&#010;-      System.out.print("\n---------\nParseData\n---------\n");&#010;+      LOG.info("\n---------\nParseData\n---------\n");&#010;       System.out.print(parse.getData().toString());&#010;       if (dumpText) {&#010;-        System.out.print("---------\nParseText\n---------\n");&#010;+        LOG.info("---------\nParseText\n---------\n");&#010;         System.out.print(parse.getText());&#010;       }&#010;     }&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1469099 - in /nutch/branches/2.x: CHANGES.txt conf/log4j.properties src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java src/java/org/apache/nutch/parse/ParserChecker.java src/java/org/apache/nutch/util/URLUtil.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130417232036.CE4272388847@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130417232036-CE4272388847@eris-apache-org%3e</id>
<updated>2013-04-17T23:20:36Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Wed Apr 17 23:20:36 2013&#010;New Revision: 1469099&#010;&#010;URL: http://svn.apache.org/r1469099&#010;Log:&#010;NUTCH-1501 Harmonize behavior of parsechecker and indexchecker&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/conf/log4j.properties&#010;    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java&#010;    nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java&#010;    nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1469099&amp;r1=1469098&amp;r2=1469099&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Wed Apr 17 23:20:36 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1501 Harmonize behavior of parsechecker and indexchecker (snagel + lewismc)&#010;+&#010; * NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc)&#010; &#010; * NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via lewismc)&#010;&#010;Modified: nutch/branches/2.x/conf/log4j.properties&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/log4j.properties?rev=1469099&amp;r1=1469098&amp;r2=1469099&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/conf/log4j.properties (original)&#010;+++ nutch/branches/2.x/conf/log4j.properties Wed Apr 17 23:20:36 2013&#010;@@ -38,6 +38,8 @@ log4j.logger.org.apache.nutch.indexer.so&#010; log4j.logger.org.apache.nutch.indexer.DeleteDuplicates=INFO,cmdstdout&#010; log4j.logger.org.apache.nutch.crawl.WebTableReader=INFO,cmdstdout&#010; log4j.logger.org.apache.nutch.host.HostDbReader=INFO,cmdstdout&#010;+log4j.logger.org.apache.nutch.parse.ParserChecker=INFO,cmdstdout&#010;+log4j.logger.org.apache.nutch.indexer.IndexingFiltersChecker=INFO,cmdstdout&#010; &#010; log4j.logger.org.apache.nutch=INFO&#010; log4j.logger.org.apache.hadoop=WARN&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java?rev=1469099&amp;r1=1469098&amp;r2=1469099&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexingFiltersChecker.java Wed Apr&#010;17 23:20:36 2013&#010;@@ -37,6 +37,7 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.protocol.ProtocolStatusUtils;&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;+import org.apache.nutch.util.URLUtil;&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; &#010;@@ -67,7 +68,7 @@ public class IndexingFiltersChecker exte&#010;       return -1;&#010;     }&#010; &#010;-    url = args[0];&#010;+    url = URLUtil.toASCII(args[0]);&#010; &#010;     if (LOG.isInfoEnabled()) {&#010;       LOG.info("fetching: " + url);&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java?rev=1469099&amp;r1=1469098&amp;r2=1469099&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/parse/ParserChecker.java Wed Apr 17 23:20:36&#010;2013&#010;@@ -28,6 +28,7 @@ import org.slf4j.LoggerFactory;&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.hadoop.util.Tool;&#010; import org.apache.hadoop.util.ToolRunner;&#010;+import org.apache.nutch.crawl.SignatureFactory;&#010; import org.apache.nutch.protocol.Content;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolFactory;&#010;@@ -36,10 +37,32 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.storage.WebPage;&#010; import org.apache.nutch.util.Bytes;&#010; import org.apache.nutch.util.NutchConfiguration;&#010;+import org.apache.nutch.util.StringUtil;&#010;+import org.apache.nutch.util.URLUtil;&#010; &#010; /**&#010;  * Parser checker, useful for testing parser.&#010;- * &#010;+ * It also accurately reports possible fetching and &#010;+ * parsing failures and presents protocol status signals to aid &#010;+ * debugging. The tool enables us to retrieve the following data from &#010;+ * any url:&#010;+ * &lt;ol&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;contentType&lt;/tt&gt;: The URL {@link org.apache.nutch.protocol.Content}&#010;type.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;signature&lt;/tt&gt;: Digest is used to identify pages (like unique&#010;ID) and is used to remove&#010;+ * duplicates during the dedup procedure. &#010;+ * It is calculated using {@link org.apache.nutch.crawl.MD5Signature} or&#010;+ * {@link org.apache.nutch.crawl.TextProfileSignature}.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Version&lt;/tt&gt;: From {@link org.apache.nutch.parse.ParseData}.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Status&lt;/tt&gt;: From {@link org.apache.nutch.parse.ParseData}.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Title&lt;/tt&gt;: of the URL&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Outlinks&lt;/tt&gt;: associated with the URL&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Content Metadata&lt;/tt&gt;: such as &lt;i&gt;X-AspNet-Version&lt;/i&gt;,&#010;&lt;i&gt;Date&lt;/i&gt;,&#010;+ * &lt;i&gt;Content-length&lt;/i&gt;, &lt;i&gt;servedBy&lt;/i&gt;, &lt;i&gt;Content-Type&lt;/i&gt;,&#010;&lt;i&gt;Cache-Control&lt;/&gt;, etc.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;Parse Metadata&lt;/tt&gt;: such as &lt;i&gt;CharEncodingForConversion&lt;/i&gt;,&#010;+ * &lt;i&gt;OriginalCharEncoding&lt;/i&gt;, &lt;i&gt;language&lt;/i&gt;, etc.&lt;/li&gt;&#010;+ * &lt;li&gt;&lt;tt&gt;ParseText&lt;/tt&gt;: The page parse text which varies in length depdnecing&#010;on &#010;+ * &lt;code&gt;content.length&lt;/code&gt; configuration.&lt;/li&gt;&#010;+ * &lt;/ol&gt;&#010;  * @author John Xing&#010;  */&#010; &#010;@@ -60,7 +83,7 @@ public class ParserChecker implements To&#010;     String usage = "Usage: ParserChecker [-dumpText] [-forceAs mimeType] url";&#010; &#010;     if (args.length == 0) {&#010;-      System.err.println(usage);&#010;+      LOG.error(usage);&#010;       return (-1);&#010;     }&#010; &#010;@@ -71,10 +94,10 @@ public class ParserChecker implements To&#010;       } else if (args[i].equals("-dumpText")) {&#010;         dumpText = true;&#010;       } else if (i != args.length - 1) {&#010;-        System.err.println(usage);&#010;+        LOG.error(usage);&#010;         System.exit(-1);&#010;       } else {&#010;-        url = args[i];&#010;+        url = URLUtil.toASCII(args[i]);&#010;       }&#010;     }&#010; &#010;@@ -110,15 +133,10 @@ public class ParserChecker implements To&#010;     }&#010; &#010;     if (contentType == null) {&#010;-      System.err.println("");&#010;+      LOG.error("Failed to determine content type!");&#010;       return (-1);&#010;     }&#010; &#010;-    if (LOG.isInfoEnabled()) {&#010;-      LOG.info("parsing: " + url);&#010;-      LOG.info("contentType: " + contentType);&#010;-    }&#010;-&#010;     page.setContentType(new Utf8(contentType));&#010; &#010;     if (ParserJob.isTruncated(url, page)) {&#010;@@ -128,13 +146,23 @@ public class ParserChecker implements To&#010;     Parse parse = new ParseUtil(conf).parse(url, page);&#010; &#010;     if (parse == null) {&#010;-      System.err.println("Problem with parse - check log");&#010;+      LOG.error("Problem with parse - check log");&#010;       return (-1);&#010;     }&#010;+    &#010;+    // Calculate the signature&#010;+    byte[] signature = SignatureFactory.getSignature(getConf()).calculate(page);&#010;+    &#010;+    if (LOG.isInfoEnabled()) {&#010;+      LOG.info("parsing: " + url);&#010;+      LOG.info("contentType: " + contentType);&#010;+      LOG.info("signature: " + StringUtil.toHexString(signature));&#010;+    }&#010;+&#010; &#010;-    System.out.print("---------\nUrl\n---------------\n");&#010;+    LOG.info("---------\nUrl\n---------------\n");&#010;     System.out.print(url + "\n");&#010;-    System.out.print("---------\nMetadata\n---------\n");&#010;+    LOG.info("---------\nMetadata\n---------\n");&#010;     Map&lt;Utf8, ByteBuffer&gt; metadata = page.getMetadata();&#010;     StringBuffer sb = new StringBuffer();&#010;     if (metadata != null) {&#010;@@ -148,7 +176,7 @@ public class ParserChecker implements To&#010;       System.out.print(sb.toString());&#010;     }&#010;     if (dumpText) {&#010;-      System.out.print("---------\nParseText\n---------\n");&#010;+      LOG.info("---------\nParseText\n---------\n");&#010;       System.out.print(parse.getText());&#010;     }&#010; &#010;@@ -170,4 +198,5 @@ public class ParserChecker implements To&#010;         args);&#010;     System.exit(res);&#010;   }&#010;+&#010; }&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java?rev=1469099&amp;r1=1469098&amp;r2=1469099&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/util/URLUtil.java Wed Apr 17 23:20:36 2013&#010;@@ -18,7 +18,7 @@&#010; package org.apache.nutch.util;&#010; &#010; import java.net.MalformedURLException;&#010;-import java.net.URL;&#010;+import java.net.*;&#010; import java.util.regex.Pattern;&#010; &#010; import org.apache.nutch.util.domain.DomainSuffix;&#010;@@ -333,6 +333,43 @@ public class URLUtil {&#010;     }&#010;   }&#010;   &#010;+  public static String toASCII(String url) {&#010;+    try {&#010;+      URL u = new URL(url);&#010;+      URI p = new URI(u.getProtocol(),&#010;+        null,&#010;+        IDN.toASCII(u.getHost()),&#010;+        u.getPort(),&#010;+        u.getPath(),&#010;+        u.getQuery(),&#010;+        u.getRef());&#010;+&#010;+      return p.toString();&#010;+    }&#010;+    catch (Exception e) {&#010;+      return null;&#010;+    }&#010;+  }&#010;+&#010;+  public static String toUNICODE(String url) {&#010;+    try {&#010;+      URL u = new URL(url);&#010;+      URI p = new URI(u.getProtocol(),&#010;+        null,&#010;+        IDN.toUnicode(u.getHost()),&#010;+        u.getPort(),&#010;+        u.getPath(),&#010;+        u.getQuery(),&#010;+        u.getRef());&#010;+&#010;+      return p.toString();&#010;+    }&#010;+    catch (Exception e) {&#010;+      return null;&#010;+    }&#010;+  }&#010;+&#010;+&#010;   /** For testing */&#010;   public static void main(String[] args){&#010;     &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1465834 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/net/protocols/HttpDateFormat.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130408235353.A130523889FD@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130408235353-A130523889FD@eris-apache-org%3e</id>
<updated>2013-04-08T23:53:53Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Mon Apr  8 23:53:53 2013&#010;New Revision: 1465834&#010;&#010;URL: http://svn.apache.org/r1465834&#010;Log:&#010;revert NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1465834&amp;r1=1465833&amp;r2=1465834&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Mon Apr  8 23:53:53 2013&#010;@@ -2,8 +2,6 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;-* NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;(lewismc)&#010;-&#010; * NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc)&#010; &#010; * NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via lewismc)&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1465834&amp;r1=1465833&amp;r2=1465834&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java Mon Apr&#010; 8 23:53:53 2013&#010;@@ -34,7 +34,7 @@ import java.text.ParseException;&#010; public class HttpDateFormat {&#010; &#010;   protected static SimpleDateFormat format = &#010;-    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz");&#010;+    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);&#010; &#010;   /**&#010;    * HTTP date uses TimeZone GMT&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1465831 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/net/protocols/HttpDateFormat.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130408235225.610DF238899C@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130408235225-610DF238899C@eris-apache-org%3e</id>
<updated>2013-04-08T23:52:25Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Mon Apr  8 23:52:24 2013&#010;New Revision: 1465831&#010;&#010;URL: http://svn.apache.org/r1465831&#010;Log:&#010;revert NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1465831&amp;r1=1465830&amp;r2=1465831&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Mon Apr  8 23:52:24 2013&#010;@@ -2,8 +2,6 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;-* NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;(lewismc)&#010;-&#010; * NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010; &#010; * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1465831&amp;r1=1465830&amp;r2=1465831&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java Mon Apr  8 23:52:24&#010;2013&#010;@@ -34,7 +34,7 @@ import java.text.ParseException;&#010; public class HttpDateFormat {&#010; &#010;   protected static SimpleDateFormat format = &#010;-    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz");&#010;+    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);&#010; &#010;   /**&#010;    * HTTP date uses TimeZone GMT&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1465741 - in /nutch/trunk: CHANGES.txt src/java/org/apache/nutch/net/protocols/HttpDateFormat.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130408194629.DFE8E23889FA@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130408194629-DFE8E23889FA@eris-apache-org%3e</id>
<updated>2013-04-08T19:46:29Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Mon Apr  8 19:46:29 2013&#010;New Revision: 1465741&#010;&#010;URL: http://svn.apache.org/r1465741&#010;Log:&#010;NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1465741&amp;r1=1465740&amp;r2=1465741&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Mon Apr  8 19:46:29 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;(lewismc)&#010;+&#010; * NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010; &#010; * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1465741&amp;r1=1465740&amp;r2=1465741&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java Mon Apr  8 19:46:29&#010;2013&#010;@@ -34,7 +34,7 @@ import java.text.ParseException;&#010; public class HttpDateFormat {&#010; &#010;   protected static SimpleDateFormat format = &#010;-    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);&#010;+    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz");&#010; &#010;   /**&#010;    * HTTP date uses TimeZone GMT&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1465742 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/net/protocols/HttpDateFormat.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130408194630.066212388AA6@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130408194630-066212388AA6@eris-apache-org%3e</id>
<updated>2013-04-08T19:46:29Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Mon Apr  8 19:46:29 2013&#010;New Revision: 1465742&#010;&#010;URL: http://svn.apache.org/r1465742&#010;Log:&#010;NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1465742&amp;r1=1465741&amp;r2=1465742&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Mon Apr  8 19:46:29 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1554 org.apache.nutch.net.protocols.HttpDateFormat should NOT be Locale.US aware&#010;(lewismc)&#010;+&#010; * NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc)&#010; &#010; * NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via lewismc)&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java?rev=1465742&amp;r1=1465741&amp;r2=1465742&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/net/protocols/HttpDateFormat.java Mon Apr&#010; 8 19:46:29 2013&#010;@@ -34,7 +34,7 @@ import java.text.ParseException;&#010; public class HttpDateFormat {&#010; &#010;   protected static SimpleDateFormat format = &#010;-    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz", Locale.US);&#010;+    new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss zzz");&#010; &#010;   /**&#010;    * HTTP date uses TimeZone GMT&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1465522 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/indexer/IndexUtil.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130408003659.B878323888EA@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130408003659-B878323888EA@eris-apache-org%3e</id>
<updated>2013-04-08T00:36:59Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Mon Apr  8 00:36:59 2013&#010;New Revision: 1465522&#010;&#010;URL: http://svn.apache.org/r1465522&#010;Log:&#010;NUTCH-1532 Replace 'segment' mapping field with batchId&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1465522&amp;r1=1465521&amp;r2=1465522&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Mon Apr  8 00:36:59 2013&#010;@@ -14,7 +14,7 @@ Release 2.2 - Current Development&#010; &#010; * NUTCH-1038 Port IndexingFiltersChecker to 2.0 (snagel via lewismc)&#010; &#010;-* NUTCH-1532 Replace 'segment' mapping field with batchId (Feng +via lewismc)&#010;+* NUTCH-1532 Replace 'segment' mapping field with batchId (patches v2 + v3) (Feng +via lewismc)&#010; &#010; * NUTCH-1533 Implement getPrevModifiedTime(), setPrevModifiedTime(), getBatchId() and setBatchId()&#010;accessors in o.a.n.storage.WebPage (Feng via lewismc)&#010; &#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java?rev=1465522&amp;r1=1465521&amp;r2=1465522&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/indexer/IndexUtil.java Mon Apr  8 00:36:59&#010;2013&#010;@@ -63,7 +63,9 @@ public class IndexUtil {&#010;     NutchDocument doc = new NutchDocument();&#010;     doc.add("id", key);&#010;     doc.add("digest", StringUtil.toHexString(page.getSignature().array()));&#010;-    doc.add("batchId", page.getBatchId().toString());&#010;+    if (page.getBatchId() != null) {&#010;+      doc.add("batchId", page.getBatchId().toString());&#010;+    }&#010;     &#010;     String url = TableUtil.unreverseUrl(key);&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1465521 - in /nutch/branches/2.x: CHANGES.txt src/java/org/apache/nutch/crawl/WebTableReader.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130408003323.D786923889E0@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130408003323-D786923889E0@eris-apache-org%3e</id>
<updated>2013-04-08T00:33:23Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Mon Apr  8 00:33:23 2013&#010;New Revision: 1465521&#010;&#010;URL: http://svn.apache.org/r1465521&#010;Log:&#010;NUTCH-1551 Improve WebTableReader field order and display batchId&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1465521&amp;r1=1465520&amp;r2=1465521&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Mon Apr  8 00:33:23 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1551 Improve WebTableReader field order and display batchId (lewismc)&#010;+&#010; * NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via lewismc)&#010; &#010; * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010;&#010;Modified: nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java?rev=1465521&amp;r1=1465520&amp;r2=1465521&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java (original)&#010;+++ nutch/branches/2.x/src/java/org/apache/nutch/crawl/WebTableReader.java Mon Apr  8 00:33:23&#010;2013&#010;@@ -341,24 +341,34 @@ public class WebTableReader extends Nutc&#010;     sb.append("baseUrl:\t" + page.getBaseUrl()).append("\n");&#010;     sb.append("status:\t").append(page.getStatus()).append(" (").append(&#010;         CrawlStatus.getName((byte) page.getStatus())).append(")\n");&#010;-    sb.append("fetchInterval:\t" + page.getFetchInterval()).append("\n");&#010;     sb.append("fetchTime:\t" + page.getFetchTime()).append("\n");&#010;     sb.append("prevFetchTime:\t" + page.getPrevFetchTime()).append("\n");&#010;-    sb.append("retries:\t" + page.getRetriesSinceFetch()).append("\n");&#010;+    sb.append("fetchInterval:\t" + page.getFetchInterval()).append("\n"); &#010;+    sb.append("retriesSinceFetch:\t" + page.getRetriesSinceFetch()).append("\n");&#010;     sb.append("modifiedTime:\t" + page.getModifiedTime()).append("\n");&#010;+    sb.append("prevModifiedTime:\t" + page.getPrevModifiedTime()).append("\n");&#010;     sb.append("protocolStatus:\t" +&#010;         ProtocolStatusUtils.toString(page.getProtocolStatus())).append("\n");&#010;-    sb.append("parseStatus:\t" +&#010;-        ParseStatusUtils.toString(page.getParseStatus())).append("\n");&#010;-    sb.append("title:\t" + page.getTitle()).append("\n");&#010;-    sb.append("score:\t" + page.getScore()).append("\n");&#010;+    ByteBuffer prevSig = page.getPrevSignature();&#010;+        if (prevSig != null) {&#010;+      sb.append("prevSignature:\t" + StringUtil.toHexString(prevSig.array())).append("\n");&#010;+    }&#010;     ByteBuffer sig = page.getSignature();&#010;     if (sig != null) {&#010;       sb.append("signature:\t" + StringUtil.toHexString(sig.array())).append("\n");&#010;     }&#010;+    sb.append("parseStatus:\t" +&#010;+        ParseStatusUtils.toString(page.getParseStatus())).append("\n");&#010;+    sb.append("title:\t" + page.getTitle()).append("\n");&#010;+    sb.append("score:\t" + page.getScore()).append("\n");&#010;+&#010;     Map&lt;Utf8, Utf8&gt; markers = page.getMarkers();&#010;     sb.append("markers:\t" + markers).append("\n");&#010;-&#010;+    sb.append("reprUrl:\t" + page.getReprUrl()).append("\n");&#010;+    Utf8 batchId = page.getBatchId();&#010;+    if (batchId != null) {&#010;+      sb.append("batchId:\t" + batchId.toString()).append("\n");&#010;+    }&#010;     Map&lt;Utf8, ByteBuffer&gt; metadata = page.getMetadata();&#010;     if (metadata != null) {&#010;       Iterator&lt;Entry&lt;Utf8, ByteBuffer&gt;&gt; iterator = metadata.entrySet()&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1465159 - in /nutch/trunk: ./ ivy/ src/java/org/apache/nutch/fetcher/ src/java/org/apache/nutch/protocol/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/ src...</title>
<author><name>tejasp@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130405235057.9DE762388847@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130405235057-9DE762388847@eris-apache-org%3e</id>
<updated>2013-04-05T23:50:57Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: tejasp&#010;Date: Fri Apr  5 23:50:56 2013&#010;New Revision: 1465159&#010;&#010;URL: http://svn.apache.org/r1465159&#010;Log:&#010;NUTCH-1031 Delegate parsing of robots.txt to crawler-commons&#010;&#010;Added:&#010;    nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java&#010;    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;Removed:&#010;    nutch/trunk/src/java/org/apache/nutch/protocol/EmptyRobotRules.java&#010;    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/ivy/ivy.xml&#010;    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&#010;    nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java&#010;    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;    nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;    nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Fri Apr  5 23:50:56 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1031 Delegate parsing of robots.txt to crawler-commons (tejasp)&#010;+&#010; * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010; &#010; * NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)&#010;&#010;Modified: nutch/trunk/ivy/ivy.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/ivy/ivy.xml?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/ivy/ivy.xml (original)&#010;+++ nutch/trunk/ivy/ivy.xml Fri Apr  5 23:50:56 2013&#010;@@ -74,6 +74,7 @@&#010; &#009;&#009;&lt;dependency org="oro" name="oro" rev="2.0.8" /&gt;&#010; &#010; &#009;&#009;&lt;dependency org="com.google.guava" name="guava" rev="11.0.2" /&gt;&#010;+                &lt;dependency org="com.google.code.crawler-commons" name="crawler-commons"&#010;rev="0.2" /&gt;&#010; &#010; &#009;&#009;&lt;!--Configuration: test --&gt;&#010; &#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Apr  5 23:50:56 2013&#010;@@ -51,6 +51,7 @@ import org.apache.nutch.scoring.ScoringF&#010; import org.apache.nutch.scoring.ScoringFilters;&#010; import org.apache.nutch.util.*;&#010; &#010;+import crawlercommons.robots.BaseRobotRules;&#010; &#010; /**&#010;  * A queue-based fetcher.&#010;@@ -671,8 +672,8 @@ public class Fetcher extends Configured &#010;               }&#010;               redirecting = false;&#010;               Protocol protocol = this.protocolFactory.getProtocol(fit.url.toString());&#010;-              RobotRules rules = protocol.getRobotRules(fit.url, fit.datum);&#010;-              if (!rules.isAllowed(fit.u)) {&#010;+              BaseRobotRules rules = protocol.getRobotRules(fit.url, fit.datum);&#010;+              if (!rules.isAllowed(fit.u.toString())) {&#010;                 // unblock&#010;                 fetchQueues.finishFetchItem(fit, true);&#010;                 if (LOG.isDebugEnabled()) {&#010;&#010;Modified: nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java (original)&#010;+++ nutch/trunk/src/java/org/apache/nutch/protocol/Protocol.java Fri Apr  5 23:50:56 2013&#010;@@ -25,6 +25,8 @@ import org.apache.hadoop.io.Text;&#010; import org.apache.nutch.crawl.CrawlDatum;&#010; import org.apache.nutch.plugin.Pluggable;&#010; &#010;+import crawlercommons.robots.BaseRobotRules;&#010;+&#010; &#010; /** A retriever of url content.  Implemented by protocol extensions. */&#010; public interface Protocol extends Pluggable, Configurable {&#010;@@ -59,5 +61,6 @@ public interface Protocol extends Plugga&#010;    * @param datum page datum&#010;    * @return robot rules (specific for this url or default), never null&#010;    */&#010;-  RobotRules getRobotRules(Text url, CrawlDatum datum);&#010;+  BaseRobotRules getRobotRules(Text url, CrawlDatum datum);&#010; }&#010;+&#010;&#010;Added: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1465159&amp;view=auto&#010;==============================================================================&#010;--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (added)&#010;+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Apr  5 23:50:56&#010;2013&#010;@@ -0,0 +1,196 @@&#010;+/**&#010;+ * Licensed to the Apache Software Foundation (ASF) under one or more&#010;+ * contributor license agreements. See the NOTICE file distributed with&#010;+ * this work for additional information regarding copyright ownership.&#010;+ * The ASF licenses this file to You under the Apache License, Version 2.0&#010;+ * (the "License"); you may not use this file except in compliance with&#010;+ * the License.  You may obtain a copy of the License at&#010;+ *&#010;+ *     http://www.apache.org/licenses/LICENSE-2.0&#010;+ *&#010;+ * Unless required by applicable law or agreed to in writing, software&#010;+ * distributed under the License is distributed on an "AS IS" BASIS,&#010;+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.&#010;+ * See the License for the specific language governing permissions and&#010;+ * limitations under the License.&#010;+ */&#010;+&#010;+package org.apache.nutch.protocol;&#010;+&#010;+// JDK imports&#010;+import java.io.File;&#010;+import java.io.FileReader;&#010;+import java.io.LineNumberReader;&#010;+import java.net.URL;&#010;+import java.util.ArrayList;&#010;+import java.util.Hashtable;&#010;+import java.util.StringTokenizer;&#010;+&#010;+// Commons Logging imports&#010;+import org.slf4j.Logger;&#010;+import org.slf4j.LoggerFactory;&#010;+&#010;+// Nutch imports&#010;+import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.hadoop.conf.Configurable;&#010;+import org.apache.hadoop.io.Text;&#010;+&#010;+import com.google.common.io.Files;&#010;+&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules.RobotRulesMode;&#010;+import crawlercommons.robots.SimpleRobotRulesParser;&#010;+&#010;+/**&#010;+ * This class uses crawler-commons for handling the parsing of {@code robots.txt} files.&#010;+ * It emits SimpleRobotRules objects, which describe the download permissions&#010;+ * as described in SimpleRobotRulesParser.&#010;+ */&#010;+public abstract class RobotRulesParser implements Configurable {&#010;+&#010;+  public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);&#010;+&#010;+  protected static final Hashtable&lt;String, BaseRobotRules&gt; CACHE = new Hashtable&lt;String,&#010;BaseRobotRules&gt; ();&#010;+&#010;+  /**&#010;+   *  A {@link BaseRobotRules} object appropriate for use&#010;+   *  when the {@code robots.txt} file is empty or missing;&#010;+   *  all requests are allowed.&#010;+   */&#010;+  public static final BaseRobotRules EMPTY_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_ALL);&#010;+&#010;+  /**&#010;+   *  A {@link BaseRobotRules} object appropriate for use when the &#010;+   *  {@code robots.txt} file is not fetched due to a {@code 403/Forbidden}&#010;+   *  response; all requests are disallowed. &#010;+   */&#010;+  public static BaseRobotRules FORBID_ALL_RULES = new SimpleRobotRules(RobotRulesMode.ALLOW_NONE);&#010;+&#010;+  private static SimpleRobotRulesParser robotParser = new SimpleRobotRulesParser();&#010;+  private Configuration conf;&#010;+  protected String agentNames;&#010;+&#010;+  public RobotRulesParser() { }&#010;+&#010;+  public RobotRulesParser(Configuration conf) {&#010;+    setConf(conf);&#010;+  }&#010;+&#010;+  /**&#010;+   * Set the {@link Configuration} object&#010;+   */&#010;+  public void setConf(Configuration conf) {&#010;+    this.conf = conf;&#010;+&#010;+    // Grab the agent names we advertise to robots files.&#010;+    String agentName = conf.get("http.agent.name");&#010;+    if (null == agentName) {&#010;+      throw new RuntimeException("Agent name not configured!");&#010;+    }&#010;+&#010;+    String agentNames = conf.get("http.robots.agents");&#010;+    StringTokenizer tok = new StringTokenizer(agentNames, ",");&#010;+    ArrayList&lt;String&gt; agents = new ArrayList&lt;String&gt;();&#010;+    while (tok.hasMoreTokens()) {&#010;+      agents.add(tok.nextToken().trim());&#010;+    }&#010;+&#010;+    /**&#010;+     * If there are no agents for robots-parsing, use the&#010;+     * default agent-string. If both are present, our agent-string&#010;+     * should be the first one we advertise to robots-parsing.&#010;+     */&#010;+    if (agents.size() == 0) {&#010;+      if (LOG.isErrorEnabled()) {&#010;+        LOG.error("No agents listed in 'http.robots.agents' property!");&#010;+      }&#010;+    } else { &#010;+      StringBuffer combinedAgentsString = new StringBuffer(agentName);&#010;+      int index = 0;&#010;+&#010;+      if ((agents.get(0)).equalsIgnoreCase(agentName))&#010;+        index++;&#010;+      else if (LOG.isErrorEnabled()) {&#010;+        LOG.error("Agent we advertise (" + agentName&#010;+            + ") not listed first in 'http.robots.agents' property!");&#010;+      }&#010;+&#010;+      // append all the agents from the http.robots.agents property&#010;+      for(; index &lt; agents.size(); index++) {&#010;+        combinedAgentsString.append(", " + agents.get(index));&#010;+      }&#010;+&#010;+      // always make sure "*" is included in the end&#010;+      combinedAgentsString.append(", *");&#010;+      this.agentNames = combinedAgentsString.toString();&#010;+    }&#010;+  }&#010;+&#010;+  /**&#010;+   * Get the {@link Configuration} object&#010;+   */&#010;+  public Configuration getConf() {&#010;+    return conf;&#010;+  }&#010;+&#010;+  /**&#010;+   * Parses the robots content using the {@link SimpleRobotRulesParser} from crawler commons&#010;+   *    &#010;+   * @param url A string containing url&#010;+   * @param content Contents of the robots file in a byte array &#010;+   * @param contentType The &#010;+   * @param robotName A string containing value of  &#010;+   * @return BaseRobotRules object &#010;+   */&#010;+  public BaseRobotRules parseRules (String url, byte[] content, String contentType, String&#010;robotName) {&#010;+    return robotParser.parseContent(url, content, contentType, robotName); &#010;+  }&#010;+&#010;+  public BaseRobotRules getRobotRulesSet(Protocol protocol, Text url) {&#010;+    URL u = null;&#010;+    try {&#010;+      u = new URL(url.toString());&#010;+    } catch (Exception e) {&#010;+      return EMPTY_RULES;&#010;+    }&#010;+    return getRobotRulesSet(protocol, u);&#010;+  }&#010;+&#010;+  public abstract BaseRobotRules getRobotRulesSet(Protocol protocol, URL url);&#010;+&#010;+  /** command-line main for testing */&#010;+  public static void main(String[] argv) {&#010;+&#010;+    if (argv.length &lt; 3) {&#010;+      System.err.println("Usage: RobotRulesParser &lt;robots-file&gt; &lt;url-file&gt; &lt;agent-names&gt;\n");&#010;+      System.err.println("\tThe &lt;robots-file&gt; will be parsed as a robots.txt file,");&#010;+      System.err.println("\tusing the given &lt;agent-name&gt; to select rules.  URLs ");&#010;+      System.err.println("\twill be read (one per line) from &lt;url-file&gt;, and tested");&#010;+      System.err.println("\tagainst the rules. Multiple agent names can be specified using&#010;spaces.");&#010;+      System.exit(-1);&#010;+    }&#010;+&#010;+    try {&#010;+      StringBuilder agentNames = new StringBuilder();&#010;+      for(int counter = 2; counter &lt; argv.length; counter++) &#010;+        agentNames.append(argv[counter]).append(",");&#010;+&#010;+      agentNames.deleteCharAt(agentNames.length()-1);&#010;+&#010;+      byte[] robotsBytes = Files.toByteArray(new File(argv[0]));&#010;+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain",&#010;agentNames.toString());&#010;+&#010;+      LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));&#010;+      String testPath = testsIn.readLine().trim();&#010;+      while (testPath != null) {&#010;+        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +&#010;+            ":\t" + testPath);&#010;+        testPath = testsIn.readLine();&#010;+      }&#010;+      testsIn.close();&#010;+    } catch (Exception e) {&#010;+      e.printStackTrace();&#010;+    }&#010;+  }&#010;+}&#010;&#010;Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java&#010;Fri Apr  5 23:50:56 2013&#010;@@ -32,15 +32,16 @@ import org.apache.nutch.protocol.Protoco&#010; import org.apache.nutch.protocol.ProtocolException;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatus;&#010;-import org.apache.nutch.protocol.RobotRules;&#010; import org.apache.nutch.util.GZIPUtils;&#010; import org.apache.nutch.util.DeflateUtils;&#010; &#010;-&#010; // Hadoop imports&#010; import org.apache.hadoop.conf.Configuration;&#010; import org.apache.hadoop.io.Text;&#010; &#010;+// crawler-commons imports&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+&#010; /**&#010;  * @author J&amp;eacute;r&amp;ocirc;me Charron&#010;  */&#010;@@ -51,7 +52,7 @@ public abstract class HttpBase implement&#010;   &#010;   private static final byte[] EMPTY_CONTENT = new byte[0];&#010; &#010;-  private RobotRulesParser robots = null;&#010;+  private HttpRobotRulesParser robots = null;&#010;  &#010;   /** The proxy hostname. */ &#010;   protected String proxyHost = null;&#010;@@ -105,7 +106,7 @@ public abstract class HttpBase implement&#010;     if (logger != null) {&#010;       this.logger = logger;&#010;     }&#010;-    robots = new RobotRulesParser();&#010;+    robots = new HttpRobotRulesParser();&#010;   }&#010;   &#010;   // Inherited Javadoc&#010;@@ -138,7 +139,6 @@ public abstract class HttpBase implement&#010;     String urlString = url.toString();&#010;     try {&#010;       URL u = new URL(urlString);&#010;-      String host = null;&#010;       Response response = getResponse(u, datum, false); // make a request&#010;       &#010;       int code = response.getCode();&#010;@@ -381,18 +381,16 @@ public abstract class HttpBase implement&#010;       System.out.println("Content:");&#010;       String text = new String(content.getContent());&#010;       System.out.println(text);&#010;-    }&#010;-    &#010;+    }  &#010;   }&#010;   &#010;-  &#010;   protected abstract Response getResponse(URL url,&#010;                                           CrawlDatum datum,&#010;                                           boolean followRedirects)&#010;     throws ProtocolException, IOException;&#010; &#010;-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;     return robots.getRobotRulesSet(this, url);&#010;   }&#010;-&#010; }&#010;+&#010;&#010;Added: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java?rev=1465159&amp;view=auto&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;(added)&#010;+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpRobotRulesParser.java&#010;Fri Apr  5 23:50:56 2013&#010;@@ -0,0 +1,126 @@&#010;+/**&#010;+ * Licensed to the Apache Software Foundation (ASF) under one or more&#010;+ * contributor license agreements.  See the NOTICE file distributed with&#010;+ * this work for additional information regarding copyright ownership.&#010;+ * The ASF licenses this file to You under the Apache License, Version 2.0&#010;+ * (the "License"); you may not use this file except in compliance with&#010;+ * the License.  You may obtain a copy of the License at&#010;+ *&#010;+ *     http://www.apache.org/licenses/LICENSE-2.0&#010;+ *&#010;+ * Unless required by applicable law or agreed to in writing, software&#010;+ * distributed under the License is distributed on an "AS IS" BASIS,&#010;+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.&#010;+ * See the License for the specific language governing permissions and&#010;+ * limitations under the License.&#010;+ */&#010;+&#010;+package org.apache.nutch.protocol.http.api;&#010;+&#010;+import java.net.URL;&#010;+&#010;+import org.slf4j.Logger;&#010;+import org.slf4j.LoggerFactory;&#010;+&#010;+import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.nutch.crawl.CrawlDatum;&#010;+import org.apache.nutch.net.protocols.Response;&#010;+import org.apache.nutch.protocol.Protocol;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010;+&#010;+import crawlercommons.robots.BaseRobotRules;&#010;+import crawlercommons.robots.SimpleRobotRules;&#010;+&#010;+/**&#010;+ * This class is used for parsing robots for urls belonging to HTTP protocol.&#010;+ * It extends the generic {@link RobotRulesParser} class and contains &#010;+ * Http protocol specific implementation for obtaining the robots file.&#010;+ */&#010;+public class HttpRobotRulesParser extends RobotRulesParser {&#010;+  &#010;+  public static final Logger LOG = LoggerFactory.getLogger(HttpRobotRulesParser.class);&#010;+  protected boolean allowForbidden = false;&#010;+&#010;+  HttpRobotRulesParser() { }&#010;+&#010;+  public HttpRobotRulesParser(Configuration conf) {&#010;+    super(conf);&#010;+    allowForbidden = conf.getBoolean("http.robots.403.allow", false);&#010;+  }&#010;+&#010;+  /**&#010;+   * The hosts for which the caching of robots rules is yet to be done,&#010;+   * it sends a Http request to the host corresponding to the {@link URL} &#010;+   * passed, gets robots file, parses the rules and caches the rules object&#010;+   * to avoid re-work in future.&#010;+   * &#010;+   *  @param http The {@link Protocol} object&#010;+   *  @param url URL &#010;+   *  &#010;+   *  @return robotRules A {@link BaseRobotRules} object for the rules&#010;+   */&#010;+  public BaseRobotRules getRobotRulesSet(Protocol http, URL url) {&#010;+&#010;+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case&#010;+    String host = url.getHost().toLowerCase();          // normalize to lower case&#010;+&#010;+    BaseRobotRules robotRules = (SimpleRobotRules)CACHE.get(protocol + ":" + host);&#010;+&#010;+    boolean cacheRule = true;&#010;+    &#010;+    if (robotRules == null) {                     // cache miss&#010;+      URL redir = null;&#010;+      if (LOG.isTraceEnabled()) { LOG.trace("cache miss " + url); }&#010;+      try {&#010;+        Response response = ((HttpBase)http).getResponse(new URL(url, "/robots.txt"),&#010;+                                             new CrawlDatum(), true);&#010;+        // try one level of redirection ?&#010;+        if (response.getCode() == 301 || response.getCode() == 302) {&#010;+          String redirection = response.getHeader("Location");&#010;+          if (redirection == null) {&#010;+            // some versions of MS IIS are known to mangle this header&#010;+            redirection = response.getHeader("location");&#010;+          }&#010;+          if (redirection != null) {&#010;+            if (!redirection.startsWith("http")) {&#010;+              // RFC says it should be absolute, but apparently it isn't&#010;+              redir = new URL(url, redirection);&#010;+            } else {&#010;+              redir = new URL(redirection);&#010;+            }&#010;+            &#010;+            response = ((HttpBase)http).getResponse(redir, new CrawlDatum(), true);&#010;+          }&#010;+        }&#010;+&#010;+        if (response.getCode() == 200)               // found rules: parse them&#010;+          robotRules =  parseRules(url.toString(), response.getContent(), &#010;+                                   response.getHeader("Content-Type"), &#010;+                                   agentNames);&#010;+&#010;+        else if ( (response.getCode() == 403) &amp;&amp; (!allowForbidden) )&#010;+          robotRules = FORBID_ALL_RULES;            // use forbid all&#010;+        else if (response.getCode() &gt;= 500) {&#010;+          cacheRule = false;&#010;+          robotRules = EMPTY_RULES;&#010;+        }else                                        &#010;+          robotRules = EMPTY_RULES;                 // use default rules&#010;+      } catch (Throwable t) {&#010;+        if (LOG.isInfoEnabled()) {&#010;+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());&#010;+        }&#010;+        cacheRule = false;&#010;+        robotRules = EMPTY_RULES;&#010;+      }&#010;+&#010;+      if (cacheRule) {&#010;+        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host&#010;+        if (redir != null &amp;&amp; !redir.getHost().equals(host)) {&#010;+          // cache also for the redirected host&#010;+          CACHE.put(protocol + ":" + redir.getHost(), robotRules);&#010;+        }&#010;+      }&#010;+    }&#010;+    return robotRules;&#010;+  }&#010;+}&#010;&#010;Modified: nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java&#010;Fri Apr  5 23:50:56 2013&#010;@@ -17,292 +17,100 @@&#010; &#010; package org.apache.nutch.protocol.http.api;&#010; &#010;-import org.apache.nutch.protocol.http.api.RobotRulesParser.RobotRuleSet;&#010;-&#010;+import crawlercommons.robots.BaseRobotRules;&#010; import junit.framework.TestCase;&#010; &#010;+/**&#010;+ * JUnit test case which tests&#010;+ * 1. that robots filtering is performed correctly as per the agent name&#010;+ * 2. that crawl delay is extracted correctly from the robots file&#010;+ *&#010;+ */&#010; public class TestRobotRulesParser extends TestCase {&#010;-  private static final String LF= "\n";&#010;-  private static final String CR= "\r";&#010;-  private static final String CRLF= "\r\n";&#010;+&#010;+  private static final String CONTENT_TYPE = "text/plain";&#010;+  private static final String SINGLE_AGENT = "Agent1";&#010;+  private static final String MULTIPLE_AGENTS = "Agent2, Agent1";&#010;+  private static final String UNKNOWN_AGENT = "AgentABC";&#010;+  private static final String CR = "\r";&#010;   &#010;-  private static final boolean[] ACCEPT_ALL = {&#010;-    true,   // "/a",&#009;      &#010;-    true,   // "/a/",&#009;      &#010;-    true,   // "/a/bloh/foo.html"&#010;-    true,   // "/b",&#009;      &#010;-    true,   // "/b/a",&#009;      &#010;-    true,   // "/b/a/index.html",&#010;-    true,   // "/b/b/foo.html",  &#010;-    true,   // "/c",&#009;      &#010;-    true,   // "/c/a",&#009;      &#010;-    true,   // "/c/a/index.html",&#010;-    true,   // "/c/b/foo.html",  &#010;-    true,   // "/d",&#009;      &#010;-    true,   // "/d/a",&#009;      &#010;-    true,   // "/e/a/index.html",&#010;-    true,   // "/e/d",&#009;      &#010;-    true,   // "/e/d/foo.html",  &#010;-    true,   // "/e/doh.html",    &#010;-    true,   // "/f/index.html",  &#010;-    true,   // "/foo/bar.html",  &#010;-    true,   // "/f/",&#010;-  };&#010;+  private static final String ROBOTS_STRING = &#010;+      "User-Agent: Agent1 #foo" + CR &#010;+      + "Disallow: /a" + CR &#010;+      + "Disallow: /b/a" + CR &#010;+      + "#Disallow: /c" + CR &#010;+      + "Crawl-delay: 10" + CR  // set crawl delay for Agent1 as 10 sec&#010;+      + "" + CR &#010;+      + "" + CR &#010;+      + "User-Agent: Agent2" + CR &#010;+      + "Disallow: /a/bloh" + CR &#010;+      + "Disallow: /c" + CR&#010;+      + "Disallow: /foo" + CR&#010;+      + "Crawl-delay: 20" + CR&#010;+      + "" + CR &#010;+      + "User-Agent: *" + CR &#010;+      + "Disallow: /foo/bar/" + CR;   // no crawl delay for other agents&#010;   &#010;-  private static final String[] ROBOTS_STRINGS= new String[] {&#010;-    "User-Agent: Agent1 #foo" + CR &#010;-    + "Disallow: /a" + CR &#010;-    + "Disallow: /b/a" + CR &#010;-    + "#Disallow: /c" + CR &#010;-    + "" + CR &#010;-    + "" + CR &#010;-    + "User-Agent: Agent2 Agent3#foo" + CR &#010;-    + "User-Agent: Agent4" + CR &#010;-    + "Disallow: /d" + CR &#010;-    + "Disallow: /e/d/" + CR&#010;-    + "" + CR &#010;-    + "User-Agent: *" + CR &#010;-    + "Disallow: /foo/bar/" + CR,&#010;-    null  // Used to test EMPTY_RULES&#010;+  private static final String[] TEST_PATHS = new String[] {&#010;+    "http://example.com/a",&#010;+    "http://example.com/a/bloh/foo.html",&#010;+    "http://example.com/b",&#010;+    "http://example.com/c",&#010;+    "http://example.com/b/a/index.html",&#010;+    "http://example.com/foo/bar/baz.html"&#010;+  };&#010;+&#010;+  private static final boolean[] RESULTS = new boolean[] {&#010;+    false,  //  /a&#010;+    false,  //  /a/bloh/foo.html&#010;+    true,   //  /b&#010;+    true,   //  /c&#010;+    false,  //  /b/a/index.html&#010;+    true    //  /foo/bar/baz.html&#010;   };&#010; &#010;-  private static final String[] AGENT_STRINGS= new String[] {&#010;-    "Agent1",&#010;-    "Agent2",&#010;-    "Agent3",&#010;-    "Agent4",&#010;-    "Agent5",&#010;-  };&#010;-&#010;-  private static final boolean[][] NOT_IN_ROBOTS_STRING= new boolean[][] {&#010;-    { &#010;-      false, &#010;-      false,&#010;-      false,&#010;-      false,&#010;-      true,&#010;-    },&#010;-    { &#010;-      false, &#010;-      false,&#010;-      false,&#010;-      false,&#010;-      true,&#010;-    }    &#010;-  };&#010;+  private HttpRobotRulesParser parser;&#010;+  private BaseRobotRules rules;&#010; &#010;-  private static final String[] TEST_PATHS= new String[] {&#010;-    "/a",&#010;-    "/a/",&#010;-    "/a/bloh/foo.html",&#010;-    "/b",&#010;-    "/b/a",&#010;-    "/b/a/index.html",&#010;-    "/b/b/foo.html",&#010;-    "/c",&#010;-    "/c/a",&#010;-    "/c/a/index.html",&#010;-    "/c/b/foo.html",&#010;-    "/d",&#010;-    "/d/a",&#010;-    "/e/a/index.html",&#010;-    "/e/d",&#010;-    "/e/d/foo.html",&#010;-    "/e/doh.html",&#010;-    "/f/index.html",&#010;-    "/foo/bar/baz.html",  &#010;-    "/f/",&#010;-  };&#010;-&#010;-  private static final boolean[][][] ALLOWED= new boolean[][][] {&#010;-    { // ROBOTS_STRINGS[0]&#010;-      { // Agent1&#010;-&#009;false,  // "/a",&#009;      &#010;-&#009;false,  // "/a/",&#009;      &#010;-&#009;false,  // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;false,  // "/b/a",&#009;      &#010;-&#009;false,  // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;true,   // "/d",&#009;      &#010;-&#009;true,   // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;true,   // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      }, &#010;-      { // Agent2&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;false,  // "/d",&#009;      &#010;-&#009;false,  // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;false,  // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      },&#010;-      { // Agent3&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;false,  // "/d",&#009;      &#010;-&#009;false,  // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;false,  // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      },&#010;-      { // Agent4&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;false,  // "/d",&#009;      &#010;-&#009;false,  // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;false,  // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;true,   // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      },&#010;-      { // Agent5/"*"&#010;-&#009;true,   // "/a",&#009;      &#010;-&#009;true,   // "/a/",&#009;      &#010;-&#009;true,   // "/a/bloh/foo.html"&#010;-&#009;true,   // "/b",&#009;      &#010;-&#009;true,   // "/b/a",&#009;      &#010;-&#009;true,   // "/b/a/index.html",&#010;-&#009;true,   // "/b/b/foo.html",  &#010;-&#009;true,   // "/c",&#009;      &#010;-&#009;true,   // "/c/a",&#009;      &#010;-&#009;true,   // "/c/a/index.html",&#010;-&#009;true,   // "/c/b/foo.html",  &#010;-&#009;true,   // "/d",&#009;      &#010;-&#009;true,   // "/d/a",&#009;      &#010;-&#009;true,   // "/e/a/index.html",&#010;-&#009;true,   // "/e/d",&#009;      &#010;-&#009;true,   // "/e/d/foo.html",  &#010;-&#009;true,   // "/e/doh.html",    &#010;-&#009;true,   // "/f/index.html",  &#010;-&#009;false,  // "/foo/bar.html",  &#010;-&#009;true,   // "/f/",  &#010;-      }&#010;-    },&#010;-    { // ROBOTS_STRINGS[1]&#010;-      ACCEPT_ALL, // Agent 1&#010;-      ACCEPT_ALL, // Agent 2&#010;-      ACCEPT_ALL, // Agent 3&#010;-      ACCEPT_ALL, // Agent 4&#010;-      ACCEPT_ALL, // Agent 5&#010;-    }&#010;-  };&#010;- &#010;   public TestRobotRulesParser(String name) {&#010;     super(name);&#010;+    parser = new HttpRobotRulesParser();&#010;   }&#010; &#010;-  public void testRobotsOneAgent() {&#010;-    for (int i= 0; i &lt; ROBOTS_STRINGS.length; i++) {&#010;-      for (int j= 0; j &lt; AGENT_STRINGS.length; j++) {&#010;-&#009;testRobots(i, new String[] { AGENT_STRINGS[j] },&#010;-&#009;&#009;   TEST_PATHS, ALLOWED[i][j]);&#010;-      }&#010;+  /**&#010;+  * Test that the robots rules are interpreted correctly by the robots rules parser. &#010;+  */&#010;+  public void testRobotsAgent() {&#010;+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE,&#010;SINGLE_AGENT);&#010;+&#010;+    for(int counter = 0; counter &lt; TEST_PATHS.length; counter++) {&#010;+      assertTrue("testing on agent (" + SINGLE_AGENT + "), and " &#010;+              + "path " + TEST_PATHS[counter] &#010;+              + " got " + rules.isAllowed(TEST_PATHS[counter]),&#010;+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);&#010;     }&#010;-  }&#010; &#010;-  public void testRobotsTwoAgents() {&#010;-    for (int i= 0; i &lt; ROBOTS_STRINGS.length; i++) {&#010;-      for (int j= 0; j &lt; AGENT_STRINGS.length; j++) {&#010;-&#009;for (int k= 0; k &lt; AGENT_STRINGS.length; k++) {&#010;-&#009;  int key= j;&#010;-&#009;  if (NOT_IN_ROBOTS_STRING[i][j])&#010;-&#009;    key= k;&#010;-&#009;  testRobots(i, new String[] { AGENT_STRINGS[j], AGENT_STRINGS[k] },&#010;-&#009;&#009;     TEST_PATHS, ALLOWED[i][key]);&#010;-&#009;}&#010;-      }&#010;-    }&#010;-  }&#010;-  &#010;-  public void testCrawlDelay() {&#010;-    RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });&#010;-    String delayRule1 = "User-agent: nutchbot" + CR +&#010;-                        "Crawl-delay: 10" + CR +&#010;-                        "User-agent: foobot" + CR +&#010;-                        "Crawl-delay: 20" + CR +&#010;-                        "User-agent: *" + CR + &#010;-                        "Disallow:/baz" + CR;&#010;-    String delayRule2 = "User-agent: foobot" + CR +&#010;-                        "Crawl-delay: 20" + CR +&#010;-                        "User-agent: *" + CR + &#010;-                        "Disallow:/baz" + CR;&#010;-    RobotRuleSet rules = p.parseRules(delayRule1.getBytes());&#010;-    long crawlDelay = rules.getCrawlDelay();&#010;-    assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000));&#010;-    rules = p.parseRules(delayRule2.getBytes());&#010;-    crawlDelay = rules.getCrawlDelay();&#010;-    assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1));&#010;-  }&#010;+    rules = parser.parseRules("testRobotsAgent", ROBOTS_STRING.getBytes(), CONTENT_TYPE,&#010;MULTIPLE_AGENTS);&#010; &#010;-  // helper&#010;-&#010;-  public void testRobots(int robotsString, String[] agents, String[] paths, &#010;-&#009;&#009;&#009; boolean[] allowed) {&#010;-    String agentsString= agents[0];&#010;-    for (int i= 1; i &lt; agents.length; i++)&#010;-      agentsString= agentsString + "," + agents[i];&#010;-    RobotRulesParser p= new RobotRulesParser(agents);&#010;-    RobotRuleSet rules= p.parseRules(ROBOTS_STRINGS[robotsString] != null&#010;-                                     ? ROBOTS_STRINGS[robotsString].getBytes()&#010;-                                     : null);&#010;-    for (int i= 0; i &lt; paths.length; i++) {&#010;-      assertTrue("testing robots file "+robotsString+", on agents ("&#010;-&#009;&#009; + agentsString + "), and path " + TEST_PATHS[i] + "; got " &#010;-&#009;&#009; + rules.isAllowed(TEST_PATHS[i]) + ", rules are: " + LF&#010;-&#009;&#009;&#009;&#009;   + rules,&#010;-&#009;&#009; rules.isAllowed(TEST_PATHS[i]) == allowed[i]);&#010;+    for(int counter = 0; counter &lt; TEST_PATHS.length; counter++) {&#010;+      assertTrue("testing on agents (" + MULTIPLE_AGENTS + "), and " &#010;+              + "path " + TEST_PATHS[counter] &#010;+              + " got " + rules.isAllowed(TEST_PATHS[counter]),&#010;+              rules.isAllowed(TEST_PATHS[counter]) == RESULTS[counter]);&#010;     }&#010;   }&#010; &#010;-&#010;-  &#010;+  /**&#010;+  * Test that the crawl delay is extracted from the robots file for respective agent. &#010;+  * If its not specified for a given agent, default value must be returned.&#010;+  */&#010;+  public void testCrawlDelay() {&#010;+    // for SINGLE_AGENT, the crawl delay of 10 sec ie. 10000 msec must be returned by the&#010;parser&#010;+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, SINGLE_AGENT);&#010;+    assertTrue("testing crawl delay for agent "+ SINGLE_AGENT +" : ", (rules.getCrawlDelay()&#010;== 10000));&#010;+    &#010;+    // for UNKNOWN_AGENT, the default crawl delay must be returned.&#010;+    rules = parser.parseRules("testCrawlDelay", ROBOTS_STRING.getBytes(), CONTENT_TYPE, UNKNOWN_AGENT);&#010;+    assertTrue("testing crawl delay for agent "+ UNKNOWN_AGENT +" : ", (rules.getCrawlDelay()&#010;== Long.MIN_VALUE));&#010;+  }&#010; }&#010;&#010;Modified: nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/File.java&#010;Fri Apr  5 23:50:56 2013&#010;@@ -17,35 +17,33 @@&#010; &#010; package org.apache.nutch.protocol.file;&#010; &#010;+import java.net.URL;&#010;+&#010; import org.slf4j.Logger;&#010; import org.slf4j.LoggerFactory;&#010; &#010;-import org.apache.nutch.crawl.CrawlDatum;&#010;-import org.apache.hadoop.io.Text;&#010;-import org.apache.nutch.metadata.Metadata;&#010;-import org.apache.nutch.net.protocols.HttpDateFormat;&#010;-import org.apache.nutch.net.protocols.Response;&#010;-&#010; import org.apache.hadoop.conf.Configuration;&#010;+import org.apache.hadoop.io.Text;&#010; &#010;+import org.apache.nutch.crawl.CrawlDatum;&#010;+import org.apache.nutch.net.protocols.Response;&#010; import org.apache.nutch.protocol.Content;&#010;-import org.apache.nutch.protocol.EmptyRobotRules;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatus;&#010;-import org.apache.nutch.protocol.RobotRules;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010; import org.apache.nutch.util.NutchConfiguration;&#010; &#010;-import java.net.URL;&#010;+import crawlercommons.robots.BaseRobotRules;&#010; &#010;-/************************************&#010;- * File.java deals with file: scheme.&#010;- *&#010;- * Configurable parameters are defined under "FILE properties" section&#010;- * in ./conf/nutch-default.xml or similar.&#010;+/**&#010;+ * This class is a protocol plugin used for file: scheme.&#010;+ * It creates {@link FileResponse} object and gets the content of the url from it.&#010;+ * Configurable parameters are {@code file.content.limit} and {@code file.crawl.parent} &#010;+ * in nutch-default.xml defined under "file properties" section.&#010;  *&#010;  * @author John Xing&#010;- ***********************************/&#010;+ */&#010; public class File implements Protocol {&#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(File.class);&#010;@@ -57,13 +55,40 @@ public class File implements Protocol {&#010; &#010;   private Configuration conf;&#010; &#010;-  // constructor&#010;-  public File() {&#010;-  }&#010;+  public File() {}&#010; &#010;-  /** Set the point at which content is truncated. */&#010;-  public void setMaxContentLength(int length) {maxContentLength = length;}&#010;+  /**&#010;+   * Set the {@link Configuration} object&#010;+   */&#010;+  public void setConf(Configuration conf) {&#010;+    this.conf = conf;&#010;+    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);&#010;+    this.crawlParents = conf.getBoolean("file.crawl.parent", true);&#010;+  }&#010; &#010;+  /**&#010;+   * Get the {@link Configuration} object&#010;+   */&#010;+  public Configuration getConf() {&#010;+    return this.conf;&#010;+  }&#010;+  &#010;+  /** &#010;+   * Set the length after at which content is truncated. &#010;+   */&#010;+  public void setMaxContentLength(int maxContentLength) {&#010;+    this.maxContentLength = maxContentLength;&#010;+  }&#010;+&#010;+  /** &#010;+   * Creates a {@link FileResponse} object corresponding to the url and &#010;+   * return a {@link ProtocolOutput} object as per the content received&#010;+   * &#010;+   * @param url Text containing the url&#010;+   * @param datum The CrawlDatum object corresponding to the url&#010;+   * &#010;+   * @return {@link ProtocolOutput} object for the content of the file indicated by url&#010;+   */&#010;   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {&#010;     String urlString = url.toString();&#010;     try {&#010;@@ -99,11 +124,9 @@ public class File implements Protocol {&#010;     }&#010;   }&#010; &#010;-//  protected void finalize () {&#010;-//    // nothing here&#010;-//  }&#010;-&#010;-  /** For debugging. */&#010;+  /** &#010;+   * Quick way for running this class. Useful for debugging.&#010;+   */&#010;   public static void main(String[] args) throws Exception {&#010;     int maxContentLength = Integer.MIN_VALUE;&#010;     String logLevel = "info";&#010;@@ -154,17 +177,12 @@ public class File implements Protocol {&#010;     file = null;&#010;   }&#010; &#010;-  public void setConf(Configuration conf) {&#010;-    this.conf = conf;&#010;-    this.maxContentLength = conf.getInt("file.content.limit", 64 * 1024);&#010;-    this.crawlParents = conf.getBoolean("file.crawl.parent", true);&#010;-  }&#010;-&#010;-  public Configuration getConf() {&#010;-    return this.conf;&#010;-  }&#010;-&#010;-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;-    return EmptyRobotRules.RULES;&#010;+  /** &#010;+   * No robots parsing is done for file protocol. &#010;+   * So this returns a set of empty rules which will allow every url.&#010;+   */&#010;+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;+    return RobotRulesParser.EMPTY_RULES;&#010;   }&#010; }&#010;+&#010;&#010;Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1465159&amp;r1=1465158&amp;r2=1465159&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)&#010;+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Fri&#010;Apr  5 23:50:56 2013&#010;@@ -24,30 +24,33 @@ import org.apache.commons.net.ftp.FTPFil&#010; &#010; import org.apache.nutch.crawl.CrawlDatum;&#010; import org.apache.hadoop.io.Text;&#010;-import org.apache.nutch.net.protocols.HttpDateFormat;&#010; import org.apache.nutch.net.protocols.Response;&#010; &#010; import org.apache.hadoop.conf.Configuration;&#010; &#010; import org.apache.nutch.protocol.Content;&#010;-import org.apache.nutch.protocol.EmptyRobotRules;&#010;+import org.apache.nutch.protocol.RobotRulesParser;&#010; import org.apache.nutch.protocol.Protocol;&#010; import org.apache.nutch.protocol.ProtocolOutput;&#010; import org.apache.nutch.protocol.ProtocolStatus;&#010;-import org.apache.nutch.protocol.RobotRules;&#010;+&#010;+import crawlercommons.robots.BaseRobotRules;&#010; &#010; import java.net.URL;&#010; &#010; import java.io.IOException;&#010; &#010;-/************************************&#010;- * Ftp.java deals with ftp: scheme.&#010;- *&#010;- * Configurable parameters are defined under "FTP properties" section&#010;- * in ./conf/nutch-default.xml or similar.&#010;+/**&#010;+ * This class is a protocol plugin used for ftp: scheme.&#010;+ * It creates {@link FtpResponse} object and gets the content of the url from it.&#010;+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},&#010;+ *                             {@code ftp.content.limit}, {@code ftp.timeout}, &#010;+ *                             {@code ftp.server.timeout}, {@code ftp.password}, &#010;+ *                             {@code ftp.keep.connection} and {@code ftp.follow.talk}.&#010;+ * For details see "FTP properties" section in {@code nutch-default.xml}.&#010;  *&#010;  * @author John Xing&#010;- ***********************************/&#010;+ */&#010; public class Ftp implements Protocol {&#010; &#010;   public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);&#010;@@ -106,6 +109,15 @@ public class Ftp implements Protocol {&#010;     this.keepConnection = keepConnection;&#010;   }&#010; &#010;+  /** &#010;+   * Creates a {@link FtpResponse} object corresponding to the url and &#010;+   * returns a {@link ProtocolOutput} object as per the content received&#010;+   * &#010;+   * @param url Text containing the ftp url&#010;+   * @param datum The CrawlDatum object corresponding to the url&#010;+   * &#010;+   * @return {@link ProtocolOutput} object for the url&#010;+   */&#010;   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {&#010;     String urlString = url.toString();&#010;     try {&#010;@@ -216,7 +228,9 @@ public class Ftp implements Protocol {&#010;     ftp = null;&#010;   }&#010; &#010;-  &#010;+  /**&#010;+   * Set the {@link Configuration} object&#010;+   */&#010;   public void setConf(Configuration conf) {&#010;     this.conf = conf;&#010;     this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);&#010;@@ -228,12 +242,20 @@ public class Ftp implements Protocol {&#010;     this.followTalk = conf.getBoolean("ftp.follow.talk", false);&#010;   }&#010; &#010;+  /**&#010;+   * Get the {@link Configuration} object&#010;+   */&#010;   public Configuration getConf() {&#010;     return this.conf;&#010;   }&#010; &#010;-  public RobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;-    return EmptyRobotRules.RULES;&#010;+  /** &#010;+   * Currently, no robots parsing is done for ftp protocol &#010;+   * and this returns a set of empty rules which will allow every url.&#010;+   * There a jira logged for the same NUTCH-1513&#010;+   */&#010;+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {&#010;+    return RobotRulesParser.EMPTY_RULES;&#010;   }&#010;-&#010; }&#010;+&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1464207 - in /nutch/branches/2.x: CHANGES.txt src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java</title>
<author><name>lewismc@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201304.mbox/%3c20130403211507.F370C2388847@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130403211507-F370C2388847@eris-apache-org%3e</id>
<updated>2013-04-03T21:15:07Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: lewismc&#010;Date: Wed Apr  3 21:15:07 2013&#010;New Revision: 1464207&#010;&#010;URL: http://svn.apache.org/r1464207&#010;Log:&#010;NUTCH-1552 possibility of a NPE in index-more plugin&#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1464207&amp;r1=1464206&amp;r2=1464207&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Wed Apr  3 21:15:07 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1552 possibility of a NPE in index-more plugin (kaveh minooie via lewismc)&#010;+&#010; * NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010; &#010; * NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)&#010;&#010;Modified: nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1464207&amp;r1=1464206&amp;r2=1464207&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java&#010;Wed Apr  3 21:15:07 2013&#010;@@ -199,7 +199,7 @@ public class MoreIndexingFilter implemen&#010;     doc.add("type", mimeType);&#010; &#010;     // Check if we need to split the content type in sub parts&#010;-    if (conf.getBoolean("moreIndexingFilter.indexMimeTypeParts", true)) {&#010;+    if ( null != contentType &amp;&amp; conf.getBoolean("moreIndexingFilter.indexMimeTypeParts",&#010;true)) {&#010;       String[] parts = getParts(contentType.toString());&#010; &#010;       for(String part: parts) {&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1462079 - in /nutch/branches/2.x: CHANGES.txt conf/nutch-default.xml src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java</title>
<author><name>fenglu@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201303.mbox/%3c20130328130910.10C3623888FE@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130328130910-10C3623888FE@eris-apache-org%3e</id>
<updated>2013-03-28T13:09:09Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: fenglu&#010;Date: Thu Mar 28 13:09:09 2013&#010;New Revision: 1462079&#010;&#010;URL: http://svn.apache.org/r1462079&#010;Log:&#010;NUTCH-1547 BasicIndexingFilter - Problem to index full title &#010;&#010;Modified:&#010;    nutch/branches/2.x/CHANGES.txt&#010;    nutch/branches/2.x/conf/nutch-default.xml&#010;    nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;&#010;Modified: nutch/branches/2.x/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1462079&amp;r1=1462078&amp;r2=1462079&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/CHANGES.txt (original)&#010;+++ nutch/branches/2.x/CHANGES.txt Thu Mar 28 13:09:09 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; Release 2.2 - Current Development&#010; &#010;+* NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010;+&#010; * NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)&#010; &#010; * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel via lewismc)&#010;&#010;Modified: nutch/branches/2.x/conf/nutch-default.xml&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1462079&amp;r1=1462078&amp;r2=1462079&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/conf/nutch-default.xml (original)&#010;+++ nutch/branches/2.x/conf/nutch-default.xml Thu Mar 28 13:09:09 2013&#010;@@ -752,7 +752,7 @@&#010; &lt;property&gt;&#010;   &lt;name&gt;indexer.max.title.length&lt;/name&gt;&#010;   &lt;value&gt;100&lt;/value&gt;&#010;-  &lt;description&gt;The maximum number of characters of a title that are indexed.&#010;+  &lt;description&gt;The maximum number of characters of a title that are indexed. A value&#010;of -1 disables this check.&#010;   Used by index-basic.&#010;   &lt;/description&gt;&#010; &lt;/property&gt;&#010;&#010;Modified: nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1462079&amp;r1=1462078&amp;r2=1462079&amp;view=diff&#010;==============================================================================&#010;--- nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;(original)&#010;+++ nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;Thu Mar 28 13:09:09 2013&#010;@@ -109,7 +109,7 @@ public class BasicIndexingFilter impleme&#010; &#010;     // title&#010;     String title = TableUtil.toString(page.getTitle());&#010;-    if (title.length() &gt; MAX_TITLE_LENGTH) { // truncate title if needed&#010;+    if (MAX_TITLE_LENGTH &gt; -1 &amp;&amp; title.length() &gt; MAX_TITLE_LENGTH) { // truncate&#010;title if needed&#010;       title = title.substring(0, MAX_TITLE_LENGTH);&#010;     }&#010;     if (title.length() &gt; 0) {&#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
<entry>
<title>svn commit: r1462078 - in /nutch/trunk: CHANGES.txt conf/nutch-default.xml src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java</title>
<author><name>fenglu@apache.org</name></author>
<link rel="alternate" href="http://mail-archives.apache.org/mod_mbox/nutch-commits/201303.mbox/%3c20130328130428.DDCFF2388978@eris.apache.org%3e"/>
<id>urn:uuid:%3c20130328130428-DDCFF2388978@eris-apache-org%3e</id>
<updated>2013-03-28T13:04:28Z</updated>
<content type="xhtml">
<div xmlns="http://www.w3.org/1999/xhtml">
<pre>
Author: fenglu&#010;Date: Thu Mar 28 13:04:28 2013&#010;New Revision: 1462078&#010;&#010;URL: http://svn.apache.org/r1462078&#010;Log:&#010;NUTCH-1547 BasicIndexingFilter - Problem to index full title &#010;&#010;Modified:&#010;    nutch/trunk/CHANGES.txt&#010;    nutch/trunk/conf/nutch-default.xml&#010;    nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;&#010;Modified: nutch/trunk/CHANGES.txt&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1462078&amp;r1=1462077&amp;r2=1462078&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/CHANGES.txt (original)&#010;+++ nutch/trunk/CHANGES.txt Thu Mar 28 13:04:28 2013&#010;@@ -2,6 +2,8 @@ Nutch Change Log&#010; &#010; (trunk): Current Development&#010; &#010;+* NUTCH-1547 BasicIndexingFilter - Problem to index full title (Feng)&#010;+&#010; * NUTCH-1389 parsechecker and indexchecker to report truncated content (snagel)&#010; &#010; * NUTCH-1419 parsechecker and indexchecker to report protocol status (snagel + lewismc)&#010;&#010;Modified: nutch/trunk/conf/nutch-default.xml&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1462078&amp;r1=1462077&amp;r2=1462078&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/conf/nutch-default.xml (original)&#010;+++ nutch/trunk/conf/nutch-default.xml Thu Mar 28 13:04:28 2013&#010;@@ -897,7 +897,7 @@&#010; &lt;property&gt;&#010;   &lt;name&gt;indexer.max.title.length&lt;/name&gt;&#010;   &lt;value&gt;100&lt;/value&gt;&#010;-  &lt;description&gt;The maximum number of characters of a title that are indexed.&#010;+  &lt;description&gt;The maximum number of characters of a title that are indexed. A value&#010;of -1 disables this check.&#010;   &lt;/description&gt;&#010; &lt;/property&gt;&#010; &#010;&#010;Modified: nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1462078&amp;r1=1462077&amp;r2=1462078&amp;view=diff&#010;==============================================================================&#010;--- nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;(original)&#010;+++ nutch/trunk/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java&#010;Thu Mar 28 13:04:28 2013&#010;@@ -108,7 +108,7 @@ public class BasicIndexingFilter impleme&#010; &#010;     // title&#010;     String title = parse.getData().getTitle();&#010;-    if (title.length() &gt; MAX_TITLE_LENGTH) {      // truncate title if needed&#010;+    if (MAX_TITLE_LENGTH &gt; -1 &amp;&amp; title.length() &gt; MAX_TITLE_LENGTH) {     &#010;// truncate title if needed&#010;       title = title.substring(0, MAX_TITLE_LENGTH);&#010;     }&#010; &#010;&#010;&#010;&#010;
</pre>
</div>
</content>
</entry>
</feed>
