nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From thammego...@apache.org
Subject [25/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build
Date Tue, 05 Jul 2016 22:49:09 GMT
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
new file mode 100644
index 0000000..2988114
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/RegexURLFilter.java
@@ -0,0 +1,111 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * Filters URLs based on a file of regular expressions using the
+ * {@link java.util.regex Java Regex implementation}.
+ */
+public class RegexURLFilter extends RegexURLFilterBase {
+
+  public static final String URLFILTER_REGEX_FILE = "urlfilter.regex.file";
+  public static final String URLFILTER_REGEX_RULES = "urlfilter.regex.rules";
+
+  public RegexURLFilter() {
+    super();
+  }
+
+  public RegexURLFilter(String filename) throws IOException,
+      PatternSyntaxException {
+    super(filename);
+  }
+
+  RegexURLFilter(Reader reader) throws IOException, IllegalArgumentException {
+    super(reader);
+  }
+
+  /*
+   * ----------------------------------- * <implementation:RegexURLFilterBase> *
+   * -----------------------------------
+   */
+
+  /**
+   * Rules specified as a config property will override rules specified as a
+   * config file.
+   */
+  protected Reader getRulesReader(Configuration conf) throws IOException {
+    String stringRules = conf.get(URLFILTER_REGEX_RULES);
+    if (stringRules != null) {
+      return new StringReader(stringRules);
+    }
+    String fileRules = conf.get(URLFILTER_REGEX_FILE);
+    return conf.getConfResourceAsReader(fileRules);
+  }
+
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
+    return new Rule(sign, regex, hostOrDomain);
+  }
+  
+  
+
+  /*
+   * ------------------------------------ * </implementation:RegexURLFilterBase>
+   * * ------------------------------------
+   */
+
+  public static void main(String args[]) throws IOException {
+    RegexURLFilter filter = new RegexURLFilter();
+    filter.setConf(NutchConfiguration.create());
+    main(filter, args);
+  }
+
+  private class Rule extends RegexRule {
+
+    private Pattern pattern;
+
+    Rule(boolean sign, String regex) {
+      this(sign, regex, null);
+    }
+    
+    Rule(boolean sign, String regex, String hostOrDomain) {
+      super(sign, regex, hostOrDomain);
+      pattern = Pattern.compile(regex);
+    }
+
+    protected boolean match(String url) {
+      return pattern.matcher(url).find();
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
new file mode 100644
index 0000000..7acf73b
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/main/java/org/apache/nutch/urlfilter/regex/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>URL filter plugin to include and/or exclude URLs matching Java regular expressions.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
new file mode 100644
index 0000000..b86181e
--- /dev/null
+++ b/nutch-plugins/urlfilter-regex/src/test/org/apache/nutch/urlfilter/regex/TestRegexURLFilter.java
@@ -0,0 +1,61 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.regex;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.*;
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based test of class <code>RegexURLFilter</code>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestRegexURLFilter extends RegexURLFilterBaseTest {
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new RegexURLFilter(rules);
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+      return null;
+    }
+  }
+
+  @Test
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+  
+  @Test
+  public void test1838() {
+    test("nutch1838");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/build.xml b/nutch-plugins/urlfilter-suffix/build.xml
new file mode 100644
index 0000000..e5382c6
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-suffix" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/ivy.xml b/nutch-plugins/urlfilter-suffix/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/plugin.xml b/nutch-plugins/urlfilter-suffix/plugin.xml
new file mode 100644
index 0000000..f326d15
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-suffix"
+   name="Suffix URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-suffix.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.suffix"
+              name="Nutch Suffix URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="SuffixURLFilter"
+                      class="org.apache.nutch.urlfilter.suffix.SuffixURLFilter"/>
+      <!-- by default, attribute "file" is undefined, to keep classic behavior.
+      <implementation id="SuffixURLFilter"
+                      class="org.apache.nutch.net.SuffixURLFilter">
+        <parameter name="file" value="urlfilter-suffix.txt"/>
+      </implementation>
+      -->
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/pom.xml b/nutch-plugins/urlfilter-suffix/pom.xml
new file mode 100644
index 0000000..82023c6
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-suffix</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-suffix</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
new file mode 100644
index 0000000..39c541f
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/SuffixURLFilter.java
@@ -0,0 +1,331 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.urlfilter.suffix;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.*;
+
+import org.apache.nutch.util.NutchConfiguration;
+import org.apache.nutch.util.SuffixStringMatcher;
+
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.io.Reader;
+import java.io.FileReader;
+import java.io.BufferedReader;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.io.StringReader;
+
+import java.util.List;
+import java.util.ArrayList;
+
+import java.net.URL;
+import java.net.MalformedURLException;
+
+/**
+ * Filters URLs based on a file of URL suffixes. The file is named by
+ * <ol>
+ * <li>property "urlfilter.suffix.file" in ./conf/nutch-default.xml, and</li>
+ * <li>attribute "file" in plugin.xml of this plugin</li>
+ * </ol>
+ * Attribute "file" has higher precedence if defined. If the config file is
+ * missing, all URLs will be rejected.
+ * 
+ * <p>
+ * This filter can be configured to work in one of two modes:
+ * <ul>
+ * <li><b>default to reject</b> ('-'): in this mode, only URLs that match
+ * suffixes specified in the config file will be accepted, all other URLs will
+ * be rejected.</li>
+ * <li><b>default to accept</b> ('+'): in this mode, only URLs that match
+ * suffixes specified in the config file will be rejected, all other URLs will
+ * be accepted.</li>
+ * </ul>
+ * <p>
+ * The format of this config file is one URL suffix per line, with no preceding
+ * whitespace. Order, in which suffixes are specified, doesn't matter. Blank
+ * lines and comments (#) are allowed.
+ * </p>
+ * <p>
+ * A single '+' or '-' sign not followed by any suffix must be used once, to
+ * signify the mode this plugin operates in. An optional single 'I' can be
+ * appended, to signify that suffix matches should be case-insensitive. The
+ * default, if not specified, is to use case-sensitive matches, i.e. suffix
+ * '.JPG' does not match '.jpg'.
+ * </p>
+ * <p>
+ * NOTE: the format of this file is different from urlfilter-prefix, because
+ * that plugin doesn't support allowed/prohibited prefixes (only supports
+ * allowed prefixes). Please note that this plugin does not support regular
+ * expressions, it only accepts literal suffixes. I.e. a suffix "+*.jpg" is most
+ * probably wrong, you should use "+.jpg" instead.
+ * </p>
+ * <h4>Example 1</h4>
+ * <p>
+ * The configuration shown below will accept all URLs with '.html' or '.htm'
+ * suffixes (case-sensitive - '.HTML' or '.HTM' will be rejected), and prohibit
+ * all other suffixes.
+ * <p>
+ * 
+ * <pre>
+ *  # this is a comment
+ *  
+ *  # prohibit all unknown, case-sensitive matching
+ *  -
+ * 
+ *  # collect only HTML files.
+ *  .html
+ *  .htm
+ * </pre>
+ * 
+ * </p>
+ * <h4>Example 2</h4>
+ * <p>
+ * The configuration shown below will accept all URLs except common graphical
+ * formats.
+ * <p>
+ * 
+ * <pre>
+ *  # this is a comment
+ *  
+ *  # allow all unknown, case-insensitive matching
+ *  +I
+ *  
+ *  # prohibited suffixes
+ *  .gif
+ *  .png
+ *  .jpg
+ *  .jpeg
+ *  .bmp
+ * </pre>
+ * 
+ * </p>
+ * 
+ * @author Andrzej Bialecki
+ */
+public class SuffixURLFilter implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(SuffixURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private String attributeFile = null;
+
+  private SuffixStringMatcher suffixes;
+  private boolean modeAccept = false;
+  private boolean filterFromPath = false;
+  private boolean ignoreCase = false;
+
+  private Configuration conf;
+
+  public SuffixURLFilter() throws IOException {
+
+  }
+
+  public SuffixURLFilter(Reader reader) throws IOException {
+    readConfiguration(reader);
+  }
+
+  public String filter(String url) {
+    if (url == null)
+      return null;
+    String _url;
+    if (ignoreCase)
+      _url = url.toLowerCase();
+    else
+      _url = url;
+    if (filterFromPath) {
+      try {
+        URL pUrl = new URL(_url);
+        _url = pUrl.getPath();
+      } catch (MalformedURLException e) {
+        // don't care
+      }
+    }
+
+    String a = suffixes.shortestMatch(_url);
+    if (a == null) {
+      if (modeAccept)
+        return url;
+      else
+        return null;
+    } else {
+      if (modeAccept)
+        return null;
+      else
+        return url;
+    }
+  }
+
+  public void readConfiguration(Reader reader) throws IOException {
+
+    // handle missing config file
+    if (reader == null) {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Missing urlfilter.suffix.file, all URLs will be rejected!");
+      }
+      suffixes = new SuffixStringMatcher(new String[0]);
+      modeAccept = false;
+      ignoreCase = false;
+      return;
+    }
+    BufferedReader in = new BufferedReader(reader);
+    List<String> aSuffixes = new ArrayList<String>();
+    boolean allow = false;
+    boolean ignore = false;
+    String line;
+
+    while ((line = in.readLine()) != null) {
+      line = line.trim();
+      if (line.length() == 0)
+        continue;
+
+      char first = line.charAt(0);
+      switch (first) {
+      case ' ':
+      case '\n':
+      case '#': // skip blank & comment lines
+        break;
+      case '-':
+        allow = false;
+        if (line.contains("P"))
+          filterFromPath = true;
+        if (line.contains("I"))
+          ignore = true;
+        break;
+      case '+':
+        allow = true;
+        if (line.contains("P"))
+          filterFromPath = true;
+        if (line.contains("I"))
+          ignore = true;
+        break;
+      default:
+        aSuffixes.add(line);
+      }
+    }
+    if (ignore) {
+      for (int i = 0; i < aSuffixes.size(); i++) {
+        aSuffixes.set(i, ((String) aSuffixes.get(i)).toLowerCase());
+      }
+    }
+    suffixes = new SuffixStringMatcher(aSuffixes);
+    modeAccept = allow;
+    ignoreCase = ignore;
+  }
+
+  public static void main(String args[]) throws IOException {
+
+    SuffixURLFilter filter;
+    if (args.length >= 1)
+      filter = new SuffixURLFilter(new FileReader(args[0]));
+    else {
+      filter = new SuffixURLFilter();
+      filter.setConf(NutchConfiguration.create());
+    }
+
+    BufferedReader in = new BufferedReader(new InputStreamReader(System.in));
+    String line;
+    while ((line = in.readLine()) != null) {
+      String out = filter.filter(line);
+      if (out != null) {
+        System.out.println("ACCEPTED " + out);
+      } else {
+        System.out.println("REJECTED " + out);
+      }
+    }
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    String pluginName = "urlfilter-suffix";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+    if (attributeFile != null && attributeFile.trim().equals(""))
+      attributeFile = null;
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      // if (LOG.isWarnEnabled()) {
+      // LOG.warn("Attribute \"file\" is not defined in plugin.xml for
+      // plugin "+pluginName);
+      // }
+    }
+
+    String file = conf.get("urlfilter.suffix.file");
+    String stringRules = conf.get("urlfilter.suffix.rules");
+    // attribute "file" takes precedence if defined
+    if (attributeFile != null)
+      file = attributeFile;
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+
+    try {
+      readConfiguration(reader);
+    } catch (IOException e) {
+      if (LOG.isErrorEnabled()) {
+        LOG.error(e.getMessage());
+      }
+      throw new RuntimeException(e.getMessage(), e);
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public boolean isModeAccept() {
+    return modeAccept;
+  }
+
+  public void setModeAccept(boolean modeAccept) {
+    this.modeAccept = modeAccept;
+  }
+
+  public boolean isIgnoreCase() {
+    return ignoreCase;
+  }
+
+  public void setIgnoreCase(boolean ignoreCase) {
+    this.ignoreCase = ignoreCase;
+  }
+
+  public void setFilterFromPath(boolean filterFromPath) {
+    this.filterFromPath = filterFromPath;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java
new file mode 100644
index 0000000..0449acc
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/src/main/java/org/apache/nutch/urlfilter/suffix/package-info.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to either exclude or include only URLs which match
+ * one of the given (path) suffixes.
+ */
+package org.apache.nutch.urlfilter.suffix;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
new file mode 100644
index 0000000..b09ca2f
--- /dev/null
+++ b/nutch-plugins/urlfilter-suffix/src/test/org/apache/nutch/urlfilter/suffix/TestSuffixURLFilter.java
@@ -0,0 +1,123 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.suffix;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+/**
+ * JUnit test for <code>SuffixURLFilter</code>.
+ * 
+ * @author Andrzej Bialecki
+ */
+public class TestSuffixURLFilter {
+  private static final String suffixes = "# this is a comment\n" + "\n"
+      + ".gif\n" + ".jpg\n" + ".js\n";
+
+  private static final String[] urls = new String[] {
+      "http://www.example.com/test.gif", "http://www.example.com/TEST.GIF",
+      "http://www.example.com/test.jpg", "http://www.example.com/test.JPG",
+      "http://www.example.com/test.html", "http://www.example.com/test.HTML",
+      "http://www.example.com/test.html?q=abc.js",
+      "http://www.example.com/test.js?foo=bar&baz=bar#12333", };
+
+  private static String[] urlsModeAccept = new String[] { null, urls[1], null,
+      urls[3], urls[4], urls[5], null, urls[7] };
+
+  private static String[] urlsModeReject = new String[] { urls[0], null,
+      urls[2], null, null, null, urls[6], null };
+
+  private static String[] urlsModeAcceptIgnoreCase = new String[] { null, null,
+      null, null, urls[4], urls[5], null, urls[7] };
+
+  private static String[] urlsModeRejectIgnoreCase = new String[] { urls[0],
+      urls[1], urls[2], urls[3], null, null, urls[6], null };
+
+  private static String[] urlsModeAcceptAndPathFilter = new String[] { null,
+      urls[1], null, urls[3], urls[4], urls[5], urls[6], null };
+
+  private static String[] urlsModeAcceptAndNonPathFilter = new String[] { null,
+      urls[1], null, urls[3], urls[4], urls[5], null, urls[7] };
+
+  private SuffixURLFilter filter = null;
+
+  @Before
+  public void setUp() throws IOException {
+    filter = new SuffixURLFilter(new StringReader(suffixes));
+  }
+
+  @Test
+  public void testModeAccept() {
+    filter.setIgnoreCase(false);
+    filter.setModeAccept(true);
+    for (int i = 0; i < urls.length; i++) {
+      Assert.assertTrue(urlsModeAccept[i] == filter.filter(urls[i]));
+    }
+  }
+
+  @Test
+  public void testModeReject() {
+    filter.setIgnoreCase(false);
+    filter.setModeAccept(false);
+    for (int i = 0; i < urls.length; i++) {
+      Assert.assertTrue(urlsModeReject[i] == filter.filter(urls[i]));
+    }
+  }
+
+  @Test
+  public void testModeAcceptIgnoreCase() {
+    filter.setIgnoreCase(true);
+    filter.setModeAccept(true);
+    for (int i = 0; i < urls.length; i++) {
+      Assert.assertTrue(urlsModeAcceptIgnoreCase[i] == filter.filter(urls[i]));
+    }
+  }
+
+  @Test
+  public void testModeRejectIgnoreCase() {
+    filter.setIgnoreCase(true);
+    filter.setModeAccept(false);
+    for (int i = 0; i < urls.length; i++) {
+      Assert.assertTrue(urlsModeRejectIgnoreCase[i] == filter.filter(urls[i]));
+    }
+  }
+
+  @Test
+  public void testModeAcceptAndNonPathFilter() {
+    filter.setModeAccept(true);
+    filter.setFilterFromPath(false);
+    for (int i = 0; i < urls.length; i++) {
+      Assert.assertTrue(urlsModeAcceptAndNonPathFilter[i] == filter
+          .filter(urls[i]));
+    }
+  }
+
+  @Test
+  public void testModeAcceptAndPathFilter() {
+    filter.setModeAccept(true);
+    filter.setFilterFromPath(true);
+    for (int i = 0; i < urls.length; i++) {
+      Assert.assertTrue(urlsModeAcceptAndPathFilter[i] == filter
+          .filter(urls[i]));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/build.xml b/nutch-plugins/urlfilter-validator/build.xml
new file mode 100644
index 0000000..4de9292
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-validator" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/ivy.xml b/nutch-plugins/urlfilter-validator/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/plugin.xml b/nutch-plugins/urlfilter-validator/plugin.xml
new file mode 100644
index 0000000..413b288
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-validator"
+   name="URL Validator"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-validator.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.validator"
+              name="Nutch URL Validatorr"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="URLValidator"
+                      class="org.apache.nutch.urlfilter.validator.UrlValidator"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/pom.xml b/nutch-plugins/urlfilter-validator/pom.xml
new file mode 100644
index 0000000..9eaf641
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-validator</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-validator</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
new file mode 100644
index 0000000..03fca97
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/UrlValidator.java
@@ -0,0 +1,386 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.validator;
+
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+
+/**
+ * <p>
+ * Validates URLs.
+ * </p>
+ * 
+ * <p>
+ * Originally based in on php script by Debbie Dyer, validation.php v1.2b, Date:
+ * 03/07/02, http://javascript.internet.com. However, this validation now bears
+ * little resemblance to the php original.
+ * </p>
+ * 
+ * <pre>
+ *   Example of usage:
+ *    UrlValidator urlValidator = UrlValidator.get();
+ *    if (urlValidator.isValid("ftp://foo.bar.com/")) {
+ *       System.out.println("url is valid");
+ *    } else {
+ *       System.out.println("url is invalid");
+ *    }
+ * 
+ *   prints out "url is valid"
+ * </pre>
+ * 
+ * <p>
+ * Based on UrlValidator code from Apache commons-validator.
+ * </p>
+ * 
+ * @see <a href='http://www.ietf.org/rfc/rfc2396.txt' > Uniform Resource
+ *      Identifiers (URI): Generic Syntax </a>
+ * 
+ */
+public class UrlValidator implements URLFilter {
+
+  private static final String ALPHA_CHARS = "a-zA-Z";
+
+  private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d";
+
+  private static final String SPECIAL_CHARS = ";/@&=,.?:+$";
+
+  private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]";
+
+  private static final String SCHEME_CHARS = ALPHA_CHARS;
+
+  // Drop numeric, and "+-." for now
+  private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\.";
+
+  private static final String ATOM = VALID_CHARS + '+';
+
+  /**
+   * This expression derived/taken from the BNF for URI (RFC2396).
+   */
+  private static final Pattern URL_PATTERN = Pattern
+      .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)"
+          + "(\\?([^#]*))?(#(.*))?");
+
+  /**
+   * Schema/Protocol (ie. http:, ftp:, file:, etc).
+   */
+  private static final int PARSE_URL_SCHEME = 2;
+
+  /**
+   * Includes hostname/ip and port number.
+   */
+  private static final int PARSE_URL_AUTHORITY = 4;
+
+  private static final int PARSE_URL_PATH = 5;
+
+  private static final int PARSE_URL_QUERY = 7;
+
+  /**
+   * Protocol (ie. http:, ftp:,https:).
+   */
+  private static final Pattern SCHEME_PATTERN = Pattern.compile("^["
+      + SCHEME_CHARS + "]+");
+
+  private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^(["
+      + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?");
+
+  private static final int PARSE_AUTHORITY_HOST_IP = 1;
+
+  private static final int PARSE_AUTHORITY_PORT = 2;
+
+  /**
+   * Should always be empty.
+   */
+  private static final int PARSE_AUTHORITY_EXTRA = 3;
+
+  private static final Pattern PATH_PATTERN = Pattern
+      .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$");
+
+  private static final Pattern QUERY_PATTERN = Pattern.compile("^(.*)$");
+
+  private static final Pattern LEGAL_ASCII_PATTERN = Pattern
+      .compile("^[\\x21-\\x7E]+$");
+
+  private static final Pattern IP_V4_DOMAIN_PATTERN = Pattern
+      .compile("^(\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})[.](\\d{1,3})$");
+
+  private static final Pattern DOMAIN_PATTERN = Pattern.compile("^" + ATOM
+      + "(\\." + ATOM + ")*$");
+
+  private static final Pattern PORT_PATTERN = Pattern.compile("^:(\\d{1,5})$");
+
+  private static final Pattern ATOM_PATTERN = Pattern.compile("(" + ATOM + ")");
+
+  private static final Pattern ALPHA_PATTERN = Pattern.compile("^["
+      + ALPHA_CHARS + "]");
+
+  private Configuration conf;
+
+  public String filter(String urlString) {
+    return isValid(urlString) ? urlString : null;
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  /**
+   * <p>
+   * Checks if a field has a valid url address.
+   * </p>
+   * 
+   * @param value
+   *          The value validation is being performed on. A <code>null</code>
+   *          value is considered invalid.
+   * @return true if the url is valid.
+   */
+  private boolean isValid(String value) {
+    if (value == null) {
+      return false;
+    }
+
+    Matcher matchUrlPat = URL_PATTERN.matcher(value);
+    if (!LEGAL_ASCII_PATTERN.matcher(value).matches()) {
+      return false;
+    }
+
+    // Check the whole url address structure
+    if (!matchUrlPat.matches()) {
+      return false;
+    }
+
+    if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) {
+      return false;
+    }
+
+    if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) {
+      return false;
+    }
+
+    if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) {
+      return false;
+    }
+
+    if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) {
+      return false;
+    }
+
+    return true;
+  }
+
+  /**
+   * Validate scheme. If schemes[] was initialized to a non null, then only
+   * those scheme's are allowed. Note this is slightly different than for the
+   * constructor.
+   * 
+   * @param scheme
+   *          The scheme to validate. A <code>null</code> value is considered
+   *          invalid.
+   * @return true if valid.
+   */
+  private boolean isValidScheme(String scheme) {
+    if (scheme == null) {
+      return false;
+    }
+
+    return SCHEME_PATTERN.matcher(scheme).matches();
+  }
+
+  /**
+   * Returns true if the authority is properly formatted. An authority is the
+   * combination of hostname and port. A <code>null</code> authority value is
+   * considered invalid.
+   * 
+   * @param authority
+   *          Authority value to validate.
+   * @return true if authority (hostname and port) is valid.
+   */
+  private boolean isValidAuthority(String authority) {
+    if (authority == null) {
+      return false;
+    }
+
+    Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority);
+    if (!authorityMatcher.matches()) {
+      return false;
+    }
+
+    boolean ipV4Address = false;
+    boolean hostname = false;
+    // check if authority is IP address or hostname
+    String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP);
+    Matcher matchIPV4Pat = IP_V4_DOMAIN_PATTERN.matcher(hostIP);
+    ipV4Address = matchIPV4Pat.matches();
+
+    if (ipV4Address) {
+      // this is an IP address so check components
+      for (int i = 1; i <= 4; i++) {
+        String ipSegment = matchIPV4Pat.group(i);
+        if (ipSegment == null || ipSegment.length() <= 0) {
+          return false;
+        }
+
+        try {
+          if (Integer.parseInt(ipSegment) > 255) {
+            return false;
+          }
+        } catch (NumberFormatException e) {
+          return false;
+        }
+
+      }
+    } else {
+      // Domain is hostname name
+      hostname = DOMAIN_PATTERN.matcher(hostIP).matches();
+    }
+
+    // rightmost hostname will never start with a digit.
+    if (hostname) {
+      // LOW-TECH FIX FOR VALIDATOR-202
+      // TODO: Rewrite to use ArrayList and .add semantics: see VALIDATOR-203
+      char[] chars = hostIP.toCharArray();
+      int size = 1;
+      for (int i = 0; i < chars.length; i++) {
+        if (chars[i] == '.') {
+          size++;
+        }
+      }
+      String[] domainSegment = new String[size];
+      int segCount = 0;
+      int segLen = 0;
+      Matcher atomMatcher = ATOM_PATTERN.matcher(hostIP);
+
+      while (atomMatcher.find()) {
+        domainSegment[segCount] = atomMatcher.group();
+        segLen = domainSegment[segCount].length() + 1;
+        hostIP = (segLen >= hostIP.length()) ? "" : hostIP.substring(segLen);
+        segCount++;
+      }
+      String topLevel = domainSegment[segCount - 1];
+      if (topLevel.length() < 2 || topLevel.length() > 4) {
+        return false;
+      }
+
+      // First letter of top level must be a alpha
+      if (!ALPHA_PATTERN.matcher(topLevel.substring(0, 1)).matches()) {
+        return false;
+      }
+
+      // Make sure there's a host name preceding the authority.
+      if (segCount < 2) {
+        return false;
+      }
+    }
+
+    if (!hostname && !ipV4Address) {
+      return false;
+    }
+
+    String port = authorityMatcher.group(PARSE_AUTHORITY_PORT);
+    if (port != null) {
+      if (!PORT_PATTERN.matcher(port).matches()) {
+        return false;
+      }
+    }
+
+    String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA);
+    return isBlankOrNull(extra);
+  }
+
+  /**
+   * <p>
+   * Checks if the field isn't null and length of the field is greater than zero
+   * not including whitespace.
+   * </p>
+   * 
+   * @param value
+   *          The value validation is being performed on.
+   * @return true if blank or null.
+   */
+  private boolean isBlankOrNull(String value) {
+    return ((value == null) || (value.trim().length() == 0));
+  }
+
+  /**
+   * Returns true if the path is valid. A <code>null</code> value is considered
+   * invalid.
+   * 
+   * @param path
+   *          Path value to validate.
+   * @return true if path is valid.
+   */
+  private boolean isValidPath(String path) {
+    if (path == null) {
+      return false;
+    }
+
+    if (!PATH_PATTERN.matcher(path).matches()) {
+      return false;
+    }
+
+    int slash2Count = countToken("//", path);
+    int slashCount = countToken("/", path);
+    int dot2Count = countToken("..", path);
+
+    return (dot2Count <= 0) || ((slashCount - slash2Count - 1) > dot2Count);
+  }
+
+  /**
+   * Returns true if the query is null or it's a properly formatted query
+   * string.
+   * 
+   * @param query
+   *          Query value to validate.
+   * @return true if query is valid.
+   */
+  private boolean isValidQuery(String query) {
+    if (query == null) {
+      return true;
+    }
+
+    return QUERY_PATTERN.matcher(query).matches();
+  }
+
+  /**
+   * Returns the number of times the token appears in the target.
+   * 
+   * @param token
+   *          Token value to be counted.
+   * @param target
+   *          Target value to count tokens in.
+   * @return the number of tokens.
+   */
+  private int countToken(String token, String target) {
+    int tokenIndex = 0;
+    int count = 0;
+    while (tokenIndex != -1) {
+      tokenIndex = target.indexOf(token, tokenIndex);
+      if (tokenIndex > -1) {
+        tokenIndex++;
+        count++;
+      }
+    }
+    return count;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html
new file mode 100644
index 0000000..b5ec8a1
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/src/main/java/org/apache/nutch/urlfilter/validator/package.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>URL filter plugin that validates given urls.</p>
+<p>This plugin runs a series of tests for the given url to make sure that given
+url is valid and 'fetchable'.</p>
+<p>Note: This plugin should <b>only</b> be used for web-related protocols such
+as http, https and ftp.</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
new file mode 100644
index 0000000..2e6d695
--- /dev/null
+++ b/nutch-plugins/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java
@@ -0,0 +1,79 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.validator;
+
+import org.apache.nutch.urlfilter.validator.UrlValidator;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit test case which tests 1. that valid urls are not filtered while invalid
+ * ones are filtered. 2. that Urls' scheme, authority, path and query are
+ * validated.
+ * 
+ * @author tejasp
+ * 
+ */
+
+public class TestUrlValidator {
+
+  /**
+   * Test method for
+   * {@link org.apache.nutch.urlfilter.validator.UrlValidator#filter(java.lang.String)}
+   * .
+   */
+  @Test
+  public void testFilter() {
+    UrlValidator url_validator = new UrlValidator();
+    Assert.assertNotNull(url_validator);
+
+    Assert.assertNull("Filtering on a null object should return null",
+        url_validator.filter(null));
+    Assert.assertNull("Invalid url: example.com/file[/].html",
+        url_validator.filter("example.com/file[/].html"));
+    Assert.assertNull("Invalid url: http://www.example.com/space here.html",
+        url_validator.filter("http://www.example.com/space here.html"));
+    Assert.assertNull("Invalid url: /main.html",
+        url_validator.filter("/main.html"));
+    Assert.assertNull("Invalid url: www.example.com/main.html",
+        url_validator.filter("www.example.com/main.html"));
+    Assert.assertNull("Invalid url: ftp:www.example.com/main.html",
+        url_validator.filter("ftp:www.example.com/main.html"));
+    Assert.assertNull(
+        "Inalid url: http://999.000.456.32/nutch/trunk/README.txt",
+        url_validator.filter("http://999.000.456.32/nutch/trunk/README.txt"));
+    Assert.assertNull("Invalid url: http://www.example.com/ma|in\\toc.html",
+        url_validator.filter(" http://www.example.com/ma|in\\toc.html"));
+
+    Assert.assertNotNull(
+        "Valid url: https://issues.apache.org/jira/NUTCH-1127",
+        url_validator.filter("https://issues.apache.org/jira/NUTCH-1127"));
+    Assert
+        .assertNotNull(
+            "Valid url: http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather",
+            url_validator
+                .filter("http://domain.tld/function.cgi?url=http://fonzi.com/&amp;name=Fonzi&amp;mood=happy&amp;coat=leather"));
+    Assert
+        .assertNotNull(
+            "Valid url: http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress",
+            url_validator
+                .filter("http://validator.w3.org/feed/check.cgi?url=http%3A%2F%2Ffeeds.feedburner.com%2Fperishablepress"));
+    Assert.assertNotNull("Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf",
+        url_validator.filter("ftp://alfa.bravo.pi/mike/check/plan.pdf"));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/build.xml b/nutch-plugins/urlmeta/build.xml
new file mode 100644
index 0000000..ed8d9c9
--- /dev/null
+++ b/nutch-plugins/urlmeta/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlmeta" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/ivy.xml b/nutch-plugins/urlmeta/ivy.xml
new file mode 100644
index 0000000..24d7606
--- /dev/null
+++ b/nutch-plugins/urlmeta/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../../ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/plugin.xml b/nutch-plugins/urlmeta/plugin.xml
new file mode 100644
index 0000000..c31adf6
--- /dev/null
+++ b/nutch-plugins/urlmeta/plugin.xml
@@ -0,0 +1,47 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlmeta"
+   name="URL Meta Indexing Filter"
+   version="1.0.0"
+   provider-name="sgonyea">
+
+
+   <runtime>
+      <library name="urlmeta.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension      id="org.apache.nutch.indexer.urlmeta"
+                    name="URL Meta Indexing Filter"
+                    point="org.apache.nutch.indexer.IndexingFilter">
+   <implementation id="indexer-urlmeta"
+                    class="org.apache.nutch.indexer.urlmeta.URLMetaIndexingFilter"/>
+   </extension>
+   <extension      id="org.apache.nutch.scoring.urlmeta"
+                    name="URL Meta Scoring Filter"
+                    point="org.apache.nutch.scoring.ScoringFilter">
+   <implementation id="scoring-urlmeta"
+                    class="org.apache.nutch.scoring.urlmeta.URLMetaScoringFilter" />
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/pom.xml b/nutch-plugins/urlmeta/pom.xml
new file mode 100644
index 0000000..cba0b62
--- /dev/null
+++ b/nutch-plugins/urlmeta/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlmeta</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlmeta</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
new file mode 100644
index 0000000..dc673a2
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/URLMetaIndexingFilter.java
@@ -0,0 +1,118 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.urlmeta;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+
+/**
+ * This is part of the URL Meta plugin. It is designed to enhance the NUTCH-655
+ * patch, by doing two things: 1. Meta Tags that are supplied with your Crawl
+ * URLs, during injection, will be propagated throughout the outlinks of those
+ * Crawl URLs. 2. When you index your URLs, the meta tags that you specified
+ * with your URLs will be indexed alongside those URLs--and can be directly
+ * queried, assuming you have done everything else correctly.
+ * 
+ * The flat-file of URLs you are injecting should, per NUTCH-655, be
+ * tab-delimited in the form of:
+ * 
+ * [www.url.com]\t[key1]=[value1]\t[key2]=[value2]...[keyN]=[valueN]
+ * 
+ * Be aware that if you collide with keywords that are already in use (such as
+ * nutch.score/nutch.fetchInterval) then you are in for some unpredictable
+ * behavior.
+ * 
+ * Furthermore, in your nutch-site.xml config, you must specify that this plugin
+ * is to be used (1), as well as what (2) Meta Tags it should actively look for.
+ * This does not mean that you must use these tags for every URL, but it does
+ * mean that you must list _all_ of meta tags that you have specified. If you
+ * want them to be propagated and indexed, that is.
+ * 
+ * 1. As of Nutch 1.2, the property "plugin.includes" looks as follows:
+ * <value>protocol-http|urlfilter-regex|parse-(text|html|js|tika|rss)|index
+ * -(basic|anchor)|query-(basic|site|url)|response-(json|xml)|summary-basic
+ * |scoring-opic|urlnormalizer-(pass|regex|basic)</value> You must change
+ * "index-(basic|anchor)" to "index-(basic|anchor|urlmeta)", in order to call
+ * this plugin.
+ * 
+ * 2. You must also specify the property "urlmeta.tags", who's values are
+ * comma-delimited <value>key1, key2, key3</value>
+ * 
+ * TODO: It may be ideal to offer two separate properties, to specify what gets
+ * indexed versus merely propagated.
+ * 
+ */
+public class URLMetaIndexingFilter implements IndexingFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(URLMetaIndexingFilter.class);
+  private static final String CONF_PROPERTY = "urlmeta.tags";
+  private static String[] urlMetaTags;
+  private Configuration conf;
+
+  /**
+   * This will take the metatags that you have listed in your "urlmeta.tags"
+   * property, and looks for them inside the CrawlDatum object. If they exist,
+   * this will add it as an attribute inside the NutchDocument.
+   * 
+   * @see IndexingFilter#filter
+   */
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    if (conf != null)
+      this.setConf(conf);
+
+    if (urlMetaTags == null || doc == null)
+      return doc;
+
+    for (String metatag : urlMetaTags) {
+      Text metadata = (Text) datum.getMetaData().get(new Text(metatag));
+
+      if (metadata != null)
+        doc.add(metatag, metadata.toString());
+    }
+
+    return doc;
+  }
+
+  /** Boilerplate */
+  public Configuration getConf() {
+    return conf;
+  }
+
+  /**
+   * handles conf assignment and pulls the value assignment from the
+   * "urlmeta.tags" property
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    if (conf == null)
+      return;
+
+    urlMetaTags = conf.getStrings(CONF_PROPERTY);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html
new file mode 100644
index 0000000..5da5d56
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/indexer/urlmeta/package.html
@@ -0,0 +1,12 @@
+<html>
+  <body>
+    <p>
+      URL Meta Tag Indexing Plugin
+    </p>
+    <p>
+      Takes Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property, 
+      and inserts them into the document--which is then sent to the Indexer.  If you specify these fields in
+      the Nutch's schema (as well as the Indexer's), you can reasonably assume that they will be indexed.
+    </p>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
new file mode 100644
index 0000000..3965e42
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/URLMetaScoringFilter.java
@@ -0,0 +1,175 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.urlmeta;
+
+import java.util.Collection;
+import java.util.Map.Entry;
+import java.util.Iterator;
+import java.util.List;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+
+/**
+ * For documentation:
+ * 
+ * @see URLMetaIndexingFilter
+ */
+public class URLMetaScoringFilter extends Configured implements ScoringFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(URLMetaScoringFilter.class);
+  private static final String CONF_PROPERTY = "urlmeta.tags";
+  private static String[] urlMetaTags;
+  private Configuration conf;
+
+  /**
+   * This will take the metatags that you have listed in your "urlmeta.tags"
+   * property, and looks for them inside the parseData object. If they exist,
+   * this will be propagated into your 'targets' Collection's ["outlinks"]
+   * attributes.
+   * 
+   * @see ScoringFilter#distributeScoreToOutlinks
+   */
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    if (urlMetaTags == null || targets == null || parseData == null)
+      return adjust;
+
+    Iterator<Entry<Text, CrawlDatum>> targetIterator = targets.iterator();
+
+    while (targetIterator.hasNext()) {
+      Entry<Text, CrawlDatum> nextTarget = targetIterator.next();
+
+      for (String metatag : urlMetaTags) {
+        String metaFromParse = parseData.getMeta(metatag);
+
+        if (metaFromParse == null)
+          continue;
+
+        nextTarget.getValue().getMetaData()
+            .put(new Text(metatag), new Text(metaFromParse));
+      }
+    }
+    return adjust;
+  }
+
+  /**
+   * Takes the metadata, specified in your "urlmeta.tags" property, from the
+   * datum object and injects it into the content. This is transfered to the
+   * parseData object.
+   * 
+   * @see ScoringFilter#passScoreBeforeParsing
+   * @see URLMetaScoringFilter#passScoreAfterParsing
+   */
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) {
+    if (urlMetaTags == null || content == null || datum == null)
+      return;
+
+    for (String metatag : urlMetaTags) {
+      Text metaFromDatum = (Text) datum.getMetaData().get(new Text(metatag));
+
+      if (metaFromDatum == null)
+        continue;
+
+      content.getMetadata().set(metatag, metaFromDatum.toString());
+    }
+  }
+
+  /**
+   * Takes the metadata, which was lumped inside the content, and replicates it
+   * within your parse data.
+   * 
+   * @see URLMetaScoringFilter#passScoreBeforeParsing
+   * @see ScoringFilter#passScoreAfterParsing
+   */
+  public void passScoreAfterParsing(Text url, Content content, Parse parse) {
+    if (urlMetaTags == null || content == null || parse == null)
+      return;
+
+    for (String metatag : urlMetaTags) {
+      String metaFromContent = content.getMetadata().get(metatag);
+
+      if (metaFromContent == null)
+        continue;
+
+      parse.getData().getParseMeta().set(metatag, metaFromContent);
+    }
+  }
+
+  /** Boilerplate */
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return initSort;
+  }
+
+  /** Boilerplate */
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+    return initScore;
+  }
+
+  /** Boilerplate */
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    return;
+  }
+
+  /** Boilerplate */
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+    return;
+  }
+
+  /** Boilerplate */
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+    return;
+  }
+
+  /**
+   * handles conf assignment and pulls the value assignment from the
+   * "urlmeta.tags" property
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+
+    if (conf == null)
+      return;
+
+    urlMetaTags = conf.getStrings(CONF_PROPERTY);
+  }
+
+  /** Boilerplate */
+  public Configuration getConf() {
+    return conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html
new file mode 100644
index 0000000..5bba7a8
--- /dev/null
+++ b/nutch-plugins/urlmeta/src/main/java/org/apache/nutch/scoring/urlmeta/package.html
@@ -0,0 +1,11 @@
+<html>
+  <body>
+    <p>
+      URL Meta Tag Scoring Plugin
+    </p>
+    <p>
+      Propagates Meta Tags, injected alongside a URL (see NUTCH-655) and specified in the "urlmeta.tags" property, 
+      along to their outlinks.  This does not actually perform scoring.
+    </p>
+  </body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/build.xml b/nutch-plugins/urlnormalizer-ajax/build.xml
new file mode 100644
index 0000000..e100f8a
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlnormalizer-ajax" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/ivy.xml b/nutch-plugins/urlnormalizer-ajax/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/plugin.xml b/nutch-plugins/urlnormalizer-ajax/plugin.xml
new file mode 100644
index 0000000..ad8c72c
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/plugin.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlnormalizer-ajax"
+   name="AJAX URL Normalizer"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlnormalizer-ajax.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlnormalizer.ajax"
+              name="Nutch AJAX URL Normalizer"
+              point="org.apache.nutch.net.URLNormalizer">
+      <implementation id="AjaxURLNormalizer"
+                      class="org.apache.nutch.net.urlnormalizer.ajax.AjaxURLNormalizer"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/pom.xml b/nutch-plugins/urlnormalizer-ajax/pom.xml
new file mode 100644
index 0000000..e32d952
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlnormalizer-ajax</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlnormalizer-ajax</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java b/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
new file mode 100644
index 0000000..5286f6f
--- /dev/null
+++ b/nutch-plugins/urlnormalizer-ajax/src/main/java/org/apache/nutch/net/urlnormalizer/ajax/AjaxURLNormalizer.java
@@ -0,0 +1,236 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.net.urlnormalizer.ajax;
+
+import java.net.URL;
+import java.net.URI;
+import java.net.URLEncoder;
+import java.net.URLDecoder;
+import java.net.MalformedURLException;
+import java.nio.charset.Charset;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.net.URLNormalizer;
+import org.apache.nutch.net.URLNormalizers;
+import org.apache.hadoop.conf.Configuration;
+
+/**
+ * URLNormalizer capable of dealing with AJAX URL's.
+ *
+ * Use the following regex filter to prevent escaped fragments from being fetched.
+ * ^(.*)\?.*_escaped_fragment_
+ */
+public class AjaxURLNormalizer implements URLNormalizer {
+  public static final Logger LOG = LoggerFactory.getLogger(AjaxURLNormalizer.class);
+
+  public static String AJAX_URL_PART = "#!";
+  public static String ESCAPED_URL_PART = "_escaped_fragment_=";
+
+  private Configuration conf;
+  private Charset utf8;
+
+  /**
+   * Default constructor.
+   */
+  public AjaxURLNormalizer() {
+    utf8 = Charset.forName("UTF-8");
+  }
+
+  /**
+   * Attempts to normalize the input URL string
+   *
+   * @param String urlString
+   * @return String
+   */
+  public String normalize(String urlString, String scope) throws MalformedURLException {
+    LOG.info(scope + " // " + urlString);
+  
+    // When indexing, transform _escaped_fragment_ URL's to their #! counterpart
+    if (scope.equals(URLNormalizers.SCOPE_INDEXER) && urlString.contains(ESCAPED_URL_PART)) {
+      return normalizeEscapedFragment(urlString);
+    }
+    
+    // Otherwise transform #! URL's to their _escaped_fragment_ counterpart
+    if (urlString.contains(AJAX_URL_PART)) {
+      LOG.info(scope + " // " + normalizeHashedFragment(urlString));
+      return normalizeHashedFragment(urlString);
+    }
+
+    // Nothing to normalize here, return verbatim
+    return urlString;
+  }
+
+  /**
+   * Returns a normalized input URL. #! querystrings are transformed
+   * to a _escaped_fragment_ form.
+   *
+   * @param String urlString
+   * @return String
+   */
+  protected String normalizeHashedFragment(String urlString) throws MalformedURLException {
+    URL u = new URL(urlString);
+    int pos = urlString.indexOf(AJAX_URL_PART);
+    StringBuilder sb = new StringBuilder(urlString.substring(0, pos));
+
+    // Get the escaped fragment
+    String escapedFragment = escape(urlString.substring(pos + AJAX_URL_PART.length()));
+
+    // Check if we already have a query in the URL
+    if (u.getQuery() == null) {
+      sb.append("?");
+    } else {
+      sb.append("&");
+    }
+
+    // Append the escaped fragment key and the value
+    sb.append(ESCAPED_URL_PART);
+    sb.append(escapedFragment);
+
+    return sb.toString();
+  }
+
+  /**
+   * Returns a normalized input URL. _escaped_fragment_ querystrings are
+   * transformed to a #! form.
+   *
+   * @param String urlString
+   * @return String
+   */
+  protected String normalizeEscapedFragment(String urlString) throws MalformedURLException {
+    int pos = urlString.indexOf(ESCAPED_URL_PART);
+    URL u = new URL(urlString);
+    StringBuilder sb = new StringBuilder();
+
+    // Write the URL without query string, we'll handle that later
+    sb.append(u.getProtocol());
+    sb.append("://");
+    sb.append(u.getHost());
+    if (u.getPort() != -1) {
+      sb.append(":");
+      sb.append(u.getPort());
+    }
+    sb.append(u.getPath());
+
+    // Get the query string
+    String queryString = u.getQuery();
+
+    // Check if there's an & in the query string
+    int ampPos = queryString.indexOf("&");
+    String keyValuePair = null;
+
+    // If there's none, then the escaped fragment is the only k/v pair
+    if (ampPos == -1) {
+      keyValuePair = queryString;
+      queryString = "";
+    } else {
+      // Obtain the escaped k/v pair
+      keyValuePair = queryString.substring(ampPos + 1);
+
+      // Remove the escaped fragment key/value pair from the query string
+      queryString = queryString.replaceFirst("&" + keyValuePair, "");
+    }
+
+    // Remove escapedUrlPart from the keyValuePair
+    keyValuePair = keyValuePair.replaceFirst(ESCAPED_URL_PART, "");
+
+    // Get the fragment escaped
+    String unescapedFragment = unescape(keyValuePair);
+
+    // Append a possible query string, without original escaped fragment
+    if (queryString.length() > 0) {
+      sb.append("?");
+      sb.append(queryString);
+    }
+
+    // Append the fragment delimiter and the unescaped fragment
+    sb.append("#!");
+    sb.append(unescapedFragment);
+
+    return sb.toString();
+  }
+
+  /**
+   * Unescape some exotic characters in the fragment part
+   *
+   * @param String fragmentPart
+   * @return String
+   */
+  protected String unescape(String fragmentPart) {
+    try {
+      fragmentPart = URLDecoder.decode(fragmentPart, "UTF-8");
+    } catch (Exception e) {
+      /// bluh
+    }
+
+    return fragmentPart;
+  }
+
+  /**
+   * Escape some exotic characters in the fragment part
+   *
+   * @param String fragmentPart
+   * @return String
+   */
+  protected String escape(String fragmentPart) {
+    String hex = null;
+    StringBuilder sb = new StringBuilder(fragmentPart.length());
+
+    for (byte b : fragmentPart.getBytes(utf8)) {
+      if (b < 33) {
+        sb.append('%');
+
+        hex = Integer.toHexString(b & 0xFF).toUpperCase();
+
+        // Prevent odd # chars
+        if (hex.length() % 2 != 0) {
+          sb.append('0');
+        }
+        sb.append(hex);
+      } else if (b == 35) {
+        sb.append("%23");
+      } else if (b == 37) {
+        sb.append("%25");
+      } else if (b == 38) {
+        sb.append("%26");
+      } else if (b == 43) {
+        sb.append("%2B");
+      } else {
+        sb.append((char)b);
+      }
+    }
+
+    return sb.toString();
+  }
+
+  /**
+   * @param Configuration conf
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  /**
+   * @return Configuration
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+}
\ No newline at end of file


Mime
View raw message