nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From thammego...@apache.org
Subject [27/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build
Date Tue, 05 Jul 2016 22:49:11 GMT
http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
new file mode 100644
index 0000000..13064eb
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/Subcollection.java
@@ -0,0 +1,259 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.util.ArrayList;
+import java.util.Iterator;
+import java.util.List;
+import java.util.StringTokenizer;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.nutch.net.URLFilter;
+import org.apache.xerces.util.DOMUtil;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * SubCollection represents a subset of index, you can define url patterns that
+ * will indicate that particular page (url) is part of SubCollection.
+ */
+public class Subcollection extends Configured implements URLFilter {
+
+  public static final String TAG_COLLECTIONS = "subcollections";
+  public static final String TAG_COLLECTION = "subcollection";
+  public static final String TAG_WHITELIST = "whitelist";
+  public static final String TAG_BLACKLIST = "blacklist";
+  public static final String TAG_NAME = "name";
+  public static final String TAG_KEY = "key";
+  public static final String TAG_ID = "id";
+
+  List<String> blackList = new ArrayList<String>();
+  List<String> whiteList = new ArrayList<String>();
+
+  /**
+   * SubCollection identifier
+   */
+  String id;
+
+  /**
+   * SubCollection key
+   */
+  String key;
+
+  /**
+   * SubCollection name
+   */
+  String name;
+
+  /**
+   * SubCollection whitelist as String
+   */
+  String wlString;
+
+  /**
+   * SubCollection blacklist as String
+   */
+  String blString;
+
+  /**
+   * public Constructor
+   * 
+   * @param id
+   *          id of SubCollection
+   * @param name
+   *          name of SubCollection
+   */
+  public Subcollection(String id, String name, Configuration conf) {
+    this(id, name, null, conf);
+  }
+
+  /**
+   * public Constructor
+   * 
+   * @param id
+   *          id of SubCollection
+   * @param name
+   *          name of SubCollection
+   */
+  public Subcollection(String id, String name, String key, Configuration conf) {
+    this(conf);
+    this.id = id;
+    this.key = key;
+    this.name = name;
+  }
+
+  public Subcollection(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * @return Returns the name
+   */
+  public String getName() {
+    return name;
+  }
+
+  /**
+   * @return Returns the key
+   */
+  public String getKey() {
+    return key;
+  }
+
+  /**
+   * @return Returns the id
+   */
+  public String getId() {
+    return id;
+  }
+
+  /**
+   * Returns whitelist
+   * 
+   * @return Whitelist entries
+   */
+  public List<String> getWhiteList() {
+    return whiteList;
+  }
+
+  /**
+   * Returns whitelist String
+   * 
+   * @return Whitelist String
+   */
+  public String getWhiteListString() {
+    return wlString;
+  }
+
+  /**
+   * Returns blacklist String
+   * 
+   * @return Blacklist String
+   */
+  public String getBlackListString() {
+    return blString;
+  }
+
+  /**
+   * @param whiteList
+   *          The whiteList to set.
+   */
+  public void setWhiteList(ArrayList<String> whiteList) {
+    this.whiteList = whiteList;
+  }
+
+  /**
+   * Simple "indexOf" currentFilter for matching patterns.
+   * 
+   * <pre>
+   *  rules for evaluation are as follows:
+   *  1. if pattern matches in blacklist then url is rejected
+   *  2. if pattern matches in whitelist then url is allowed
+   *  3. url is rejected
+   * </pre>
+   * 
+   * @see org.apache.nutch.net.URLFilter#filter(java.lang.String)
+   */
+  public String filter(String urlString) {
+    // first the blacklist
+    Iterator<String> i = blackList.iterator();
+    while (i.hasNext()) {
+      String row = (String) i.next();
+      if (urlString.contains(row))
+        return null;
+    }
+
+    // then whitelist
+    i = whiteList.iterator();
+    while (i.hasNext()) {
+      String row = (String) i.next();
+      if (urlString.contains(row))
+        return urlString;
+    }
+    return null;
+  }
+
+  /**
+   * Initialize Subcollection from dom element
+   * 
+   * @param collection
+   */
+  public void initialize(Element collection) {
+    this.id = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_ID).item(0)).trim();
+    this.name = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_NAME).item(0)).trim();
+    this.wlString = DOMUtil.getChildText(
+        collection.getElementsByTagName(TAG_WHITELIST).item(0)).trim();
+
+    parseList(this.whiteList, wlString);
+
+    // Check if there's a blacklist we need to parse
+    NodeList nodeList = collection.getElementsByTagName(TAG_BLACKLIST);
+    if (nodeList.getLength() > 0) {
+      this.blString = DOMUtil.getChildText(nodeList.item(0)).trim();
+      parseList(this.blackList, blString);
+    }
+
+    // Check if there's a key element or set default name
+    nodeList = collection.getElementsByTagName(TAG_KEY);
+    if (nodeList.getLength() == 1) {
+      this.key = DOMUtil.getChildText(nodeList.item(0)).trim();
+    }
+  }
+
+  /**
+   * Create a list of patterns from chunk of text, patterns are separated with
+   * newline
+   * 
+   * @param list
+   * @param text
+   */
+  protected void parseList(List<String> list, String text) {
+    list.clear();
+
+    StringTokenizer st = new StringTokenizer(text, "\n\r");
+
+    while (st.hasMoreElements()) {
+      String line = (String) st.nextElement();
+      list.add(line.trim());
+    }
+  }
+
+  /**
+   * Set contents of blacklist from String
+   * 
+   * @param list
+   *          the blacklist contents
+   */
+  public void setBlackList(String list) {
+    this.blString = list;
+    parseList(blackList, list);
+  }
+
+  /**
+   * Set contents of whitelist from String
+   * 
+   * @param list
+   *          the whitelist contents
+   */
+  public void setWhiteList(String list) {
+    this.wlString = list;
+    parseList(whiteList, list);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
new file mode 100644
index 0000000..be08d1c
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/collection/package.html
@@ -0,0 +1,36 @@
+<html>
+<body>
+<p>
+Subcollection is a subset of an index. Subcollections are defined
+by urlpatterns in form of white/blacklist. So to get the page into
+subcollection it must match the whitelist and not the blacklist.
+</p>
+<p>
+Subcollection definitions are read from a file subcollections.xml
+and the format is as follows (imagine here that you are crawling all
+the virtualhosts from apache.org and you wan't to tag pages with
+url pattern "http://lucene.apache.org/nutch" and http://wiki.apache.org/nutch/
+to be part of subcollection "nutch", this allows you to later search
+specifically from this subcollection)
+</p>
+<p/>
+<p/>
+<pre>
+&lt;?xml version="1.0" encoding="UTF-8"?>
+&lt;subcollections>
+	&lt;subcollection>
+		&lt;name>nutch&lt;/name>
+		&lt;id>lucene&lt;/id>
+		&lt;whitelist>http://lucene.apache.org/nutch&lt;/whitelist>
+		&lt;whitelist>http://wiki.apache.org/nutch/&lt;/whitelist>
+		&lt;blacklist />
+	&lt;/subcollection>
+&lt;/subcollections>
+</pre>
+</p>
+<p>Despite of this configuration you still can crawl any urls
+as long as they pass through your global url filters. (note that
+you must also seed your urls in normal nutch way)
+</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
new file mode 100644
index 0000000..2946d9e
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/SubcollectionIndexingFilter.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.subcollection;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.conf.Configured;
+import org.apache.hadoop.io.Text;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.NutchDocument;
+
+import org.apache.nutch.collection.CollectionManager;
+import org.apache.nutch.collection.Subcollection;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+
+public class SubcollectionIndexingFilter extends Configured implements
+    IndexingFilter {
+
+  private Configuration conf;
+
+  public SubcollectionIndexingFilter() {
+    super(NutchConfiguration.create());
+  }
+
+  public SubcollectionIndexingFilter(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * @param Configuration
+   *          conf
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    fieldName = conf.get("subcollection.default.fieldname", "subcollection");
+  }
+
+  /**
+   * @return Configuration
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Doc field name
+   */
+  public static String fieldName = "subcollection";
+
+  /**
+   * Logger
+   */
+  public static final Logger LOG = LoggerFactory
+      .getLogger(SubcollectionIndexingFilter.class);
+
+  /**
+   * "Mark" document to be a part of subcollection
+   * 
+   * @param doc
+   * @param url
+   */
+  private void addSubCollectionField(NutchDocument doc, String url) {
+    for (Subcollection coll : CollectionManager.getCollectionManager(getConf())
+        .getSubCollections(url)) {
+      if (coll.getKey() == null) {
+        doc.add(fieldName, coll.getName());
+      } else {
+        doc.add(coll.getKey(), coll.getName());
+      }
+    }
+  }
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text url,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+    String sUrl = url.toString();
+    addSubCollectionField(doc, sUrl);
+    return doc;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
new file mode 100644
index 0000000..1c6ba72
--- /dev/null
+++ b/nutch-plugins/subcollection/src/main/java/org/apache/nutch/indexer/subcollection/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Indexing filter to assign documents to subcollections.
+ * The field "subcollection" is added and filled with a collection name
+ * defined in a configuration file and selected by pattern, see
+ * {@link org.apache.nutch.collection}.
+ */
+package org.apache.nutch.indexer.subcollection;
+

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java b/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
new file mode 100644
index 0000000..a2d2772
--- /dev/null
+++ b/nutch-plugins/subcollection/src/test/org/apache/nutch/collection/TestSubcollection.java
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.collection;
+
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Collection;
+
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestSubcollection {
+
+  /**
+   * Test filtering logic
+   * 
+   * @throws Exception
+   */
+  @Test
+  public void testFilter() throws Exception {
+    Subcollection sc = new Subcollection(NutchConfiguration.create());
+    sc.setWhiteList("www.nutch.org\nwww.apache.org");
+    sc.setBlackList("jpg\nwww.apache.org/zecret/");
+
+    // matches whitelist
+    Assert.assertEquals("http://www.apache.org/index.html",
+        sc.filter("http://www.apache.org/index.html"));
+
+    // matches blacklist
+    Assert.assertEquals(null,
+        sc.filter("http://www.apache.org/zecret/index.html"));
+    Assert.assertEquals(null, sc.filter("http://www.apache.org/img/image.jpg"));
+
+    // no match
+    Assert.assertEquals(null, sc.filter("http://www.google.com/"));
+  }
+
+  @Test
+  public void testInput() {
+    StringBuffer xml = new StringBuffer();
+    xml.append("<?xml version=\"1.0\" encoding=\"UTF-8\"?>");
+    xml.append("<!-- just a comment -->");
+    xml.append("<subcollections>");
+    xml.append("<subcollection>");
+    xml.append("<name>nutch collection</name>");
+    xml.append("<id>nutch</id>");
+    xml.append("<whitelist>");
+    xml.append("http://lucene.apache.org/nutch/\n");
+    xml.append("http://wiki.apache.org/nutch/\n");
+    xml.append("</whitelist>");
+    xml.append("<blacklist>");
+    xml.append("http://www.xxx.yyy\n");
+    xml.append("</blacklist>");
+    xml.append("</subcollection>");
+    xml.append("</subcollections>");
+
+    InputStream is = new ByteArrayInputStream(xml.toString().getBytes());
+
+    CollectionManager cm = new CollectionManager();
+    cm.parse(is);
+
+    Collection<?> c = cm.getAll();
+
+    // test that size matches
+    Assert.assertEquals(1, c.size());
+
+    Subcollection collection = (Subcollection) c.toArray()[0];
+
+    // test collection id
+    Assert.assertEquals("nutch", collection.getId());
+
+    // test collection name
+    Assert.assertEquals("nutch collection", collection.getName());
+
+    // test whitelist
+    Assert.assertEquals(2, collection.whiteList.size());
+
+    String wlUrl = (String) collection.whiteList.get(0);
+    Assert.assertEquals("http://lucene.apache.org/nutch/", wlUrl);
+
+    wlUrl = (String) collection.whiteList.get(1);
+    Assert.assertEquals("http://wiki.apache.org/nutch/", wlUrl);
+
+    // matches whitelist
+    Assert.assertEquals("http://lucene.apache.org/nutch/",
+        collection.filter("http://lucene.apache.org/nutch/"));
+
+    // test blacklist
+    Assert.assertEquals(1, collection.blackList.size());
+
+    String blUrl = (String) collection.blackList.get(0);
+    Assert.assertEquals("http://www.xxx.yyy", blUrl);
+
+    // no match
+    Assert.assertEquals(null, collection.filter("http://www.google.com/"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/build.xml b/nutch-plugins/tld/build.xml
new file mode 100644
index 0000000..f46c8e6
--- /dev/null
+++ b/nutch-plugins/tld/build.xml
@@ -0,0 +1,22 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="tld" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/ivy.xml b/nutch-plugins/tld/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/tld/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/plugin.xml b/nutch-plugins/tld/plugin.xml
new file mode 100644
index 0000000..712a34a
--- /dev/null
+++ b/nutch-plugins/tld/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="tld"
+   name="Top Level Domain Plugin"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+
+   <runtime>
+      <library name="tld.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.indexer.tld"
+              name="Top Level Domain Indexing Filter"
+              point="org.apache.nutch.indexer.IndexingFilter">
+      <implementation id="TLDIndexingFilter"
+                      class="org.apache.nutch.indexer.tld.TLDIndexingFilter"/>
+   </extension>
+
+   <extension id="org.apache.nutch.scoring.tld"
+              name="Top Level Domain Scoring Filter"
+              point="org.apache.nutch.scoring.ScoringFilter">
+
+      <implementation id="org.apache.nutch.scoring.tld.TLDScoringFilter"
+                      class="org.apache.nutch.scoring.tld.TLDScoringFilter" />
+   </extension>
+
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/pom.xml b/nutch-plugins/tld/pom.xml
new file mode 100644
index 0000000..95039bd
--- /dev/null
+++ b/nutch-plugins/tld/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>tld</artifactId>
+    <packaging>jar</packaging>
+
+    <name>tld</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
new file mode 100644
index 0000000..cd7e194
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/TLDIndexingFilter.java
@@ -0,0 +1,69 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.indexer.tld;
+
+import java.net.URL;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.IndexingException;
+import org.apache.nutch.indexer.IndexingFilter;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * Adds the Top level domain extensions to the index
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDIndexingFilter implements IndexingFilter {
+  public static final Logger LOG = LoggerFactory
+      .getLogger(TLDIndexingFilter.class);
+
+  private Configuration conf;
+
+  public NutchDocument filter(NutchDocument doc, Parse parse, Text urlText,
+      CrawlDatum datum, Inlinks inlinks) throws IndexingException {
+
+    try {
+      URL url = new URL(urlText.toString());
+      DomainSuffix d = URLUtil.getDomainSuffix(url);
+
+      doc.add("tld", d.getDomain());
+
+    } catch (Exception ex) {
+      LOG.warn(ex.toString());
+    }
+
+    return doc;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
new file mode 100644
index 0000000..75841d9
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/indexer/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Indexing plugin.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
new file mode 100644
index 0000000..b7f4963
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/TLDScoringFilter.java
@@ -0,0 +1,114 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.scoring.tld;
+
+import java.util.List;
+import java.util.Collection;
+import java.util.Map.Entry;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.crawl.Inlinks;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.indexer.NutchField;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.scoring.ScoringFilter;
+import org.apache.nutch.scoring.ScoringFilterException;
+import org.apache.nutch.util.domain.DomainSuffix;
+import org.apache.nutch.util.domain.DomainSuffixes;
+
+/**
+ * Scoring filter to boost tlds.
+ * 
+ * @author Enis Soztutar &lt;enis.soz.nutch@gmail.com&gt;
+ */
+public class TLDScoringFilter implements ScoringFilter {
+
+  private Configuration conf;
+  private DomainSuffixes tldEntries;
+
+  public TLDScoringFilter() {
+    tldEntries = DomainSuffixes.getInstance();
+  }
+
+  public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum,
+      CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore)
+      throws ScoringFilterException {
+
+    NutchField tlds = doc.getField("tld");
+    float boost = 1.0f;
+
+    if (tlds != null) {
+      for (Object tld : tlds.getValues()) {
+        DomainSuffix entry = tldEntries.get(tld.toString());
+        if (entry != null)
+          boost *= entry.getBoost();
+      }
+    }
+    return initScore * boost;
+  }
+
+  public CrawlDatum distributeScoreToOutlink(Text fromUrl, Text toUrl,
+      ParseData parseData, CrawlDatum target, CrawlDatum adjust, int allCount,
+      int validCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+  public float generatorSortValue(Text url, CrawlDatum datum, float initSort)
+      throws ScoringFilterException {
+    return initSort;
+  }
+
+  public void initialScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void injectedScore(Text url, CrawlDatum datum)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreAfterParsing(Text url, Content content, Parse parse)
+      throws ScoringFilterException {
+  }
+
+  public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content)
+      throws ScoringFilterException {
+  }
+
+  public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum,
+      List<CrawlDatum> inlinked) throws ScoringFilterException {
+  }
+
+  public Configuration getConf() {
+    return conf;
+  }
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+  }
+
+  public CrawlDatum distributeScoreToOutlinks(Text fromUrl,
+      ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets,
+      CrawlDatum adjust, int allCount) throws ScoringFilterException {
+    return adjust;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
new file mode 100644
index 0000000..d05e4b8
--- /dev/null
+++ b/nutch-plugins/tld/src/main/java/org/apache/nutch/scoring/tld/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Top Level Domain Scoring plugin.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/build.xml b/nutch-plugins/urlfilter-automaton/build.xml
new file mode 100644
index 0000000..78557fc
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/build.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-automaton" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-regex-filter/*.jar" />
+    </fileset>
+    <pathelement location="${nutch.root}/build/lib-regex-filter/test"/>
+  </path>
+
+  <!-- Compile test classes for dependencies -->
+  <target name="deps-test-compile">
+    <ant target="compile-test" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- Deploy Unit test dependencies -->
+  <target name="deps-test">
+    <ant target="deploy" inheritall="false" dir="../lib-regex-filter"/>
+  </target>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="sample" includes="**/*.rules, **/*.urls"/>
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/ivy.xml b/nutch-plugins/urlfilter-automaton/ivy.xml
new file mode 100644
index 0000000..7c1968f
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/ivy.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+    <dependency org="dk.brics.automaton" name="automaton" rev="1.11-8" conf="*->default" />
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/plugin.xml b/nutch-plugins/urlfilter-automaton/plugin.xml
new file mode 100644
index 0000000..d0cc1ef
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-automaton"
+   name="Automaton URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-automaton.jar">
+         <export name="*"/>
+      </library>
+      <library name="automaton-1.11-8.jar"/>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+      <import plugin="lib-regex-filter"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.automaton"
+              name="Nutch Automaton URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="AutomatonURLFilter"
+                      class="org.apache.nutch.urlfilter.automaton.AutomatonURLFilter"/>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/pom.xml b/nutch-plugins/urlfilter-automaton/pom.xml
new file mode 100644
index 0000000..14a2d07
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/pom.xml
@@ -0,0 +1,50 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-automaton</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-automaton</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+    <dependencies>
+        <dependency>
+            <groupId>dk.brics.automaton</groupId>
+            <artifactId>automaton</artifactId>
+            <version>1.11-8</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-regex-filter</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules
new file mode 100644
index 0000000..a2f6da0
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.rules
@@ -0,0 +1,26 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# skip .fr .org and .net domains
+-.*//.*\.fr/.*
+-.*//.*\.org/.*
+-.*//.*\.net/.*
+
+# skip everything else
++.*

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls
new file mode 100644
index 0000000..40bf4ee
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/Benchmarks.urls
@@ -0,0 +1,297 @@
++http://www.hostip.info/
+-http://www.elanceur.org/Articles/OntologieSurfaite.html
++http://www.opensymphony.com/quartz/
+-http://www.portletbridge.org/saxbenchmark/index.html
++http://www.lesmotsdelinfo.com/
++http://usefulinc.com/doap/
++http://www.codezoo.com/
++http://search.infocious.com/
+-http://pedagogie.ac-montpellier.fr/disciplines/anglais/tice/sms.html
++http://www.brics.dk/%7Eamoeller/automaton/
++http://jazzz.com/wp.html
++http://www.maxkiesler.com/index.php
++http://adscriptum.blogspot.com/2006/03/google-et-la-prsentation-deric-schmidt.html
++http://www.alias-i.com/lingpipe/
+-http://johnny.ihackstuff.com/index.php?module=prodreviews
+-http://www.spurl.net/
++http://www.dropload.com/
++http://vivisimo.com/
++http://www.marumushi.com/apps/newsmap/newsmap.cfm
++http://www.ixquick.com/
+-http://today.java.net/pub/a/today/2003/07/30/LuceneIntro.html
++http://www.mail-archive.com/
++http://www.spymac.com/
+-http://browsers.evolt.org/
+-http://www.oswd.org/
++http://www.stayinvisible.com/index.pl
++http://java.sun.com/j2se/1.4.2/docs/api/index.html
++http://www.microsoft.com/resources/documentation/windows/xp/all/proddocs/en-us/ntcmds.mspx
++http://www.bloglines.com/
+-http://www.fckeditor.net/
++http://search.msn.com/
+-http://www.grub.org/
++http://www.xml.com/pub/a/2000/11/29/schemas/part1.html
+-http://www.mnot.net/cache_docs/
+-http://www.furl.net/
++http://www.blogpulse.com/
++http://www.googlefight.com/
++http://www.rokulabs.com/
+-http://mightylegends.zapto.org/dvd/dvdauthor_howto.php
+-http://www.batbox.org/wrt54g-linux.html
+-http://en.wikipedia.org/wiki/%s
++http://www.sipcenter.com/
++http://www.merriampark.com/ld.htm
++http://anon.inf.tu-dresden.de/index_en.html
++http://www.pluck.com/
++http://www.tiddlywiki.com/
++http://www.jux2.com/
++http://clusty.com/
+-http://findability.org/
++http://www.searchengineshowdown.com/
++http://www.nhacks.com/email/index.php
++http://www.koders.com/
++http://www.cs.rochester.edu/sosp2003/papers/p125-ghemawat.pdf
++http://www.gmailwiki.com/index.php/Main_Page
++http://www.tadalist.com/
++http://www.net2ftp.com/
++http://www.streamload.com/
++http://www.lucazappa.com/brilliantMaker/buttonImage.php
++http://www.hybernaut.com/bdv/delicious-import.html
++http://www.gtmcknight.com/buttons/
++http://amb.vis.ne.jp/mozilla/scrapbook/
++http://g-metrics.com/index.php
+-http://tor.eff.org/
++http://www.search-this.com/search_engine_decoder.asp
++http://www.onjava.com/pub/a/onjava/2005/01/26/classloading.html
++http://www.adaptivepath.com/publications/essays/archives/000385.php
+-http://isnoop.net/gmail/
+-http://openweb.eu.org/
++http://www.mistergooddeal.com/
++http://javatoolbox.com/
+-http://www.freenews.fr/
++http://www.wikiwax.com/
+-http://today.java.net/pub/a/today/2005/04/21/farm.html
++http://users.skynet.be/J.Beever/pave.htm
++http://www.lundi8h.com/
++http://www.snap.com/
++http://www.goosee.com/puppy/index.shtml
+-http://www.softwarefreedom.org/index.html
+-http://y.20q.net/
++http://www.bitty.com/
++http://www.lafraise.com/
+-http://www.liquidinformation.org/
++http://www.searchtools.com/
++http://www.martinfowler.com/articles/injection.html
++http://pdos.csail.mit.edu/scigen/
+-http://developer.yahoo.net/blog/
++http://blogger-templates.blogspot.com/
++http://phpadsnew.com/two/
++http://www.langreiter.com/exec/yahoo-vs-google.html
+-http://www.dataparksearch.org/
+-http://www.yubnub.org/
+-http://www.fing.org/
+-http://www.swish-e.org/
+-http://www.openajax.net/wordpress/
++http://crypto.stanford.edu/PwdHash/
++http://www.html-kit.com/favicon/
+-http://today.java.net/pub/a/today/2005/08/09/didyoumean.html?page=1
++http://www.durhamtownship.com/
++http://jiwire.com/
++http://www.insilmaril.de/vym/
+-http://www.spreadshirt.net/
++http://www.goffice.com/
++http://www.writely.com/
++http://www.milindparikh.com/
++http://www.onjava.com/pub/a/onjava/2005/02/02/bitsets.html
++http://www.wikyblog.com/Map/Guest/Home
+-http://www.kottke.org/05/08/googleos-webos
++http://www.rollyo.com/
++http://www.meebo.com/
++http://www.factbites.com/
++http://www.placeopedia.com/
++http://swoogle.umbc.edu/
++http://www.viaduc.com/
+-http://demo.wikiwyg.net/wikiwyg/demo/standalone/
++http://podcasts.yahoo.com/
+-http://beaglewiki.org/Main_Page
++http://yq.search.yahoo.com/
+-http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html?page=1
++http://www.onlamp.com/pub/a/onlamp/2005/10/13/what_is_rails.html
++http://socialight.com/
++http://www.lexxe.com/
++http://www.xom.nu/
++http://www.turboprint.de/
++http://www.whatdoesthatmean.com/index.php/Welcome_to_%27Whatdoesthatmean%3F%27
++http://www.wi-fiplanet.com/tutorials/article.php/3562391
++http://particletree.com/features/10-tips-to-a-better-form/
++http://www.songbirdnest.com/
+-http://www.w3.org/Talks/Tools/Slidy/
+-http://www.compassframework.org/display/SITE/Home
++http://motrech.blogspot.com/
++http://www.moteurzine.com/
++http://www.mex-search.com/
+-http://beta.previewseek.com/?mdc=y&amp;twin=n&amp;ilang=french
++http://www.goshme.com/
++http://rialto.application-servers.com/
++http://www.multe-pass.com/
++http://www.tailrank.com/
++http://www.vandertramp.com/INTERNETDOWN/
++http://www.letterjames.de/index.html
++http://code.google.com/index.html
++http://www.kritx.com/
++http://performancing.com/firefox
++http://www.mywebsearch.com/
+-http://en.wikibooks.org/w/index.php?title=Wikimania05/IM1
++http://www.lukew.com/resources/articles/blogs2.asp
+-http://www.hyperwords.net/
++http://ajax.parish.ath.cx/translator/
++http://www.maplandia.com/
+-http://www.tbray.org/ongoing/When/200x/2006/01/08/No-New-XML-Languages
++http://onefeed.com/index.php
++http://www.file-swap.com/
+-http://opennlp.org/
++http://mindprod.com/jgloss/encoding.html
++http://code.google.com/webstats/index.html
++http://www.freeweb-hosting.com/google_pagerank_pr_checker/
+-http://www.framakey.org/
+-http://microformats.org/wiki/hreview
+-http://www.ashesandsnow.org/index2.html
+-http://uima-framework.sourceforge.net/
++http://sethgodin.typepad.com/seths_blog/2006/01/flipping_the_fu.html
+-http://www.anandtech.com/IT/showdoc.aspx?i=2523&amp;p=2
++http://fr.techcrunch.com/
+-http://developer.yahoo.net/yui/
++http://www.fredrikodman.com/
++http://www.mpirical.com/companion/mpirical_companion.html
++http://www.onjava.com/pub/a/onjava/2005/08/03/drools.html
+-http://k9copy.free.fr/
+-http://lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
+-http://www.tbray.org/ongoing/When/200x/2006/01/09/On-XML-Language-Design
+-http://lespetitescases.net/structurer-decrire-et-organiser-l-information-2
++http://blogokat.canalblog.com/archives/2005/11/02/882454.html
++http://robur.slu.se/jensl/xmlclitools/
+-http://www.internetactu.net/?p=6291
+-http://www.xml.com/pub/a/2005/10/19/microformats-and-web-2.0.html?page=1
++http://www.memodata.com/2004/fr/alexandria/
+-http://presse-citron.net/?2006/01/23/654-joomla-pete-grave
++http://www.randomerror.com/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/techniques-for-determining-the-location-on-umts-networks/
+-http://fr.newsgator.com/ngs/subscriber/WebEd2.aspx?fid=368395
+-http://interstices.info/display.jsp?id=c_15918
++http://www.tech-invite.com/
++http://www.croczilla.com/zap
+-http://www.libervis.com/modules/wordpress/?p=13
++http://www.searchmorph.com/wp/2005/07/19/recent-discovery-clickfraud-tools/
+-http://savoirscdi.cndp.fr/CulturePro/actualisation/Serres/Serres.htm
++http://www.influo.com/
++http://www.dsi-info.ca/chroniques/chroniques-recherche-web.html
+-http://www.addnb.org/fr/docs/webinvisible.htm
+-http://manhack.net/
+-http://www.jibaku.net/
++http://www.pipologie.com/
++http://christophenoel.blogspot.com/
+-http://www.seekport.fr/seekbot/
++http://beta.exalead.com/
+-http://www.boolgum.fr/index.html
++http://www.kesako.canalblog.com/
++http://loran.blogspot.com/
++http://outils-recherche.blogspot.com/
++http://www.art-dept.com/artists/giacobbe/
++http://www.meggould.netfirms.com/site_seeingIII.htm
++http://www.freedpi.com/
++http://www.frenchfred.com/
++http://www.photoways.com/
+-http://freco.free.fr/index.htm
+-http://triturages.free.fr/index.htm
+-http://www.qsos.org/
++http://www.alvis.info/alvis/
++http://www.i-cherubini.it/mauro/blog/2005/12/16/open-source-information-retrieval-systems/
+-http://www.shinux.org/
++http://www.linuxlinks.com/Distributions/Mini_Distributions/index.shtml
++http://www.kurobox.com/online/tiki-index.php
+-http://news.gmane.org/gmane.comp.misc.linkstation.linux
++http://www.imsbook.com/SIP-IMS-Standards-List.html
+-http://incubator.apache.org/directory/subprojects/snickers/
+-http://www.mozilla.org/projects/security/pki/jss/javadoc/org/mozilla/jss/asn1/package-summary.html
+-http://sourceforge.net/projects/cryptix-asn1/
+-http://sourceforge.net/projects/basn/
+-http://asn1.elibel.tm.fr/fr/index.htm
+-http://sourceforge.net/projects/a2j/
++http://www.degrouptest.com/
++http://interstices.info/
++http://louvre-boite.viabloga.com/news/18.shtml
+-http://tel.ccsd.cnrs.fr/documents/archives0/00/00/62/60/index_fr.html
++http://poiplace.oabsoftware.nl/
+-http://www.gpspassion.com/forumsen/topic.asp?TOPIC_ID=7759
+-http://www.yoono.com/favorites.jsp?user-id=lquerel
+-http://www.librecours.org/cgi-bin/main
+-http://www.onjava.com/pub/a/onjava/2006/01/18/using-lucene-to-search-java-source.html?page=1
+-http://limo.sourceforge.net/
++http://www-scf.usc.edu/%7Emattmann/
++http://spaces.msn.com/members/famillezen/
+-http://photos.joune.org/
+-http://www.canon.fr/paperart/
++http://flash.eastweb.ru/files/20051024092150.swf
++http://www.xsltwiki.com/index.php/Main_Page
++http://www.i-cherubini.it/mauro/blog/2005/12/08/software-that-goes-on-a-stick/
+-http://www.webrankinfo.com/forums/forum_15.htm?sid=307384cdbce813aa19ba017513cbbc31
++http://www.loiclemeur.com/france/2006/01/eric_tenin_se_f.html
+-http://member.openmobilealliance.org/ftp/Public_documents/MCC/2005/
++http://www.aeliosfinance.com/
++http://www.capital-it.com/
+-http://www.tradedoubler.fr/pan/public/solutions/publisher
+-http://www.recherche.gouv.fr/technologie/concours/2006/index.htm
++http://www.techcrunch.com/2005/12/21/gravee-takes-a-new-approach-to-search/
++http://wanabo.com/
+-http://www.lespetitescases.net/structurer-decrire-et-organiser-l-information-1
+-http://presse-citron.net/?2006/02/07/705-joue-la-comme-stickam
++http://aeliosfinance.com/
++http://www.centreincubation.com/
++http://www.franceincubation.com/
+-http://www.oseo.fr/
++http://www.i18nfaq.com/chardet.html
+-http://cpdetector.sourceforge.net/
++http://www.jeremi.info/index.php/2005/07/21/7-introduction-aux-methodes-agiles
++http://chezlorry.ca/Accueil.htm
++http://cetnia.blogs.com/d_lires/
+-http://www.directwine.fr/
++http://www.new-phenix.com/
+-http://upnp.sourceforge.net/
+-http://www.pixmania.fr/
+-http://www.lespetitescases.net/comment-organiser-l-information-pour-y-naviguer-efficacement-3
++http://www.i-cherubini.it/mauro/blog/2006/01/25/kwmap-a-keyword-search-visualization-tool/
++http://www.stepnewz.com/sn/default.asp
++http://opquast.com/
+-http://www.freeplayer.org/
+-http://www.cafe-clope.net/orangeamere/index.php/2005/08/24/5-le-modele-contributif-une-utopie
+-http://atomcomputer.free.fr/fbox/
+-http://www.internetactu.net/index.php?p=6100
+-http://mammouthland.free.fr/cours/css/genecss.php
+-http://www.xml.com/pub/a/2006/02/01/doing-http-caching-right-introducing-httplib2.html?page=1
++http://www-106.ibm.com/developerworks/xml/library/x-xapi.html
+-http://xml.apache.org/xalan-j/extensions.html
++http://developers.sun.com/foryourbusiness/jcc/
++http://blogs.sun.com/roller/page/roumen/Weblog
+-http://www.onjava.com/pub/a/onjava/2005/10/12/diagnostic-tests-with-ant.html?page=1
+-http://blog.developpez.com/index.php?blog=51&amp;p=1389&amp;more=1&amp;c=1&amp;tb=1&amp;pb=1
++http://dcabasson.developpez.com/articles/javascript/ajax/ajax-autocompletion-pas-a-pas/
++http://odur.let.rug.nl/%7Evannoord/
+-http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
+-http://artist.inist.fr/
++http://www.elra.info/
+-http://beinecke.library.yale.edu/dl_crosscollex/SearchExecXC.asp?srchtype=CNO
++http://www.i-cherubini.it/mauro/blog/2005/12/13/information-retrieval-system-evaluation-effort-sensitivity-and-reliability
++http://www.i-cherubini.it/mauro/blog/2005/12/13/trec-datasets-text-retrieval-conference-datasets-for-information-retrieval
++http://www.i-cherubini.it/mauro/blog/2005/12/12/focused-crawling-using-context-graphs/
++http://www.i-cherubini.it/mauro/blog/2005/12/08/spam-filtering-using-contextual-network-graphs/
++http://www.cs.northwestern.edu/%7Evidya/semanticons/IconsWebPage/
++http://www.i-cherubini.it/mauro/blog/2006/01/05/social-information-retrieval/
++http://www.i-cherubini.it/mauro/blog/2006/01/04/an-introduction-to-random-indexing/
++http://dossierdoc.typepad.com/descripteurs/2006/01/liste_de_thsaur.html
+-http://www.lexique.org/
++http://www.i-cherubini.it/mauro/blog/2006/01/22/montylingua-a-commonsense-enriched-part-of-speech-tagger/
++http://www.streamium.com/products/mx6000i/
+-http://www.p4c.philips.com/cgi-bin/dcbint/cpindex.pl?ctn=MX6000I/22S&amp;scy=FR&amp;slg=fr
+-http://store.interact-tv.com/store/product_info.php?cPath=9&amp;products_id=73
++http://www.tversity.com/
+-http://www.aspseek.org/index.php
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules
new file mode 100644
index 0000000..8966183
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.rules
@@ -0,0 +1,24 @@
+# The url filter file used by the crawl command.
+
+# Better for intranet crawling.
+# Be sure to change MY.DOMAIN.NAME to your domain name.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file:, ftp:, & mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe|png)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept hosts in MY.DOMAIN.NAME
++http://([a-z0-9]*\.)*MY.DOMAIN.NAME/.*
+
+# skip everything else
+-.*

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls
new file mode 100644
index 0000000..b1ad9b7
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/IntranetCrawling.urls
@@ -0,0 +1,8 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
+-news://any.news.server/comp.lang.java
+-whois:/nutch.org
++http://MY.DOMAIN.NAME/
++http://MY.DOMAIN.NAME/nutch
++http://www.MY.DOMAIN.NAME/

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules
new file mode 100644
index 0000000..dfae8b0
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.rules
@@ -0,0 +1,19 @@
+# The default url filter.
+# Better for whole-internet crawling.
+
+# Each non-comment, non-blank line contains a regular expression
+# prefixed by '+' or '-'.  The first matching pattern in the file
+# determines whether a URL is included or ignored.  If no pattern
+# matches, the URL is ignored.
+
+# skip file: ftp: and mailto: urls
+-(file|ftp|mailto):.*
+
+# skip image and other suffixes we can't yet parse
+-.*\.(gif|GIF|jpg|JPG|ico|ICO|css|sit|eps|wmf|zip|ppt|mpg|xls|gz|rpm|tgz|mov|MOV|exe)
+
+# skip URLs containing certain characters as probable queries, etc.
+-.*[?*!@=].*
+
+# accept anything else
++.*

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls
new file mode 100644
index 0000000..d3b1bf3
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/sample/WholeWebCrawling.urls
@@ -0,0 +1,11 @@
+-file://home/jc/nutch/index.html
+-ftp://ftp.apache.org/nutch.html
+-mailto:jerome.charron@gmail.com
++news://any.news.server/comp.lang.java
++whois:/nutch.org
+-http://www.nutch.org/nutch.gif
+-http://www.nutch.org/nutch.eps
+-http://www.nutch.org/nutch?q=nutch
++http://www.nutch.org/
++http://www.nutch.org/abcd/foo/bar/foo/bar/foo/
++http://www.nutch.org/abcd/foo/bar/xyz/foo/bar/foo/

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
new file mode 100644
index 0000000..ae4896d
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/AutomatonURLFilter.java
@@ -0,0 +1,116 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.Reader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.regex.PatternSyntaxException;
+
+// Hadoop imports
+import org.apache.hadoop.conf.Configuration;
+
+// Automaton imports
+import dk.brics.automaton.RegExp;
+import dk.brics.automaton.RunAutomaton;
+import org.apache.nutch.net.*;
+import org.apache.nutch.urlfilter.api.RegexRule;
+import org.apache.nutch.urlfilter.api.RegexURLFilterBase;
+
+/**
+ * RegexURLFilterBase implementation based on the <a
+ * href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+ * Automata for Java<sup>TM</sup>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ * @see <a href="http://www.brics.dk/automaton/">dk.brics.automaton</a>
+ */
+public class AutomatonURLFilter extends RegexURLFilterBase {
+  public static final String URLFILTER_AUTOMATON_FILE = "urlfilter.automaton.file";
+  public static final String URLFILTER_AUTOMATON_RULES = "urlfilter.automaton.rules";
+
+  public AutomatonURLFilter() {
+    super();
+  }
+
+  public AutomatonURLFilter(String filename) throws IOException,
+      PatternSyntaxException {
+    super(filename);
+  }
+
+  AutomatonURLFilter(Reader reader) throws IOException,
+      IllegalArgumentException {
+    super(reader);
+  }
+
+  /*
+   * ----------------------------------- * <implementation:RegexURLFilterBase> *
+   * -----------------------------------
+   */
+
+  /**
+   * Rules specified as a config property will override rules specified as a
+   * config file.
+   */
+  protected Reader getRulesReader(Configuration conf) throws IOException {
+    String stringRules = conf.get(URLFILTER_AUTOMATON_RULES);
+    if (stringRules != null) {
+      return new StringReader(stringRules);
+    }
+    String fileRules = conf.get(URLFILTER_AUTOMATON_FILE);
+    return conf.getConfResourceAsReader(fileRules);
+  }
+
+  // Inherited Javadoc
+  protected RegexRule createRule(boolean sign, String regex) {
+    return new Rule(sign, regex);
+  }
+  
+  protected RegexRule createRule(boolean sign, String regex, String hostOrDomain) {
+    return new Rule(sign, regex, hostOrDomain);
+  }
+
+  /*
+   * ------------------------------------ * </implementation:RegexURLFilterBase>
+   * * ------------------------------------
+   */
+
+  public static void main(String args[]) throws IOException {
+    main(new AutomatonURLFilter(), args);
+  }
+
+  private class Rule extends RegexRule {
+
+    private RunAutomaton automaton;
+
+    Rule(boolean sign, String regex) {
+      super(sign, regex);
+      automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+    }
+    
+    Rule(boolean sign, String regex, String hostOrDomain) {
+      super(sign, regex, hostOrDomain);
+      automaton = new RunAutomaton(new RegExp(regex, RegExp.ALL).toAutomaton());
+    }
+
+    protected boolean match(String url) {
+      return automaton.run(url);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
new file mode 100644
index 0000000..42533f7
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/main/java/org/apache/nutch/urlfilter/automaton/package.html
@@ -0,0 +1,9 @@
+<html>
+<body>
+<p>
+URL filter plugin based on
+<a href="http://www.brics.dk/automaton/">dk.brics.automaton</a> Finite-State
+Automata for Java<sup>TM</sup>.
+</p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java b/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
new file mode 100644
index 0000000..a70a6b6
--- /dev/null
+++ b/nutch-plugins/urlfilter-automaton/src/test/org/apache/nutch/urlfilter/automaton/TestAutomatonURLFilter.java
@@ -0,0 +1,56 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.automaton;
+
+// JDK imports
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.nutch.net.*;
+// Nutch imports
+import org.apache.nutch.urlfilter.api.RegexURLFilterBaseTest;
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * JUnit based test of class <code>AutomatonURLFilter</code>.
+ * 
+ * @author J&eacute;r&ocirc;me Charron
+ */
+public class TestAutomatonURLFilter extends RegexURLFilterBaseTest {
+
+  protected URLFilter getURLFilter(Reader rules) {
+    try {
+      return new AutomatonURLFilter(rules);
+    } catch (IOException e) {
+      Assert.fail(e.toString());
+      return null;
+    }
+  }
+
+  @Test
+  public void test() {
+    test("WholeWebCrawling");
+    test("IntranetCrawling");
+    bench(50, "Benchmarks");
+    bench(100, "Benchmarks");
+    bench(200, "Benchmarks");
+    bench(400, "Benchmarks");
+    bench(800, "Benchmarks");
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/build.xml b/nutch-plugins/urlfilter-domain/build.xml
new file mode 100644
index 0000000..4af55ac
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/build.xml
@@ -0,0 +1,28 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="urlfilter-domain" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- for junit test -->
+  <mkdir dir="${build.test}/data"/>
+  <copy todir="${build.test}/data">
+    <fileset dir="data" />
+  </copy>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/data/hosts.txt
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/data/hosts.txt b/nutch-plugins/urlfilter-domain/data/hosts.txt
new file mode 100644
index 0000000..2b88c3b
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/data/hosts.txt
@@ -0,0 +1,5 @@
+# comments start with the pound sign
+net
+apache.org
+be
+www.yahoo.com
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/ivy.xml b/nutch-plugins/urlfilter-domain/ivy.xml
new file mode 100644
index 0000000..1a86d68
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/ivy.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/plugin.xml b/nutch-plugins/urlfilter-domain/plugin.xml
new file mode 100644
index 0000000..1452d58
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/plugin.xml
@@ -0,0 +1,43 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="urlfilter-domain"
+   name="Domain URL Filter"
+   version="1.0.0"
+   provider-name="nutch.org">
+
+   <runtime>
+      <library name="urlfilter-domain.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <requires>
+      <import plugin="nutch-extensionpoints"/>
+   </requires>
+
+   <extension id="org.apache.nutch.net.urlfilter.domain"
+              name="Nutch Domain URL Filter"
+              point="org.apache.nutch.net.URLFilter">
+      <implementation id="DomainURLFilter"
+        class="org.apache.nutch.urlfilter.domain.DomainURLFilter">
+        <parameter name="file" value="domain-urlfilter.txt"/>
+      </implementation>
+   </extension>
+
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/pom.xml b/nutch-plugins/urlfilter-domain/pom.xml
new file mode 100644
index 0000000..0c9dddd
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/pom.xml
@@ -0,0 +1,38 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>urlfilter-domain</artifactId>
+    <packaging>jar</packaging>
+
+    <name>urlfilter-domain</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
new file mode 100644
index 0000000..821d944
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/DomainURLFilter.java
@@ -0,0 +1,212 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.urlfilter.domain;
+
+import java.io.BufferedReader;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.net.URLFilter;
+import org.apache.nutch.plugin.Extension;
+import org.apache.nutch.plugin.PluginRepository;
+import org.apache.nutch.util.URLUtil;
+import org.apache.nutch.util.domain.DomainSuffix;
+
+/**
+ * <p>
+ * Filters URLs based on a file containing domain suffixes, domain names, and
+ * hostnames. Only a url that matches one of the suffixes, domains, or hosts
+ * present in the file is allowed.
+ * </p>
+ * 
+ * <p>
+ * Urls are checked in order of domain suffix, domain name, and hostname against
+ * entries in the domain file. The domain file would be setup as follows with
+ * one entry per line:
+ * 
+ * <pre>
+ * com apache.org www.apache.org
+ * </pre>
+ * 
+ * <p>
+ * The first line is an example of a filter that would allow all .com domains.
+ * The second line allows all urls from apache.org and all of its subdomains
+ * such as lucene.apache.org and hadoop.apache.org. The third line would allow
+ * only urls from www.apache.org. There is no specific ordering to entries. The
+ * entries are from more general to more specific with the more general
+ * overridding the more specific.
+ * </p>
+ * 
+ * The domain file defaults to domain-urlfilter.txt in the classpath but can be
+ * overridden using the:
+ * 
+ * <ul>
+ * <ol>
+ * property "urlfilter.domain.file" in ./conf/nutch-*.xml, and
+ * </ol>
+ * <ol>
+ * attribute "file" in plugin.xml of this plugin
+ * </ol>
+ * </ul>
+ * 
+ * the attribute "file" has higher precedence if defined.
+ */
+public class DomainURLFilter implements URLFilter {
+
+  private static final Logger LOG = LoggerFactory
+      .getLogger(DomainURLFilter.class);
+
+  // read in attribute "file" of this plugin.
+  private static String attributeFile = null;
+  private Configuration conf;
+  private String domainFile = null;
+  private Set<String> domainSet = new LinkedHashSet<String>();
+
+  private void readConfiguration(Reader configReader) throws IOException {
+
+    // read the configuration file, line by line
+    BufferedReader reader = new BufferedReader(configReader);
+    String line = null;
+    while ((line = reader.readLine()) != null) {
+      if (StringUtils.isNotBlank(line) && !line.startsWith("#")) {
+        // add non-blank lines and non-commented lines
+        domainSet.add(StringUtils.lowerCase(line.trim()));
+      }
+    }
+  }
+
+  /**
+   * Default constructor.
+   */
+  public DomainURLFilter() {
+
+  }
+
+  /**
+   * Constructor that specifies the domain file to use.
+   * 
+   * @param domainFile
+   *          The domain file, overrides domain-urlfilter.text default.
+   * 
+   * @throws IOException
+   */
+  public DomainURLFilter(String domainFile) {
+    this.domainFile = domainFile;
+  }
+
+  /**
+   * Sets the configuration.
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+
+    // get the extensions for domain urlfilter
+    String pluginName = "urlfilter-domain";
+    Extension[] extensions = PluginRepository.get(conf)
+        .getExtensionPoint(URLFilter.class.getName()).getExtensions();
+    for (int i = 0; i < extensions.length; i++) {
+      Extension extension = extensions[i];
+      if (extension.getDescriptor().getPluginId().equals(pluginName)) {
+        attributeFile = extension.getAttribute("file");
+        break;
+      }
+    }
+
+    // handle blank non empty input
+    if (attributeFile != null && attributeFile.trim().equals("")) {
+      attributeFile = null;
+    }
+
+    if (attributeFile != null) {
+      if (LOG.isInfoEnabled()) {
+        LOG.info("Attribute \"file\" is defined for plugin " + pluginName
+            + " as " + attributeFile);
+      }
+    } else {
+      if (LOG.isWarnEnabled()) {
+        LOG.warn("Attribute \"file\" is not defined in plugin.xml for plugin "
+            + pluginName);
+      }
+    }
+
+    // domain file and attribute "file" take precedence if defined
+    String file = conf.get("urlfilter.domain.file");
+    String stringRules = conf.get("urlfilter.domain.rules");
+    if (domainFile != null) {
+      file = domainFile;
+    } else if (attributeFile != null) {
+      file = attributeFile;
+    }
+    Reader reader = null;
+    if (stringRules != null) { // takes precedence over files
+      reader = new StringReader(stringRules);
+    } else {
+      reader = conf.getConfResourceAsReader(file);
+    }
+    try {
+      if (reader == null) {
+        reader = new FileReader(file);
+      }
+      readConfiguration(reader);
+    } catch (IOException e) {
+      LOG.error(org.apache.hadoop.util.StringUtils.stringifyException(e));
+    }
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public String filter(String url) {
+    // https://issues.apache.org/jira/browse/NUTCH-2189
+    if (domainSet.size() == 0) return url;
+    
+    try {
+      // match for suffix, domain, and host in that order. more general will
+      // override more specific
+      String domain = URLUtil.getDomainName(url).toLowerCase().trim();
+      String host = URLUtil.getHost(url);
+      String suffix = null;
+      DomainSuffix domainSuffix = URLUtil.getDomainSuffix(url);
+      if (domainSuffix != null) {
+        suffix = domainSuffix.getDomain();
+      }
+
+      if (domainSet.contains(suffix) || domainSet.contains(domain)
+          || domainSet.contains(host)) {
+        return url;
+      }
+
+      // doesn't match, don't allow
+      return null;
+    } catch (Exception e) {
+
+      // if an error happens, allow the url to pass
+      LOG.error("Could not apply filter on url: " + url + "\n"
+          + org.apache.hadoop.util.StringUtils.stringifyException(e));
+      return null;
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
new file mode 100644
index 0000000..d2eba1f
--- /dev/null
+++ b/nutch-plugins/urlfilter-domain/src/main/java/org/apache/nutch/urlfilter/domain/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * URL filter plugin to include only URLs which match an element in a given list of
+ * domain suffixes, domain names, and/or host names.
+ * See {@link org.apache.nutch.urlfilter.domainblacklist} for the counterpart
+ * (exclude URLs by host or domain).
+ */
+package org.apache.nutch.urlfilter.domain;
+


Mime
View raw message