nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jnio...@apache.org
Subject svn commit: r1099483 - in /nutch/branches/branch-1.3: ./ conf/ src/plugin/ src/plugin/parse-rss/ src/plugin/parse-tika/ src/plugin/parse-tika/sample/ src/plugin/parse-tika/src/test/org/apache/nutch/tika/
Date Wed, 04 May 2011 15:20:01 GMT
Author: jnioche
Date: Wed May  4 15:20:00 2011
New Revision: 1099483

URL: http://svn.apache.org/viewvc?rev=1099483&view=rev
Log:
NUTCH-888 : Remove parse-rss

Added:
    nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss
    nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
Removed:
    nutch/branches/branch-1.3/src/plugin/parse-rss/
Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/conf/parse-plugins.xml
    nutch/branches/branch-1.3/src/plugin/build.xml
    nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1099483&r1=1099482&r2=1099483&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Wed May  4 15:20:00 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - 4/21/2011
 
+* NUTCH-888 Remove parse-rss and add tests for rss to parse-tika (jnioche)
+
 * NUTCH-991 SolrDedup must issue a commit (markus)
 
 * NUTCH 986 SolrDedup fails due to date incorrect format (markus)

Modified: nutch/branches/branch-1.3/conf/parse-plugins.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/parse-plugins.xml?rev=1099483&r1=1099482&r2=1099483&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/parse-plugins.xml (original)
+++ nutch/branches/branch-1.3/conf/parse-plugins.xml Wed May  4 15:20:00 2011
@@ -27,9 +27,9 @@
 	<mimeType name="*">
 	  <plugin id="parse-tika" />
 	</mimeType>
-
+ 
 	<mimeType name="application/rss+xml">
-	    <plugin id="parse-rss" />
+	    <plugin id="parse-tika" />
 	    <plugin id="feed" />
 	</mimeType>
 
@@ -65,7 +65,6 @@
 
 	<mimeType name="text/xml">
 		<plugin id="parse-tika" />
-		<plugin id="parse-rss" />
 		<plugin id="feed" />
 	</mimeType>
 
@@ -88,8 +87,6 @@
 		<alias name="parse-html"
 			extension-id="org.apache.nutch.parse.html.HtmlParser" />
 		<alias name="parse-js" extension-id="JSParser" />
-		<alias name="parse-rss"
-			extension-id="org.apache.nutch.parse.rss.RSSParser" />
 		<alias name="feed"
 			extension-id="org.apache.nutch.parse.feed.FeedParser" />
 		<alias name="parse-swf"

Modified: nutch/branches/branch-1.3/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/build.xml?rev=1099483&r1=1099482&r2=1099483&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/build.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/build.xml Wed May  4 15:20:00 2011
@@ -45,7 +45,6 @@
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
-     <ant dir="parse-rss" target="deploy"/>
      <ant dir="parse-swf" target="deploy"/>
      <ant dir="parse-tika" target="deploy"/>
      <ant dir="parse-zip" target="deploy"/>
@@ -77,7 +76,6 @@
      <ant dir="protocol-file" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
      <!--ant dir="parse-ext" target="test"/-->
-     <ant dir="parse-rss" target="test"/>
      <ant dir="feed" target="test"/>
      <ant dir="parse-html" target="test"/>
      <ant dir="parse-swf" target="test"/>
@@ -119,7 +117,6 @@
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-html" target="clean"/>
-    <ant dir="parse-rss" target="clean"/>
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>

Modified: nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml?rev=1099483&r1=1099482&r2=1099483&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml (original)
+++ nutch/branches/branch-1.3/src/plugin/parse-tika/build.xml Wed May  4 15:20:00 2011
@@ -29,6 +29,7 @@
   <mkdir dir="${build.test}/data"/>
   <copy todir="${build.test}/data">
     <fileset dir="sample">
+      <include name="*.rss"/>
       <include name="*.rtf"/>
       <include name="*.pdf"/>
       <include name="ootest.*"/>

Added: nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss?rev=1099483&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss (added)
+++ nutch/branches/branch-1.3/src/plugin/parse-tika/sample/rsstest.rss Wed May  4 15:20:00
2011
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>

Added: nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java?rev=1099483&view=auto
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
(added)
+++ nutch/branches/branch-1.3/src/plugin/parse-tika/src/test/org/apache/nutch/tika/TestFeedParser.java
Wed May  4 15:20:00 2011
@@ -0,0 +1,130 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.tika;
+
+import junit.framework.TestCase;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.parse.Outlink;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseData;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.tika.TikaParser;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+/**
+ * 
+ * @author mattmann / jnioche
+ * 
+ *         Test Suite for the RSS feeds with the {@link TikaParser}.
+ * 
+ */
+public class TestFeedParser extends TestCase {
+
+	private String fileSeparator = System.getProperty("file.separator");
+
+	// This system property is defined in ./src/plugin/build-plugin.xml
+	private String sampleDir = System.getProperty("test.data", ".");
+
+	private String[] sampleFiles = { "rsstest.rss" };
+
+	public static final Log LOG = LogFactory.getLog(TestFeedParser.class
+			.getName());
+
+	/**
+	 * Default Constructor.
+	 * 
+	 * @param name
+	 *            The name of this {@link TestCase}.
+	 */
+	public TestFeedParser(String name) {
+		super(name);
+	}
+
+	/**
+	 * <p>
+	 * The test method: tests out the following 2 asserts:
+	 * </p>
+	 * 
+	 * <ul>
+	 * <li>There are 3 outlinks read from the sample rss file</li>
+	 * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+	 * file</li>
+	 * </ul>
+	 */
+	public void testIt() throws ProtocolException, ParseException {
+		String urlString;
+		Protocol protocol;
+		Content content;
+		Parse parse;
+
+		Configuration conf = NutchConfiguration.create();
+		for (int i = 0; i < sampleFiles.length; i++) {
+			urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+			protocol = new ProtocolFactory(conf).getProtocol(urlString);
+			content = protocol.getProtocolOutput(new Text(urlString),
+					new CrawlDatum()).getContent();
+			parse = new ParseUtil(conf).parseByExtensionId("parse-tika",
+					content).get(content.getUrl());
+
+			// check that there are 2 outlinks:
+			// unlike the original parse-rss
+			// tika ignores the URL and description of the channel
+
+			// http://test.channel.com
+			// http://www-scf.usc.edu/~mattmann/
+			// http://www.nutch.org
+
+			ParseData theParseData = parse.getData();
+
+			Outlink[] theOutlinks = theParseData.getOutlinks();
+
+			assertTrue("There aren't 2 outlinks read!",
+					theOutlinks.length == 2);
+
+			// now check to make sure that those are the two outlinks
+			boolean hasLink1 = false, hasLink2 = false;
+
+			for (int j = 0; j < theOutlinks.length; j++) {
+				if (theOutlinks[j].getToUrl().equals(
+						"http://www-scf.usc.edu/~mattmann/")) {
+					hasLink1 = true;
+				}
+
+				if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+					hasLink2 = true;
+				}
+			}
+
+			if (!hasLink1 || !hasLink2) {
+				fail("Outlinks read from sample rss file are not correct!");
+			}
+		}
+	}
+
+}



Mime
View raw message