nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jnio...@apache.org
Subject svn commit: r1099585 - in /nutch/trunk: ./ conf/ src/plugin/ src/plugin/parse-rss/ src/plugin/parse-tika/ src/plugin/parse-tika/sample/ src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/
Date Wed, 04 May 2011 20:16:06 GMT
Author: jnioche
Date: Wed May  4 20:16:06 2011
New Revision: 1099585

URL: http://svn.apache.org/viewvc?rev=1099585&view=rev
Log:
NUTCH-888 : Remove parse-rss

Added:
    nutch/trunk/src/plugin/parse-tika/sample/rsstest.rss
    nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
Removed:
    nutch/trunk/src/plugin/parse-rss/
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/parse-plugins.xml
    nutch/trunk/src/plugin/build.xml
    nutch/trunk/src/plugin/parse-tika/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1099585&r1=1099584&r2=1099585&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed May  4 20:16:06 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 2.0 - Current Development
 
+* NUTCH-888 Remove parse-rss and add tests for rss to parse-tika (jnioche)
+
 * NUTCH-991 SolrDedup must issue a commit (markus)
 
 * NUTCH 986 SolrDedup fails due to date incorrect format (markus)

Modified: nutch/trunk/conf/parse-plugins.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/parse-plugins.xml?rev=1099585&r1=1099584&r2=1099585&view=diff
==============================================================================
--- nutch/trunk/conf/parse-plugins.xml (original)
+++ nutch/trunk/conf/parse-plugins.xml Wed May  4 20:16:06 2011
@@ -37,7 +37,7 @@
 	</mimeType>
 
 	<mimeType name="application/rss+xml">
-	    <plugin id="parse-rss" />
+	    <plugin id="parse-tika" />
 	    <plugin id="feed" />
 	</mimeType>
 
@@ -65,7 +65,6 @@
 
 	<mimeType name="text/xml">
 		<plugin id="parse-tika" />
-		<plugin id="parse-rss" />
 		<plugin id="feed" />
 	</mimeType>
 
@@ -88,8 +87,6 @@
 			extension-id="org.apache.nutch.parse.tika.TikaParser" />
 		<alias name="parse-ext" extension-id="ExtParser" />
 		<alias name="parse-js" extension-id="JSParser" />
-		<alias name="parse-rss"
-			extension-id="org.apache.nutch.parse.rss.RSSParser" />
 		<alias name="feed"
 			extension-id="org.apache.nutch.parse.feed.FeedParser" />
 		<alias name="parse-swf"

Modified: nutch/trunk/src/plugin/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1099585&r1=1099584&r2=1099585&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Wed May  4 20:16:06 2011
@@ -44,7 +44,6 @@
   	 <ant dir="protocol-sftp" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
-     <ant dir="parse-rss" target="deploy"/>
      <ant dir="parse-tika" target="deploy"/>
      <ant dir="scoring-link" target="deploy"/>
      <ant dir="scoring-opic" target="deploy"/>
@@ -72,11 +71,9 @@
   <!-- ====================================================== -->
   <target name="test">
      <ant dir="creativecommons" target="test"/>
-     <ant dir="parse-rss" target="test"/>
      <ant dir="parse-tika" target="test"/>
      <ant dir="protocol-file" target="test"/>
      <ant dir="parse-html" target="test"/>
-     <ant dir="parse-rss" target="test"/>
      <ant dir="index-more" target="test"/>
      <ant dir="languageidentifier" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>
@@ -117,7 +114,6 @@
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-html" target="clean"/>
-    <ant dir="parse-rss" target="clean"/>
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>

Modified: nutch/trunk/src/plugin/parse-tika/build.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/build.xml?rev=1099585&r1=1099584&r2=1099585&view=diff
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/build.xml (original)
+++ nutch/trunk/src/plugin/parse-tika/build.xml Wed May  4 20:16:06 2011
@@ -29,6 +29,7 @@
   <mkdir dir="${build.test}/data"/>
   <copy todir="${build.test}/data">
     <fileset dir="sample">
+      <include name="*.rss"/>
       <include name="*.rtf"/>
       <include name="*.pdf"/>
       <include name="ootest.*"/>

Added: nutch/trunk/src/plugin/parse-tika/sample/rsstest.rss
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/sample/rsstest.rss?rev=1099585&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/sample/rsstest.rss (added)
+++ nutch/trunk/src/plugin/parse-tika/sample/rsstest.rss Wed May  4 20:16:06 2011
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="ISO-8859-1" ?>
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one or more
+	contributor license agreements.  See the NOTICE file distributed with
+	this work for additional information regarding copyright ownership.
+	The ASF licenses this file to You under the Apache License, Version 2.0
+	(the "License"); you may not use this file except in compliance with
+	the License.  You may obtain a copy of the License at
+	
+	http://www.apache.org/licenses/LICENSE-2.0
+	
+	Unless required by applicable law or agreed to in writing, software
+	distributed under the License is distributed on an "AS IS" BASIS,
+	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+	See the License for the specific language governing permissions and
+	limitations under the License.
+-->
+<rss version="0.91">
+    <channel>
+      <title>TestChannel</title>
+      <link>http://test.channel.com/</link> 
+      <description>Sample RSS File for Junit test</description> 
+      <language>en-us</language>
+      
+      <item>
+        <title>Home Page of Chris Mattmann</title>
+        <link>http://www-scf.usc.edu/~mattmann/</link>
+        <description>Chris Mattmann's home page</description>
+      </item>
+
+      <item>
+        <title>Awesome Open Source Search Engine</title> 
+        <link>http://www.nutch.org/</link> 
+        <description>Yup, that's what it is</description> 
+      </item>
+   </channel>
+</rss>

Added: nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java?rev=1099585&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
(added)
+++ nutch/trunk/src/plugin/parse-tika/src/test/org/apache/nutch/parse/tika/TestRSSParser.java
Wed May  4 20:16:06 2011
@@ -0,0 +1,140 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.tika;
+
+import java.io.DataInputStream;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.ProtocolException;
+
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.parse.ParseException;
+import org.apache.nutch.parse.Outlink;
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.MimeUtil;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.apache.tika.mime.MimeType;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for the RSS Parser based on John Xing's TestPdfParser class.
+ * 
+ * @author mattmann
+ * @version 1.0
+ */
+public class TestRSSParser extends TestCase {
+
+  private String fileSeparator = System.getProperty("file.separator");
+
+  // This system property is defined in ./src/plugin/build-plugin.xml
+  private String sampleDir = System.getProperty("test.data", ".");
+
+  // Make sure sample files are copied to "test.data" as specified in
+  // ./src/plugin/parse-rss/build.xml during plugin compilation.
+
+  private String[] sampleFiles = { "rsstest.rss" };
+
+  /**
+   * <p>
+   * Default constructor
+   * </p>
+   * 
+   * @param name
+   *          The name of the RSSParserTest
+   */
+  public TestRSSParser(String name) {
+    super(name);
+  }
+
+  /**
+   * <p>
+   * The test method: tests out the following 2 asserts:
+   * </p>
+   * 
+   * <ul>
+   * <li>There are 3 outlinks read from the sample rss file</li>
+   * <li>The 3 outlinks read are in fact the correct outlinks from the sample
+   * file</li>
+   * </ul>
+   */
+  public void testIt()throws ProtocolException, ParseException, IOException {
+    String urlString;
+    Protocol protocol;
+    Parse parse;
+
+    Configuration conf = NutchConfiguration.create();
+    MimeUtil mimeutil = new MimeUtil(conf);
+    for (int i = 0; i < sampleFiles.length; i++) {
+      urlString = "file:" + sampleDir + fileSeparator + sampleFiles[i];
+
+      File file = new File(sampleDir + fileSeparator + sampleFiles[i]);
+      byte[] bytes = new byte[(int) file.length()];
+      DataInputStream in = new DataInputStream(new FileInputStream(file));
+      in.readFully(bytes);
+      in.close();
+
+      WebPage page = new WebPage();
+      page.setBaseUrl(new Utf8(urlString));
+      page.setContent(ByteBuffer.wrap(bytes));
+      MimeType mtype = mimeutil.getMimeType(file);
+      page.setContentType(new Utf8(mtype.getName()));
+
+      parse = new ParseUtil(conf).parse(urlString, page);
+
+      // check that there are 2 outlinks:
+
+      // http://www-scf.usc.edu/~mattmann/
+      // http://www.nutch.org
+
+      Outlink[] theOutlinks = parse.getOutlinks();
+
+      assertTrue("There aren't 2 outlinks read!", theOutlinks.length == 2);
+
+      // now check to make sure that those are the two outlinks
+      boolean hasLink1 = false, hasLink2 = false;
+
+      for (int j = 0; j < theOutlinks.length; j++) {
+        // System.out.println("reading "+theOutlinks[j].getToUrl());
+        if (theOutlinks[j].getToUrl().equals(
+            "http://www-scf.usc.edu/~mattmann/")) {
+          hasLink1 = true;
+        }
+
+        if (theOutlinks[j].getToUrl().equals("http://www.nutch.org/")) {
+          hasLink2 = true;
+        }
+      }
+
+      if (!hasLink1 || !hasLink2) {
+        fail("Outlinks read from sample rss file are not correct!");
+      }
+    }
+  }
+
+}



Mime
View raw message