nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From jnio...@apache.org
Subject svn commit: r1124202 - in /nutch/branches/branch-1.3: ./ conf/ src/java/org/apache/nutch/indexer/solr/ src/plugin/feed/src/java/org/apache/nutch/indexer/feed/ src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/ src/plugin/index-more/src/jav...
Date Wed, 18 May 2011 11:50:02 GMT
Author: jnioche
Date: Wed May 18 11:50:02 2011
New Revision: 1124202

URL: http://svn.apache.org/viewvc?rev=1124202&view=rev
Log:
NUTCH-997 IndexingFitlers to store Date objects instead of Strings

Modified:
    nutch/branches/branch-1.3/CHANGES.txt
    nutch/branches/branch-1.3/conf/schema.xml
    nutch/branches/branch-1.3/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
    nutch/branches/branch-1.3/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
    nutch/branches/branch-1.3/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
    nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java

Modified: nutch/branches/branch-1.3/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/CHANGES.txt?rev=1124202&r1=1124201&r2=1124202&view=diff
==============================================================================
--- nutch/branches/branch-1.3/CHANGES.txt (original)
+++ nutch/branches/branch-1.3/CHANGES.txt Wed May 18 11:50:02 2011
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Release 1.3 - 4/21/2011
 
+* NUTCH-997 IndexingFitlers to store Date objects instead of Strings (jnioche)
+
 * NUTCH-996 Indexer adds solr.commit.size+1 docs (markus)
 
 * NUTCH-983 Upgrade SolrJ to 3.1 (markus, jnioche)

Modified: nutch/branches/branch-1.3/conf/schema.xml
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/conf/schema.xml?rev=1124202&r1=1124201&r2=1124202&view=diff
==============================================================================
--- nutch/branches/branch-1.3/conf/schema.xml (original)
+++ nutch/branches/branch-1.3/conf/schema.xml Wed May 18 11:50:02 2011
@@ -84,9 +84,9 @@
             multiValued="true"/>
         <field name="contentLength" type="long" stored="true"
             indexed="false"/>
-        <field name="lastModified" type="long" stored="true"
+        <field name="lastModified" type="date" stored="true"
             indexed="false"/>
-        <field name="date" type="string" stored="true" indexed="true"/>
+        <field name="date" type="date" stored="true" indexed="true"/>
 
         <!-- fields for languageidentifier plugin -->
         <field name="lang" type="string" stored="true" indexed="true"/>
@@ -99,9 +99,9 @@
         <field name="author" type="string" stored="true" indexed="true"/>
         <field name="tag" type="string" stored="true" indexed="true"/>
         <field name="feed" type="string" stored="true" indexed="true"/>
-        <field name="publishedDate" type="string" stored="true"
+        <field name="publishedDate" type="date" stored="true"
             indexed="true"/>
-        <field name="updatedDate" type="string" stored="true"
+        <field name="updatedDate" type="date" stored="true"
             indexed="true"/>
     </fields>
     <uniqueKey>id</uniqueKey>

Modified: nutch/branches/branch-1.3/src/java/org/apache/nutch/indexer/solr/SolrWriter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/java/org/apache/nutch/indexer/solr/SolrWriter.java?rev=1124202&r1=1124201&r2=1124202&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/java/org/apache/nutch/indexer/solr/SolrWriter.java (original)
+++ nutch/branches/branch-1.3/src/java/org/apache/nutch/indexer/solr/SolrWriter.java Wed May
18 11:50:02 2011
@@ -18,6 +18,7 @@ package org.apache.nutch.indexer.solr;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Date;
 import java.util.List;
 import java.util.Map.Entry;
 
@@ -29,6 +30,7 @@ import org.apache.solr.client.solrj.Solr
 import org.apache.solr.client.solrj.SolrServerException;
 import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
 import org.apache.solr.common.SolrInputDocument;
+import org.apache.solr.common.util.DateUtil;
 
 public class SolrWriter implements NutchIndexWriter {
 
@@ -50,7 +52,12 @@ public class SolrWriter implements Nutch
     final SolrInputDocument inputDoc = new SolrInputDocument();
     for(final Entry<String, NutchField> e : doc) {
       for (final Object val : e.getValue().getValues()) {
-        inputDoc.addField(solrMapping.mapKey(e.getKey()), val, e.getValue().getWeight());
+        // normalise the string representation for a Date
+        Object val2 = val;
+        if (val instanceof Date){
+          val2 = DateUtil.getThreadLocalDateFormat().format(val);
+        }
+        inputDoc.addField(solrMapping.mapKey(e.getKey()), val2, e.getValue().getWeight());
         String sCopy = solrMapping.mapCopyKey(e.getKey());
         if (sCopy != e.getKey()) {
         	inputDoc.addField(sCopy, val);	

Modified: nutch/branches/branch-1.3/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java?rev=1124202&r1=1124201&r2=1124202&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
(original)
+++ nutch/branches/branch-1.3/src/plugin/feed/src/java/org/apache/nutch/indexer/feed/FeedIndexingFilter.java
Wed May 18 11:50:02 2011
@@ -18,9 +18,7 @@
 package org.apache.nutch.indexer.feed;
 
 //JDK imports
-import java.text.SimpleDateFormat;
 import java.util.Date;
-import java.util.TimeZone;
 
 //APACHE imports
 import org.apache.hadoop.conf.Configuration;
@@ -96,18 +94,14 @@ public class FeedIndexingFilter implemen
     if (feed != null)
       doc.add(Feed.FEED, feed);
     
-    SimpleDateFormat sdf = new SimpleDateFormat(dateFormatStr);
-    sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
     if (published != null) {
       Date date = new Date(Long.parseLong(published));
-      String dateString = sdf.format(date);
-      doc.add(PUBLISHED_DATE, dateString);
+      doc.add(PUBLISHED_DATE, date);
     }
     
     if (updated != null) {
       Date date = new Date(Long.parseLong(updated));
-      String dateString = sdf.format(date);
-      doc.add(UPDATED_DATE, dateString);
+      doc.add(UPDATED_DATE, date);
     }
         
     return doc;

Modified: nutch/branches/branch-1.3/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1124202&r1=1124201&r2=1124202&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
(original)
+++ nutch/branches/branch-1.3/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
Wed May 18 11:50:02 2011
@@ -30,7 +30,6 @@ import org.apache.hadoop.io.Text;
 
 import org.apache.nutch.crawl.CrawlDatum;
 import org.apache.nutch.crawl.Inlinks;
-import org.apache.solr.common.util.DateUtil;
 
 import java.net.MalformedURLException;
 import java.net.URL;
@@ -87,8 +86,7 @@ public class BasicIndexingFilter impleme
     }
     
     // add timestamp when fetched, for deduplication
-    String tstamp = DateUtil.getThreadLocalDateFormat().format(new Date(datum.getFetchTime()));
-    doc.add("tstamp", tstamp);
+    doc.add("tstamp", new Date(datum.getFetchTime()));
 
     return doc;
   }

Modified: nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
URL: http://svn.apache.org/viewvc/nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java?rev=1124202&r1=1124201&r2=1124202&view=diff
==============================================================================
--- nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
(original)
+++ nutch/branches/branch-1.3/src/plugin/index-more/src/java/org/apache/nutch/indexer/more/MoreIndexingFilter.java
Wed May 18 11:50:02 2011
@@ -51,7 +51,6 @@ import java.text.ParseException;
 import java.text.SimpleDateFormat;
 
 import java.util.Date;
-import java.util.TimeZone;
 
 import org.apache.commons.lang.time.DateUtils;
 
@@ -101,21 +100,15 @@ public class MoreIndexingFilter implemen
     if (lastModified != null) {                   // try parse last-modified
       time = getTime(lastModified,url);           // use as time
                                                   // store as string
-      doc.add("lastModified", Long.toString(time));
+      doc.add("lastModified", new Date(time));
     }
 
     if (time == -1) {                             // if no last-modified
       time = datum.getFetchTime();                // use fetch time
     }
 
-    // add support for query syntax date:
-    // query filter is implemented in DateQueryFilter.java
-    SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");
-    sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
-    String dateString = sdf.format(new Date(time));
-
     // un-stored, indexed and un-tokenized
-    doc.add("date", dateString);
+    doc.add("date", new Date(time));
 
     return doc;
   }



Mime
View raw message