nutch-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Apache Wiki <wikidi...@apache.org>
Subject [Nutch Wiki] Update of "IntranetRecrawl" by MatthewHolt
Date Wed, 02 Aug 2006 21:16:05 GMT
Dear Wiki user,

You have subscribed to a wiki page or wiki category on "Nutch Wiki" for change notification.

The following page has been changed by MatthewHolt:
http://wiki.apache.org/nutch/IntranetRecrawl

------------------------------------------------------------------------------
  linkdb_dir=$crawl_dir/linkdb
  index_dir=$crawl_dir/index
  
+ # Sets the number of current segments for later clean up
+ seg_num=`ls $segments_dir | wc -l`
+ 
  # The generate/fetch/update cycle
  for ((i=1; i <= depth ; i++))
  do
@@ -147, +150 @@

  # Update segments
  $nutch_dir/nutch invertlinks $linkdb_dir -dir $segments_dir
  
+ # Merge segments
+ mergesegs_dir=$crawl_dir/mergesegs_dir
+ $nutch_dir/nutch mergesegs $mergesegs_dir -dir $segments_dir
+ cp -R $mergesegs_dir/* $segments_dir
+ rm -rf $mergesegs_dir
+ 
  # Index segments
  new_indexes=$crawl_dir/newindexes
+ segment=`ls -d $segments_dir/* | tail -1`
- $nutch_dir/nutch index $new_indexes $webdb_dir $linkdb_dir $segments_dir/*
+ $nutch_dir/nutch index $new_indexes $webdb_dir $linkdb_dir $segment
  
  # De-duplicate indexes
  $nutch_dir/nutch dedup $new_indexes
@@ -163, +173 @@

  # Clean up
  rm -rf $new_indexes
  
+ # sleeps for 1 minute to make sure tomcat has released its lock on dir's
+ # before removing them
+ sleep 1m
+ 
+ echo "***Removing old segment directories that are no longer in use. If any of these error
out it is not a problem, just used for clean up."
+ 
+ seg_num=`expr $seg_num + $depth`
+ for segment in `ls -dr $segments_dir/* | tail -$seg_num`
+ do
+   echo "Removing Segment: $segment"
+   rm -rf $segment
+ done
  }}}
  

Mime
View raw message