httpd-cvs mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pque...@apache.org
Subject svn commit: r729612 - /httpd/mod_mbox/trunk/scripts/site-sitemap.py
Date Sat, 27 Dec 2008 07:22:20 GMT
Author: pquerna
Date: Fri Dec 26 23:22:20 2008
New Revision: 729612

URL: http://svn.apache.org/viewvc?rev=729612&view=rev
Log:
Generate partitioned sitemaps for mailing lists with over 100mb of messages.

Modified:
    httpd/mod_mbox/trunk/scripts/site-sitemap.py

Modified: httpd/mod_mbox/trunk/scripts/site-sitemap.py
URL: http://svn.apache.org/viewvc/httpd/mod_mbox/trunk/scripts/site-sitemap.py?rev=729612&r1=729611&r2=729612&view=diff
==============================================================================
--- httpd/mod_mbox/trunk/scripts/site-sitemap.py (original)
+++ httpd/mod_mbox/trunk/scripts/site-sitemap.py Fri Dec 26 23:22:20 2008
@@ -1,10 +1,25 @@
 #!/usr/local/bin/python
 
 import os
+from os.path import join as pjoin
 import sys
+import subprocess
+
+def get_output(cmd):
+    s = subprocess.Popen(cmd, stdout=subprocess.PIPE)
+    out = s.communicate()[0]
+    s.wait()
+    return out.strip()
+
+# you could use os.path.walk to calculate this... or you could use du(1).
+def duhack(path):
+    cmd = ['du', '-k', path]
+    out = get_output(cmd).split()
+    return int(out[0]) * 1024
 
 ROOT="/x1/mail-archives/mod_mbox"
 HOSTNAME="http://mail-archives.apache.org/mod_mbox/"
+PARITION_SIZE=100 * 1024 * 1024
 tlps={}
 for files in os.listdir(ROOT):
     path = files
@@ -17,7 +32,7 @@
        tlp = "asf"
     if not tlps.has_key(tlp):
         tlps[tlp] = {}
-    tlps[tlp][list] = path
+    tlps[tlp][list] = [path, duhack(pjoin(ROOT, path))]
 
 keys = tlps.keys()
 keys.sort()
@@ -36,7 +51,14 @@
     klist = tlps[tlp].keys()
     klist.sort()
     for list in klist:
-        print "   <sitemap><loc>%s%s/?format=sitemap</loc></sitemap>"
% (HOSTNAME, tlps[tlp][list])
+        name = tlps[tlp][list][0]
+        size = tlps[tlp][list][1]
+        if (size > PARITION_SIZE):
+            print "   <sitemap><loc>%s%s/?format=sitemap</loc></sitemap>"
% (HOSTNAME, name)
+        else:
+            part = int(size / PARITION_SIZE) + 1
+            for i in range(0, part):
+                print "   <sitemap><loc>%s%s/?format=sitemap&pmax=%d&part=%d</loc></sitemap>"
% (HOSTNAME, name, part, i)
 
 print """
 </sitemapindex>



Mime
View raw message