incubator-cvs mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From br...@apache.org
Subject svn commit: r1438531 - in /incubator/public/trunk/incuvoter: incuvoter.py mboxparser.py votestatus.py
Date Fri, 25 Jan 2013 14:19:29 GMT
Author: brane
Date: Fri Jan 25 14:19:29 2013
New Revision: 1438531

URL: http://svn.apache.org/viewvc?rev=1438531&view=rev
Log:
Incuvoter: Remove all traces of parsing Atom feeds, rely on mbox
mail archives instead. To work around the fact that our archive
mbox files only contain mails that are addressed To: the list and
do not show Cc:, parse all incubator archives instead of just
the general@ one to find interesting votes.

Removed:
    incubator/public/trunk/incuvoter/mboxparser.py
Modified:
    incubator/public/trunk/incuvoter/incuvoter.py
    incubator/public/trunk/incuvoter/votestatus.py

Modified: incubator/public/trunk/incuvoter/incuvoter.py
URL: http://svn.apache.org/viewvc/incubator/public/trunk/incuvoter/incuvoter.py?rev=1438531&r1=1438530&r2=1438531&view=diff
==============================================================================
--- incubator/public/trunk/incuvoter/incuvoter.py (original)
+++ incubator/public/trunk/incuvoter/incuvoter.py Fri Jan 25 14:19:29 2013
@@ -19,14 +19,16 @@
 Purpose: Incuvoter maintaines a list of currently open and recently closed
          votes on the general@incubator mailing list.
 
-It does so by parsing either the Atom feed (see FeedParser.__feed_url, below)
- or an mbox file of the list archives and updates information about votes,
-based on tags in from the subject lines, in a SQLite database.
+It does so by parsing the mbox files of all Incubator list archives
+and updates information about votes, based on tags in from the subject
+lines, in a SQLite database.
 
-Usage: incuvoter.py [mbox_path]
+Usage: incuvoter.py <mbox_archive_basedir>
 
-   If mbox_path is not provided, retreives and parses an Atom feed.
-   Otherwise, parses the mbox file.
+   where mbox_archive_basedir is the root directory of all Incubator
+   mailing list archives in mbox format; e.g., on minotaur, it's
+
+       ~apmail/public-arch/incubator.apache.org
 
 Status: Pre-Alpha, under construction.
 '''
@@ -36,56 +38,95 @@ from __future__ import absolute_import
 import os, re, sys
 import collections
 import datetime
+import email
+import email.utils
 import sqlite3
-import time
-
-# The feedparser module is an optional dependency
-try:
-    import feedparser
-except ImportError:
-    feedparser = None
 
 
-class __ZULU(datetime.tzinfo):
+class __UTC(datetime.tzinfo):
     def utcoffset(self, dt):
         return 0;
     def dst(self, dt):
         return timedelta(0)
     def tzname(self,dt):
-        return "Z"
+        return 'UTC'
 
     def adjust(self, dateobject):
-        if isinstance(dateobject, datetime.datetime):
-            if dateobject.tzinfo:
-                dateobject = dateobject.astimezone(self)
-            return dateobject
+        if dateobject.tzinfo:
+            dateobject = dateobject.astimezone(self)
+        return dateobject
+
+    def timestring(self, dateobject):
+        if dateobject is not None:
+            return self.adjust(dateobject).strftime('%Y-%m-%d %H:%M:%S')
+        return None
+
+    def timedate(self, dateobject):
+        if dateobject is not None:
+            return self.adjust(dateobject).strftime('%Y-%m-%d')
+        return None
+
+    def timeparse(self, datestring):
+        if datestring is not None:
+            return datetime.datetime.strptime(datestring, '%Y-%m-%d %H:%M:%S')
         return None
-ZULU = __ZULU()
+UTC = __UTC()
+
+
+class MBoxParser(object):
+    __general_rx = re.compile(r'general@incubator\.apache\.org')
+
+    MBox = collections.namedtuple('MBox', ('mtime', 'entries'))
+    Entry = collections.namedtuple('Entry', ('updated', 'title'))
+
+    @classmethod
+    def __append_message(cls, text, mbox):
+        if not text:
+            return
+
+        message = email.message_from_string(text)
+
+        def decode(header):
+            header = message[header]
+            if header:
+                decoded = email.utils.decode_rfc2231(header)
+                try:
+                    return email.utils.collapse_rfc2231_value(decoded)
+                except:
+                    return header
+            return None
+
+        to = decode('To')
+        cc = decode('Cc')
+        date = message['Date']
+        subject = decode('Subject')
+        if not subject or not date:
+            return
+
+        if not (to and cls.__general_rx.search(to)
+                or cc and cls.__general_rx.search(cc)):
+            # The message was not sent to general@incubator
+            return
 
-def timestring(dateobject):
-    adjusted = ZULU.adjust(dateobject)
-    if adjusted:
-        return adjusted.strftime('%Y-%m-%d %H:%M:%S')
-    return str(dateobject)
-
-def timedate(dateobject):
-    adjusted = ZULU.adjust(dateobject)
-    if adjusted:
-        return adjusted.strftime('%Y-%m-%d')
-    return str(dateobject)
-
-def timeparse(datestring):
-    return datetime.datetime.strptime(datestring, '%Y-%m-%d %H:%M:%S')
-
-
-class FeedParser(object):
-    # FIXME: Our own feed seems to be woefully incomplete.
-    #        Use Markmail's for now.
-    #__feed_url = 'http://mail-archives.apache.org/mod_mbox/incubator-general/?format=atom'
-    __feed_url = ('http://markmail.org/atom/list:org.apache.incubator.general+date:'
-                   + datetime.datetime.utcnow().strftime('%Y%m')  # current month
-                   + '+order:date-forward')
+        timestamp = email.utils.mktime_tz(email.utils.parsedate_tz(date))
+        updated = datetime.datetime.fromtimestamp(timestamp)
+        mbox.entries.append(cls.Entry(updated, subject))
 
+    @classmethod
+    def parse(cls, mbox_path, mtime):
+        with open(mbox_path, 'rt') as mbox_file:
+            mbox = cls.MBox(mtime, [])
+            text = None
+            for line in mbox_file:
+                if line.startswith('From '):
+                    cls.__append_message(text, mbox)
+                    text = ''
+                text += line
+            cls.__append_message(text, mbox)
+        return mbox
+
+
+class VoteUpdater(object):
     __subject_rx = re.compile(
         # Skip anything before the first tag
         r'^[^[]*'
@@ -100,66 +141,60 @@ class FeedParser(object):
         r'(?P<subject>.*?)\s*$',
         re.IGNORECASE)
 
-    def __init__(self, mbox_path = None):
-        if not mbox_path:
-            if not feedparser:
-                sys.stderr.write('ERROR: feedparser module is not available\n')
-                sys.exit(1)
-            self.feed = feedparser.parse(self.__feed_url)
-        else:
-            sys_path_saved = sys.path
-            sys.path.insert(0, os.path.dirname(__file__))
-            try:
-                import mboxparser
-            finally:
-                sys.path = sys_path_saved
-            self.feed  = mboxparser.parse_mbox(mbox_path)
-            if not self.feed:
-                sys.stderr.write('ERROR: mbox not found: %s\n' % mbox_path)
-                sys.exit(1)
+    def __init__(self, mbox_archive_basedir):
+        self.mbox_basedir = mbox_archive_basedir
+        self.mbox_relpaths = []
+        month = datetime.datetime.utcnow().strftime('%Y%m')
+        for topdir, dirnames, filenames in os.walk(mbox_archive_basedir):
+            if month not in filenames:
+                continue
+            relpath = os.path.relpath(os.path.join(topdir, month),
+                                      mbox_archive_basedir)
+            self.mbox_relpaths.append(relpath)
 
     ParsedVote = collections.namedtuple(
         'ParsedVote',
         ('sortkey', 'updated', 'subject', 'closed', 'cancelled'))
 
-    def record(self, database):
-        feed_updated = self.feed.get('updated', None)
-        if not feed_updated:
-            feed_updated = timestring(datetime.datetime.utcnow())
-        if database.updated == feed_updated:
-            # Nothing changed, do not modify the database.
-            return
-
-        votes = []
-        for e in self.feed.entries:
-            title = e.get('title', None)
-            if title is None:
-                continue
+    ParsedMBox = collections.namedtuple('ParsedMBox', ('relpath', 'mtime'))
 
-            parsed = self.__subject_rx.match(title)
+    def __parse_mbox(self, mbox_path, mtime, votes):
+        mbox = MBoxParser.parse(mbox_path, mtime)
+        for e in mbox.entries:
+            parsed = self.__subject_rx.match(e.title)
             if parsed is None:
                 continue
 
             subject = parsed.group('subject')
-            created = e.get('created_parsed', None)
-            if created:
-                created = datetime.datetime.fromtimestamp(time.mktime(created))
-            updated = e.get('updated_parsed', None)
-            if updated:
-                updated = datetime.datetime.fromtimestamp(time.mktime(updated))
-            if updated and created and created > updated:
-                updated = created
-            updated = timestring(updated)
             cancelled = int(parsed.group('cancel1') is not None
                             or parsed.group('cancel2') is not None)
             if cancelled or parsed.group('result1') or parsed.group('result2'):
-                closed = updated
+                closed = e.updated
             else:
                 closed = None
 
-            votes.append(self.ParsedVote(subject.upper(), updated,
+            votes.append(self.ParsedVote(subject.upper(), e.updated,
                                          subject, closed, cancelled))
 
+    def record(self, database):
+        votes = []
+        mboxes = []
+        for relpath in self.mbox_relpaths:
+            mbox_path = os.path.join(self.mbox_basedir, relpath)
+            mtime = os.stat(mbox_path).st_mtime
+            if mtime != database.mbox_mtime(relpath):
+                mboxes.append((mbox_path, relpath, mtime))
+
+        if not mboxes:
+            # Nothing to do
+            return
+
+        for mbox_path, relpath, mtime in mboxes:
+            self.__parse_mbox(mbox_path, mtime, votes)
+
+        timestamp = max(m for p, r, m in mboxes)
+        feed_updated = datetime.datetime.utcfromtimestamp(timestamp)
+
         # TODO: Filter duplicate votes, but return the first and
         #       last in every duplicate list
         database.record_votes(feed_updated,
@@ -167,10 +202,11 @@ class FeedParser(object):
                                              updated = v.updated,
                                              closed = v.closed,
                                              cancelled = v.cancelled)
-                               for v in sorted(votes)))
+                               for v in sorted(votes)),
+                               (self.ParsedMBox(r, m) for p, r, m in mboxes))
 
 
-class FeedDatabase(object):
+class VoteDatabase(object):
     __schema = """
         DROP TABLE IF EXISTS feedinfo;
         CREATE TABLE feedinfo (
@@ -191,6 +227,12 @@ class FeedDatabase(object):
         );
         CREATE INDEX updated_index ON vote(updated DESC);
         CREATE INDEX closed_index ON vote(closed DESC);
+
+        DROP TABLE IF EXISTS mbox;
+        CREATE TABLE mbox (
+          relpath TEXT NOT NULL PRIMARY KEY,
+          mtime FLOAT NOT NULL
+        );
         """
 
     @classmethod
@@ -217,6 +259,26 @@ class FeedDatabase(object):
         self.con = self.__connect(path)
         self.__updated = None
 
+    class __Transaction(object):
+        __slots__ = ['__db']
+        def __init__(self, database):
+            self.__db = database
+        def __enter__(self):
+            self.__db.con.execute("BEGIN")
+            return self.__db
+        def __exit__(self, exc_type, exc_value, traceback):
+            if exc_type is None:
+                self.__db.con.commit()
+            else:
+                try:
+                    self.__db.con.rollback()
+                except:
+                    pass
+            return None
+
+    def transaction(self):
+        return self.__Transaction(self)
+
     def close(self):
         self.con.close()
 
@@ -225,9 +287,20 @@ class FeedDatabase(object):
         if self.__updated is None:
             cursor = self.con.cursor()
             cursor.execute("SELECT updated FROM feedinfo WHERE rowid = 1")
-            self.__updated = cursor.fetchone()['updated']
+            self.__updated = UTC.timeparse(cursor.fetchone()['updated'])
         return self.__updated
 
+    def mbox_mtime(self, relpath):
+        cursor = self.con.cursor()
+        cursor.execute("SELECT mtime FROM mbox WHERE relpath = ?", (relpath,))
+        row = cursor.fetchone()
+        return row and row['mtime'] or None
+
+    def record_mbox(self, relpath, mtime):
+        self.con.execute("INSERT OR REPLACE INTO mbox"
+                         " (relpath, mtime) VALUES (?, ?)",
+                         (relpath, mtime))
+
     class Vote(object):
         __slots__ = ('sortkey', 'subject',
                      'noticed', 'updated', 'closed', 'cancelled')
@@ -261,7 +334,11 @@ class FeedDatabase(object):
             row = cursor.fetchone()
             if row is None:
                 return None
-            return cls(sortkey=sortkey, **row)
+            vote = cls(sortkey=sortkey, **row)
+            vote.noticed = UTC.timeparse(vote.noticed)
+            vote.updated = UTC.timeparse(vote.updated)
+            vote.closed = UTC.timeparse(vote.closed)
+            return vote
 
         def insert(self, con):
             assert self.sortkey is None and self.subject is not None
@@ -272,7 +349,10 @@ class FeedDatabase(object):
                         " (sortkey, subject, noticed, updated, closed, cancelled)"
                         " VALUES (?, ?, ?, ?, ?, ?)",
                         (self.sortkey, self.subject,
-                         self.noticed, self.updated, self.closed, self.cancelled))
+                         UTC.timestring(self.noticed),
+                         UTC.timestring(self.updated),
+                         UTC.timestring(self.closed),
+                         self.cancelled))
 
         def update(self, con):
             assert self.sortkey is not None
@@ -280,31 +360,27 @@ class FeedDatabase(object):
             con.execute("UPDATE vote SET"
                         " noticed = ?, updated = ?, closed = ?, cancelled = ?"
                         " WHERE sortkey = ?",
-                        (self.noticed, self.updated, self.closed,
+                        (UTC.timestring(self.noticed),
+                         UTC.timestring(self.updated),
+                         UTC.timestring(self.closed),
                          self.cancelled, self.sortkey))
 
-    def record_votes(self, updated, votes):
-        self.con.rollback()
-        self.con.execute("BEGIN")
-        try:
+    def record_votes(self, updated, votes, mboxes):
+        with self.transaction() as txn:
             for v in votes:
-                vote = self.Vote.find(self.con, v.subject)
+                vote = txn.Vote.find(txn.con, v.subject)
                 if vote:
                     vote.merge(v)
-                    vote.update(self.con)
+                    vote.update(txn.con)
                 else:
-                    v.insert(self.con)
+                    v.insert(txn.con)
 
-            self.con.execute("UPDATE feedinfo SET updated = ?", (updated,))
-            self.__updated = None
-        except:
-            try:
-                self.con.rollback()
-            except:
-                pass
-            raise
-        else:
-            self.con.commit()
+            for m in mboxes:
+                txn.record_mbox(m.relpath, m.mtime)
+
+            txn.con.execute("UPDATE feedinfo SET updated = ?",
+                            (UTC.timestring(updated),))
+            txn.__updated = None
 
     def __list_votes(self, active):
         if active:
@@ -316,7 +392,11 @@ class FeedDatabase(object):
         cursor = self.con.cursor()
         cursor.execute(sql)
         for row in cursor.fetchall():
-            yield self.Vote(**row)
+            vote = self.Vote(**row)
+            vote.noticed = UTC.timeparse(vote.noticed)
+            vote.updated = UTC.timeparse(vote.updated)
+            vote.closed = UTC.timeparse(vote.closed)
+            yield vote
 
     def list_open_votes(self):
         return self.__list_votes(True)
@@ -331,24 +411,22 @@ class FeedDatabase(object):
                        " WHERE closed IS NOT NULL ORDER BY closed DESC")
         obsolete = []
         for row in cursor.fetchall():
-            updated = timeparse(row['updated'])
+            updated = UTC.timeparse(row['updated'])
             if now - updated > datetime.timedelta(days = 30):
                 obsolete.append(row['sortkey'])
         if obsolete:
-            cursor.execute("DELETE FROM vote WHERE sortkey IN ?", (obsolete,))
-            self.con.commit()
+            with self.transaction() as txn:
+                txn.con.execute("DELETE FROM vote WHERE sortkey IN ?",
+                                (obsolete,))
 
 
 def main():
     votes_path = os.path.join(os.path.dirname(__file__), 'votes.sqlite')
     if not os.path.isfile(votes_path):
-        FeedDatabase.create(votes_path)
-    if len(sys.argv) > 1:
-        parser = FeedParser(sys.argv[1])
-    else:
-        parser = FeedParser()
-    database = FeedDatabase(votes_path)
-    parser.record(database)
+        VoteDatabase.create(votes_path)
+    updater = VoteUpdater(sys.argv[1])
+    database = VoteDatabase(votes_path)
+    updater.record(database)
     database.prune_old_votes()
     database.close()
 

Modified: incubator/public/trunk/incuvoter/votestatus.py
URL: http://svn.apache.org/viewvc/incubator/public/trunk/incuvoter/votestatus.py?rev=1438531&r1=1438530&r2=1438531&view=diff
==============================================================================
--- incubator/public/trunk/incuvoter/votestatus.py (original)
+++ incubator/public/trunk/incuvoter/votestatus.py Fri Jan 25 14:19:29 2013
@@ -32,7 +32,7 @@ import cgi
 import datetime
 
 sys.path.insert(0, os.path.dirname(__file__))
-from incuvoter import FeedDatabase, timedate, timeparse
+from incuvoter import VoteDatabase, UTC
 
 
 __page_template = """\
@@ -66,10 +66,6 @@ __page_template = """\
   .nag    { background-color: #e69f00; }
   .yell   { background-color: #d55e00; font-weight: bold; }
 
-  .warn   { color: red;
-            background-color: yellow;
-            font-weight: bold; }
-
   body    { color: black;
             background-color: white;
             font-family: Helvetica, Arial, sans-serif; }
@@ -84,6 +80,7 @@ __page_template = """\
 </head>
 <body>
   <h1>Apache Incubator Voting Status</h1>
+  <p>Last recorded change: %s</p>
 %s
 %s
 </body>
@@ -97,6 +94,7 @@ __current_table = """\
      <th>Subject</th>
      <th>Activity</th>
      <th>Started</th>
+     <th>Age</th>
     </tr>
 %s
   </table>"""
@@ -106,6 +104,7 @@ __current_row = """\
       <td>%(subject)s</td>
       <td>%(updated)s</td>
       <td>%(noticed)s</td>
+      <td>%(duration)s</td>
     </tr>"""
 
 __closed_table = """\
@@ -130,32 +129,39 @@ __closed_row = """\
 
 
 def __datescape(dateobject):
-  return cgi.escape(timedate(dateobject)).replace('-', '&ndash;')
+  return cgi.escape(UTC.timedate(dateobject)).replace('-', '&ndash;')
 
 def refresh_page(target, database):
     current = []
     now = datetime.datetime.utcnow()
     for vote in database.list_open_votes():
-        updated = timeparse(vote.updated)
-        noticed = timeparse(vote.noticed)
-        if (not isinstance(updated, datetime.datetime)
-            or not isinstance(noticed, datetime.datetime)):
-            klass = 'warn'
+        age = now - vote.noticed
+        if age < datetime.timedelta(hours = 49):
+            klass = 'normal'
+        if age < datetime.timedelta(hours = 73):
+            klass = 'nudge'
+        elif age < datetime.timedelta(days = 7):
+            klass = 'nag'
+        else:
+            klass = 'yell'
+
+        if age < datetime.timedelta(hours = 72):
+           hours = int((age.total_seconds() + 1800) // 3600)
+           if hours == 1:
+               duration = 'one hour'
+           else:
+               duration = '%d hours' % hours
         else:
-            age = now - noticed
-            if age < datetime.timedelta(hours = 49):
-                klass = 'normal'
-            if age < datetime.timedelta(hours = 73):
-                klass = 'nudge'
-            elif age < datetime.timedelta(days = 7):
-                klass = 'nag'
-            else:
-                klass = 'yell'
+           days = int((age.total_seconds() + 43200) // 86400)
+           duration = '%d days' % days
+
+
         current.append(__current_row
                        % dict(klass = klass,
+                              duration = duration,
                               subject = cgi.escape(vote.subject),
-                              updated = __datescape(updated),
-                              noticed = __datescape(noticed)))
+                              updated = __datescape(vote.updated),
+                              noticed = __datescape(vote.noticed)))
     if current:
         current = __current_table % '\n'.join(current)
     else:
@@ -163,31 +169,27 @@ def refresh_page(target, database):
 
     resolved = []
     for vote in database.list_resolved_votes():
-        noticed = timeparse(vote.noticed)
-        closed = timeparse(vote.closed)
-        if (not isinstance(noticed, datetime.datetime)
-            or not isinstance(closed, datetime.datetime)):
-            klass = 'warn'
-        else:
-            klass = vote.cancelled and 'nudge' or 'normal'
+        klass = vote.cancelled and 'nudge' or 'normal'
+        status = vote.cancelled and 'Cancelled' or 'Resolved'
         resolved.append(__closed_row
                         % dict(klass = klass,
-                               status = vote.cancelled and 'Cancelled' or 'Resolved',
+                               status = status,
                                subject = cgi.escape(vote.subject),
-                               noticed = __datescape(noticed),
-                               closed = __datescape(closed)))
+                               noticed = __datescape(vote.noticed),
+                               closed = __datescape(vote.closed)))
     if resolved:
         resolved = __closed_table % '\n'.join(resolved)
     else:
         resolved = ''
 
+    updated = cgi.escape(UTC.timestring(database.updated)).replace('-', '&ndash;')
     temp = target + '.temp'
     with open(temp, 'wt') as page:
-        page.write(__page_template % (current, resolved))
+        page.write(__page_template % (updated, current, resolved))
     os.rename(temp, target)
 
 
 if __name__ == '__main__':
     status_page = os.path.join(os.path.dirname(__file__), 'votes.html')
     votes_path = os.path.join(os.path.dirname(__file__), 'votes.sqlite')
-    refresh_page(status_page, FeedDatabase(votes_path))
+    refresh_page(status_page, VoteDatabase(votes_path))



---------------------------------------------------------------------
To unsubscribe, e-mail: cvs-unsubscribe@incubator.apache.org
For additional commands, e-mail: cvs-help@incubator.apache.org


Mime
View raw message