ponymail-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From humbed...@apache.org
Subject [06/11] incubator-ponymail git commit: eliminate some redundant code
Date Wed, 01 Jun 2016 12:41:08 GMT
eliminate some redundant code


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/1ed519e5
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/1ed519e5
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/1ed519e5

Branch: refs/heads/master
Commit: 1ed519e5b152e3c92d07722fbc2329ba13e48697
Parents: 6b42d4d
Author: Sam Ruby <rubys@intertwingly.net>
Authored: Wed May 4 20:58:28 2016 -0400
Committer: Sam Ruby <rubys@intertwingly.net>
Committed: Wed May 4 20:58:28 2016 -0400

----------------------------------------------------------------------
 tools/archiver.py    |  14 ++-
 tools/import-mbox.py | 311 +++++++---------------------------------------
 2 files changed, 51 insertions(+), 274 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/1ed519e5/tools/archiver.py
----------------------------------------------------------------------
diff --git a/tools/archiver.py b/tools/archiver.py
index 31f790a..991b6d6 100644
--- a/tools/archiver.py
+++ b/tools/archiver.py
@@ -129,12 +129,15 @@ class Archiver(object):
         "x-mailman-rule-misses",
     ]
 
-    def __init__(self):
+    def __init__(self, parseHTML=False):
         """ Just initialize ES. """
-        global config, auth, parseHTML
+        global config, auth
         ssl = False
         self.cropout = None
         self.html = parseHTML
+        if parseHTML:
+           import html2text
+           self.html2text = html2text.html2text
         self.dbname = config.get("elasticsearch", "dbname")
         self.consistency = 'quorum'
         if config.has_option("elasticsearch", "ssl") and config.get("elasticsearch", "ssl").lower()
== 'true':
@@ -209,7 +212,7 @@ class Archiver(object):
             
         # this requires a GPL lib, user will have to install it themselves
         if firstHTML and (not body or len(body) <= 1):
-            body = html2text.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML)
is bytes else firstHTML)
+            body = self.html2text(firstHTML.decode("utf-8", 'ignore') if type(firstHTML)
is bytes else firstHTML)
     
         for charset in pm_charsets(msg):
             try:
@@ -287,6 +290,7 @@ class Archiver(object):
                         body = None
 
         attachments, contents = self.msgfiles(msg)
+        irt = ""
         if body or attachments:
             pmid = mid
             try:
@@ -295,7 +299,6 @@ class Archiver(object):
                 if logger:
                     logger.warn("Could not generate MID: %s" % err)
                 mid = pmid
-            irt = ""
             if 'in-reply-to' in msg_metadata:
                 try:
                     try:
@@ -511,14 +514,13 @@ if __name__ == '__main__':
     args = parser.parse_args()
     
     if args.html2text:
-        import html2text
         parseHTML = True
 
     if args.verbose:
         import logging
         logging.basicConfig(stream=sys.stdout, level=logging.INFO)
         
-    foo = Archiver()
+    foo = Archiver(parseHTML = parseHTML)
     input_stream = io.TextIOWrapper(sys.stdin.buffer, encoding='utf-8', errors="ignore")
     
     try:

http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/1ed519e5/tools/import-mbox.py
----------------------------------------------------------------------
diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 3d24036..399afe6 100644
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3.4
+ 
 # -*- coding: utf-8 -*-
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
@@ -35,6 +36,7 @@ import codecs
 import multiprocessing
 import tempfile
 import gzip
+import archiver
 
 try:
     from elasticsearch import Elasticsearch, helpers
@@ -49,7 +51,6 @@ block = Lock()
 lists = []
 start = time.time()
 quickmode = False
-lid_override = None
 private = False
 appender = "apache.org"
 
@@ -105,100 +106,6 @@ es = Elasticsearch([
 
 rootURL = ""
 
-def parse_attachment(part):
-    cd = part.get("Content-Disposition", None)
-    if cd:
-        dispositions = cd.strip().split(";")
-        if dispositions[0].lower() == "attachment":
-            try:
-                fd = part.get_payload(decode=True)
-                if fd:
-                    attachment = {}
-                    attachment['content_type'] = part.get_content_type()
-                    attachment['size'] = len(fd)
-                    attachment['filename'] = None
-                    h = hashlib.sha256(fd).hexdigest()
-                    b64 = codecs.encode(fd, "base64").decode('ascii')
-                    attachment['hash'] = h
-                    for param in dispositions[1:]:
-                        key,val = param.split("=", 1)
-                        if key.lower().strip() == "filename":
-                            val = val.strip(' "')
-                            print("Found attachment: %s" % val)
-                            attachment['filename'] = val
-                    if attachment['filename']:
-                        return attachment, b64 # Return meta data and contents separately
-            except:
-                pass
-    return None, None
-
-def msgfiles(msg):
-        attachments = []
-        contents = {}
-        if msg.is_multipart():    
-            for part in msg.walk():
-                part_meta, part_file = parse_attachment(part)
-                if part_meta:
-                    attachments.append(part_meta)
-                    contents[part_meta['hash']] = part_file
-        return attachments, contents
-    
-def pm_charsets(msg):
-    charsets = set({})
-    for c in msg.get_charsets():
-        if c is not None:
-            charsets.update([c])
-    return charsets
-
-def msgbody(msg):
-    global parseHTML, iBody
-    firstHTML = None
-    body = None
-    if msg.is_multipart():
-        for part in msg.walk():
-            try:
-                if part.is_multipart(): 
-                    for subpart in part.walk():
-                        if subpart.get_content_type() == 'text/plain' and not body:
-                                body = subpart.get_payload(decode=True)
-                        elif subpart.get_content_type() == 'text/html' and parseHTML and
not firstHTML:
-                             firstHTML = subpart.get_payload(decode=True)
-        
-                elif part.get_content_type() == 'text/plain' and not body:
-                    body = part.get_payload(decode=True)
-                elif part.get_content_type() == 'text/html' and parseHTML and not firstHTML:
-                    firstHTML = part.get_payload(decode=True)
-            except Exception as err:
-                print("Body parser error: %s" % err)
-    elif msg.get_content_type() == 'text/plain':
-        body = msg.get_payload(decode=True)
-    elif msg.get_content_type() == 'text/html' and parseHTML and not firstHTML:
-        firstHTML = msg.get_payload(decode=True)
-        
-    # this requires a GPL lib, user will have to install it themselves
-    if firstHTML and (not body or len(body) <= 1 or (iBody and str(body).find(str(iBody))
!= -1)):
-        print("No body found as text/plain, converting HTML to text")
-        body = html2text.html2text(firstHTML.decode("utf-8", errors='replace') if type(firstHTML)
is bytes else firstHTML)
-
-    for charset in pm_charsets(msg):
-        try:
-            body = body.decode(charset) if type(body) is bytes else body
-        except:
-            body = body.decode('utf-8', errors='replace') if type(body) is bytes else body
-            
-    return body  
-
-
-def msgfactory(fp):
-    try:
-        return email.message_from_file(fp)
-    except Exception as err:
-        # Don't return None since that will
-        # stop the mailbox iterator
-        print("hmm: %s" % err)
-        return None
-
-
 class BulkThread(Thread):
     def assign(self, json, xes, dtype = 'mbox', wc = 'quorum'):
         self.json = json
@@ -247,6 +154,8 @@ class SlurpThread(Thread):
         mboxfile = ""
         filename = ""
         xlist_override = None
+
+        foo = archiver.Archiver(parseHTML = parseHTML)
     
         while len(lists) > 0:
             print("%u elements left to slurp" % len(lists))
@@ -334,180 +243,46 @@ class SlurpThread(Thread):
                 if (time.time() - stime > timeout): # break out after N seconds, it shouldn't
take this long..!
                     print("Whoa, this is taking way too long, ignoring %s for now" % tmpname)
                     break
-                if 'subject' in message:
-                    subject = message['subject']       # Could possibly be None.
-                    mid = message['message-id']
-
-                    lid = message['list-id']
-                    if args.requirelid and (not lid or lid == ""):
-                        continue
-                    if not lid or lid == "": # Guess list name in absence
-                        lid = '.'.join(reversed(ml.split("-"))) + "." + appender
-                    
-                    # Compact LID to <foo@domain>, discard rest
-                    m = re.search(r"(<.+>)", lid)
-                    if m:
-                        lid = m.group(1)
-                    if xlist_override and len(xlist_override) > 3:
-                        lid = xlist_override
-                    lid = lid.replace("@",".") # we want foo.bar.org, not foo@bar.org
-                    lid = "<%s>" % lid.strip("<>") # We need <> around
it!
-                    if cropout:
-                        crops = cropout.split(" ")
-                        # Regex replace?
-                        if len(crops) == 2:
-                            lid = re.sub(crops[0], crops[1], lid)
-                        # Standard crop out?
-                        else:
-                            lid = lid.replace(cropout, "")
-                    
-                    date = message['date']
-                    fro = message['from']
-                    to = message['to']
-                    body = msgbody(message)
-                    try:
-                        if 'content-type' in message and message['content-type'].find("flowed")
!= -1:
-                            body = convertToWrapped(body, character_set="utf-8")
-                        if isinstance(body, str):
-                            body = body.encode('utf-8')
-                    except Exception as err:
-                        try:
-                            body = body.decode(chardet.detect(body)['encoding'])
-                        except Exception as err:
-                            try:
-                                body = body.decode('latin-1')
-                            except:
-                                try:
-                                    if isinstance(body, str):
-                                        body = body.encode('utf-8')
-                                except:
-                                    body = None
-
-                    okay = True
-                    dheader = {}
-                    for key in ['to','from','subject','message-id']:
-                        try:
-                            hval = ""
-                            if message.get(key):
-                                for t in email.header.decode_header(message[key]):
-                                    if t[1] == None or t[1].find("8bit") != -1:
-                                        hval += t[0].decode('utf-8', errors='replace') if
type(t[0]) is bytes else t[0]
-                                    else:
-                                        hval += t[0].decode(t[1],errors='ignore')
-                                dheader[key] = hval
-                            else:
-                                dheader[key] = "(Unknown)"
-                        except Exception as err:
-                            print("Could not decode headers, ignoring..: %s" % err)
-                            okay = False
-                    mdt = ""
-                    if not 'date' in message and 'received' in message:
-                        print("No Date header found, resorting to Received")
-                        m = re.search(r"(\d+ \S+ \d{4} \d\d:\d\d:\d\d ([-+]\d{4})?)", message['received'])
-                        if m:
-                            mdt = m.group(1)
-                    else:
-                        mdt = message['date']
-                    mdate = None
-                    uid_mdate = 0
-                    try:
-                        mdate = email.utils.parsedate_tz(mdt)
-                        uid_mdate = email.utils.mktime_tz(mdate)
-                    except:
-                        pass
-                    if not mdate or mdate[0] < (LEY-1):
-                        print("Date is wrong or missing here, setting to %s" % ( LEY))
-                        mdate = datetime.datetime(LEY, EM, 1).timetuple()
-                    else:
-                        LEY = mdate[0] # Gather evidence 'n'stuff!
-                    mdatestring = ""
-                    try:
-                        mdatestring = time.strftime("%Y/%m/%d %H:%M:%S", time.localtime(email.utils.mktime_tz(mdate)))
-                    except:
-                        okay = False
-                    if body and okay and mdate and {'from','subject'} <= set(dheader):
-                        # Pipermail transforms from: to something weird - reset that!
-                        if piperWeirdness:
-                            m = re.match(r"(.+) at ([^(]+) \((.+)\)$", dheader['from'])
-                            # Try just 'foo at bar.tld' if 'foo at bar.tld (foo bar)' isn't
working
-                            if not m:
-                                m = re.match(r"(.+) at ([^(]+)$", dheader['from'])
-                            if m:
-                                dheader['from'] = "%s <%s@%s>" % (m.group(3), m.group(1),
m.group(2))
-                                
-                        attachments, contents = msgfiles(message)
-                        if mid == None or not mid:
-                            try:
-                                mid = hashlib.sha256(body if type(body) is bytes else body.encode('ascii',
errors='ignore')).hexdigest() + "@" + lid + "@" + appender
-                            except:
-                                if filebased:
-                                    mid = hashlib.sha256("%f-%f-%s" % (random.random(), time.time(),
filename) ).hexdigest()+ "@" + appender
-                                else:
-                                    mid = hashlib.sha256("%f-%f-%s-%s" % (random.random(),
time.time(), ml, mboxfile) ).hexdigest()+ "@" + appender
-                            print("No MID found, setting to %s" % mid)
-                        mid2 = "%s@%s@%s" % (hashlib.sha224(body if type(body) is bytes else
body.encode('ascii', 'ignore')).hexdigest(), uid_mdate, lid)
-                        count += 1
-                        mr = ""
-                        if 'references' in message:
-                            mr = message['references']
-                        irt = ""
-                        if 'in-reply-to' in message:
-                            try:
-                                irt = "".join(message['in-reply-to'])
-                            except:
-                                irt = message.get('in-reply-to').__str__()
-
-                        json = {
-                            'from_raw': dheader['from'],
-                            'from': dheader['from'],
-                            'to': dheader['to'],
-                            'subject': dheader['subject'],
-                            'cc': message.get('cc'),
-                            'message-id': mid,
-                            'mid': mid2,
-                            'epoch': email.utils.mktime_tz(mdate),
-                            'list': lid,
-                            'list_raw': lid,
-                            'date': mdatestring,
-                            'private': private,
-                            'references': mr,
-                            'in-reply-to': irt,
-                            'body': body.decode('utf-8', errors='replace') if type(body)
is bytes else body,
-                            'attachments': attachments
-                        }
-                        json_source = {
-                            'mid': mid2,
-                            'message-id': mid,
-                            'source': message.as_bytes().decode('utf-8', errors='replace')
-                        }
-                        ja.append(json)
-                        jas.append(json_source)
-                        if contents:
-                            iname = config.get("elasticsearch", "dbname")
-                            if not args.dry:
-                                for key in contents:
-                                    es.index(
-                                        index=iname,
-                                        doc_type="attachment",
-                                        id=key,
-                                        body = {
-                                            'source': contents[key]
-                                        }
-                                    )
-                        if len(ja) >= 40:
-                            if not args.dry:
-                                bulk = BulkThread()
-                                bulk.assign(ja, es, 'mbox')
-                                bulk.insert()
-                            ja = []
-                            
-                            if not args.dry:
-                                bulks = BulkThread()
-                                bulks.assign(jas, es, 'mbox_source')
-                                bulks.insert()
-                            jas = []
+
+                json, contents = foo.compute_updates(list_override, private, message)
+
+                if json:
+                    json_source = {
+                        'mid': json['mid'],
+                        'message-id': json['message-id'],
+                        'source': message.as_bytes().decode('utf-8', errors='replace')
+                    }
+
+                    count += 1
+                    ja.append(json)
+                    jas.append(json_source)
+                    if contents:
+                        iname = config.get("elasticsearch", "dbname")
+                        if not args.dry:
+                            for key in contents:
+                                es.index(
+                                    index=iname,
+                                    doc_type="attachment",
+                                    id=key,
+                                    body = {
+                                        'source': contents[key]
+                                    }
+                                )
+                    if len(ja) >= 40:
+                        if not args.dry:
+                            bulk = BulkThread()
+                            bulk.assign(ja, es, 'mbox')
+                            bulk.insert()
+                        ja = []
+                        
+                        if not args.dry:
+                            bulks = BulkThread()
+                            bulks.assign(jas, es, 'mbox_source')
+                            bulks.insert()
+                        jas = []
                 else:
                     baddies += 1
+
             if filebased:
                 print("Parsed %u records from %s" % (count, filename))
                 if dFile:
@@ -585,7 +360,7 @@ if args.source:
 if args.dir:
     maildir = args.dir
 if args.listid:
-    list_override = args.listid[0]
+    list_override = '<%s>' % args.listid[0].strip('<>')
 if args.project:
     project = args.project[0]
 if args.domain:


Mime
View raw message