ponymail-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From humbed...@apache.org
Subject [2/2] incubator-ponymail git commit: start work on a dedup feature
Date Sun, 17 Jul 2016 08:43:27 GMT
start work on a dedup feature

--dedup will scan the DB for existing entries with the same
message-id and not insert them again if found. Could be
used for re-importing after the ID generator has changed
or in case of unicode bugs.


Project: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/commit/88c83da0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/tree/88c83da0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-ponymail/diff/88c83da0

Branch: refs/heads/master
Commit: 88c83da0616d91277baafd98c87e22f774b188ac
Parents: feb191e
Author: Daniel Gruno <humbedooh@apache.org>
Authored: Sun Jul 17 10:42:13 2016 +0200
Committer: Daniel Gruno <humbedooh@apache.org>
Committed: Sun Jul 17 10:42:13 2016 +0200

----------------------------------------------------------------------
 tools/import-mbox.py | 36 +++++++++++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-ponymail/blob/88c83da0/tools/import-mbox.py
----------------------------------------------------------------------
diff --git a/tools/import-mbox.py b/tools/import-mbox.py
index 5c62d20..59dbf95 100644
--- a/tools/import-mbox.py
+++ b/tools/import-mbox.py
@@ -74,6 +74,8 @@ iBody = None
 resendTo = None
 timeout = 600
 fromFilter = None
+dedup = False
+dedupped = 0
 
 # Fetch config
 config = configparser.RawConfigParser()
@@ -238,7 +240,8 @@ class SlurpThread(Thread):
 
             count = 0
             LEY = EY
-
+            
+            
             for message in messages:
                 # If --filter is set, discard any messages not matching by continuing to
next email
                 if fromFilter and 'from' in message and message['from'].find(fromFilter)
== -1:
@@ -261,6 +264,31 @@ class SlurpThread(Thread):
                     break
 
                 json, contents = foo.compute_updates(list_override, private, message)
+                
+                # If --dedup is active, try to filter out any messages that already exist
+                if json and dedup and message.get('message-id', None):
+                    res = es.search(
+                        index=iname,
+                        doc_type="mbox",
+                        size = 1,
+                        body = {
+                            'query': {
+                                'bool': {
+                                    'must': [
+                                        {
+                                            'term': {
+                                                'message-id': message.get('message-id', None)
+                                            }
+                                        }
+                                    ]
+                                }
+                            }
+                        }
+                    )
+                    if res and len(res['hits']['hits']) > 0:
+                        print("Dedupping %s" % json['message-id'])
+                        dedupped += 1
+                        continue
 
                 if json:
                     json_source = {
@@ -354,6 +382,8 @@ parser.add_argument('--html2text', dest='html2text', action='store_true',
                    help='If no text/plain is found, try to parse HTML using html2text')
 parser.add_argument('--requirelid', dest='requirelid', action='store_true',
                    help='Require a List ID to be present, ignore otherwise')
+parser.add_argument('--dedup', dest='dedup', action='store_true',
+                   help='Try to dedup messages based on ID before importing')
 parser.add_argument('--ignorebody', dest='ibody', type=str, nargs=1,
                    help='Optional email bodies to treat as empty (in conjunction with --html2text)')
 parser.add_argument('--resend', dest='resend', type=str, nargs=1,
@@ -387,6 +417,8 @@ if args.quick:
     quickmode = args.quick
 if args.private:
     private = args.private
+if args.dedup:
+    dedup = args.dedup
 if args.ext:
     extension = args.ext[0]
 if args.html2text:
@@ -593,3 +625,5 @@ for t in threads:
     t.join()
 
 print("All done! %u records inserted/updated after %u seconds. %u records were bad and ignored"
% (y, int(time.time() - start), baddies))
+if dedupped > 0:
+    print("%u records were not inserted due to deduplication" % dedupped)


Mime
View raw message