incubator-allura-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From john...@apache.org
Subject git commit: [#6078] Chunk processing of commits when cleaning data during repo refresh to avoid BSON doc size limits
Date Thu, 11 Apr 2013 16:18:26 GMT
Updated Branches:
  refs/heads/cj/6078 [created] 674e5f5c8


[#6078] Chunk processing of commits when cleaning data during repo refresh to avoid BSON doc
size limits

Signed-off-by: Cory Johns <cjohns@slashdotmedia.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/674e5f5c
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/674e5f5c
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/674e5f5c

Branch: refs/heads/cj/6078
Commit: 674e5f5c819e29884f5514bb3e8c8b37afb85788
Parents: e62d190
Author: Cory Johns <cjohns@slashdotmedia.com>
Authored: Thu Apr 11 16:18:12 2013 +0000
Committer: Cory Johns <cjohns@slashdotmedia.com>
Committed: Thu Apr 11 16:18:12 2013 +0000

----------------------------------------------------------------------
 Allura/allura/scripts/refreshrepo.py |   66 +++++++++++++++++------------
 1 files changed, 39 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/674e5f5c/Allura/allura/scripts/refreshrepo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/scripts/refreshrepo.py b/Allura/allura/scripts/refreshrepo.py
index 94e7425..6166c76 100644
--- a/Allura/allura/scripts/refreshrepo.py
+++ b/Allura/allura/scripts/refreshrepo.py
@@ -51,42 +51,54 @@ class RefreshRepo(ScriptTask):
                     if options.clean:
                         ci_ids = list(c.app.repo.all_commit_ids())
                         log.info("Deleting mongo data for %i commits...", len(ci_ids))
-                        tree_ids = [
-                                tree_id for doc in
-                                M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids}},
-                                                       {"tree_ids": 1})
-                                for tree_id in doc.get("tree_ids", [])]
-
-                        i = M.repo.CommitDoc.m.find({"_id": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i CommitDoc docs...", i)
-                        M.repo.CommitDoc.m.remove({"_id": {"$in": ci_ids}})
+                        # like the tree_ids themselves below, we need to process these in
+                        # chunks to avoid hitting the BSON max size limit
+                        tree_ids = []
+                        for ci_ids_chunk in chunked_list(ci_ids, 3000):
+                            tree_ids.extend([
+                                    tree_id for doc in
+                                    M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids_chunk}},
+                                                           {"tree_ids": 1})
+                                    for tree_id in doc.get("tree_ids", [])])
+
+                            i = M.repo.CommitDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i CommitDoc docs...", i)
+                                M.repo.CommitDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
 
                         # delete these in chunks, otherwise the query doc can
                         # exceed the max BSON size limit (16MB at the moment)
                         for tree_ids_chunk in chunked_list(tree_ids, 300000):
                             i = M.repo.TreeDoc.m.find({"_id": {"$in": tree_ids_chunk}}).count()
-                            log.info("Deleting %i TreeDoc docs...", i)
-                            M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
+                            if i:
+                                log.info("Deleting %i TreeDoc docs...", i)
+                                M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
                         del tree_ids
 
                         # delete these after TreeDoc and LastCommitDoc so that if
                         # we crash, we don't lose the ability to delete those
-                        i = M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i TreesDoc docs...", i)
-                        M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids}})
-
-                        # delete LastCommitDocs
-                        i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
-                        log.info("Deleting %i remaining LastCommitDoc docs, by repo id...",
i)
-                        M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
-
-                        i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i DiffInfoDoc docs...", i)
-                        M.repo.DiffInfoDoc.m.remove({"_id": {"$in": ci_ids}})
-
-                        i = M.repo.CommitRunDoc.m.find({"commit_ids": {"$in": ci_ids}}).count()
-                        log.info("Deleting %i CommitRunDoc docs...", i)
-                        M.repo.CommitRunDoc.m.remove({"commit_ids": {"$in": ci_ids}})
+                        for ci_ids_chunk in chunked_list(ci_ids, 3000):
+                            # delete TreesDocs
+                            i = M.repo.TreesDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i TreesDoc docs...", i)
+                                M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
+
+                            # delete LastCommitDocs
+                            i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids_chunk})).count()
+                            if i:
+                                log.info("Deleting %i remaining LastCommitDoc docs, by repo
id...", i)
+                                M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids_chunk}))
+
+                            i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i DiffInfoDoc docs...", i)
+                                M.repo.DiffInfoDoc.m.remove({"_id": {"$in": ci_ids_chunk}})
+
+                            i = M.repo.CommitRunDoc.m.find({"commit_ids": {"$in": ci_ids_chunk}}).count()
+                            if i:
+                                log.info("Deleting %i CommitRunDoc docs...", i)
+                                M.repo.CommitRunDoc.m.remove({"commit_ids": {"$in": ci_ids_chunk}})
                         del ci_ids
 
                     try:


Mime
View raw message