allura-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From john...@apache.org
Subject git commit: Partial commit, WIP
Date Thu, 29 Nov 2012 02:54:45 GMT
Updated Branches:
  refs/heads/cj/4691 80c85e8e3 -> 465b05ee3


Partial commit, WIP


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/465b05ee
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/465b05ee
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/465b05ee

Branch: refs/heads/cj/4691
Commit: 465b05ee36e90704f55ff6b8fb5ddead81858ecd
Parents: 80c85e8
Author: Cory Johns <johnsca@geek.net>
Authored: Thu Nov 29 02:54:12 2012 +0000
Committer: Cory Johns <johnsca@geek.net>
Committed: Thu Nov 29 02:54:12 2012 +0000

----------------------------------------------------------------------
 Allura/allura/model/repo.py            |  100 ++++++++++---
 Allura/allura/model/repo_refresh.py    |  218 +++++++-------------------
 Allura/allura/tests/model/test_repo.py |   56 +++++++
 scripts/refresh-all-repos.py           |   11 +-
 scripts/refresh-last-commits.py        |  165 ++++++++++++++++++++
 5 files changed, 361 insertions(+), 189 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/465b05ee/Allura/allura/model/repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py
index bb28423..cfe39e1 100644
--- a/Allura/allura/model/repo.py
+++ b/Allura/allura/model/repo.py
@@ -405,19 +405,21 @@ class Tree(RepoObject):
             sha_obj.update(line)
         return sha_obj.hexdigest()
 
-    def __getitem__(self, name):
+    def __getitem__(self, name, cache=None):
+        if cache is None:
+            cache = ModelCache()
         obj = self.by_name[name]
         if obj['type'] == 'blob':
             return Blob(self, name, obj['id'])
-        obj = self.query.get(_id=obj['id'])
+        obj = cache.get(Tree, dict(_id=obj['id']))
         if obj is None:
             oid = self.repo.compute_tree_new(self.commit, self.path() + name + '/')
-            obj = self.query.get(_id=oid)
+            obj = cache.get(Tree, dict(_id=oid))
         if obj is None: raise KeyError, name
         obj.set_context(self, name)
         return obj
 
-    def get_obj_by_path(self, path):
+    def get_obj_by_path(self, path, cache=None):
         if hasattr(path, 'get'):
             path = path['new']
         if path.startswith('/'):
@@ -426,7 +428,7 @@ class Tree(RepoObject):
         obj = self
         for p in path:
             try:
-                obj = obj[p]
+                obj = obj.__getitem__(p, cache)
             except KeyError:
                 return None
         return obj
@@ -470,6 +472,7 @@ class Tree(RepoObject):
             )
         if last_commit:
             return sorted(last_commit.entries, cmp=lambda a,b: cmp(b.type,a.type) or cmp(a.name,b.name))
+        return []
         # otherwise, try old format
         old_style_results = self.ls_old()
         if old_style_results:
@@ -485,14 +488,14 @@ class Tree(RepoObject):
             re.escape(h.really_unicode(self.path()).encode('utf-8'))))
         lc_index = dict(
             (lc.name, lc.commit_info)
-            for lc in LastCommitDoc.m.find(dict(_id=id_re)))
+            for lc in LastCommitDoc_old.m.find(dict(_id=id_re)))
 
         # FIXME: Temporarily fall back to old, semi-broken lookup behavior until refresh
is done
         oids = [ x.id for x in chain(self.tree_ids, self.blob_ids, self.other_ids) ]
         id_re = re.compile("^{0}:".format(self.repo._id))
         lc_index.update(dict(
             (lc.object_id, lc.commit_info)
-            for lc in LastCommitDoc.m.find(dict(_id=id_re, object_id={'$in': oids}))))
+            for lc in LastCommitDoc_old.m.find(dict(_id=id_re, object_id={'$in': oids}))))
         # /FIXME
 
         results = []
@@ -663,8 +666,11 @@ class Blob(object):
         return differ.get_opcodes()
 
 class LastCommit(RepoObject):
+    def __repr__(self):
+        return '<LastCommit /%s [%s]>' % (self.path, ',\n    '.join(self.commit_ids))
+
     @classmethod
-    def get(cls, tree):
+    def get(cls, tree, cache=None):
         '''Find the LastCommitDoc for the given tree.
 
         Climbs the commit tree until either:
@@ -677,29 +683,31 @@ class LastCommit(RepoObject):
            In this case, we know that the LCD hasn't been constructed for this
            (chain of) commit(s), and it will have to be built.
         '''
+        if cache is None:
+            cache = ModelCache()
         path = tree.path().strip('/')
         commit_ids = []
         for commit in tree.commit.climb_commit_tree():
-            last_commit = cls.query.get(
+            last_commit = cache.get(LastCommit, dict(
                     commit_ids=commit._id,
                     path=path,
-                )
+                ))
             if last_commit:
                 # found our LCD; add any traversed commits to it
                 if commit_ids:
                     last_commit.commit_ids.extend(commit_ids)
-                    session(last_commit).flush(last_commit)
                 return last_commit
             commit_ids.append(commit._id)
             if path in commit.changed_paths:
                 # tree was changed but no LCD found; have to build
                 tree = commit.tree
                 if path != '':
-                    tree = tree.get_obj_by_path(path)
-                return cls.build(tree, commit_ids)
+                    import ipdb; ipdb.set_trace()
+                    tree = tree.get_obj_by_path(path, cache)
+                return cls.build(tree, commit_ids, cache)
 
     @classmethod
-    def build(cls, tree, commit_ids=[]):
+    def build(cls, tree, commit_ids=[], cache=None):
         '''
           Build the LCD record, presuming that this tree is where it was most
           recently changed.
@@ -721,6 +729,8 @@ class LastCommit(RepoObject):
           the LCD info from a single call and if that turns out to be more efficient
           than walking up the tree.  It is unclear if those hold without testing.)
         '''
+        if cache is None:
+            cache = ModelCache()
         unfilled = set([n.name for n in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)])
         tree_nodes = set([n.name for n in tree.tree_ids])
         path = tree.path().strip('/')
@@ -729,28 +739,31 @@ class LastCommit(RepoObject):
                     path=path,
                     entries=[],
                 )
+        for commit_id in commit_ids:
+            cache.set(lcd, LastCommit, dict(commit_ids=commit_id, path=path))
         for commit in tree.commit.climb_commit_tree():
-            partial_lcd = cls.query.get(
+            partial_lcd = cache.get(LastCommit, dict(
                     commit_ids=commit._id,
                     path=path,
-                )
+                ))
             for name in list(unfilled):
-                if partial_lcd:
-                    # the partial LCD should contain anything we're missing
-                    lcd.entries.append(partial_lcd.entry_by_name(name))
-                    unfilled.remove(name)
-                elif os.path.join(path, name) in commit.changed_paths:
-                    # no partial LCD to finish us, but changed in this commit, so gather
the data
+                if os.path.join(path, name) in commit.changed_paths:
+                    # changed in this commit, so gather the data
                     lcd.entries.append(dict(
                             type=name in tree_nodes and 'DIR' or 'BLOB',
                             name=name,
                             commit_info=commit.info,
                         ))
                     unfilled.remove(name)
+                elif partial_lcd:
+                    # the partial LCD should contain anything we're missing
+                    entry = partial_lcd.entry_by_name(name)
+                    assert entry
+                    lcd.entries.append(entry)
+                    unfilled.remove(name)
 
             if not unfilled:
                 break
-        session(lcd).flush()
         return lcd
 
     def entry_by_name(self, name):
@@ -762,3 +775,44 @@ class LastCommit(RepoObject):
 mapper(Commit, CommitDoc, repository_orm_session)
 mapper(Tree, TreeDoc, repository_orm_session)
 mapper(LastCommit, LastCommitDoc, repository_orm_session)
+
+
+class ModelCache(object):
+    def __init__(self):
+        self._cache = defaultdict(dict)
+
+    def _normalize_key(self, key):
+        _key = tuple(sorted(key.items(), key=lambda k: k[0]))
+        return _key
+
+    def get(self, cls, key):
+        _key = self._normalize_key(key)
+        if _key not in self._cache:
+            self._cache[cls][_key] = cls.query.get(**key)
+        return self._cache[cls][_key]
+
+    def set(self, cls, key, val):
+        self._cache[cls][self._normalize_key(key)] = val
+
+    def keys(self, cls):
+        '''
+        Returns all the cache keys for a given class.  Each
+        cache key will be a dict.
+        '''
+        if self._cache[cls]:
+            return [dict(k) for k in self._cache[cls].keys()]
+        return []
+
+    def batch_load(self, cls, query, attrs=None):
+        '''
+        Load multiple results given a query.
+
+        Optionally takes a list of attribute names to use
+        as the cache key.  If not given, uses the keys of
+        the given query.
+        '''
+        if attrs is None:
+            attrs = query.keys()
+        for result in cls.query.find(query):
+            keys = {a: result[a] for a in attrs}
+            self.set(cls, keys, result)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/465b05ee/Allura/allura/model/repo_refresh.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
index 3fa41fc..7413a40 100644
--- a/Allura/allura/model/repo_refresh.py
+++ b/Allura/allura/model/repo_refresh.py
@@ -17,7 +17,7 @@ from allura.lib import utils
 from allura.lib import helpers as h
 from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
 from allura.model.repo import LastCommitDoc, CommitRunDoc
-from allura.model.repo import Commit
+from allura.model.repo import Commit, Tree, LastCommit
 from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
 
 log = logging.getLogger(__name__)
@@ -89,17 +89,24 @@ def refresh_repo(repo, all_commits=False, notify=True):
 
     # Compute diffs
     cache = {}
-    # Have to compute_diffs() for all commits to ensure that LastCommitDocs
-    # are set properly for forked repos. For some SCMs, compute_diffs()
-    # we don't want to pre-compute the diffs because that would be too
-    # expensive, so we skip them here and do them on-demand with caching.
+    # For some SCMs, we don't want to pre-compute the diffs because that
+    # would be too expensive, so we skip them here and do them on-demand
+    # with caching.
     if repo._refresh_precompute:
-        for i, oid in enumerate(reversed(all_commit_ids)):
+        for i, oid in enumerate(commit_ids):
             ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
             compute_diffs(repo._id, cache, ci)
             if (i+1) % 100 == 0:
                 log.info('Compute diffs %d: %s', (i+1), ci._id)
 
+    if repo._refresh_precompute:
+        for i, oid in enumerate(reversed(commit_ids)):
+            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
+            compute_lcds(ci, cache)
+            if (i+1) % 100 == 0:
+                log.info('Compute last commit info %d: %s', (i+1), ci._id)
+
+
     log.info('Refresh complete for %s', repo.full_fs_path)
     g.post_event(
             'repo_refreshed',
@@ -349,7 +356,6 @@ def compute_diffs(repo_id, tree_cache, rhs_ci):
             dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
     # Set last commit data
     rhs_tree = tree_index[rhs_ci.tree_id]
-    #refresh_last_commit(repo_id, '/', rhs_tree, lhs_tree, None, commit_info)
     # Build the diffinfo
     di = DiffInfoDoc(dict(
             _id=rhs_ci._id,
@@ -421,18 +427,21 @@ def _diff_trees(lhs, rhs, index, *path):
         (o.name, o.id)
         for o in rhs.tree_ids)
     for o in lhs.tree_ids:
-        rhs_id = rhs_tree_ids.pop(o.name, None)
-        if rhs_id == o.id:
-            continue # no change
-        elif rhs_id is None:
+        rhs_id = rhs_tree_ids.pop(o.name, None)  # remove so won't be picked up as added,
below
+        if rhs_id == o.id:  # no change
+            continue
+        elif rhs_id is None:  # removed
             yield (_fq(o.name), o.id, None)
-        else:
-            for difference in _diff_trees(
-                index[o.id], index[rhs_id], index,
-                o.name, *path):
-                yield difference
-    for name, id in rhs_tree_ids.items():
+            rhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
+        else:  # changed
+            rhs_tree = index[rhs_id]
+        for difference in _diff_trees(index[o.id], rhs_tree, index, o.name, *path):
+            yield difference
+    for name, id in rhs_tree_ids.items():  # added
         yield (_fq(name), None, id)
+        lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
+        for difference in _diff_trees(lhs_tree, index[id], index, name, *path):
+            yield difference
     # DIff the blobs
     rhs_blob_ids = dict(
         (o.name, o.id)
@@ -463,53 +472,6 @@ def get_commit_info(commit):
         summary=commit.summary
         )
 
-def refresh_last_commit(repo_id, path, tree, lhs_tree, parent_tree, commit_info):
-    '''Build the LastCommit info.
-
-    We only need to create LastCommit info for objects that are in the
-    RHS but not in the LHS, because only those objects are only ones
-    who have had anything changed in them.  (If file x/y/z.txt changes,
-    then it's hash will change, which also forces the hash for tree x/y
-    to change, as well as the hash for tree x.  So as long as an object's
-    hash isn't in the LHS, it means it's new or modified in this commit.)
-
-    In order to uniquely identify the tree or blob that a LastCommitDoc is
-    for, the tree or blob hash is not sufficient; we also need to know
-    either it's full path name, or it's parent tree and name.  Because of
-    this, we have to walk down the commit tree.'''
-    if lhs_tree is not None and tree._id == lhs_tree._id:
-        # tree was not changed in this commit (nor was anything under it)
-        return
-
-    # map LHS entries for easy lookup
-    lhs_map = {}
-    if lhs_tree:
-        for lhs_child in chain(lhs_tree.tree_ids, lhs_tree.blob_ids, lhs_tree.other_ids):
-            lhs_map[lhs_child.name] = lhs_child.id
-
-    # update our children
-    for child in chain(tree.tree_ids, tree.blob_ids, tree.other_ids):
-        if child.id != lhs_map.get(child.name, None):  # check if changed in this commit
-            lc = set_last_commit(repo_id, path, child.name, child.id, commit_info)
-
-    # (re)curse at our child trees
-    for child_tree in tree.tree_ids:
-        child_name = child_tree.name
-        child_tree = TreeDoc.m.get(_id=child_tree.id)
-        lhs_child = None
-        if child_name in lhs_map:
-            lhs_child = TreeDoc.m.get(_id=lhs_map[child_name])
-        refresh_last_commit(repo_id, path + child_name + '/', child_tree, lhs_child, tree,
commit_info)
-
-def set_last_commit(repo_id, path, name, oid, commit_info):
-    lc = LastCommitDoc(dict(
-            _id='%s:%s:%s' % (repo_id, path, name),
-            object_id=oid,
-            name=name,
-            commit_info=commit_info))
-    lc.m.save(safe=False, upsert=True)
-    return lc
-
 def last_known_commit_id(all_commit_ids, new_commit_ids):
     """
     Return the newest "known" (cached in mongo) commit id.
@@ -524,100 +486,38 @@ def last_known_commit_id(all_commit_ids, new_commit_ids):
     if not new_commit_ids: return all_commit_ids[-1]
     return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1]
 
-def build_last_commit_doc(tree):
+
+def compute_lcds(commit, cache):
     '''
-    We need a LCD for this tree, for which there are two possibilities:
-
-      1) This tree was modified in an ancestor commit but the commit ID chain
-         was not filled in.  In this case, as we walk back up the tree, we'll
-         find a LCD record when or before we find the commit in which this tree
-         was changed.  If we find this, we save the new commit IDs in the LCD
-         record for faster access in the future.
-
-      2) The LCD record for the commit in which this tree was changed does not
-         exist.  We'll find the commit and still not have a LCD record, which
-         means we have to construct it.  The LCD record will only contain the
-         commit IDs up to the commit where the tree was most recently changed.
-
-         Constructing it differs for SVN and Git / Hg.  SVN can pull all the info
-         from a single SVN call.  Git / Hg have to walk up the tree.  (SVN could
-         walk up the tree as well, except that the TreesDoc and DiffInfoDoc
-         records are not correctly populated, making it hard to tell when a tree
-         was changed in a given commit, plus it's unnecessary.)
-
-         To walk up the tree, we have to keep track of which entries we still
-         need info about.  At each step of the walk, we check the following:
-
-           1) If the current tree has a LCD record, we can pull all the remaining
-              info we need from it, and we're done.
-
-           2) If the tree was modified in this commit, then we pull the info for
-              all changed entries, then making the parent tree the new active
-              tree and continuing the walk.  Once we have data for all entries,
-              we're done.
+    Compute LastCommit data for every Tree node under this tree.
     '''
-    unfilled = set([n.name for n in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)])
-    tree_nodes = set([n.name for n in tree.tree_ids])
-    entires = []
-    commit_ids = []
-    commit = tree.commit
-    path = tree.path().strip('/')
-    lcd = None
-    has_changes = False
-    while unfilled:
-        last_commit = LastCommitDoc.m.get(
-                commit_ids=commit._id,
-                path=path,
-            )
-        if not has_changes:
-            # no changes found yet, so look to see if we have a matching LCD
-            # that just doesn't have all the commit_ids filled in
-            if last_commit:
-                # found a complete LCD for our tree
-                last_commit.commit_ids.extend(commit_ids)
-                last_commit.m.save()
-                return last_commit
-            # LCD is only valid for the most recent commit
-            # that changed the tree, so once we have changes,
-            # stop recording commit_ids
-            commit_ids.append(commit._id)
-
-        # look for changes to the tree in this commit,
-        # meaning our LCD is missing and must be built
-        diff_info = DiffInfoDoc.m.get(_id=commit._id)
-        diffs = set()
-        for d in diff_info.differences:
-            diffs.add(d.name)
-            node_path = os.path.dirname(d.name)
-            while node_path:
-                diffs.add(node_path)
-                node_path = os.path.dirname(node_path)
-        import ipdb; ipdb.set_trace()
-        for name in list(unfilled):
-            full_name = path + '/' + name
-            if full_name in diffs:
-                has_changes = True
-                if lcd is None:
-                    lcd = LastCommitDoc(dict(
-                                commit_ids=commit_ids,
-                                path=path,
-                                entries=dict(),
-                            ))
-                lcd.entries[name] = dict(
-                        type=name in tree_nodes and 'DIR' or 'BLOB',
-                        name=name,
-                        commit_info=get_commit_info(commit),
-                    )
-                unfilled.remove(name)
-
-        # if we have changes but this commit has an LCD for our
-        # path, it should have all the remaining info we need
-        if last_commit:
-            for name in list(unfilled):
-                lcd.entries[name] = last_commit.entries[name]
-                unfilled.remove(name)
-
-        # walk up the tree
-        commit = commit.get_parent()
-    lcd.m.save()
-    return lcd
+    trees = TreesDoc.m.get(_id=commit._id)
+    assert trees, 'Missing TreesDoc for %s' % commit
+    _update_tree_cache(trees.tree_ids, cache)
+    for tree in _walk_commit_tree(commit, cache):
+        lcd = LastCommit.get(tree, cache)  # auto-vivify LCD
+
+def _walk_commit_tree(commit, cache):
+    def _walk_tree(tree):
+        yield tree
+        for x in tree.tree_ids:
+            sub_tree = cache.get(Tree, dict(_id=x.id))
+            sub_tree.set_context(tree, x.name)
+            for xx in _walk_tree(sub_tree):
+                yield xx
+    top_tree = cache.get(Tree, dict(_id=commit.tree_id))
+    top_tree.set_context(commit)
+    return _walk_tree(top_tree)
+
+def _update_tree_cache(tree_ids, cache):
+    current_ids = set(tree_ids)
+    cached_ids = set([k['_id'] for k in cache.keys(Tree)])
+    new_ids = current_ids - all_ids
+    cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}})
+    return
+
+    unused_ids = all_ids - current_ids
+    for _id in unused_ids:
+        tree_cache.pop(_id)
+    new_trees = Tree.query.find({'_id': {'$in': list(new_ids)}})
+    tree_cache.update({t._id: t for t in new_trees})

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/465b05ee/Allura/allura/tests/model/test_repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py
index 7450632..b1c7091 100644
--- a/Allura/allura/tests/model/test_repo.py
+++ b/Allura/allura/tests/model/test_repo.py
@@ -516,3 +516,59 @@ class TestLastCommit(unittest.TestCase):
                 id=commit4._id,
                 shortlink=self.repo.shorthand_for_commit(commit4._id),
             )))
+
+
+class TestModelCache(unittest.TestCase):
+    def setUp(self):
+        self.cache = M.repo.ModelCache()
+
+    def test_normalize_key(self):
+        self.assertEqual(self.cache._normalize_key({'foo': 1, 'bar': 2}), (('bar', 2), ('foo',
1)))
+
+    @mock.patch.object(M.repo.Tree.query, 'get')
+    @mock.patch.object(M.repo.LastCommit.query, 'get')
+    def test_get(self, lc_get, tr_get):
+        tr_get.return_value = 'bar'
+        lc_get.return_value = 'qux'
+
+        val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
+        tr_get.assert_called_with(_id='foo')
+        self.assertEqual(val, 'bar')
+
+        val = self.cache.get(M.repo.LastCommit, {'_id': 'foo'})
+        lc_get.assert_called_with(_id='foo')
+        self.assertEqual(val, 'qux')
+
+    def test_set(self):
+        self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'test_set')
+        self.assertEqual(self.cache._cache, {M.repo.Tree: {(('_id', 'foo'),): 'test_set'}})
+
+    def test_keys(self):
+        self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo'
+        self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar'
+        self.assertEqual(self.cache.keys(M.repo.Tree), [{'_id': 'test_keys', 'text': 'tko'},
{'fubar': 'scm'}])
+        self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
+
+    @mock.patch.object(M.repo.Tree.query, 'find')
+    def test_batch_load(self, tr_find):
+        # cls, query, attrs
+        tr_find.return_value = [{'foo': 1, 'qux': 3}, {'foo': 2, 'qux': 5}]
+
+        self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}})
+        tr_find.assert_called_with({'foo': {'$in': 'bar'}})
+        self.assertEqual(self.cache._cache[M.repo.Tree], {
+                (('foo', 1),): {'foo': 1, 'qux': 3},
+                (('foo', 2),): {'foo': 2, 'qux': 5},
+            })
+
+    @mock.patch.object(M.repo.Tree.query, 'find')
+    def test_batch_load_attrs(self, tr_find):
+        # cls, query, attrs
+        tr_find.return_value = [{'foo': 1, 'qux': 3}, {'foo': 2, 'qux': 5}]
+
+        self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}, ['qux'])
+        tr_find.assert_called_with({'foo': {'$in': 'bar'}})
+        self.assertEqual(self.cache._cache[M.repo.Tree], {
+                (('qux', 3),): {'foo': 1, 'qux': 3},
+                (('qux', 5),): {'foo': 2, 'qux': 5},
+            })

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/465b05ee/scripts/refresh-all-repos.py
----------------------------------------------------------------------
diff --git a/scripts/refresh-all-repos.py b/scripts/refresh-all-repos.py
index 822148f..1cf7e3d 100644
--- a/scripts/refresh-all-repos.py
+++ b/scripts/refresh-all-repos.py
@@ -32,6 +32,7 @@ def main(options):
         M.repo.TreesDoc.m.remove({})
         M.repo.DiffInfoDoc.m.remove({})
         M.repo.CommitRunDoc.m.remove({})
+        M.repo.LastCommitDoc.m.remove({})
 
     for chunk in chunked_find(M.Project, q_project):
         for p in chunk:
@@ -72,9 +73,6 @@ def main(options):
                         i = M.repo.TreeDoc.m.find({"_id": {"$in": tree_ids_chunk}}).count()
                         log.info("Deleting %i TreeDoc docs...", i)
                         M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
-                        i = M.repo.LastCommitDoc.m.find({"object_id": {"$in": tree_ids_chunk}}).count()
-                        log.info("Deleting %i LastCommitDoc docs...", i)
-                        M.repo.LastCommitDoc.m.remove({"object_id": {"$in": tree_ids_chunk}})
                     del tree_ids
 
                     # delete these after TreeDoc and LastCommitDoc so that if
@@ -83,11 +81,10 @@ def main(options):
                     log.info("Deleting %i TreesDoc docs...", i)
                     M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids}})
 
-                    # delete LastCommitDocs for non-trees
-                    repo_lastcommit_re = re.compile("^{}:".format(c.app.repo._id))
-                    i = M.repo.LastCommitDoc.m.find(dict(_id=repo_lastcommit_re)).count()
+                    # delete LastCommitDocs
+                    i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
                     log.info("Deleting %i remaining LastCommitDoc docs, by repo id...", i)
-                    M.repo.LastCommitDoc.m.remove(dict(_id=repo_lastcommit_re))
+                    M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
 
                     i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids}}).count()
                     log.info("Deleting %i DiffInfoDoc docs...", i)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/465b05ee/scripts/refresh-last-commits.py
----------------------------------------------------------------------
diff --git a/scripts/refresh-last-commits.py b/scripts/refresh-last-commits.py
new file mode 100644
index 0000000..38dbb58
--- /dev/null
+++ b/scripts/refresh-last-commits.py
@@ -0,0 +1,165 @@
+import argparse
+import logging
+import re
+from datetime import datetime
+from contextlib import contextmanager
+
+import faulthandler
+from pylons import c
+from ming.orm import ThreadLocalORMSession
+
+from allura import model as M
+from allura.lib.utils import chunked_find, chunked_list
+
+log = logging.getLogger(__name__)
+
+
+def main(options):
+    q_project = {}
+    if options.nbhd:
+        nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd)
+        if not nbhd:
+            return "Invalid neighborhood url prefix."
+        q_project['neighborhood_id'] = nbhd._id
+    if options.project:
+        q_project['shortname'] = options.project
+    elif options.project_regex:
+        q_project['shortname'] = {'$regex': options.project_regex}
+
+    log.info('Refreshing last commit data')
+
+    for chunk in chunked_find(M.Project, q_project):
+        for p in chunk:
+            log.info("Refreshing last commit data for project '%s'." % p.shortname)
+            if options.dry_run:
+                continue
+            c.project = p
+            if options.mount_point:
+                mount_points = [options.mount_point]
+            else:
+                mount_points = [ac.options.mount_point for ac in
+                                M.AppConfig.query.find(dict(project_id=p._id))]
+            for app in (p.app_instance(mp) for mp in mount_points):
+                c.app = app
+                if not hasattr(app, 'repo'):
+                    continue
+                if c.app.repo.tool.lower() not in options.repo_types:
+                    log.info("Skipping %r: wrong type (%s)", c.app.repo,
+                            c.app.repo.tool.lower())
+                    continue
+
+                ci_ids = list(reversed(list(c.app.repo.all_commit_ids())))
+                #ci_ids = list(c.app.repo.all_commit_ids())
+                if options.clean:
+                    if options.diffs:
+                        # delete DiffInfoDocs
+                        i = M.repo.DiffInfoDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
+                        log.info("Deleting %i DiffInfoDoc docs, by repo id...", i)
+                        M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
+
+                    # delete LastCommitDocs
+                    i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
+                    log.info("Deleting %i LastCommitDoc docs, by repo id...", i)
+                    M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
+
+                try:
+                    log.info('Refreshing all last commits in %r', c.app.repo)
+                    if options.profile:
+                        import cProfile
+                        cProfile.runctx('refresh_repo_lcds(ci_ids, options)',
+                                globals(), locals(), '/tmp/refresh_lcds.profile')
+                    else:
+                        refresh_repo_lcds(ci_ids, options)
+                except:
+                    log.exception('Error refreshing %r', c.app.repo)
+                    raise
+        ThreadLocalORMSession.flush_all()
+        ThreadLocalORMSession.close_all()
+
+
+def refresh_repo_lcds(commit_ids, options):
+    tree_cache = {}
+    timings = []
+    if options.diffs:
+        print 'Processing diffs'
+        for commit_id in commit_ids:
+            commit = M.repo.Commit.query.get(_id=commit_id)
+            with time(timings):
+                M.repo_refresh.compute_diffs(c.app.repo._id, tree_cache, commit)
+            if len(timings) % 1000 == 0:
+                mt = max(timings)
+                tt = sum(timings)
+                at = tt / len(timings)
+                print '  Processed %d commits (max: %f, avg: %f, tot: %f, cl: %d)' % (
+                        len(timings), mt, at, tt, len(tree_cache))
+    lcd_cache = {}
+    timings = []
+    print 'Processing last commits'
+    for commit_id in commit_ids:
+        commit = M.repo.Commit.query.get(_id=commit_id)
+        with time(timings):
+            M.repo_refresh.compute_lcds(commit, tree_cache, lcd_cache)
+        if len(timings) % 100 == 0:
+            mt = max(timings)
+            tt = sum(timings)
+            at = tt / len(timings)
+            print '  Processed %d commits (max: %f, avg: %f, tot: %f, tcl: %d, lcl: %d)'
% (
+                    len(timings), mt, at, tt, len(tree_cache), len(lcd_cache))
+        if len(timings) == 1100:
+            break
+
+
+@contextmanager
+def time(timings):
+    s = datetime.now()
+    yield
+    timings.append((datetime.now() - s).total_seconds())
+
+
+def repo_type_list(s):
+    repo_types = []
+    for repo_type in s.split(','):
+        repo_type = repo_type.strip()
+        if repo_type not in ['svn', 'git', 'hg']:
+            raise argparse.ArgumentTypeError(
+                    '{} is not a valid repo type.'.format(repo_type))
+        repo_types.append(repo_type)
+    return repo_types
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(description='Using existing commit data, '
+            'refresh the last commit metadata in MongoDB. Run for all repos (no args), '
+            'or restrict by neighborhood, project, or code tool mount point.')
+    parser.add_argument('--nbhd', action='store', default='', dest='nbhd',
+            help='Restrict update to a particular neighborhood, e.g. /p/.')
+    parser.add_argument('--project', action='store', default='', dest='project',
+            help='Restrict update to a particular project. To specify a '
+            'subproject, use a slash: project/subproject.')
+    parser.add_argument('--project-regex', action='store', default='',
+            dest='project_regex',
+            help='Restrict update to projects for which the shortname matches '
+            'the provided regex.')
+    parser.add_argument('--repo-types', action='store', type=repo_type_list,
+            default=['svn', 'git', 'hg'], dest='repo_types',
+            help='Only refresh last commits for repos of the given type(s). Defaults to:
'
+            'svn,git,hg. Example: --repo-types=git,hg')
+    parser.add_argument('--mount_point', default='', dest='mount_point',
+            help='Restrict update to repos at the given tool mount point. ')
+    parser.add_argument('--clean', action='store_true', dest='clean',
+            default=False, help='Remove last commit mongo docs for '
+            'project(s) being refreshed before doing the refresh.')
+    parser.add_argument('--dry-run', action='store_true', dest='dry_run',
+            default=False, help='Log names of projects that would have their '
+            'last commits refreshed, but do not perform the actual refresh.')
+    parser.add_argument('--profile', action='store_true', dest='profile',
+            default=False, help='Enable the profiler (slow). Will log '
+            'profiling output to ./refresh.profile')
+    parser.add_argument('--diffs', action='store_true', dest='diffs',
+            default=False, help='Refresh diffs as well as LCDs')
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    import sys
+    faulthandler.enable()
+    sys.exit(main(parse_options()))


Mime
View raw message