incubator-allura-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From john...@apache.org
Subject [23/50] git commit: [#4691] New implementation of LastCommit info
Date Thu, 10 Jan 2013 18:08:21 GMT
[#4691] New implementation of LastCommit info


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/20bad544
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/20bad544
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/20bad544

Branch: refs/heads/cj/4691
Commit: 20bad544f859915a920db24ceb76369cf9dab83f
Parents: c2087a2
Author: Cory Johns <johnsca@geek.net>
Authored: Wed Oct 10 02:28:07 2012 +0000
Committer: Cory Johns <johnsca@geek.net>
Committed: Thu Jan 10 16:27:08 2013 +0000

----------------------------------------------------------------------
 Allura/allura/model/repo.py            |  358 +++++++++++++++++-
 Allura/allura/model/repo_refresh.py    |  126 +++----
 Allura/allura/model/repository.py      |   15 -
 Allura/allura/tests/model/test_repo.py |  544 +++++++++++++++++++++++++++
 ForgeSVN/forgesvn/model/svn.py         |   33 ++-
 scripts/refresh-all-repos.py           |   11 +-
 scripts/refresh-last-commits.py        |  172 +++++++++
 7 files changed, 1149 insertions(+), 110 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/20bad544/Allura/allura/model/repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py
index b46f3fa..2bc22a9 100644
--- a/Allura/allura/model/repo.py
+++ b/Allura/allura/model/repo.py
@@ -11,7 +11,7 @@ from difflib import SequenceMatcher, unified_diff
 from pylons import c
 import pymongo.errors
 
-from ming import Field, collection
+from ming import Field, collection, Index
 from ming import schema as S
 from ming.base import Object
 from ming.utils import LazyProperty
@@ -61,9 +61,7 @@ TreeDoc = collection(
     Field('blob_ids', [dict(name=str, id=str)]),
     Field('other_ids', [dict(name=str, id=str, type=SObjType)]))
 
-# Information about the last commit to touch a tree/blob
-# LastCommitDoc.object_id = TreeDoc._id
-LastCommitDoc = collection(
+LastCommitDoc_old = collection(
     'repo_last_commit', project_doc_session,
     Field('_id', str),
     Field('object_id', str, index=True),
@@ -77,6 +75,25 @@ LastCommitDoc = collection(
         shortlink=str,
         summary=str)))
 
+# Information about the last commit to touch a tree
+LastCommitDoc = collection(
+    'repo_last_commit', main_doc_session,
+    Field('_id', S.ObjectId()),
+    Field('commit_ids', [str]),
+    Field('path', str),
+    Index('commit_ids', 'path'),
+    Field('entries', [dict(
+        type=str,
+        name=str,
+        commit_info=dict(
+            id=str,
+            date=datetime,
+            author=str,
+            author_email=str,
+            author_url=str,
+            shortlink=str,
+            summary=str))]))
+
 # List of all trees contained within a commit
 # TreesDoc._id = CommitDoc._id
 # TreesDoc.tree_ids = [ TreeDoc._id, ... ]
@@ -160,7 +177,8 @@ class Commit(RepoObject):
             self.tree_id = self.repo.compute_tree_new(self)
         if self.tree_id is None:
             return None
-        t = Tree.query.get(_id=self.tree_id)
+        cache = getattr(c, 'model_cache', '') or ModelCache()
+        t = cache.get(Tree, dict(_id=self.tree_id))
         if t is None:
             self.tree_id = self.repo.compute_tree_new(self)
             t = Tree.query.get(_id=self.tree_id)
@@ -182,13 +200,29 @@ class Commit(RepoObject):
     def symbolic_ids(self):
         return self.repo.symbolics_for_commit(self)
 
-    def parent(self, index=0):
-        ci = None
-        if self.parent_ids:
-            ci = self.query.get(_id=self.parent_ids[index])
-        if ci:
+    def get_parent(self, index=0):
+        '''Get the parent of this commit.
+
+        If there is no parent commit, or if an invalid index is given,
+        returns None.
+        '''
+        try:
+            cache = getattr(c, 'model_cache', '') or ModelCache()
+            ci = cache.get(Commit, dict(_id=self.parent_ids[index]))
             ci.set_context(self.repo)
-        return ci
+            return ci
+        except IndexError as e:
+            return None
+
+    def climb_commit_tree(self):
+        '''
+        Returns a generator that walks up the commit tree along
+        the first-parent ancestory, starting with this commit.'''
+        yield self
+        ancestor = self.get_parent()
+        while ancestor:
+            yield ancestor
+            ancestor = ancestor.get_parent()
 
     def url(self):
         if self.repo is None: self.repo = self.guess_repo()
@@ -293,7 +327,7 @@ class Commit(RepoObject):
         if not removed:
             return []
         copied = []
-        prev_commit = self.parent()
+        prev_commit = self.get_parent()
         for removed_name in removed[:]:
             removed_blob = prev_commit.tree.get_obj_by_path(removed_name)
             rename_info = None
@@ -316,6 +350,43 @@ class Commit(RepoObject):
             cur = cur[part]
         return cur
 
+    @LazyProperty
+    def changed_paths(self):
+        '''
+        Returns a list of paths changed in this commit.
+        Leading and trailing slashes are removed, and
+        the list is complete, meaning that if a sub-path
+        is changed, all of the parent paths are included
+        (including '' to represent the root path).
+
+        Example:
+
+            If the file /foo/bar is changed in the commit,
+            this would return ['', 'foo', 'foo/bar']
+        '''
+        diff_info = DiffInfoDoc.m.get(_id=self._id)
+        diffs = set()
+        for d in diff_info.differences:
+            diffs.add(d.name.strip('/'))
+            node_path = os.path.dirname(d.name)
+            while node_path:
+                diffs.add(node_path)
+                node_path = os.path.dirname(node_path)
+            diffs.add('')  # include '/' if there are any changes
+        return diffs
+
+    @LazyProperty
+    def info(self):
+        return dict(
+            id=self._id,
+            author=self.authored.name,
+            author_email=self.authored.email,
+            date=self.authored.date,
+            author_url=self.author_url,
+            shortlink=self.shorthand_id(),
+            summary=self.summary
+            )
+
 class Tree(RepoObject):
     # Ephemeral attrs
     repo=None
@@ -337,13 +408,14 @@ class Tree(RepoObject):
         return sha_obj.hexdigest()
 
     def __getitem__(self, name):
+        cache = getattr(c, 'model_cache', '') or ModelCache()
         obj = self.by_name[name]
         if obj['type'] == 'blob':
             return Blob(self, name, obj['id'])
-        obj = self.query.get(_id=obj['id'])
+        obj = cache.get(Tree, dict(_id=obj['id']))
         if obj is None:
             oid = self.repo.compute_tree_new(self.commit, self.path() + name + '/')
-            obj = self.query.get(_id=oid)
+            obj = cache.get(Tree, dict(_id=oid))
         if obj is None: raise KeyError, name
         obj.set_context(self, name)
         return obj
@@ -386,22 +458,71 @@ class Tree(RepoObject):
         return None, None
 
     def ls(self):
+        '''
+        List the entries in this tree, with historical commit info for
+        each node.  Eventually, ls_old can be removed and this can be
+        replaced with the following:
+
+            last_commit = LastCommit.get(self)
+            return sorted(last_commit.entries, cmp=lambda a,b: cmp(b.type,a.type) or cmp(a.name,b.name))
+        '''
+        # look for existing new format first
+        last_commit = LastCommit.query.get(
+                commit_ids=self.commit._id,
+                path=self.path().strip('/'),
+            )
+        if last_commit:
+            sorted_entries = sorted(last_commit.entries, cmp=lambda a,b: cmp(b.type,a.type) or cmp(a.name,b.name))
+            mapped_entries = [self._dirent_map(e) for e in sorted_entries]
+            return mapped_entries
+        # otherwise, try old format
+        old_style_results = self.ls_old()
+        if old_style_results:
+            return old_style_results
+        # finally, use the new implentation that auto-vivifies
+        last_commit = LastCommit.get(self)
+        sorted_entries = sorted(last_commit.entries, cmp=lambda a,b: cmp(b.type,a.type) or cmp(a.name,b.name))
+        mapped_entries = [self._dirent_map(e) for e in sorted_entries]
+        return mapped_entries
+
+    def _dirent_map(self, dirent):
+        return dict(
+                kind=dirent.type,
+                name=dirent.name,
+                href=dirent.name + '/',
+                last_commit=dict(
+                        author=dirent.commit_info.author,
+                        author_email=dirent.commit_info.author_email,
+                        author_url=dirent.commit_info.author_url,
+                        date=dirent.commit_info.date,
+                        href=self.repo.url_for_commit(dirent.commit_info['id']),
+                        shortlink=dirent.commit_info.shortlink,
+                        summary=dirent.commit_info.summary,
+                    ),
+            )
+
+    def ls_old(self):
         # Load last commit info
         id_re = re.compile("^{0}:{1}:".format(
             self.repo._id,
             re.escape(h.really_unicode(self.path()).encode('utf-8'))))
         lc_index = dict(
             (lc.name, lc.commit_info)
-            for lc in LastCommitDoc.m.find(dict(_id=id_re)))
+            for lc in LastCommitDoc_old.m.find(dict(_id=id_re)))
 
         # FIXME: Temporarily fall back to old, semi-broken lookup behavior until refresh is done
         oids = [ x.id for x in chain(self.tree_ids, self.blob_ids, self.other_ids) ]
         id_re = re.compile("^{0}:".format(self.repo._id))
         lc_index.update(dict(
             (lc.object_id, lc.commit_info)
-            for lc in LastCommitDoc.m.find(dict(_id=id_re, object_id={'$in': oids}))))
+            for lc in LastCommitDoc_old.m.find(dict(_id=id_re, object_id={'$in': oids}))))
         # /FIXME
 
+        if not lc_index:
+            # allow fallback to new method instead
+            # of showing a bunch of Nones
+            return []
+
         results = []
         def _get_last_commit(name, oid):
             lc = lc_index.get(name, lc_index.get(oid, None))
@@ -569,5 +690,210 @@ class Blob(object):
         differ = SequenceMatcher(v0, v1)
         return differ.get_opcodes()
 
+class LastCommit(RepoObject):
+    def __repr__(self):
+        return '<LastCommit /%s [%s]>' % (self.path, ',\n    '.join(self.commit_ids))
+
+    @classmethod
+    def get(cls, tree):
+        '''Find the LastCommitDoc for the given tree.
+
+        Climbs the commit tree until either:
+
+        1) An LCD is found for the given tree.  (If the LCD was not found for the
+           tree's commit, the commits traversed while searching for it are
+           added to the LCD for faster retrieval in the future.)
+
+        2) The commit in which the tree was most recently modified is found.
+           In this case, we know that the LCD hasn't been constructed for this
+           (chain of) commit(s), and it will have to be built.
+        '''
+        cache = getattr(c, 'model_cache', '') or ModelCache()
+        path = tree.path().strip('/')
+        commit_ids = []
+        cache._get_calls += 1
+        gw = 0
+        for commit in tree.commit.climb_commit_tree():
+            last_commit = cache.get(LastCommit, dict(
+                    commit_ids=commit._id,
+                    path=path,
+                ))
+            if last_commit:
+                cache._get_hits += 1
+                # found our LCD; add any traversed commits to it
+                if commit_ids:
+                    last_commit.commit_ids.extend(commit_ids)
+                    for commit_id in commit_ids:
+                        cache.set(LastCommit, dict(commit_ids=commit_id, path=path), last_commit)
+                return last_commit
+            commit_ids.append(commit._id)
+            if path in commit.changed_paths:
+                cache._get_misses += 1
+                # tree was changed but no LCD found; have to build
+                tree = commit.tree
+                if path != '':
+                    tree = tree.get_obj_by_path(path)
+                return cls.build(tree, commit_ids)
+            cache._get_walks += 1
+            gw += 1
+            cache._get_walks_max = max(cache._get_walks_max, gw)
+
+    @classmethod
+    def build(cls, tree, commit_ids=[]):
+        '''
+          Build the LCD record, presuming that this tree is where it was most
+          recently changed.
+
+          To build the LCD, we climb the commit tree, keeping track of which
+          entries we still need info about.  (For multi-parent commits, it
+          doesn't matter which parent we follow because those would be merge
+          commits and ought to have the diff info populated for any file
+          touched by the merge.)  At each step of the walk, we check the following:
+
+            1) If the current tree has an LCD record, we can pull all the remaining
+               info we need from it, and we're done.
+
+            2) If the tree was modified in this commit, then we pull the info for
+               all changed entries, then continue up the tree.  Once we have data
+               for all entries, we're done.
+
+          (It may be possible to optimize this for SVN, if SVN can return all of
+          the LCD info from a single call and if that turns out to be more efficient
+          than walking up the tree.  It is unclear if those hold without testing.)
+        '''
+        cache = getattr(c, 'model_cache', '') or ModelCache()
+        unfilled = set([n.name for n in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)])
+        tree_nodes = set([n.name for n in tree.tree_ids])
+        path = tree.path().strip('/')
+        lcd = cls(
+                    commit_ids=commit_ids,
+                    path=path,
+                    entries=[],
+                )
+        cache._build_calls += 1
+        bw = 0
+        for commit in tree.commit.climb_commit_tree():
+            partial_lcd = cache.get(LastCommit, dict(
+                    commit_ids=commit._id,
+                    path=path,
+                ))
+            for name in list(unfilled):
+                if os.path.join(path, name) in commit.changed_paths:
+                    # changed in this commit, so gather the data
+                    lcd.entries.append(dict(
+                            type=name in tree_nodes and 'DIR' or 'BLOB',
+                            name=name,
+                            commit_info=commit.info,
+                        ))
+                    unfilled.remove(name)
+                elif partial_lcd:
+                    # the partial LCD should contain anything we're missing
+                    entry = partial_lcd.entry_by_name(name)
+                    assert entry
+                    lcd.entries.append(entry)
+                    unfilled.remove(name)
+
+            if not unfilled:
+                break
+            cache._build_walks += 1
+            bw += 1
+            cache._build_walks_max = max(cache._build_walks_max, bw)
+        for commit_id in commit_ids:
+            cache.set(LastCommit, dict(commit_ids=commit_id, path=path), lcd)
+        return lcd
+
+    def entry_by_name(self, name):
+        for entry in self.entries:
+            if entry.name == name:
+                return entry
+        return None
+
 mapper(Commit, CommitDoc, repository_orm_session)
 mapper(Tree, TreeDoc, repository_orm_session)
+mapper(LastCommit, LastCommitDoc, repository_orm_session)
+
+
+class ModelCache(object):
+    '''
+    Cache model instances based on query params passed to get.
+    '''
+    def __init__(self, max_size=2000):
+        '''
+        The max_size of the cache is tracked separately for
+        each model class stored.  I.e., you can have 2000
+        Commit instances and 2000 Tree instances in the cache
+        at once with the default value.
+        '''
+        self._cache = defaultdict(dict)
+        self.max_size = max_size
+        self._insertion_order = defaultdict(list)
+        # temporary, for performance testing
+        self._hits = 0
+        self._misses = 0
+        self._get_calls = 0
+        self._get_walks = 0
+        self._get_walks_max = 0
+        self._get_hits = 0
+        self._get_misses = 0
+        self._build_calls = 0
+        self._build_walks = 0
+        self._build_walks_max = 0
+
+    def _normalize_key(self, key):
+        _key = key
+        if not isinstance(_key, tuple):
+            _key = tuple(sorted(_key.items(), key=lambda k: k[0]))
+        return _key
+
+    def get(self, cls, key):
+        _key = self._normalize_key(key)
+        if _key not in self._cache[cls]:
+            self._misses += 1
+            query = getattr(cls, 'query', getattr(cls, 'm', None))
+            self.set(cls, _key, query.get(**key))
+        else:
+            self._hits += 1
+        return self._cache[cls][_key]
+
+    def set(self, cls, key, val):
+        _key = self._normalize_key(key)
+        self._manage_cache(cls, _key)
+        self._cache[cls][_key] = val
+
+    def _manage_cache(self, cls, key):
+        '''
+        Keep track of insertion order, prevent duplicates,
+        and expire from the cache in a FIFO manner.
+        '''
+        if key in self._cache[cls]:
+            return
+        self._insertion_order[cls].append(key)
+        if len(self._insertion_order[cls]) > self.max_size:
+            _key = self._insertion_order[cls].pop(0)
+            self._cache[cls].pop(_key)
+
+    def size(self):
+        return sum([len(c) for c in self._insertion_order.values()])
+
+    def keys(self, cls):
+        '''
+        Returns all the cache keys for a given class.  Each
+        cache key will be a dict.
+        '''
+        if self._cache[cls]:
+            return [dict(k) for k in self._cache[cls].keys()]
+        return []
+
+    def batch_load(self, cls, query, attrs=None):
+        '''
+        Load multiple results given a query.
+
+        Optionally takes a list of attribute names to use
+        as the cache key.  If not given, uses the keys of
+        the given query.
+        '''
+        if attrs is None:
+            attrs = query.keys()
+        for result in cls.query.find(query):
+            keys = {a: getattr(result, a) for a in attrs}
+            self.set(cls, keys, result)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/20bad544/Allura/allura/model/repo_refresh.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
index 6e0db59..149fcae 100644
--- a/Allura/allura/model/repo_refresh.py
+++ b/Allura/allura/model/repo_refresh.py
@@ -2,12 +2,13 @@ import logging
 from itertools import chain
 from cPickle import dumps
 import re
+import os
 
 import bson
 
 import tg
 
-from pylons import g
+from pylons import g,c
 
 from ming.base import Object
 from ming.orm import mapper, session
@@ -16,7 +17,7 @@ from allura.lib import utils
 from allura.lib import helpers as h
 from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc
 from allura.model.repo import LastCommitDoc, CommitRunDoc
-from allura.model.repo import Commit
+from allura.model.repo import Commit, Tree, LastCommit, ModelCache
 from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc
 
 log = logging.getLogger(__name__)
@@ -88,17 +89,25 @@ def refresh_repo(repo, all_commits=False, notify=True):
 
     # Compute diffs
     cache = {}
-    # Have to compute_diffs() for all commits to ensure that LastCommitDocs
-    # are set properly for forked repos. For some SCMs, compute_diffs()
-    # we don't want to pre-compute the diffs because that would be too
-    # expensive, so we skip them here and do them on-demand with caching.
+    # For some SCMs, we don't want to pre-compute the diffs because that
+    # would be too expensive, so we skip them here and do them on-demand
+    # with caching.
     if repo._refresh_precompute:
-        for i, oid in enumerate(reversed(all_commit_ids)):
+        for i, oid in enumerate(commit_ids):
             ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
             compute_diffs(repo._id, cache, ci)
             if (i+1) % 100 == 0:
                 log.info('Compute diffs %d: %s', (i+1), ci._id)
 
+    if repo._refresh_precompute:
+        cache = ModelCache()
+        for i, oid in enumerate(reversed(commit_ids)):
+            ci = CommitDoc.m.find(dict(_id=oid), validate=False).next()
+            compute_lcds(ci, cache)
+            if (i+1) % 100 == 0:
+                log.info('Compute last commit info %d: %s', (i+1), ci._id)
+
+
     log.info('Refresh complete for %s', repo.full_fs_path)
     g.post_event(
             'repo_refreshed',
@@ -348,7 +357,6 @@ def compute_diffs(repo_id, tree_cache, rhs_ci):
             dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id))
     # Set last commit data
     rhs_tree = tree_index[rhs_ci.tree_id]
-    refresh_last_commit(repo_id, '/', rhs_tree, lhs_tree, None, commit_info)
     # Build the diffinfo
     di = DiffInfoDoc(dict(
             _id=rhs_ci._id,
@@ -420,18 +428,21 @@ def _diff_trees(lhs, rhs, index, *path):
         (o.name, o.id)
         for o in rhs.tree_ids)
     for o in lhs.tree_ids:
-        rhs_id = rhs_tree_ids.pop(o.name, None)
-        if rhs_id == o.id:
-            continue # no change
-        elif rhs_id is None:
+        rhs_id = rhs_tree_ids.pop(o.name, None)  # remove so won't be picked up as added, below
+        if rhs_id == o.id:  # no change
+            continue
+        elif rhs_id is None:  # removed
             yield (_fq(o.name), o.id, None)
-        else:
-            for difference in _diff_trees(
-                index[o.id], index[rhs_id], index,
-                o.name, *path):
-                yield difference
-    for name, id in rhs_tree_ids.items():
+            rhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
+        else:  # changed
+            rhs_tree = index[rhs_id]
+        for difference in _diff_trees(index[o.id], rhs_tree, index, o.name, *path):
+            yield difference
+    for name, id in rhs_tree_ids.items():  # added
         yield (_fq(name), None, id)
+        lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[])
+        for difference in _diff_trees(lhs_tree, index[id], index, name, *path):
+            yield difference
     # DIff the blobs
     rhs_blob_ids = dict(
         (o.name, o.id)
@@ -462,53 +473,6 @@ def get_commit_info(commit):
         summary=commit.summary
         )
 
-def refresh_last_commit(repo_id, path, tree, lhs_tree, parent_tree, commit_info):
-    '''Build the LastCommit info.
-
-    We only need to create LastCommit info for objects that are in the
-    RHS but not in the LHS, because only those objects are only ones
-    who have had anything changed in them.  (If file x/y/z.txt changes,
-    then it's hash will change, which also forces the hash for tree x/y
-    to change, as well as the hash for tree x.  So as long as an object's
-    hash isn't in the LHS, it means it's new or modified in this commit.)
-
-    In order to uniquely identify the tree or blob that a LastCommitDoc is
-    for, the tree or blob hash is not sufficient; we also need to know
-    either it's full path name, or it's parent tree and name.  Because of
-    this, we have to walk down the commit tree.'''
-    if lhs_tree is not None and tree._id == lhs_tree._id:
-        # tree was not changed in this commit (nor was anything under it)
-        return
-
-    # map LHS entries for easy lookup
-    lhs_map = {}
-    if lhs_tree:
-        for lhs_child in chain(lhs_tree.tree_ids, lhs_tree.blob_ids, lhs_tree.other_ids):
-            lhs_map[lhs_child.name] = lhs_child.id
-
-    # update our children
-    for child in chain(tree.tree_ids, tree.blob_ids, tree.other_ids):
-        if child.id != lhs_map.get(child.name, None):  # check if changed in this commit
-            lc = set_last_commit(repo_id, path, child.name, child.id, commit_info)
-
-    # (re)curse at our child trees
-    for child_tree in tree.tree_ids:
-        child_name = child_tree.name
-        child_tree = TreeDoc.m.get(_id=child_tree.id)
-        lhs_child = None
-        if child_name in lhs_map:
-            lhs_child = TreeDoc.m.get(_id=lhs_map[child_name])
-        refresh_last_commit(repo_id, path + child_name + '/', child_tree, lhs_child, tree, commit_info)
-
-def set_last_commit(repo_id, path, name, oid, commit_info):
-    lc = LastCommitDoc(dict(
-            _id='%s:%s:%s' % (repo_id, path, name),
-            object_id=oid,
-            name=name,
-            commit_info=commit_info))
-    lc.m.save(safe=False, upsert=True)
-    return lc
-
 def last_known_commit_id(all_commit_ids, new_commit_ids):
     """
     Return the newest "known" (cached in mongo) commit id.
@@ -522,3 +486,35 @@ def last_known_commit_id(all_commit_ids, new_commit_ids):
     if not all_commit_ids: return None
     if not new_commit_ids: return all_commit_ids[-1]
     return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1]
+
+
+def compute_lcds(commit, cache):
+    '''
+    Compute LastCommit data for every Tree node under this tree.
+    '''
+    trees = cache.get(TreesDoc, dict(_id=commit._id))
+    if not trees:
+        log.error('Missing TreesDoc for %s; skipping compute_lcd' % commit)
+        return
+    _update_tree_cache(trees.tree_ids, cache)
+    c.model_cache = cache
+    for tree in _walk_commit_tree(commit, cache):
+        lcd = LastCommit.get(tree)  # auto-vivify LCD
+
+def _walk_commit_tree(commit, cache):
+    def _walk_tree(tree):
+        yield tree
+        for x in tree.tree_ids:
+            sub_tree = cache.get(Tree, dict(_id=x.id))
+            sub_tree.set_context(tree, x.name)
+            for xx in _walk_tree(sub_tree):
+                yield xx
+    top_tree = cache.get(Tree, dict(_id=commit.tree_id))
+    top_tree.set_context(commit)
+    return _walk_tree(top_tree)
+
+def _update_tree_cache(tree_ids, cache):
+    current_ids = set(tree_ids)
+    cached_ids = set([k['_id'] for k in cache.keys(Tree)])
+    new_ids = current_ids - cached_ids
+    cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}})

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/20bad544/Allura/allura/model/repository.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repository.py b/Allura/allura/model/repository.py
index 8aa91e8..c322658 100644
--- a/Allura/allura/model/repository.py
+++ b/Allura/allura/model/repository.py
@@ -445,21 +445,6 @@ class Repository(Artifact, ActivityObject):
         with self.push_upstream_context():
             return MergeRequest.query.find(q).count()
 
-    def get_last_commit(self, obj):
-        from .repo import LastCommitDoc
-        lc = LastCommitDoc.m.get(
-            repo_id=self._id, object_id=obj._id)
-        if lc is None:
-            return dict(
-                author=None,
-                author_email=None,
-                author_url=None,
-                date=None,
-                id=None,
-                shortlink=None,
-                summary=None)
-        return lc.commit_info
-
     @property
     def forks(self):
         return self.query.find({'upstream_repo.name': self.url()}).all()

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/20bad544/Allura/allura/tests/model/test_repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py
index 90eaac1..0dbff66 100644
--- a/Allura/allura/tests/model/test_repo.py
+++ b/Allura/allura/tests/model/test_repo.py
@@ -1,5 +1,11 @@
+from datetime import datetime
+from collections import defaultdict
+import unittest
+import mock
 from nose.tools import assert_equal
 from pylons import c
+from bson import ObjectId
+from ming.orm import session
 
 from alluratest.controller import setup_basic_test, setup_global_objects
 from allura import model as M
@@ -67,3 +73,541 @@ class RepoImplTestBase(object):
         self.assertEqual(run.commit_ids, commit_ids)
         self.assertEqual(len(run.commit_ids), len(run.commit_times))
         self.assertEqual(run.parent_commit_ids, [])
+
+
+class TestLastCommit(unittest.TestCase):
+    def setUp(self):
+        setup_basic_test()
+        setup_global_objects()
+        c.model_cache = M.repo.ModelCache()
+        self.repo = mock.Mock('repo', _commits={}, _last_commit=None)
+        self.repo.shorthand_for_commit = lambda _id: _id[:6]
+
+    def _build_tree(self, commit, path, tree_paths):
+        tree_nodes = []
+        blob_nodes = []
+        sub_paths = defaultdict(list)
+        def n(p):
+            m = mock.Mock()
+            m.name = p
+            return m
+        for p in tree_paths:
+            if '/' in p:
+                node, sub = p.split('/',1)
+                tree_nodes.append(n(node))
+                sub_paths[node].append(sub)
+            else:
+                blob_nodes.append(n(p))
+        tree = mock.Mock(
+                commit=commit,
+                path=mock.Mock(return_value=path),
+                tree_ids=tree_nodes,
+                blob_ids=blob_nodes,
+                other_ids=[],
+            )
+        tree.get_obj_by_path = lambda p: self._build_tree(commit, p, sub_paths[p])
+        return tree
+
+    def _add_commit(self, msg, tree_paths, diff_paths=None, parents=[]):
+        suser = dict(
+                name='test',
+                email='test@example.com',
+                date=datetime(2013, 1, 1 + len(self.repo._commits)),
+            )
+        commit = M.repo.Commit(
+                _id=str(ObjectId()),
+                message=msg,
+                parent_ids=[parent._id for parent in parents],
+                commited=suser,
+                authored=suser,
+                repo=self.repo,
+            )
+        commit.tree = self._build_tree(commit, '/', tree_paths)
+        diffinfo = M.repo.DiffInfoDoc(dict(
+                _id=commit._id,
+                differences=[{'name': p} for p in diff_paths or tree_paths],
+            ))
+        diffinfo.m.save()
+        self.repo._commits[commit._id] = commit
+        return commit
+
+    def test_single_commit(self):
+        commit1 = self._add_commit('Commit 1', [
+                'file1',
+                'dir1/file2',
+            ])
+        lcd = M.repo.LastCommit.get(commit1.tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit1.message])
+        self.assertEqual(lcd.path, '')
+        self.assertEqual(len(lcd.entries), 2)
+        self.assertEqual(lcd.entry_by_name('file1'), dict(
+            type='BLOB',
+            name='file1',
+            commit_info=dict(
+                summary='Commit 1',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 1),
+                author_url=None,
+                id=commit1._id,
+                shortlink=self.repo.shorthand_for_commit(commit1._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('dir1'), dict(
+            type='DIR',
+            name='dir1',
+            commit_info=dict(
+                summary='Commit 1',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 1),
+                author_url=None,
+                id=commit1._id,
+                shortlink=self.repo.shorthand_for_commit(commit1._id),
+            )))
+
+    def test_multiple_commits_no_overlap(self):
+        commit1 = self._add_commit('Commit 1', ['file1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1'], ['dir1/file1'], [commit1])
+        commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'file2'], ['file2'], [commit2])
+        lcd = M.repo.LastCommit.get(commit3.tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message])
+        self.assertEqual(lcd.commit_ids, [commit3._id])
+        self.assertEqual(lcd.path, '')
+        self.assertEqual(len(lcd.entries), 3)
+        self.assertEqual(lcd.entry_by_name('file1'), dict(
+            type='BLOB',
+            name='file1',
+            commit_info=dict(
+                summary='Commit 1',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 1),
+                author_url=None,
+                id=commit1._id,
+                shortlink=self.repo.shorthand_for_commit(commit1._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('dir1'), dict(
+            type='DIR',
+            name='dir1',
+            commit_info=dict(
+                summary='Commit 2',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 2),
+                author_url=None,
+                id=commit2._id,
+                shortlink=self.repo.shorthand_for_commit(commit2._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('file2'), dict(
+            type='BLOB',
+            name='file2',
+            commit_info=dict(
+                summary='Commit 3',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 3),
+                author_url=None,
+                id=commit3._id,
+                shortlink=self.repo.shorthand_for_commit(commit3._id),
+            )))
+
+    def test_multiple_commits_with_overlap(self):
+        commit1 = self._add_commit('Commit 1', ['file1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1'], ['dir1/file1'], [commit1])
+        commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'file2'], ['file1', 'file2'], [commit2])
+        lcd = M.repo.LastCommit.get(commit3.tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message])
+        self.assertEqual(lcd.path, '')
+        self.assertEqual(len(lcd.entries), 3)
+        self.assertEqual(lcd.entry_by_name('file1'), dict(
+            type='BLOB',
+            name='file1',
+            commit_info=dict(
+                summary='Commit 3',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 3),
+                author_url=None,
+                id=commit3._id,
+                shortlink=self.repo.shorthand_for_commit(commit3._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('dir1'), dict(
+            type='DIR',
+            name='dir1',
+            commit_info=dict(
+                summary='Commit 2',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 2),
+                author_url=None,
+                id=commit2._id,
+                shortlink=self.repo.shorthand_for_commit(commit2._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('file2'), dict(
+            type='BLOB',
+            name='file2',
+            commit_info=dict(
+                summary='Commit 3',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 3),
+                author_url=None,
+                id=commit3._id,
+                shortlink=self.repo.shorthand_for_commit(commit3._id),
+            )))
+
+    def test_multiple_commits_subdir_change(self):
+        commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1])
+        commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file1'], [commit2])
+        lcd = M.repo.LastCommit.get(commit3.tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message])
+        self.assertEqual(lcd.path, '')
+        self.assertEqual(len(lcd.entries), 2)
+        self.assertEqual(lcd.entry_by_name('file1'), dict(
+            type='BLOB',
+            name='file1',
+            commit_info=dict(
+                summary='Commit 1',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 1),
+                author_url=None,
+                id=commit1._id,
+                shortlink=self.repo.shorthand_for_commit(commit1._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('dir1'), dict(
+            type='DIR',
+            name='dir1',
+            commit_info=dict(
+                summary='Commit 3',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 3),
+                author_url=None,
+                id=commit3._id,
+                shortlink=self.repo.shorthand_for_commit(commit3._id),
+            )))
+
+    def test_subdir_lcd(self):
+        commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1])
+        commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file1'], [commit2])
+        tree = self._build_tree(commit3, '/dir1', ['file1', 'file2'])
+        lcd = M.repo.LastCommit.get(tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message])
+        self.assertEqual(lcd.path, 'dir1')
+        self.assertEqual(len(lcd.entries), 2)
+        self.assertEqual(lcd.entry_by_name('file1'), dict(
+            type='BLOB',
+            name='file1',
+            commit_info=dict(
+                summary='Commit 3',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 3),
+                author_url=None,
+                id=commit3._id,
+                shortlink=self.repo.shorthand_for_commit(commit3._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('file2'), dict(
+            type='BLOB',
+            name='file2',
+            commit_info=dict(
+                summary='Commit 2',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 2),
+                author_url=None,
+                id=commit2._id,
+                shortlink=self.repo.shorthand_for_commit(commit2._id),
+            )))
+
+    def test_subdir_lcd_prev_commit(self):
+        commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1])
+        commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file1'], [commit2])
+        commit4 = self._add_commit('Commit 4', ['file1', 'dir1/file1', 'dir1/file2', 'file2'], ['file2'], [commit3])
+        tree = self._build_tree(commit4, '/dir1', ['file1', 'file2'])
+        lcd = M.repo.LastCommit.get(tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit4.message, commit3.message])
+        self.assertEqual(lcd.path, 'dir1')
+        self.assertEqual(len(lcd.entries), 2)
+        self.assertEqual(lcd.entry_by_name('file1'), dict(
+            type='BLOB',
+            name='file1',
+            commit_info=dict(
+                summary='Commit 3',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 3),
+                author_url=None,
+                id=commit3._id,
+                shortlink=self.repo.shorthand_for_commit(commit3._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('file2'), dict(
+            type='BLOB',
+            name='file2',
+            commit_info=dict(
+                summary='Commit 2',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 2),
+                author_url=None,
+                id=commit2._id,
+                shortlink=self.repo.shorthand_for_commit(commit2._id),
+            )))
+
+    def test_subdir_lcd_always_empty(self):
+        commit1 = self._add_commit('Commit 1', ['file1', 'dir1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'file2'], ['file2'], [commit1])
+        tree = self._build_tree(commit2, '/dir1', [])
+        lcd = M.repo.LastCommit.get(tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit2.message, commit1.message])
+        self.assertEqual(lcd.path, 'dir1')
+        self.assertEqual(lcd.entries, [])
+
+    def test_subdir_lcd_emptied(self):
+        commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1'])
+        commit2 = self._add_commit('Commit 2', ['file1'], ['dir1/file1'], [commit1])
+        tree = self._build_tree(commit2, '/dir1', [])
+        lcd = M.repo.LastCommit.get(tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit2.message])
+        self.assertEqual(lcd.path, 'dir1')
+        self.assertEqual(lcd.entries, [])
+
+    def test_existing_lcd_unchained(self):
+        commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1])
+        commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['file1'], [commit2])
+        prev_lcd = M.repo.LastCommit(
+                path='dir1',
+                commit_ids=[commit2._id],
+                entries=[
+                    dict(
+                        type='BLOB',
+                        name='file1',
+                        commit_info=dict(
+                            summary='Commit 1',
+                            author='test',
+                            author_email='test@example.com',
+                            date=datetime(2013, 1, 1),
+                            author_url=None,
+                            id=commit1._id,
+                            shortlink=self.repo.shorthand_for_commit(commit1._id),
+                        )),
+                    dict(
+                        type='BLOB',
+                        name='file2',
+                        commit_info=dict(
+                            summary='Commit 2',
+                            author='test',
+                            author_email='test@example.com',
+                            date=datetime(2013, 1, 2),
+                            author_url=None,
+                            id=commit2._id,
+                            shortlink=self.repo.shorthand_for_commit(commit2._id),
+                        )),
+                ],
+            )
+        session(prev_lcd).flush()
+        tree = self._build_tree(commit3, '/dir1', ['file1', 'file2'])
+        lcd = M.repo.LastCommit.get(tree)
+        self.assertEqual(lcd._id, prev_lcd._id)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit2.message, commit3.message])
+        self.assertEqual(lcd.path, 'dir1')
+        self.assertEqual(lcd.entries, prev_lcd.entries)
+
+    def test_existing_lcd_partial(self):
+        commit1 = self._add_commit('Commit 1', ['file1'])
+        commit2 = self._add_commit('Commit 2', ['file1', 'file2'], ['file2'], [commit1])
+        commit3 = self._add_commit('Commit 3', ['file1', 'file2', 'file3'], ['file3'], [commit2])
+        commit4 = self._add_commit('Commit 4', ['file1', 'file2', 'file3', 'file4'], ['file2', 'file4'], [commit3])
+        prev_lcd = M.repo.LastCommit(
+                path='',
+                commit_ids=[commit3._id],
+                entries=[
+                    dict(
+                        type='BLOB',
+                        name='file1',
+                        commit_info=dict(
+                            summary='Existing LCD',    # lying here to test that it uses this
+                            author='test',             # data instead of walking up the tree
+                            author_email='test@example.com',
+                            date=datetime(2013, 1, 1),
+                            author_url=None,
+                            id=commit1._id,
+                            shortlink=self.repo.shorthand_for_commit(commit1._id),
+                        )),
+                    dict(
+                        type='BLOB',
+                        name='file2',
+                        commit_info=dict(
+                            summary='Commit 2',
+                            author='test',
+                            author_email='test@example.com',
+                            date=datetime(2013, 1, 2),
+                            author_url=None,
+                            id=commit2._id,
+                            shortlink=self.repo.shorthand_for_commit(commit2._id),
+                        )),
+                    dict(
+                        type='BLOB',
+                        name='file3',
+                        commit_info=dict(
+                            summary='Commit 3',
+                            author='test',
+                            author_email='test@example.com',
+                            date=datetime(2013, 1, 3),
+                            author_url=None,
+                            id=commit3._id,
+                            shortlink=self.repo.shorthand_for_commit(commit3._id),
+                        )),
+                ],
+            )
+        session(prev_lcd).flush()
+        lcd = M.repo.LastCommit.get(commit4.tree)
+        self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit4.message])
+        self.assertEqual(lcd.path, '')
+        self.assertEqual(lcd.entry_by_name('file1')['commit_info']['summary'], 'Existing LCD')
+        self.assertEqual(len(lcd.entries), 4)
+        self.assertEqual(lcd.entry_by_name('file1'), dict(
+            type='BLOB',
+            name='file1',
+            commit_info=dict(
+                summary='Existing LCD',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 1),
+                author_url=None,
+                id=commit1._id,
+                shortlink=self.repo.shorthand_for_commit(commit1._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('file2'), dict(
+            type='BLOB',
+            name='file2',
+            commit_info=dict(
+                summary='Commit 4',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 4),
+                author_url=None,
+                id=commit4._id,
+                shortlink=self.repo.shorthand_for_commit(commit4._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('file3'), dict(
+            type='BLOB',
+            name='file3',
+            commit_info=dict(
+                summary='Commit 3',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 3),
+                author_url=None,
+                id=commit3._id,
+                shortlink=self.repo.shorthand_for_commit(commit3._id),
+            )))
+        self.assertEqual(lcd.entry_by_name('file4'), dict(
+            type='BLOB',
+            name='file4',
+            commit_info=dict(
+                summary='Commit 4',
+                author='test',
+                author_email='test@example.com',
+                date=datetime(2013, 1, 4),
+                author_url=None,
+                id=commit4._id,
+                shortlink=self.repo.shorthand_for_commit(commit4._id),
+            )))
+
+
+class TestModelCache(unittest.TestCase):
+    def setUp(self):
+        self.cache = M.repo.ModelCache()
+
+    def test_normalize_key(self):
+        self.assertEqual(self.cache._normalize_key({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1)))
+
+    @mock.patch.object(M.repo.Tree.query, 'get')
+    @mock.patch.object(M.repo.LastCommit.query, 'get')
+    def test_get(self, lc_get, tr_get):
+        tr_get.return_value = 'bar'
+        lc_get.return_value = 'qux'
+
+        val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
+        tr_get.assert_called_with(_id='foo')
+        self.assertEqual(val, 'bar')
+
+        val = self.cache.get(M.repo.LastCommit, {'_id': 'foo'})
+        lc_get.assert_called_with(_id='foo')
+        self.assertEqual(val, 'qux')
+
+    @mock.patch.object(M.repo.Tree.query, 'get')
+    def test_get_no_dup(self, tr_get):
+        tr_get.return_value = 'bar'
+        val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
+        tr_get.assert_called_once_with(_id='foo')
+        self.assertEqual(val, 'bar')
+
+        tr_get.return_value = 'qux'
+        val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
+        tr_get.assert_called_once_with(_id='foo')
+        self.assertEqual(val, 'bar')
+
+    @mock.patch.object(M.repo.TreesDoc.m, 'get')
+    def test_get_doc(self, tr_get):
+        tr_get.return_value = 'bar'
+        val = self.cache.get(M.repo.TreesDoc, {'_id': 'foo'})
+        tr_get.assert_called_once_with(_id='foo')
+        self.assertEqual(val, 'bar')
+
+    def test_set(self):
+        self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'test_set')
+        self.assertEqual(self.cache._cache, {M.repo.Tree: {(('_id', 'foo'),): 'test_set'}})
+
+    def test_keys(self):
+        self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo'
+        self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar'
+        self.assertEqual(self.cache.keys(M.repo.Tree), [{'_id': 'test_keys', 'text': 'tko'}, {'fubar': 'scm'}])
+        self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
+
+    @mock.patch.object(M.repo.Tree.query, 'find')
+    def test_batch_load(self, tr_find):
+        # cls, query, attrs
+        m1 = mock.Mock(foo=1, qux=3)
+        m2 = mock.Mock(foo=2, qux=5)
+        tr_find.return_value = [m1, m2]
+
+        self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}})
+        tr_find.assert_called_with({'foo': {'$in': 'bar'}})
+        self.assertEqual(self.cache._cache[M.repo.Tree], {
+                (('foo', 1),): m1,
+                (('foo', 2),): m2,
+            })
+
+    @mock.patch.object(M.repo.Tree.query, 'find')
+    def test_batch_load_attrs(self, tr_find):
+        # cls, query, attrs
+        m1 = mock.Mock(foo=1, qux=3)
+        m2 = mock.Mock(foo=2, qux=5)
+        tr_find.return_value = [m1, m2]
+
+        self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}, ['qux'])
+        tr_find.assert_called_with({'foo': {'$in': 'bar'}})
+        self.assertEqual(self.cache._cache[M.repo.Tree], {
+                (('qux', 3),): m1,
+                (('qux', 5),): m2,
+            })
+
+    def test_pruning(self):
+        self.cache.max_size = 2
+        self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'bar')
+        self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz')
+        self.cache.set(M.repo.Tree, {'_id': 'f00'}, 'b4r')
+        self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz')
+        self.assertEqual(self.cache._cache, {
+                M.repo.Tree: {
+                    (('_id', 'qux'),): 'zaz',
+                    (('_id', 'f00'),): 'b4r',
+                },
+            })

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/20bad544/ForgeSVN/forgesvn/model/svn.py
----------------------------------------------------------------------
diff --git a/ForgeSVN/forgesvn/model/svn.py b/ForgeSVN/forgesvn/model/svn.py
index 13f30a6..8412d89 100644
--- a/ForgeSVN/forgesvn/model/svn.py
+++ b/ForgeSVN/forgesvn/model/svn.py
@@ -76,7 +76,7 @@ class Repository(M.Repository):
         while ci is not None and limit > 0:
             yield ci._id
             limit -= 1
-            ci = ci.parent()
+            ci = ci.get_parent()
 
     def latest(self, branch=None):
         if self._impl is None: return None
@@ -386,15 +386,27 @@ class SVNImplementation(M.RepositoryImplementation):
         log.debug('Compute tree for %d paths', len(infos))
         tree_ids = []
         blob_ids = []
+        chg_revno = infos[0][1]['last_changed_rev'].number
+        cur_revno = self._revno(commit._id)
+        commit_ids = [self._oid(revno) for revno in range(chg_revno, cur_revno+1)]
+        lcd = M.repo.LastCommit.query.get(
+                commit_ids=self._oid(chg_revno),
+                path=tree_path.strip('/'),
+            )
+        if lcd:
+            lcd.commit_ids = list(set(lcd.commit_ids + commit_ids))
+            lcd_is_new = False
+        else:
+            # we can't use the normal auto-vivification, because
+            # SVN repos don't have their diff infos filled out :(
+            lcd = M.repo.LastCommit(
+                commit_ids=commit_ids,
+                path=tree_path.strip('/'),
+            )
+            lcd_is_new = True
         for path, info in infos[1:]:
             last_commit_id = self._oid(info['last_changed_rev'].number)
             last_commit = M.repo.Commit.query.get(_id=last_commit_id)
-            M.repo_refresh.set_last_commit(
-                self._repo._id,
-                re.sub(r'/?$', '/', tree_path),  # force it to end with /
-                path,
-                self._tree_oid(commit._id, path),
-                M.repo_refresh.get_commit_info(last_commit))
             if info.kind == pysvn.node_kind.dir:
                 tree_ids.append(Object(
                         id=self._tree_oid(commit._id, path),
@@ -405,6 +417,13 @@ class SVNImplementation(M.RepositoryImplementation):
                         name=path))
             else:
                 assert False
+            if lcd_is_new:
+                lcd.entries.append(dict(
+                        name=path,
+                        type='DIR' if info.kind == pysvn.node_kind.dir else 'BLOB',
+                        commit_info=last_commit.info,
+                    ))
+        session(lcd).flush(lcd)
         tree, is_new = RM.Tree.upsert(tree_id,
                 tree_ids=tree_ids,
                 blob_ids=blob_ids,

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/20bad544/scripts/refresh-all-repos.py
----------------------------------------------------------------------
diff --git a/scripts/refresh-all-repos.py b/scripts/refresh-all-repos.py
index 822148f..1cf7e3d 100644
--- a/scripts/refresh-all-repos.py
+++ b/scripts/refresh-all-repos.py
@@ -32,6 +32,7 @@ def main(options):
         M.repo.TreesDoc.m.remove({})
         M.repo.DiffInfoDoc.m.remove({})
         M.repo.CommitRunDoc.m.remove({})
+        M.repo.LastCommitDoc.m.remove({})
 
     for chunk in chunked_find(M.Project, q_project):
         for p in chunk:
@@ -72,9 +73,6 @@ def main(options):
                         i = M.repo.TreeDoc.m.find({"_id": {"$in": tree_ids_chunk}}).count()
                         log.info("Deleting %i TreeDoc docs...", i)
                         M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}})
-                        i = M.repo.LastCommitDoc.m.find({"object_id": {"$in": tree_ids_chunk}}).count()
-                        log.info("Deleting %i LastCommitDoc docs...", i)
-                        M.repo.LastCommitDoc.m.remove({"object_id": {"$in": tree_ids_chunk}})
                     del tree_ids
 
                     # delete these after TreeDoc and LastCommitDoc so that if
@@ -83,11 +81,10 @@ def main(options):
                     log.info("Deleting %i TreesDoc docs...", i)
                     M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids}})
 
-                    # delete LastCommitDocs for non-trees
-                    repo_lastcommit_re = re.compile("^{}:".format(c.app.repo._id))
-                    i = M.repo.LastCommitDoc.m.find(dict(_id=repo_lastcommit_re)).count()
+                    # delete LastCommitDocs
+                    i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
                     log.info("Deleting %i remaining LastCommitDoc docs, by repo id...", i)
-                    M.repo.LastCommitDoc.m.remove(dict(_id=repo_lastcommit_re))
+                    M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
 
                     i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids}}).count()
                     log.info("Deleting %i DiffInfoDoc docs...", i)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/20bad544/scripts/refresh-last-commits.py
----------------------------------------------------------------------
diff --git a/scripts/refresh-last-commits.py b/scripts/refresh-last-commits.py
new file mode 100644
index 0000000..8776010
--- /dev/null
+++ b/scripts/refresh-last-commits.py
@@ -0,0 +1,172 @@
+import argparse
+import logging
+import re
+from datetime import datetime
+from contextlib import contextmanager
+
+import faulthandler
+from pylons import c
+from ming.orm import ThreadLocalORMSession
+
+from allura import model as M
+from allura.lib.utils import chunked_find, chunked_list
+
+log = logging.getLogger(__name__)
+
+
+def main(options):
+    q_project = {}
+    if options.nbhd:
+        nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd)
+        if not nbhd:
+            return "Invalid neighborhood url prefix."
+        q_project['neighborhood_id'] = nbhd._id
+    if options.project:
+        q_project['shortname'] = options.project
+    elif options.project_regex:
+        q_project['shortname'] = {'$regex': options.project_regex}
+
+    log.info('Refreshing last commit data')
+
+    for chunk in chunked_find(M.Project, q_project):
+        for p in chunk:
+            log.info("Refreshing last commit data for project '%s'." % p.shortname)
+            if options.dry_run:
+                continue
+            c.project = p
+            if options.mount_point:
+                mount_points = [options.mount_point]
+            else:
+                mount_points = [ac.options.mount_point for ac in
+                                M.AppConfig.query.find(dict(project_id=p._id))]
+            for app in (p.app_instance(mp) for mp in mount_points):
+                c.app = app
+                if not hasattr(app, 'repo'):
+                    continue
+                if c.app.repo.tool.lower() not in options.repo_types:
+                    log.info("Skipping %r: wrong type (%s)", c.app.repo,
+                            c.app.repo.tool.lower())
+                    continue
+
+                ci_ids = list(reversed(list(c.app.repo.all_commit_ids())))
+                #ci_ids = list(c.app.repo.all_commit_ids())
+                if options.clean:
+                    if options.diffs:
+                        # delete DiffInfoDocs
+                        i = M.repo.DiffInfoDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
+                        log.info("Deleting %i DiffInfoDoc docs, by repo id...", i)
+                        M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
+
+                    # delete LastCommitDocs
+                    i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count()
+                    log.info("Deleting %i LastCommitDoc docs, by repo id...", i)
+                    M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids}))
+
+                try:
+                    log.info('Refreshing all last commits in %r', c.app.repo)
+                    if options.profile:
+                        import cProfile
+                        cProfile.runctx('refresh_repo_lcds(ci_ids, options)',
+                                globals(), locals(), '/tmp/refresh_lcds.profile')
+                    else:
+                        refresh_repo_lcds(ci_ids, options)
+                except:
+                    log.exception('Error refreshing %r', c.app.repo)
+                    raise
+        ThreadLocalORMSession.flush_all()
+        ThreadLocalORMSession.close_all()
+
+
+def refresh_repo_lcds(commit_ids, options):
+    tree_cache = {}
+    timings = []
+    if options.diffs:
+        print 'Processing diffs'
+        for commit_id in commit_ids:
+            commit = M.repo.Commit.query.get(_id=commit_id)
+            with time(timings):
+                M.repo_refresh.compute_diffs(c.app.repo._id, tree_cache, commit)
+            if len(timings) % 1000 == 0:
+                mt = max(timings)
+                tt = sum(timings)
+                at = tt / len(timings)
+                print '  Processed %d commits (max: %f, avg: %f, tot: %f, cl: %d)' % (
+                        len(timings), mt, at, tt, len(tree_cache))
+    lcd_cache = M.repo.ModelCache(80000)
+    timings = []
+    print 'Processing last commits'
+    for commit_id in commit_ids:
+        commit = M.repo.Commit.query.get(_id=commit_id)
+        with time(timings):
+            M.repo_refresh.compute_lcds(commit, lcd_cache)
+        if len(timings) % 100 == 0:
+            mt = max(timings)
+            tt = sum(timings)
+            at = tt / len(timings)
+            mat = sum(timings[-100:]) / 100
+            print '  Processed %d commits (max: %f, avg: %f, mavg: %f, tot: %f, lc: %d, lcl: %d, hits: %d, agw: %d, mgw: %d, gh: %d, abw: %d, mbw: %d, ts: %d)' % (
+                    len(timings), mt, at, mat, tt, lcd_cache.size(), len(lcd_cache._cache[M.repo.LastCommit]),
+                    lcd_cache._hits * 100 / (lcd_cache._hits + lcd_cache._misses),
+                    lcd_cache._get_walks / lcd_cache._get_calls, lcd_cache._get_walks_max, lcd_cache._get_hits * 100 / lcd_cache._get_calls,
+                    lcd_cache._build_walks / lcd_cache._build_calls, lcd_cache._build_walks_max,
+                    len(lcd_cache.get(M.repo.TreesDoc, dict(_id=commit._id)).tree_ids))
+            ThreadLocalORMSession.flush_all()
+            ThreadLocalORMSession.close_all()
+        #if len(timings) == 300:
+        #    break
+
+
+@contextmanager
+def time(timings):
+    s = datetime.now()
+    yield
+    timings.append((datetime.now() - s).total_seconds())
+
+
+def repo_type_list(s):
+    repo_types = []
+    for repo_type in s.split(','):
+        repo_type = repo_type.strip()
+        if repo_type not in ['svn', 'git', 'hg']:
+            raise argparse.ArgumentTypeError(
+                    '{} is not a valid repo type.'.format(repo_type))
+        repo_types.append(repo_type)
+    return repo_types
+
+
+def parse_options():
+    parser = argparse.ArgumentParser(description='Using existing commit data, '
+            'refresh the last commit metadata in MongoDB. Run for all repos (no args), '
+            'or restrict by neighborhood, project, or code tool mount point.')
+    parser.add_argument('--nbhd', action='store', default='', dest='nbhd',
+            help='Restrict update to a particular neighborhood, e.g. /p/.')
+    parser.add_argument('--project', action='store', default='', dest='project',
+            help='Restrict update to a particular project. To specify a '
+            'subproject, use a slash: project/subproject.')
+    parser.add_argument('--project-regex', action='store', default='',
+            dest='project_regex',
+            help='Restrict update to projects for which the shortname matches '
+            'the provided regex.')
+    parser.add_argument('--repo-types', action='store', type=repo_type_list,
+            default=['svn', 'git', 'hg'], dest='repo_types',
+            help='Only refresh last commits for repos of the given type(s). Defaults to: '
+            'svn,git,hg. Example: --repo-types=git,hg')
+    parser.add_argument('--mount_point', default='', dest='mount_point',
+            help='Restrict update to repos at the given tool mount point. ')
+    parser.add_argument('--clean', action='store_true', dest='clean',
+            default=False, help='Remove last commit mongo docs for '
+            'project(s) being refreshed before doing the refresh.')
+    parser.add_argument('--dry-run', action='store_true', dest='dry_run',
+            default=False, help='Log names of projects that would have their '
+            'last commits refreshed, but do not perform the actual refresh.')
+    parser.add_argument('--profile', action='store_true', dest='profile',
+            default=False, help='Enable the profiler (slow). Will log '
+            'profiling output to ./refresh.profile')
+    parser.add_argument('--diffs', action='store_true', dest='diffs',
+            default=False, help='Refresh diffs as well as LCDs')
+    return parser.parse_args()
+
+if __name__ == '__main__':
+    import sys
+    faulthandler.enable()
+    sys.exit(main(parse_options()))


Mime
View raw message