Return-Path: X-Original-To: apmail-incubator-allura-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-allura-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 65BB59509 for ; Fri, 30 Nov 2012 00:17:00 +0000 (UTC) Received: (qmail 64784 invoked by uid 500); 30 Nov 2012 00:17:00 -0000 Delivered-To: apmail-incubator-allura-commits-archive@incubator.apache.org Received: (qmail 64768 invoked by uid 500); 30 Nov 2012 00:17:00 -0000 Mailing-List: contact allura-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: allura-dev@incubator.apache.org Delivered-To: mailing list allura-commits@incubator.apache.org Received: (qmail 64757 invoked by uid 99); 30 Nov 2012 00:17:00 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Fri, 30 Nov 2012 00:17:00 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 1576D8138EF; Fri, 30 Nov 2012 00:16:59 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: johnsca@apache.org To: allura-commits@incubator.apache.org X-Mailer: ASF-Git Admin Mailer Subject: git commit: [#4691] New implementation of LastCommit info Message-Id: <20121130001700.1576D8138EF@tyr.zones.apache.org> Date: Fri, 30 Nov 2012 00:16:59 +0000 (UTC) Updated Branches: refs/heads/cj/4691 77fb5246f -> cbe221ee1 (forced update) [#4691] New implementation of LastCommit info Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/cbe221ee Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/cbe221ee Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/cbe221ee Branch: refs/heads/cj/4691 Commit: cbe221ee1cf63a5d4b6b2922dcb3f19e3665f811 Parents: 7bed302 Author: Cory Johns Authored: Wed Oct 10 02:28:07 2012 +0000 Committer: Cory Johns Committed: Fri Nov 30 00:16:41 2012 +0000 ---------------------------------------------------------------------- Allura/allura/model/repo.py | 358 +++++++++++++++++- Allura/allura/model/repo_refresh.py | 126 +++---- Allura/allura/model/repository.py | 15 - Allura/allura/tests/model/test_repo.py | 544 +++++++++++++++++++++++++++ ForgeSVN/forgesvn/model/svn.py | 33 ++- scripts/refresh-all-repos.py | 11 +- scripts/refresh-last-commits.py | 172 +++++++++ 7 files changed, 1149 insertions(+), 110 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/cbe221ee/Allura/allura/model/repo.py ---------------------------------------------------------------------- diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py index b46f3fa..2bc22a9 100644 --- a/Allura/allura/model/repo.py +++ b/Allura/allura/model/repo.py @@ -11,7 +11,7 @@ from difflib import SequenceMatcher, unified_diff from pylons import c import pymongo.errors -from ming import Field, collection +from ming import Field, collection, Index from ming import schema as S from ming.base import Object from ming.utils import LazyProperty @@ -61,9 +61,7 @@ TreeDoc = collection( Field('blob_ids', [dict(name=str, id=str)]), Field('other_ids', [dict(name=str, id=str, type=SObjType)])) -# Information about the last commit to touch a tree/blob -# LastCommitDoc.object_id = TreeDoc._id -LastCommitDoc = collection( +LastCommitDoc_old = collection( 'repo_last_commit', project_doc_session, Field('_id', str), Field('object_id', str, index=True), @@ -77,6 +75,25 @@ LastCommitDoc = collection( shortlink=str, summary=str))) +# Information about the last commit to touch a tree +LastCommitDoc = collection( + 'repo_last_commit', main_doc_session, + Field('_id', S.ObjectId()), + Field('commit_ids', [str]), + Field('path', str), + Index('commit_ids', 'path'), + Field('entries', [dict( + type=str, + name=str, + commit_info=dict( + id=str, + date=datetime, + author=str, + author_email=str, + author_url=str, + shortlink=str, + summary=str))])) + # List of all trees contained within a commit # TreesDoc._id = CommitDoc._id # TreesDoc.tree_ids = [ TreeDoc._id, ... ] @@ -160,7 +177,8 @@ class Commit(RepoObject): self.tree_id = self.repo.compute_tree_new(self) if self.tree_id is None: return None - t = Tree.query.get(_id=self.tree_id) + cache = getattr(c, 'model_cache', '') or ModelCache() + t = cache.get(Tree, dict(_id=self.tree_id)) if t is None: self.tree_id = self.repo.compute_tree_new(self) t = Tree.query.get(_id=self.tree_id) @@ -182,13 +200,29 @@ class Commit(RepoObject): def symbolic_ids(self): return self.repo.symbolics_for_commit(self) - def parent(self, index=0): - ci = None - if self.parent_ids: - ci = self.query.get(_id=self.parent_ids[index]) - if ci: + def get_parent(self, index=0): + '''Get the parent of this commit. + + If there is no parent commit, or if an invalid index is given, + returns None. + ''' + try: + cache = getattr(c, 'model_cache', '') or ModelCache() + ci = cache.get(Commit, dict(_id=self.parent_ids[index])) ci.set_context(self.repo) - return ci + return ci + except IndexError as e: + return None + + def climb_commit_tree(self): + ''' + Returns a generator that walks up the commit tree along + the first-parent ancestory, starting with this commit.''' + yield self + ancestor = self.get_parent() + while ancestor: + yield ancestor + ancestor = ancestor.get_parent() def url(self): if self.repo is None: self.repo = self.guess_repo() @@ -293,7 +327,7 @@ class Commit(RepoObject): if not removed: return [] copied = [] - prev_commit = self.parent() + prev_commit = self.get_parent() for removed_name in removed[:]: removed_blob = prev_commit.tree.get_obj_by_path(removed_name) rename_info = None @@ -316,6 +350,43 @@ class Commit(RepoObject): cur = cur[part] return cur + @LazyProperty + def changed_paths(self): + ''' + Returns a list of paths changed in this commit. + Leading and trailing slashes are removed, and + the list is complete, meaning that if a sub-path + is changed, all of the parent paths are included + (including '' to represent the root path). + + Example: + + If the file /foo/bar is changed in the commit, + this would return ['', 'foo', 'foo/bar'] + ''' + diff_info = DiffInfoDoc.m.get(_id=self._id) + diffs = set() + for d in diff_info.differences: + diffs.add(d.name.strip('/')) + node_path = os.path.dirname(d.name) + while node_path: + diffs.add(node_path) + node_path = os.path.dirname(node_path) + diffs.add('') # include '/' if there are any changes + return diffs + + @LazyProperty + def info(self): + return dict( + id=self._id, + author=self.authored.name, + author_email=self.authored.email, + date=self.authored.date, + author_url=self.author_url, + shortlink=self.shorthand_id(), + summary=self.summary + ) + class Tree(RepoObject): # Ephemeral attrs repo=None @@ -337,13 +408,14 @@ class Tree(RepoObject): return sha_obj.hexdigest() def __getitem__(self, name): + cache = getattr(c, 'model_cache', '') or ModelCache() obj = self.by_name[name] if obj['type'] == 'blob': return Blob(self, name, obj['id']) - obj = self.query.get(_id=obj['id']) + obj = cache.get(Tree, dict(_id=obj['id'])) if obj is None: oid = self.repo.compute_tree_new(self.commit, self.path() + name + '/') - obj = self.query.get(_id=oid) + obj = cache.get(Tree, dict(_id=oid)) if obj is None: raise KeyError, name obj.set_context(self, name) return obj @@ -386,22 +458,71 @@ class Tree(RepoObject): return None, None def ls(self): + ''' + List the entries in this tree, with historical commit info for + each node. Eventually, ls_old can be removed and this can be + replaced with the following: + + last_commit = LastCommit.get(self) + return sorted(last_commit.entries, cmp=lambda a,b: cmp(b.type,a.type) or cmp(a.name,b.name)) + ''' + # look for existing new format first + last_commit = LastCommit.query.get( + commit_ids=self.commit._id, + path=self.path().strip('/'), + ) + if last_commit: + sorted_entries = sorted(last_commit.entries, cmp=lambda a,b: cmp(b.type,a.type) or cmp(a.name,b.name)) + mapped_entries = [self._dirent_map(e) for e in sorted_entries] + return mapped_entries + # otherwise, try old format + old_style_results = self.ls_old() + if old_style_results: + return old_style_results + # finally, use the new implentation that auto-vivifies + last_commit = LastCommit.get(self) + sorted_entries = sorted(last_commit.entries, cmp=lambda a,b: cmp(b.type,a.type) or cmp(a.name,b.name)) + mapped_entries = [self._dirent_map(e) for e in sorted_entries] + return mapped_entries + + def _dirent_map(self, dirent): + return dict( + kind=dirent.type, + name=dirent.name, + href=dirent.name + '/', + last_commit=dict( + author=dirent.commit_info.author, + author_email=dirent.commit_info.author_email, + author_url=dirent.commit_info.author_url, + date=dirent.commit_info.date, + href=self.repo.url_for_commit(dirent.commit_info['id']), + shortlink=dirent.commit_info.shortlink, + summary=dirent.commit_info.summary, + ), + ) + + def ls_old(self): # Load last commit info id_re = re.compile("^{0}:{1}:".format( self.repo._id, re.escape(h.really_unicode(self.path()).encode('utf-8')))) lc_index = dict( (lc.name, lc.commit_info) - for lc in LastCommitDoc.m.find(dict(_id=id_re))) + for lc in LastCommitDoc_old.m.find(dict(_id=id_re))) # FIXME: Temporarily fall back to old, semi-broken lookup behavior until refresh is done oids = [ x.id for x in chain(self.tree_ids, self.blob_ids, self.other_ids) ] id_re = re.compile("^{0}:".format(self.repo._id)) lc_index.update(dict( (lc.object_id, lc.commit_info) - for lc in LastCommitDoc.m.find(dict(_id=id_re, object_id={'$in': oids})))) + for lc in LastCommitDoc_old.m.find(dict(_id=id_re, object_id={'$in': oids})))) # /FIXME + if not lc_index: + # allow fallback to new method instead + # of showing a bunch of Nones + return [] + results = [] def _get_last_commit(name, oid): lc = lc_index.get(name, lc_index.get(oid, None)) @@ -569,5 +690,210 @@ class Blob(object): differ = SequenceMatcher(v0, v1) return differ.get_opcodes() +class LastCommit(RepoObject): + def __repr__(self): + return '' % (self.path, ',\n '.join(self.commit_ids)) + + @classmethod + def get(cls, tree): + '''Find the LastCommitDoc for the given tree. + + Climbs the commit tree until either: + + 1) An LCD is found for the given tree. (If the LCD was not found for the + tree's commit, the commits traversed while searching for it are + added to the LCD for faster retrieval in the future.) + + 2) The commit in which the tree was most recently modified is found. + In this case, we know that the LCD hasn't been constructed for this + (chain of) commit(s), and it will have to be built. + ''' + cache = getattr(c, 'model_cache', '') or ModelCache() + path = tree.path().strip('/') + commit_ids = [] + cache._get_calls += 1 + gw = 0 + for commit in tree.commit.climb_commit_tree(): + last_commit = cache.get(LastCommit, dict( + commit_ids=commit._id, + path=path, + )) + if last_commit: + cache._get_hits += 1 + # found our LCD; add any traversed commits to it + if commit_ids: + last_commit.commit_ids.extend(commit_ids) + for commit_id in commit_ids: + cache.set(LastCommit, dict(commit_ids=commit_id, path=path), last_commit) + return last_commit + commit_ids.append(commit._id) + if path in commit.changed_paths: + cache._get_misses += 1 + # tree was changed but no LCD found; have to build + tree = commit.tree + if path != '': + tree = tree.get_obj_by_path(path) + return cls.build(tree, commit_ids) + cache._get_walks += 1 + gw += 1 + cache._get_walks_max = max(cache._get_walks_max, gw) + + @classmethod + def build(cls, tree, commit_ids=[]): + ''' + Build the LCD record, presuming that this tree is where it was most + recently changed. + + To build the LCD, we climb the commit tree, keeping track of which + entries we still need info about. (For multi-parent commits, it + doesn't matter which parent we follow because those would be merge + commits and ought to have the diff info populated for any file + touched by the merge.) At each step of the walk, we check the following: + + 1) If the current tree has an LCD record, we can pull all the remaining + info we need from it, and we're done. + + 2) If the tree was modified in this commit, then we pull the info for + all changed entries, then continue up the tree. Once we have data + for all entries, we're done. + + (It may be possible to optimize this for SVN, if SVN can return all of + the LCD info from a single call and if that turns out to be more efficient + than walking up the tree. It is unclear if those hold without testing.) + ''' + cache = getattr(c, 'model_cache', '') or ModelCache() + unfilled = set([n.name for n in chain(tree.tree_ids, tree.blob_ids, tree.other_ids)]) + tree_nodes = set([n.name for n in tree.tree_ids]) + path = tree.path().strip('/') + lcd = cls( + commit_ids=commit_ids, + path=path, + entries=[], + ) + cache._build_calls += 1 + bw = 0 + for commit in tree.commit.climb_commit_tree(): + partial_lcd = cache.get(LastCommit, dict( + commit_ids=commit._id, + path=path, + )) + for name in list(unfilled): + if os.path.join(path, name) in commit.changed_paths: + # changed in this commit, so gather the data + lcd.entries.append(dict( + type=name in tree_nodes and 'DIR' or 'BLOB', + name=name, + commit_info=commit.info, + )) + unfilled.remove(name) + elif partial_lcd: + # the partial LCD should contain anything we're missing + entry = partial_lcd.entry_by_name(name) + assert entry + lcd.entries.append(entry) + unfilled.remove(name) + + if not unfilled: + break + cache._build_walks += 1 + bw += 1 + cache._build_walks_max = max(cache._build_walks_max, bw) + for commit_id in commit_ids: + cache.set(LastCommit, dict(commit_ids=commit_id, path=path), lcd) + return lcd + + def entry_by_name(self, name): + for entry in self.entries: + if entry.name == name: + return entry + return None + mapper(Commit, CommitDoc, repository_orm_session) mapper(Tree, TreeDoc, repository_orm_session) +mapper(LastCommit, LastCommitDoc, repository_orm_session) + + +class ModelCache(object): + ''' + Cache model instances based on query params passed to get. + ''' + def __init__(self, max_size=2000): + ''' + The max_size of the cache is tracked separately for + each model class stored. I.e., you can have 2000 + Commit instances and 2000 Tree instances in the cache + at once with the default value. + ''' + self._cache = defaultdict(dict) + self.max_size = max_size + self._insertion_order = defaultdict(list) + # temporary, for performance testing + self._hits = 0 + self._misses = 0 + self._get_calls = 0 + self._get_walks = 0 + self._get_walks_max = 0 + self._get_hits = 0 + self._get_misses = 0 + self._build_calls = 0 + self._build_walks = 0 + self._build_walks_max = 0 + + def _normalize_key(self, key): + _key = key + if not isinstance(_key, tuple): + _key = tuple(sorted(_key.items(), key=lambda k: k[0])) + return _key + + def get(self, cls, key): + _key = self._normalize_key(key) + if _key not in self._cache[cls]: + self._misses += 1 + query = getattr(cls, 'query', getattr(cls, 'm', None)) + self.set(cls, _key, query.get(**key)) + else: + self._hits += 1 + return self._cache[cls][_key] + + def set(self, cls, key, val): + _key = self._normalize_key(key) + self._manage_cache(cls, _key) + self._cache[cls][_key] = val + + def _manage_cache(self, cls, key): + ''' + Keep track of insertion order, prevent duplicates, + and expire from the cache in a FIFO manner. + ''' + if key in self._cache[cls]: + return + self._insertion_order[cls].append(key) + if len(self._insertion_order[cls]) > self.max_size: + _key = self._insertion_order[cls].pop(0) + self._cache[cls].pop(_key) + + def size(self): + return sum([len(c) for c in self._insertion_order.values()]) + + def keys(self, cls): + ''' + Returns all the cache keys for a given class. Each + cache key will be a dict. + ''' + if self._cache[cls]: + return [dict(k) for k in self._cache[cls].keys()] + return [] + + def batch_load(self, cls, query, attrs=None): + ''' + Load multiple results given a query. + + Optionally takes a list of attribute names to use + as the cache key. If not given, uses the keys of + the given query. + ''' + if attrs is None: + attrs = query.keys() + for result in cls.query.find(query): + keys = {a: getattr(result, a) for a in attrs} + self.set(cls, keys, result) http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/cbe221ee/Allura/allura/model/repo_refresh.py ---------------------------------------------------------------------- diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py index 6e0db59..149fcae 100644 --- a/Allura/allura/model/repo_refresh.py +++ b/Allura/allura/model/repo_refresh.py @@ -2,12 +2,13 @@ import logging from itertools import chain from cPickle import dumps import re +import os import bson import tg -from pylons import g +from pylons import g,c from ming.base import Object from ming.orm import mapper, session @@ -16,7 +17,7 @@ from allura.lib import utils from allura.lib import helpers as h from allura.model.repo import CommitDoc, TreeDoc, TreesDoc, DiffInfoDoc from allura.model.repo import LastCommitDoc, CommitRunDoc -from allura.model.repo import Commit +from allura.model.repo import Commit, Tree, LastCommit, ModelCache from allura.model.index import ArtifactReferenceDoc, ShortlinkDoc log = logging.getLogger(__name__) @@ -88,17 +89,25 @@ def refresh_repo(repo, all_commits=False, notify=True): # Compute diffs cache = {} - # Have to compute_diffs() for all commits to ensure that LastCommitDocs - # are set properly for forked repos. For some SCMs, compute_diffs() - # we don't want to pre-compute the diffs because that would be too - # expensive, so we skip them here and do them on-demand with caching. + # For some SCMs, we don't want to pre-compute the diffs because that + # would be too expensive, so we skip them here and do them on-demand + # with caching. if repo._refresh_precompute: - for i, oid in enumerate(reversed(all_commit_ids)): + for i, oid in enumerate(commit_ids): ci = CommitDoc.m.find(dict(_id=oid), validate=False).next() compute_diffs(repo._id, cache, ci) if (i+1) % 100 == 0: log.info('Compute diffs %d: %s', (i+1), ci._id) + if repo._refresh_precompute: + cache = ModelCache() + for i, oid in enumerate(reversed(commit_ids)): + ci = CommitDoc.m.find(dict(_id=oid), validate=False).next() + compute_lcds(ci, cache) + if (i+1) % 100 == 0: + log.info('Compute last commit info %d: %s', (i+1), ci._id) + + log.info('Refresh complete for %s', repo.full_fs_path) g.post_event( 'repo_refreshed', @@ -348,7 +357,6 @@ def compute_diffs(repo_id, tree_cache, rhs_ci): dict(name=name, lhs_id=lhs_id, rhs_id=rhs_id)) # Set last commit data rhs_tree = tree_index[rhs_ci.tree_id] - refresh_last_commit(repo_id, '/', rhs_tree, lhs_tree, None, commit_info) # Build the diffinfo di = DiffInfoDoc(dict( _id=rhs_ci._id, @@ -420,18 +428,21 @@ def _diff_trees(lhs, rhs, index, *path): (o.name, o.id) for o in rhs.tree_ids) for o in lhs.tree_ids: - rhs_id = rhs_tree_ids.pop(o.name, None) - if rhs_id == o.id: - continue # no change - elif rhs_id is None: + rhs_id = rhs_tree_ids.pop(o.name, None) # remove so won't be picked up as added, below + if rhs_id == o.id: # no change + continue + elif rhs_id is None: # removed yield (_fq(o.name), o.id, None) - else: - for difference in _diff_trees( - index[o.id], index[rhs_id], index, - o.name, *path): - yield difference - for name, id in rhs_tree_ids.items(): + rhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[]) + else: # changed + rhs_tree = index[rhs_id] + for difference in _diff_trees(index[o.id], rhs_tree, index, o.name, *path): + yield difference + for name, id in rhs_tree_ids.items(): # added yield (_fq(name), None, id) + lhs_tree = Object(_id=None, tree_ids=[], blob_ids=[], other_ids=[]) + for difference in _diff_trees(lhs_tree, index[id], index, name, *path): + yield difference # DIff the blobs rhs_blob_ids = dict( (o.name, o.id) @@ -462,53 +473,6 @@ def get_commit_info(commit): summary=commit.summary ) -def refresh_last_commit(repo_id, path, tree, lhs_tree, parent_tree, commit_info): - '''Build the LastCommit info. - - We only need to create LastCommit info for objects that are in the - RHS but not in the LHS, because only those objects are only ones - who have had anything changed in them. (If file x/y/z.txt changes, - then it's hash will change, which also forces the hash for tree x/y - to change, as well as the hash for tree x. So as long as an object's - hash isn't in the LHS, it means it's new or modified in this commit.) - - In order to uniquely identify the tree or blob that a LastCommitDoc is - for, the tree or blob hash is not sufficient; we also need to know - either it's full path name, or it's parent tree and name. Because of - this, we have to walk down the commit tree.''' - if lhs_tree is not None and tree._id == lhs_tree._id: - # tree was not changed in this commit (nor was anything under it) - return - - # map LHS entries for easy lookup - lhs_map = {} - if lhs_tree: - for lhs_child in chain(lhs_tree.tree_ids, lhs_tree.blob_ids, lhs_tree.other_ids): - lhs_map[lhs_child.name] = lhs_child.id - - # update our children - for child in chain(tree.tree_ids, tree.blob_ids, tree.other_ids): - if child.id != lhs_map.get(child.name, None): # check if changed in this commit - lc = set_last_commit(repo_id, path, child.name, child.id, commit_info) - - # (re)curse at our child trees - for child_tree in tree.tree_ids: - child_name = child_tree.name - child_tree = TreeDoc.m.get(_id=child_tree.id) - lhs_child = None - if child_name in lhs_map: - lhs_child = TreeDoc.m.get(_id=lhs_map[child_name]) - refresh_last_commit(repo_id, path + child_name + '/', child_tree, lhs_child, tree, commit_info) - -def set_last_commit(repo_id, path, name, oid, commit_info): - lc = LastCommitDoc(dict( - _id='%s:%s:%s' % (repo_id, path, name), - object_id=oid, - name=name, - commit_info=commit_info)) - lc.m.save(safe=False, upsert=True) - return lc - def last_known_commit_id(all_commit_ids, new_commit_ids): """ Return the newest "known" (cached in mongo) commit id. @@ -522,3 +486,35 @@ def last_known_commit_id(all_commit_ids, new_commit_ids): if not all_commit_ids: return None if not new_commit_ids: return all_commit_ids[-1] return all_commit_ids[all_commit_ids.index(new_commit_ids[0]) - 1] + + +def compute_lcds(commit, cache): + ''' + Compute LastCommit data for every Tree node under this tree. + ''' + trees = cache.get(TreesDoc, dict(_id=commit._id)) + if not trees: + log.error('Missing TreesDoc for %s; skipping compute_lcd' % commit) + return + _update_tree_cache(trees.tree_ids, cache) + c.model_cache = cache + for tree in _walk_commit_tree(commit, cache): + lcd = LastCommit.get(tree) # auto-vivify LCD + +def _walk_commit_tree(commit, cache): + def _walk_tree(tree): + yield tree + for x in tree.tree_ids: + sub_tree = cache.get(Tree, dict(_id=x.id)) + sub_tree.set_context(tree, x.name) + for xx in _walk_tree(sub_tree): + yield xx + top_tree = cache.get(Tree, dict(_id=commit.tree_id)) + top_tree.set_context(commit) + return _walk_tree(top_tree) + +def _update_tree_cache(tree_ids, cache): + current_ids = set(tree_ids) + cached_ids = set([k['_id'] for k in cache.keys(Tree)]) + new_ids = current_ids - cached_ids + cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}}) http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/cbe221ee/Allura/allura/model/repository.py ---------------------------------------------------------------------- diff --git a/Allura/allura/model/repository.py b/Allura/allura/model/repository.py index 9867cf0..1cf08cd 100644 --- a/Allura/allura/model/repository.py +++ b/Allura/allura/model/repository.py @@ -433,21 +433,6 @@ class Repository(Artifact, ActivityObject): with self.push_upstream_context(): return MergeRequest.query.find(q).count() - def get_last_commit(self, obj): - from .repo import LastCommitDoc - lc = LastCommitDoc.m.get( - repo_id=self._id, object_id=obj._id) - if lc is None: - return dict( - author=None, - author_email=None, - author_url=None, - date=None, - id=None, - shortlink=None, - summary=None) - return lc.commit_info - @property def forks(self): return self.query.find({'upstream_repo.name': self.url()}).all() http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/cbe221ee/Allura/allura/tests/model/test_repo.py ---------------------------------------------------------------------- diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py index 90eaac1..0dbff66 100644 --- a/Allura/allura/tests/model/test_repo.py +++ b/Allura/allura/tests/model/test_repo.py @@ -1,5 +1,11 @@ +from datetime import datetime +from collections import defaultdict +import unittest +import mock from nose.tools import assert_equal from pylons import c +from bson import ObjectId +from ming.orm import session from alluratest.controller import setup_basic_test, setup_global_objects from allura import model as M @@ -67,3 +73,541 @@ class RepoImplTestBase(object): self.assertEqual(run.commit_ids, commit_ids) self.assertEqual(len(run.commit_ids), len(run.commit_times)) self.assertEqual(run.parent_commit_ids, []) + + +class TestLastCommit(unittest.TestCase): + def setUp(self): + setup_basic_test() + setup_global_objects() + c.model_cache = M.repo.ModelCache() + self.repo = mock.Mock('repo', _commits={}, _last_commit=None) + self.repo.shorthand_for_commit = lambda _id: _id[:6] + + def _build_tree(self, commit, path, tree_paths): + tree_nodes = [] + blob_nodes = [] + sub_paths = defaultdict(list) + def n(p): + m = mock.Mock() + m.name = p + return m + for p in tree_paths: + if '/' in p: + node, sub = p.split('/',1) + tree_nodes.append(n(node)) + sub_paths[node].append(sub) + else: + blob_nodes.append(n(p)) + tree = mock.Mock( + commit=commit, + path=mock.Mock(return_value=path), + tree_ids=tree_nodes, + blob_ids=blob_nodes, + other_ids=[], + ) + tree.get_obj_by_path = lambda p: self._build_tree(commit, p, sub_paths[p]) + return tree + + def _add_commit(self, msg, tree_paths, diff_paths=None, parents=[]): + suser = dict( + name='test', + email='test@example.com', + date=datetime(2013, 1, 1 + len(self.repo._commits)), + ) + commit = M.repo.Commit( + _id=str(ObjectId()), + message=msg, + parent_ids=[parent._id for parent in parents], + commited=suser, + authored=suser, + repo=self.repo, + ) + commit.tree = self._build_tree(commit, '/', tree_paths) + diffinfo = M.repo.DiffInfoDoc(dict( + _id=commit._id, + differences=[{'name': p} for p in diff_paths or tree_paths], + )) + diffinfo.m.save() + self.repo._commits[commit._id] = commit + return commit + + def test_single_commit(self): + commit1 = self._add_commit('Commit 1', [ + 'file1', + 'dir1/file2', + ]) + lcd = M.repo.LastCommit.get(commit1.tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit1.message]) + self.assertEqual(lcd.path, '') + self.assertEqual(len(lcd.entries), 2) + self.assertEqual(lcd.entry_by_name('file1'), dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Commit 1', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 1), + author_url=None, + id=commit1._id, + shortlink=self.repo.shorthand_for_commit(commit1._id), + ))) + self.assertEqual(lcd.entry_by_name('dir1'), dict( + type='DIR', + name='dir1', + commit_info=dict( + summary='Commit 1', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 1), + author_url=None, + id=commit1._id, + shortlink=self.repo.shorthand_for_commit(commit1._id), + ))) + + def test_multiple_commits_no_overlap(self): + commit1 = self._add_commit('Commit 1', ['file1']) + commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1'], ['dir1/file1'], [commit1]) + commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'file2'], ['file2'], [commit2]) + lcd = M.repo.LastCommit.get(commit3.tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message]) + self.assertEqual(lcd.commit_ids, [commit3._id]) + self.assertEqual(lcd.path, '') + self.assertEqual(len(lcd.entries), 3) + self.assertEqual(lcd.entry_by_name('file1'), dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Commit 1', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 1), + author_url=None, + id=commit1._id, + shortlink=self.repo.shorthand_for_commit(commit1._id), + ))) + self.assertEqual(lcd.entry_by_name('dir1'), dict( + type='DIR', + name='dir1', + commit_info=dict( + summary='Commit 2', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 2), + author_url=None, + id=commit2._id, + shortlink=self.repo.shorthand_for_commit(commit2._id), + ))) + self.assertEqual(lcd.entry_by_name('file2'), dict( + type='BLOB', + name='file2', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + ))) + + def test_multiple_commits_with_overlap(self): + commit1 = self._add_commit('Commit 1', ['file1']) + commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1'], ['dir1/file1'], [commit1]) + commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'file2'], ['file1', 'file2'], [commit2]) + lcd = M.repo.LastCommit.get(commit3.tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message]) + self.assertEqual(lcd.path, '') + self.assertEqual(len(lcd.entries), 3) + self.assertEqual(lcd.entry_by_name('file1'), dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + ))) + self.assertEqual(lcd.entry_by_name('dir1'), dict( + type='DIR', + name='dir1', + commit_info=dict( + summary='Commit 2', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 2), + author_url=None, + id=commit2._id, + shortlink=self.repo.shorthand_for_commit(commit2._id), + ))) + self.assertEqual(lcd.entry_by_name('file2'), dict( + type='BLOB', + name='file2', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + ))) + + def test_multiple_commits_subdir_change(self): + commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1']) + commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1]) + commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file1'], [commit2]) + lcd = M.repo.LastCommit.get(commit3.tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message]) + self.assertEqual(lcd.path, '') + self.assertEqual(len(lcd.entries), 2) + self.assertEqual(lcd.entry_by_name('file1'), dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Commit 1', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 1), + author_url=None, + id=commit1._id, + shortlink=self.repo.shorthand_for_commit(commit1._id), + ))) + self.assertEqual(lcd.entry_by_name('dir1'), dict( + type='DIR', + name='dir1', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + ))) + + def test_subdir_lcd(self): + commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1']) + commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1]) + commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file1'], [commit2]) + tree = self._build_tree(commit3, '/dir1', ['file1', 'file2']) + lcd = M.repo.LastCommit.get(tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit3.message]) + self.assertEqual(lcd.path, 'dir1') + self.assertEqual(len(lcd.entries), 2) + self.assertEqual(lcd.entry_by_name('file1'), dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + ))) + self.assertEqual(lcd.entry_by_name('file2'), dict( + type='BLOB', + name='file2', + commit_info=dict( + summary='Commit 2', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 2), + author_url=None, + id=commit2._id, + shortlink=self.repo.shorthand_for_commit(commit2._id), + ))) + + def test_subdir_lcd_prev_commit(self): + commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1']) + commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1]) + commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file1'], [commit2]) + commit4 = self._add_commit('Commit 4', ['file1', 'dir1/file1', 'dir1/file2', 'file2'], ['file2'], [commit3]) + tree = self._build_tree(commit4, '/dir1', ['file1', 'file2']) + lcd = M.repo.LastCommit.get(tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit4.message, commit3.message]) + self.assertEqual(lcd.path, 'dir1') + self.assertEqual(len(lcd.entries), 2) + self.assertEqual(lcd.entry_by_name('file1'), dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + ))) + self.assertEqual(lcd.entry_by_name('file2'), dict( + type='BLOB', + name='file2', + commit_info=dict( + summary='Commit 2', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 2), + author_url=None, + id=commit2._id, + shortlink=self.repo.shorthand_for_commit(commit2._id), + ))) + + def test_subdir_lcd_always_empty(self): + commit1 = self._add_commit('Commit 1', ['file1', 'dir1']) + commit2 = self._add_commit('Commit 2', ['file1', 'file2'], ['file2'], [commit1]) + tree = self._build_tree(commit2, '/dir1', []) + lcd = M.repo.LastCommit.get(tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit2.message, commit1.message]) + self.assertEqual(lcd.path, 'dir1') + self.assertEqual(lcd.entries, []) + + def test_subdir_lcd_emptied(self): + commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1']) + commit2 = self._add_commit('Commit 2', ['file1'], ['dir1/file1'], [commit1]) + tree = self._build_tree(commit2, '/dir1', []) + lcd = M.repo.LastCommit.get(tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit2.message]) + self.assertEqual(lcd.path, 'dir1') + self.assertEqual(lcd.entries, []) + + def test_existing_lcd_unchained(self): + commit1 = self._add_commit('Commit 1', ['file1', 'dir1/file1']) + commit2 = self._add_commit('Commit 2', ['file1', 'dir1/file1', 'dir1/file2'], ['dir1/file2'], [commit1]) + commit3 = self._add_commit('Commit 3', ['file1', 'dir1/file1', 'dir1/file2'], ['file1'], [commit2]) + prev_lcd = M.repo.LastCommit( + path='dir1', + commit_ids=[commit2._id], + entries=[ + dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Commit 1', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 1), + author_url=None, + id=commit1._id, + shortlink=self.repo.shorthand_for_commit(commit1._id), + )), + dict( + type='BLOB', + name='file2', + commit_info=dict( + summary='Commit 2', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 2), + author_url=None, + id=commit2._id, + shortlink=self.repo.shorthand_for_commit(commit2._id), + )), + ], + ) + session(prev_lcd).flush() + tree = self._build_tree(commit3, '/dir1', ['file1', 'file2']) + lcd = M.repo.LastCommit.get(tree) + self.assertEqual(lcd._id, prev_lcd._id) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit2.message, commit3.message]) + self.assertEqual(lcd.path, 'dir1') + self.assertEqual(lcd.entries, prev_lcd.entries) + + def test_existing_lcd_partial(self): + commit1 = self._add_commit('Commit 1', ['file1']) + commit2 = self._add_commit('Commit 2', ['file1', 'file2'], ['file2'], [commit1]) + commit3 = self._add_commit('Commit 3', ['file1', 'file2', 'file3'], ['file3'], [commit2]) + commit4 = self._add_commit('Commit 4', ['file1', 'file2', 'file3', 'file4'], ['file2', 'file4'], [commit3]) + prev_lcd = M.repo.LastCommit( + path='', + commit_ids=[commit3._id], + entries=[ + dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Existing LCD', # lying here to test that it uses this + author='test', # data instead of walking up the tree + author_email='test@example.com', + date=datetime(2013, 1, 1), + author_url=None, + id=commit1._id, + shortlink=self.repo.shorthand_for_commit(commit1._id), + )), + dict( + type='BLOB', + name='file2', + commit_info=dict( + summary='Commit 2', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 2), + author_url=None, + id=commit2._id, + shortlink=self.repo.shorthand_for_commit(commit2._id), + )), + dict( + type='BLOB', + name='file3', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + )), + ], + ) + session(prev_lcd).flush() + lcd = M.repo.LastCommit.get(commit4.tree) + self.assertEqual([self.repo._commits[c].message for c in lcd.commit_ids], [commit4.message]) + self.assertEqual(lcd.path, '') + self.assertEqual(lcd.entry_by_name('file1')['commit_info']['summary'], 'Existing LCD') + self.assertEqual(len(lcd.entries), 4) + self.assertEqual(lcd.entry_by_name('file1'), dict( + type='BLOB', + name='file1', + commit_info=dict( + summary='Existing LCD', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 1), + author_url=None, + id=commit1._id, + shortlink=self.repo.shorthand_for_commit(commit1._id), + ))) + self.assertEqual(lcd.entry_by_name('file2'), dict( + type='BLOB', + name='file2', + commit_info=dict( + summary='Commit 4', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 4), + author_url=None, + id=commit4._id, + shortlink=self.repo.shorthand_for_commit(commit4._id), + ))) + self.assertEqual(lcd.entry_by_name('file3'), dict( + type='BLOB', + name='file3', + commit_info=dict( + summary='Commit 3', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 3), + author_url=None, + id=commit3._id, + shortlink=self.repo.shorthand_for_commit(commit3._id), + ))) + self.assertEqual(lcd.entry_by_name('file4'), dict( + type='BLOB', + name='file4', + commit_info=dict( + summary='Commit 4', + author='test', + author_email='test@example.com', + date=datetime(2013, 1, 4), + author_url=None, + id=commit4._id, + shortlink=self.repo.shorthand_for_commit(commit4._id), + ))) + + +class TestModelCache(unittest.TestCase): + def setUp(self): + self.cache = M.repo.ModelCache() + + def test_normalize_key(self): + self.assertEqual(self.cache._normalize_key({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1))) + + @mock.patch.object(M.repo.Tree.query, 'get') + @mock.patch.object(M.repo.LastCommit.query, 'get') + def test_get(self, lc_get, tr_get): + tr_get.return_value = 'bar' + lc_get.return_value = 'qux' + + val = self.cache.get(M.repo.Tree, {'_id': 'foo'}) + tr_get.assert_called_with(_id='foo') + self.assertEqual(val, 'bar') + + val = self.cache.get(M.repo.LastCommit, {'_id': 'foo'}) + lc_get.assert_called_with(_id='foo') + self.assertEqual(val, 'qux') + + @mock.patch.object(M.repo.Tree.query, 'get') + def test_get_no_dup(self, tr_get): + tr_get.return_value = 'bar' + val = self.cache.get(M.repo.Tree, {'_id': 'foo'}) + tr_get.assert_called_once_with(_id='foo') + self.assertEqual(val, 'bar') + + tr_get.return_value = 'qux' + val = self.cache.get(M.repo.Tree, {'_id': 'foo'}) + tr_get.assert_called_once_with(_id='foo') + self.assertEqual(val, 'bar') + + @mock.patch.object(M.repo.TreesDoc.m, 'get') + def test_get_doc(self, tr_get): + tr_get.return_value = 'bar' + val = self.cache.get(M.repo.TreesDoc, {'_id': 'foo'}) + tr_get.assert_called_once_with(_id='foo') + self.assertEqual(val, 'bar') + + def test_set(self): + self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'test_set') + self.assertEqual(self.cache._cache, {M.repo.Tree: {(('_id', 'foo'),): 'test_set'}}) + + def test_keys(self): + self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo' + self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar' + self.assertEqual(self.cache.keys(M.repo.Tree), [{'_id': 'test_keys', 'text': 'tko'}, {'fubar': 'scm'}]) + self.assertEqual(self.cache.keys(M.repo.LastCommit), []) + + @mock.patch.object(M.repo.Tree.query, 'find') + def test_batch_load(self, tr_find): + # cls, query, attrs + m1 = mock.Mock(foo=1, qux=3) + m2 = mock.Mock(foo=2, qux=5) + tr_find.return_value = [m1, m2] + + self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}) + tr_find.assert_called_with({'foo': {'$in': 'bar'}}) + self.assertEqual(self.cache._cache[M.repo.Tree], { + (('foo', 1),): m1, + (('foo', 2),): m2, + }) + + @mock.patch.object(M.repo.Tree.query, 'find') + def test_batch_load_attrs(self, tr_find): + # cls, query, attrs + m1 = mock.Mock(foo=1, qux=3) + m2 = mock.Mock(foo=2, qux=5) + tr_find.return_value = [m1, m2] + + self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}, ['qux']) + tr_find.assert_called_with({'foo': {'$in': 'bar'}}) + self.assertEqual(self.cache._cache[M.repo.Tree], { + (('qux', 3),): m1, + (('qux', 5),): m2, + }) + + def test_pruning(self): + self.cache.max_size = 2 + self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'bar') + self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz') + self.cache.set(M.repo.Tree, {'_id': 'f00'}, 'b4r') + self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz') + self.assertEqual(self.cache._cache, { + M.repo.Tree: { + (('_id', 'qux'),): 'zaz', + (('_id', 'f00'),): 'b4r', + }, + }) http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/cbe221ee/ForgeSVN/forgesvn/model/svn.py ---------------------------------------------------------------------- diff --git a/ForgeSVN/forgesvn/model/svn.py b/ForgeSVN/forgesvn/model/svn.py index 48d2c9c..3bb7727 100644 --- a/ForgeSVN/forgesvn/model/svn.py +++ b/ForgeSVN/forgesvn/model/svn.py @@ -76,7 +76,7 @@ class Repository(M.Repository): while ci is not None and limit > 0: yield ci._id limit -= 1 - ci = ci.parent() + ci = ci.get_parent() def latest(self, branch=None): if self._impl is None: return None @@ -349,15 +349,27 @@ class SVNImplementation(M.RepositoryImplementation): log.debug('Compute tree for %d paths', len(infos)) tree_ids = [] blob_ids = [] + chg_revno = infos[0][1]['last_changed_rev'].number + cur_revno = self._revno(commit._id) + commit_ids = [self._oid(revno) for revno in range(chg_revno, cur_revno+1)] + lcd = M.repo.LastCommit.query.get( + commit_ids=self._oid(chg_revno), + path=tree_path.strip('/'), + ) + if lcd: + lcd.commit_ids = list(set(lcd.commit_ids + commit_ids)) + lcd_is_new = False + else: + # we can't use the normal auto-vivification, because + # SVN repos don't have their diff infos filled out :( + lcd = M.repo.LastCommit( + commit_ids=commit_ids, + path=tree_path.strip('/'), + ) + lcd_is_new = True for path, info in infos[1:]: last_commit_id = self._oid(info['last_changed_rev'].number) last_commit = M.repo.Commit.query.get(_id=last_commit_id) - M.repo_refresh.set_last_commit( - self._repo._id, - re.sub(r'/?$', '/', tree_path), # force it to end with / - path, - self._tree_oid(commit._id, path), - M.repo_refresh.get_commit_info(last_commit)) if info.kind == pysvn.node_kind.dir: tree_ids.append(Object( id=self._tree_oid(commit._id, path), @@ -368,6 +380,13 @@ class SVNImplementation(M.RepositoryImplementation): name=path)) else: assert False + if lcd_is_new: + lcd.entries.append(dict( + name=path, + type='DIR' if info.kind == pysvn.node_kind.dir else 'BLOB', + commit_info=last_commit.info, + )) + session(lcd).flush(lcd) tree, is_new = RM.Tree.upsert(tree_id, tree_ids=tree_ids, blob_ids=blob_ids, http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/cbe221ee/scripts/refresh-all-repos.py ---------------------------------------------------------------------- diff --git a/scripts/refresh-all-repos.py b/scripts/refresh-all-repos.py index 822148f..1cf7e3d 100644 --- a/scripts/refresh-all-repos.py +++ b/scripts/refresh-all-repos.py @@ -32,6 +32,7 @@ def main(options): M.repo.TreesDoc.m.remove({}) M.repo.DiffInfoDoc.m.remove({}) M.repo.CommitRunDoc.m.remove({}) + M.repo.LastCommitDoc.m.remove({}) for chunk in chunked_find(M.Project, q_project): for p in chunk: @@ -72,9 +73,6 @@ def main(options): i = M.repo.TreeDoc.m.find({"_id": {"$in": tree_ids_chunk}}).count() log.info("Deleting %i TreeDoc docs...", i) M.repo.TreeDoc.m.remove({"_id": {"$in": tree_ids_chunk}}) - i = M.repo.LastCommitDoc.m.find({"object_id": {"$in": tree_ids_chunk}}).count() - log.info("Deleting %i LastCommitDoc docs...", i) - M.repo.LastCommitDoc.m.remove({"object_id": {"$in": tree_ids_chunk}}) del tree_ids # delete these after TreeDoc and LastCommitDoc so that if @@ -83,11 +81,10 @@ def main(options): log.info("Deleting %i TreesDoc docs...", i) M.repo.TreesDoc.m.remove({"_id": {"$in": ci_ids}}) - # delete LastCommitDocs for non-trees - repo_lastcommit_re = re.compile("^{}:".format(c.app.repo._id)) - i = M.repo.LastCommitDoc.m.find(dict(_id=repo_lastcommit_re)).count() + # delete LastCommitDocs + i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count() log.info("Deleting %i remaining LastCommitDoc docs, by repo id...", i) - M.repo.LastCommitDoc.m.remove(dict(_id=repo_lastcommit_re)) + M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids})) i = M.repo.DiffInfoDoc.m.find({"_id": {"$in": ci_ids}}).count() log.info("Deleting %i DiffInfoDoc docs...", i) http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/cbe221ee/scripts/refresh-last-commits.py ---------------------------------------------------------------------- diff --git a/scripts/refresh-last-commits.py b/scripts/refresh-last-commits.py new file mode 100644 index 0000000..8776010 --- /dev/null +++ b/scripts/refresh-last-commits.py @@ -0,0 +1,172 @@ +import argparse +import logging +import re +from datetime import datetime +from contextlib import contextmanager + +import faulthandler +from pylons import c +from ming.orm import ThreadLocalORMSession + +from allura import model as M +from allura.lib.utils import chunked_find, chunked_list + +log = logging.getLogger(__name__) + + +def main(options): + q_project = {} + if options.nbhd: + nbhd = M.Neighborhood.query.get(url_prefix=options.nbhd) + if not nbhd: + return "Invalid neighborhood url prefix." + q_project['neighborhood_id'] = nbhd._id + if options.project: + q_project['shortname'] = options.project + elif options.project_regex: + q_project['shortname'] = {'$regex': options.project_regex} + + log.info('Refreshing last commit data') + + for chunk in chunked_find(M.Project, q_project): + for p in chunk: + log.info("Refreshing last commit data for project '%s'." % p.shortname) + if options.dry_run: + continue + c.project = p + if options.mount_point: + mount_points = [options.mount_point] + else: + mount_points = [ac.options.mount_point for ac in + M.AppConfig.query.find(dict(project_id=p._id))] + for app in (p.app_instance(mp) for mp in mount_points): + c.app = app + if not hasattr(app, 'repo'): + continue + if c.app.repo.tool.lower() not in options.repo_types: + log.info("Skipping %r: wrong type (%s)", c.app.repo, + c.app.repo.tool.lower()) + continue + + ci_ids = list(reversed(list(c.app.repo.all_commit_ids()))) + #ci_ids = list(c.app.repo.all_commit_ids()) + if options.clean: + if options.diffs: + # delete DiffInfoDocs + i = M.repo.DiffInfoDoc.m.find(dict(commit_ids={'$in': ci_ids})).count() + log.info("Deleting %i DiffInfoDoc docs, by repo id...", i) + M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids})) + + # delete LastCommitDocs + i = M.repo.LastCommitDoc.m.find(dict(commit_ids={'$in': ci_ids})).count() + log.info("Deleting %i LastCommitDoc docs, by repo id...", i) + M.repo.LastCommitDoc.m.remove(dict(commit_ids={'$in': ci_ids})) + + try: + log.info('Refreshing all last commits in %r', c.app.repo) + if options.profile: + import cProfile + cProfile.runctx('refresh_repo_lcds(ci_ids, options)', + globals(), locals(), '/tmp/refresh_lcds.profile') + else: + refresh_repo_lcds(ci_ids, options) + except: + log.exception('Error refreshing %r', c.app.repo) + raise + ThreadLocalORMSession.flush_all() + ThreadLocalORMSession.close_all() + + +def refresh_repo_lcds(commit_ids, options): + tree_cache = {} + timings = [] + if options.diffs: + print 'Processing diffs' + for commit_id in commit_ids: + commit = M.repo.Commit.query.get(_id=commit_id) + with time(timings): + M.repo_refresh.compute_diffs(c.app.repo._id, tree_cache, commit) + if len(timings) % 1000 == 0: + mt = max(timings) + tt = sum(timings) + at = tt / len(timings) + print ' Processed %d commits (max: %f, avg: %f, tot: %f, cl: %d)' % ( + len(timings), mt, at, tt, len(tree_cache)) + lcd_cache = M.repo.ModelCache(80000) + timings = [] + print 'Processing last commits' + for commit_id in commit_ids: + commit = M.repo.Commit.query.get(_id=commit_id) + with time(timings): + M.repo_refresh.compute_lcds(commit, lcd_cache) + if len(timings) % 100 == 0: + mt = max(timings) + tt = sum(timings) + at = tt / len(timings) + mat = sum(timings[-100:]) / 100 + print ' Processed %d commits (max: %f, avg: %f, mavg: %f, tot: %f, lc: %d, lcl: %d, hits: %d, agw: %d, mgw: %d, gh: %d, abw: %d, mbw: %d, ts: %d)' % ( + len(timings), mt, at, mat, tt, lcd_cache.size(), len(lcd_cache._cache[M.repo.LastCommit]), + lcd_cache._hits * 100 / (lcd_cache._hits + lcd_cache._misses), + lcd_cache._get_walks / lcd_cache._get_calls, lcd_cache._get_walks_max, lcd_cache._get_hits * 100 / lcd_cache._get_calls, + lcd_cache._build_walks / lcd_cache._build_calls, lcd_cache._build_walks_max, + len(lcd_cache.get(M.repo.TreesDoc, dict(_id=commit._id)).tree_ids)) + ThreadLocalORMSession.flush_all() + ThreadLocalORMSession.close_all() + #if len(timings) == 300: + # break + + +@contextmanager +def time(timings): + s = datetime.now() + yield + timings.append((datetime.now() - s).total_seconds()) + + +def repo_type_list(s): + repo_types = [] + for repo_type in s.split(','): + repo_type = repo_type.strip() + if repo_type not in ['svn', 'git', 'hg']: + raise argparse.ArgumentTypeError( + '{} is not a valid repo type.'.format(repo_type)) + repo_types.append(repo_type) + return repo_types + + +def parse_options(): + parser = argparse.ArgumentParser(description='Using existing commit data, ' + 'refresh the last commit metadata in MongoDB. Run for all repos (no args), ' + 'or restrict by neighborhood, project, or code tool mount point.') + parser.add_argument('--nbhd', action='store', default='', dest='nbhd', + help='Restrict update to a particular neighborhood, e.g. /p/.') + parser.add_argument('--project', action='store', default='', dest='project', + help='Restrict update to a particular project. To specify a ' + 'subproject, use a slash: project/subproject.') + parser.add_argument('--project-regex', action='store', default='', + dest='project_regex', + help='Restrict update to projects for which the shortname matches ' + 'the provided regex.') + parser.add_argument('--repo-types', action='store', type=repo_type_list, + default=['svn', 'git', 'hg'], dest='repo_types', + help='Only refresh last commits for repos of the given type(s). Defaults to: ' + 'svn,git,hg. Example: --repo-types=git,hg') + parser.add_argument('--mount_point', default='', dest='mount_point', + help='Restrict update to repos at the given tool mount point. ') + parser.add_argument('--clean', action='store_true', dest='clean', + default=False, help='Remove last commit mongo docs for ' + 'project(s) being refreshed before doing the refresh.') + parser.add_argument('--dry-run', action='store_true', dest='dry_run', + default=False, help='Log names of projects that would have their ' + 'last commits refreshed, but do not perform the actual refresh.') + parser.add_argument('--profile', action='store_true', dest='profile', + default=False, help='Enable the profiler (slow). Will log ' + 'profiling output to ./refresh.profile') + parser.add_argument('--diffs', action='store_true', dest='diffs', + default=False, help='Refresh diffs as well as LCDs') + return parser.parse_args() + +if __name__ == '__main__': + import sys + faulthandler.enable() + sys.exit(main(parse_options()))