Return-Path: X-Original-To: apmail-incubator-allura-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-allura-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id A5DAED662 for ; Thu, 13 Dec 2012 02:30:17 +0000 (UTC) Received: (qmail 74995 invoked by uid 500); 13 Dec 2012 02:30:17 -0000 Delivered-To: apmail-incubator-allura-commits-archive@incubator.apache.org Received: (qmail 74940 invoked by uid 500); 13 Dec 2012 02:30:17 -0000 Mailing-List: contact allura-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: allura-dev@incubator.apache.org Delivered-To: mailing list allura-commits@incubator.apache.org Received: (qmail 74888 invoked by uid 99); 13 Dec 2012 02:30:17 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Thu, 13 Dec 2012 02:30:17 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id 3C55781BE0D; Thu, 13 Dec 2012 02:30:17 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: johnsca@apache.org To: allura-commits@incubator.apache.org X-Mailer: ASF-Git Admin Mailer Subject: [2/4] git commit: [#4691] Improved memory footprint for ModelCache Message-Id: <20121213023017.3C55781BE0D@tyr.zones.apache.org> Date: Thu, 13 Dec 2012 02:30:17 +0000 (UTC) [#4691] Improved memory footprint for ModelCache Signed-off-by: Cory Johns Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/c73eea0f Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/c73eea0f Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/c73eea0f Branch: refs/heads/cj/4691 Commit: c73eea0f98aa385b705ffb0bf218a47a3fa03d93 Parents: a9e99e6 Author: Cory Johns Authored: Thu Dec 13 02:28:34 2012 +0000 Committer: Cory Johns Committed: Thu Dec 13 02:28:34 2012 +0000 ---------------------------------------------------------------------- Allura/allura/model/repo.py | 148 ++++++++++++++++++-------- Allura/allura/model/repo_refresh.py | 2 +- Allura/allura/tests/model/test_repo.py | 135 ++++++++++++++++--------- 3 files changed, 190 insertions(+), 95 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c73eea0f/Allura/allura/model/repo.py ---------------------------------------------------------------------- diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py index 2ed1e87..6d8a61f 100644 --- a/Allura/allura/model/repo.py +++ b/Allura/allura/model/repo.py @@ -7,6 +7,7 @@ from itertools import chain from datetime import datetime from collections import defaultdict, OrderedDict from difflib import SequenceMatcher, unified_diff +import bson from pylons import c import pymongo.errors @@ -830,17 +831,37 @@ class ModelCache(object): ''' Cache model instances based on query params passed to get. ''' - def __init__(self, max_size=2000): + def __init__(self, max_instances=None, max_queries=None): ''' - The max_size of the cache is tracked separately for - each model class stored. I.e., you can have 2000 - Commit instances and 2000 Tree instances in the cache - at once with the default value. + By default, each model type can have 2000 instances and + 8000 queries. You can override these for specific model + types by passing in a dict() for either max_instances or + max_queries keyed by the class(es) with the max values. + Classes not in the dict() will use the default 2000/8000 + default. + + If you pass in a number instead of a dict, that value will + be used as the max for all classes. ''' - self._cache = defaultdict(OrderedDict) - self.max_size = max_size + max_instances_default = 2000 + max_queries_default = 8000 + if isinstance(max_instances, int): + max_instances_default = max_instances + if isinstance(max_queries, int): + max_queries_default = max_queries + self._max_instances = defaultdict(lambda:max_instances_default) + self._max_queries = defaultdict(lambda:max_queries_default) + if hasattr(max_instances, 'items'): + self._max_instances.update(max_instances) + if hasattr(max_queries, 'items'): + self._max_queries.update(max_queries) + + self._query_cache = defaultdict(OrderedDict) # keyed by query, holds _id + self._instance_cache = defaultdict(OrderedDict) # keyed by _id + # temporary, for performance testing - self._hits = defaultdict(int) + self._query_hits = defaultdict(int) + self._instance_hits = defaultdict(int) self._accesses = defaultdict(int) self._get_calls = 0 self._get_walks = 0 @@ -851,59 +872,94 @@ class ModelCache(object): self._build_walks = 0 self._build_walks_max = 0 - def _normalize_key(self, key): - _key = key - if not isinstance(_key, tuple): - _key = tuple(sorted(_key.items(), key=lambda k: k[0])) - return _key + def _normalize_query(self, query): + _query = query + if not isinstance(_query, tuple): + _query = tuple(sorted(_query.items(), key=lambda k: k[0])) + return _query + + def _model_query(self, cls): + if hasattr(cls, 'query'): + return cls.query + elif hasattr(cls, 'm'): + return cls.m + else: + raise AttributeError('%s has neither "query" nor "m" attribute' % cls) - def get(self, cls, key): - _key = self._normalize_key(key) - self._manage_cache(cls, _key) + def get(self, cls, query): self._accesses[cls] += 1 - if _key not in self._cache[cls]: - query = getattr(cls, 'query', getattr(cls, 'm', None)) - self.set(cls, _key, query.get(**key)) + _query = self._normalize_query(query) + self._touch(cls, _query) + if _query not in self._query_cache[cls]: + self.set(cls, _query, self._model_query(cls).get(**query)) else: - self._hits[cls] += 1 - return self._cache[cls][_key] - - def set(self, cls, key, val): - _key = self._normalize_key(key) - self._manage_cache(cls, _key) - self._cache[cls][_key] = val - - def _manage_cache(self, cls, key): + self._query_hits[cls] += 1 + _id = self._query_cache[cls][_query] + if _id not in self._instance_cache[cls]: + model_query = getattr(cls, 'query', getattr(cls, 'm', None)) + self.set(cls, _query, self._model_query(cls).get(**query)) + else: + self._instance_hits[cls] += 1 + return self._instance_cache[cls][_id] + + def set(self, cls, query, val): + _query = self._normalize_query(query) + self._touch(cls, _query) + _id = self._query_cache[cls].get(_query, getattr(val, '_id', None)) + if _id is None: + _id = 'None_%s' % bson.ObjectId() + self._query_cache[cls][_query] = _id + self._instance_cache[cls][_id] = val + self._check_sizes(cls) + + def _touch(self, cls, query): ''' Keep track of insertion order, prevent duplicates, and expire from the cache in a FIFO manner. ''' - if key in self._cache[cls]: - # refresh access time in cache - val = self._cache[cls].pop(key) - self._cache[cls][key] = val - elif len(self._cache[cls]) >= self.max_size: - # remove the least-recently-used cache item - key, instance = self._cache[cls].popitem(last=False) + _query = self._normalize_query(query) + if _query not in self._query_cache[cls]: + return + _id = self._query_cache[cls].pop(_query) + self._query_cache[cls][_query] = _id + + if _id not in self._instance_cache[cls]: + return + val = self._instance_cache[cls].pop(_id) + self._instance_cache[cls][_id] = val + + def _check_sizes(self, cls): + if self.num_queries(cls) > self._max_queries[cls]: + self._remove_least_recently_used(self._query_cache[cls]) + if self.num_instances(cls) > self._max_instances[cls]: + instance = self._remove_least_recently_used(self._instance_cache[cls]) try: inst_session = session(instance) except AttributeError: inst_session = None if inst_session: + inst_session.flush(instance) inst_session.expunge(instance) - def size(self): - return sum([len(c) for c in self._cache.values()]) + def _remove_least_recently_used(self, cache): + # last-used (most-recently-used) is last in cache, so take first + key, val = cache.popitem(last=False) + return val - def keys(self, cls, as_dict=True): - ''' - Returns all the cache keys for a given class. Each - cache key will be a dict. - ''' - if as_dict: - return [dict(k) for k in self._cache[cls].keys()] + def num_queries(self, cls=None): + if cls is None: + return sum([len(c) for c in self._query_cache.values()]) else: - return self._cache[cls].keys() + return len(self._query_cache[cls]) + + def num_instances(self, cls=None): + if cls is None: + return sum([len(c) for c in self._instance_cache.values()]) + else: + return len(self._instance_cache[cls]) + + def instance_ids(self, cls): + return self._instance_cache[cls].keys() def batch_load(self, cls, query, attrs=None): ''' @@ -915,6 +971,6 @@ class ModelCache(object): ''' if attrs is None: attrs = query.keys() - for result in cls.query.find(query): + for result in self._model_query(cls).find(query): keys = {a: getattr(result, a) for a in attrs} self.set(cls, keys, result) http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c73eea0f/Allura/allura/model/repo_refresh.py ---------------------------------------------------------------------- diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py index dfabc5b..b827579 100644 --- a/Allura/allura/model/repo_refresh.py +++ b/Allura/allura/model/repo_refresh.py @@ -523,6 +523,6 @@ def _pull_tree(cache, tree_id, *context): def _update_tree_cache(tree_ids, cache): current_ids = set(tree_ids) - cached_ids = set([k[0][1] for k in cache.keys(Tree, as_dict=False)]) + cached_ids = set(cache.instance_ids(Tree)) new_ids = current_ids - cached_ids cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}}) http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c73eea0f/Allura/allura/tests/model/test_repo.py ---------------------------------------------------------------------- diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py index 040c750..40af7ce 100644 --- a/Allura/allura/tests/model/test_repo.py +++ b/Allura/allura/tests/model/test_repo.py @@ -525,99 +525,138 @@ class TestModelCache(unittest.TestCase): def setUp(self): self.cache = M.repo.ModelCache() - def test_normalize_key(self): - self.assertEqual(self.cache._normalize_key({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1))) + def test_normalize_query(self): + self.assertEqual(self.cache._normalize_query({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1))) + + def test_model_query(self): + q = mock.Mock(spec_set=['query'], query='foo') + m = mock.Mock(spec_set=['m'], m='bar') + n = mock.Mock(spec_set=['foo'], foo='qux') + self.assertEquals(self.cache._model_query(q), 'foo') + self.assertEquals(self.cache._model_query(m), 'bar') + self.assertRaises(AttributeError, self.cache._model_query, [n]) @mock.patch.object(M.repo.Tree.query, 'get') @mock.patch.object(M.repo.LastCommit.query, 'get') def test_get(self, lc_get, tr_get): - tr_get.return_value = 'bar' - lc_get.return_value = 'qux' + tree = tr_get.return_value = mock.Mock(_id='foo', val='bar') + lcd = lc_get.return_value = mock.Mock(_id='foo', val='qux') val = self.cache.get(M.repo.Tree, {'_id': 'foo'}) tr_get.assert_called_with(_id='foo') - self.assertEqual(val, 'bar') + self.assertEqual(val, tree) val = self.cache.get(M.repo.LastCommit, {'_id': 'foo'}) lc_get.assert_called_with(_id='foo') - self.assertEqual(val, 'qux') + self.assertEqual(val, lcd) @mock.patch.object(M.repo.Tree.query, 'get') - def test_get_no_dup(self, tr_get): - tr_get.return_value = 'bar' + def test_get_no_query(self, tr_get): + tree1 = tr_get.return_value = mock.Mock(_id='foo', val='bar') val = self.cache.get(M.repo.Tree, {'_id': 'foo'}) tr_get.assert_called_once_with(_id='foo') - self.assertEqual(val, 'bar') + self.assertEqual(val, tree1) - tr_get.return_value = 'qux' + tree2 = tr_get.return_value = mock.Mock(_id='foo', val='qux') val = self.cache.get(M.repo.Tree, {'_id': 'foo'}) tr_get.assert_called_once_with(_id='foo') - self.assertEqual(val, 'bar') + self.assertEqual(val, tree1) @mock.patch.object(M.repo.TreesDoc.m, 'get') def test_get_doc(self, tr_get): - tr_get.return_value = 'bar' + trees = tr_get.return_value = mock.Mock(_id='foo', val='bar') val = self.cache.get(M.repo.TreesDoc, {'_id': 'foo'}) tr_get.assert_called_once_with(_id='foo') - self.assertEqual(val, 'bar') + self.assertEqual(val, trees) def test_set(self): - self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'test_set') - self.assertEqual(self.cache._cache, {M.repo.Tree: {(('_id', 'foo'),): 'test_set'}}) - - def test_keys(self): - self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo' - self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar' - self.assertEqual(self.cache.keys(M.repo.Tree), [{'_id': 'test_keys', 'text': 'tko'}, {'fubar': 'scm'}]) - self.assertEqual(self.cache.keys(M.repo.LastCommit), []) - - def test_keys_not_as_dict(self): - self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo' - self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar' - self.assertEqual(self.cache.keys(M.repo.Tree, as_dict=False), [(('_id', 'test_keys'), ('text', 'tko')), (('fubar', 'scm'),)]) - self.assertEqual(self.cache.keys(M.repo.LastCommit), []) + tree = mock.Mock(_id='foo', val='test_set') + self.cache.set(M.repo.Tree, {'val': 'test_set'}, tree) + self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'foo'}}) + self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'foo': tree}}) + + @mock.patch('bson.ObjectId') + def test_set_none_id(self, obj_id): + obj_id.return_value = 'OBJID' + tree = mock.Mock(_id=None, val='test_set') + self.cache.set(M.repo.Tree, {'val': 'test_set'}, tree) + self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'None_OBJID'}}) + self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'None_OBJID': tree}}) + + @mock.patch('bson.ObjectId') + def test_set_none_val(self, obj_id): + obj_id.return_value = 'OBJID' + self.cache.set(M.repo.Tree, {'val': 'test_set'}, None) + self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'None_OBJID'}}) + self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'None_OBJID': None}}) + + def test_instance_ids(self): + tree1 = mock.Mock(_id='id1', val='tree1') + tree2 = mock.Mock(_id='id2', val='tree2') + self.cache.set(M.repo.Tree, {'val': 'tree1'}, tree1) + self.cache.set(M.repo.Tree, {'val': 'tree2'}, tree2) + self.assertEqual(set(self.cache.instance_ids(M.repo.Tree)), set(['id1', 'id2'])) + self.assertEqual(self.cache.instance_ids(M.repo.LastCommit), []) @mock.patch.object(M.repo.Tree.query, 'find') def test_batch_load(self, tr_find): # cls, query, attrs - m1 = mock.Mock(foo=1, qux=3) - m2 = mock.Mock(foo=2, qux=5) + m1 = mock.Mock(_id='id1', foo=1, qux=3) + m2 = mock.Mock(_id='id2', foo=2, qux=5) tr_find.return_value = [m1, m2] self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}) tr_find.assert_called_with({'foo': {'$in': 'bar'}}) - self.assertEqual(self.cache._cache[M.repo.Tree], { - (('foo', 1),): m1, - (('foo', 2),): m2, + self.assertEqual(self.cache._query_cache[M.repo.Tree], { + (('foo', 1),): 'id1', + (('foo', 2),): 'id2', + }) + self.assertEqual(self.cache._instance_cache[M.repo.Tree], { + 'id1': m1, + 'id2': m2, }) @mock.patch.object(M.repo.Tree.query, 'find') def test_batch_load_attrs(self, tr_find): # cls, query, attrs - m1 = mock.Mock(foo=1, qux=3) - m2 = mock.Mock(foo=2, qux=5) + m1 = mock.Mock(_id='id1', foo=1, qux=3) + m2 = mock.Mock(_id='id2', foo=2, qux=5) tr_find.return_value = [m1, m2] self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}, ['qux']) tr_find.assert_called_with({'foo': {'$in': 'bar'}}) - self.assertEqual(self.cache._cache[M.repo.Tree], { - (('qux', 3),): m1, - (('qux', 5),): m2, + self.assertEqual(self.cache._query_cache[M.repo.Tree], { + (('qux', 3),): 'id1', + (('qux', 5),): 'id2', + }) + self.assertEqual(self.cache._instance_cache[M.repo.Tree], { + 'id1': m1, + 'id2': m2, }) def test_pruning(self): - self.cache.max_size = 3 + cache = M.repo.ModelCache(max_queries=3, max_instances=2) # ensure cache expires as LRU - self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'bar') - self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz') - self.cache.set(M.repo.Tree, {'_id': 'f00'}, 'b4r') - self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'zaz') - self.cache.get(M.repo.Tree, {'_id': 'f00'}) - self.cache.set(M.repo.Tree, {'_id': 'mee'}, 'you') - self.assertEqual(self.cache._cache, { + tree1 = mock.Mock(_id='foo', val='bar') + tree2 = mock.Mock(_id='qux', val='fuz') + tree3 = mock.Mock(_id='f00', val='b4r') + tree4 = mock.Mock(_id='foo', val='zaz') + cache.set(M.repo.Tree, {'_id': 'foo'}, tree1) + cache.set(M.repo.Tree, {'_id': 'qux'}, tree2) + cache.set(M.repo.Tree, {'_id': 'f00'}, tree3) + cache.set(M.repo.Tree, {'_id': 'foo'}, tree4) + cache.get(M.repo.Tree, {'_id': 'f00'}) + cache.set(M.repo.Tree, {'val': 'b4r'}, tree3) + self.assertEqual(cache._query_cache, { + M.repo.Tree: { + (('_id', 'foo'),): 'foo', + (('_id', 'f00'),): 'f00', + (('val', 'b4r'),): 'f00', + }, + }) + self.assertEqual(cache._instance_cache, { M.repo.Tree: { - (('_id', 'foo'),): 'zaz', - (('_id', 'f00'),): 'b4r', - (('_id', 'mee'),): 'you', + 'f00': tree3, + 'foo': tree4, }, })