Mailing-List: contact allura-commits-help@incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: allura-dev@incubator.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: johnsca@apache.org
To: allura-commits@incubator.apache.org
Subject: [2/4] git commit: [#4691] Improved memory footprint for ModelCache
Message-Id: <20121213023017.3C55781BE0D@tyr.zones.apache.org>
Date: Thu, 13 Dec 2012 02:30:17 +0000 (UTC)

[#4691] Improved memory footprint for ModelCache

Signed-off-by: Cory Johns <johnsca@geek.net>


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/c73eea0f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/c73eea0f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/c73eea0f

Branch: refs/heads/cj/4691
Commit: c73eea0f98aa385b705ffb0bf218a47a3fa03d93
Parents: a9e99e6
Author: Cory Johns <johnsca@geek.net>
Authored: Thu Dec 13 02:28:34 2012 +0000
Committer: Cory Johns <johnsca@geek.net>
Committed: Thu Dec 13 02:28:34 2012 +0000

----------------------------------------------------------------------
 Allura/allura/model/repo.py            |  148 ++++++++++++++++++--------
 Allura/allura/model/repo_refresh.py    |    2 +-
 Allura/allura/tests/model/test_repo.py |  135 ++++++++++++++++---------
 3 files changed, 190 insertions(+), 95 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c73eea0f/Allura/allura/model/repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py
index 2ed1e87..6d8a61f 100644
--- a/Allura/allura/model/repo.py
+++ b/Allura/allura/model/repo.py
@@ -7,6 +7,7 @@ from itertools import chain
 from datetime import datetime
 from collections import defaultdict, OrderedDict
 from difflib import SequenceMatcher, unified_diff
+import bson
 
 from pylons import c
 import pymongo.errors
@@ -830,17 +831,37 @@ class ModelCache(object):
     '''
     Cache model instances based on query params passed to get.
     '''
-    def __init__(self, max_size=2000):
+    def __init__(self, max_instances=None, max_queries=None):
         '''
-        The max_size of the cache is tracked separately for
-        each model class stored.  I.e., you can have 2000
-        Commit instances and 2000 Tree instances in the cache
-        at once with the default value.
+        By default, each model type can have 2000 instances and
+        8000 queries.  You can override these for specific model
+        types by passing in a dict() for either max_instances or
+        max_queries keyed by the class(es) with the max values.
+        Classes not in the dict() will use the default 2000/8000
+        default.
+
+        If you pass in a number instead of a dict, that value will
+        be used as the max for all classes.
         '''
-        self._cache = defaultdict(OrderedDict)
-        self.max_size = max_size
+        max_instances_default = 2000
+        max_queries_default = 8000
+        if isinstance(max_instances, int):
+            max_instances_default = max_instances
+        if isinstance(max_queries, int):
+            max_queries_default = max_queries
+        self._max_instances = defaultdict(lambda:max_instances_default)
+        self._max_queries = defaultdict(lambda:max_queries_default)
+        if hasattr(max_instances, 'items'):
+            self._max_instances.update(max_instances)
+        if hasattr(max_queries, 'items'):
+            self._max_queries.update(max_queries)
+
+        self._query_cache = defaultdict(OrderedDict)  # keyed by query, holds _id
+        self._instance_cache = defaultdict(OrderedDict)  # keyed by _id
+
         # temporary, for performance testing
-        self._hits = defaultdict(int)
+        self._query_hits = defaultdict(int)
+        self._instance_hits = defaultdict(int)
         self._accesses = defaultdict(int)
         self._get_calls = 0
         self._get_walks = 0
@@ -851,59 +872,94 @@ class ModelCache(object):
         self._build_walks = 0
         self._build_walks_max = 0
 
-    def _normalize_key(self, key):
-        _key = key
-        if not isinstance(_key, tuple):
-            _key = tuple(sorted(_key.items(), key=lambda k: k[0]))
-        return _key
+    def _normalize_query(self, query):
+        _query = query
+        if not isinstance(_query, tuple):
+            _query = tuple(sorted(_query.items(), key=lambda k: k[0]))
+        return _query
+
+    def _model_query(self, cls):
+        if hasattr(cls, 'query'):
+            return cls.query
+        elif hasattr(cls, 'm'):
+            return cls.m
+        else:
+            raise AttributeError('%s has neither "query" nor "m" attribute' % cls)
 
-    def get(self, cls, key):
-        _key = self._normalize_key(key)
-        self._manage_cache(cls, _key)
+    def get(self, cls, query):
         self._accesses[cls] += 1
-        if _key not in self._cache[cls]:
-            query = getattr(cls, 'query', getattr(cls, 'm', None))
-            self.set(cls, _key, query.get(**key))
+        _query = self._normalize_query(query)
+        self._touch(cls, _query)
+        if _query not in self._query_cache[cls]:
+            self.set(cls, _query, self._model_query(cls).get(**query))
         else:
-            self._hits[cls] += 1
-        return self._cache[cls][_key]
-
-    def set(self, cls, key, val):
-        _key = self._normalize_key(key)
-        self._manage_cache(cls, _key)
-        self._cache[cls][_key] = val
-
-    def _manage_cache(self, cls, key):
+            self._query_hits[cls] += 1
+        _id = self._query_cache[cls][_query]
+        if _id not in self._instance_cache[cls]:
+            model_query = getattr(cls, 'query', getattr(cls, 'm', None))
+            self.set(cls, _query, self._model_query(cls).get(**query))
+        else:
+            self._instance_hits[cls] += 1
+        return self._instance_cache[cls][_id]
+
+    def set(self, cls, query, val):
+        _query = self._normalize_query(query)
+        self._touch(cls, _query)
+        _id = self._query_cache[cls].get(_query, getattr(val, '_id', None))
+        if _id is None:
+            _id = 'None_%s' % bson.ObjectId()
+        self._query_cache[cls][_query] = _id
+        self._instance_cache[cls][_id] = val
+        self._check_sizes(cls)
+
+    def _touch(self, cls, query):
         '''
         Keep track of insertion order, prevent duplicates,
         and expire from the cache in a FIFO manner.
         '''
-        if key in self._cache[cls]:
-            # refresh access time in cache
-            val = self._cache[cls].pop(key)
-            self._cache[cls][key] = val
-        elif len(self._cache[cls]) >= self.max_size:
-            # remove the least-recently-used cache item
-            key, instance = self._cache[cls].popitem(last=False)
+        _query = self._normalize_query(query)
+        if _query not in self._query_cache[cls]:
+            return
+        _id = self._query_cache[cls].pop(_query)
+        self._query_cache[cls][_query] = _id
+
+        if _id not in self._instance_cache[cls]:
+            return
+        val = self._instance_cache[cls].pop(_id)
+        self._instance_cache[cls][_id] = val
+
+    def _check_sizes(self, cls):
+        if self.num_queries(cls) > self._max_queries[cls]:
+            self._remove_least_recently_used(self._query_cache[cls])
+        if self.num_instances(cls) > self._max_instances[cls]:
+            instance = self._remove_least_recently_used(self._instance_cache[cls])
             try:
                 inst_session = session(instance)
             except AttributeError:
                 inst_session = None
             if inst_session:
+                inst_session.flush(instance)
                 inst_session.expunge(instance)
 
-    def size(self):
-        return sum([len(c) for c in self._cache.values()])
+    def _remove_least_recently_used(self, cache):
+        # last-used (most-recently-used) is last in cache, so take first
+        key, val = cache.popitem(last=False)
+        return val
 
-    def keys(self, cls, as_dict=True):
-        '''
-        Returns all the cache keys for a given class.  Each
-        cache key will be a dict.
-        '''
-        if as_dict:
-            return [dict(k) for k in self._cache[cls].keys()]
+    def num_queries(self, cls=None):
+        if cls is None:
+            return sum([len(c) for c in self._query_cache.values()])
         else:
-            return self._cache[cls].keys()
+            return len(self._query_cache[cls])
+
+    def num_instances(self, cls=None):
+        if cls is None:
+            return sum([len(c) for c in self._instance_cache.values()])
+        else:
+            return len(self._instance_cache[cls])
+
+    def instance_ids(self, cls):
+        return self._instance_cache[cls].keys()
 
     def batch_load(self, cls, query, attrs=None):
         '''
@@ -915,6 +971,6 @@ class ModelCache(object):
         '''
         if attrs is None:
             attrs = query.keys()
-        for result in cls.query.find(query):
+        for result in self._model_query(cls).find(query):
             keys = {a: getattr(result, a) for a in attrs}
             self.set(cls, keys, result)

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c73eea0f/Allura/allura/model/repo_refresh.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
index dfabc5b..b827579 100644
--- a/Allura/allura/model/repo_refresh.py
+++ b/Allura/allura/model/repo_refresh.py
@@ -523,6 +523,6 @@ def _pull_tree(cache, tree_id, *context):
 
 def _update_tree_cache(tree_ids, cache):
     current_ids = set(tree_ids)
-    cached_ids = set([k[0][1] for k in cache.keys(Tree, as_dict=False)])
+    cached_ids = set(cache.instance_ids(Tree))
     new_ids = current_ids - cached_ids
     cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}})

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/c73eea0f/Allura/allura/tests/model/test_repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py
index 040c750..40af7ce 100644
--- a/Allura/allura/tests/model/test_repo.py
+++ b/Allura/allura/tests/model/test_repo.py
@@ -525,99 +525,138 @@ class TestModelCache(unittest.TestCase):
     def setUp(self):
         self.cache = M.repo.ModelCache()
 
-    def test_normalize_key(self):
-        self.assertEqual(self.cache._normalize_key({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1)))
+    def test_normalize_query(self):
+        self.assertEqual(self.cache._normalize_query({'foo': 1, 'bar': 2}), (('bar', 2), ('foo', 1)))
+
+    def test_model_query(self):
+        q = mock.Mock(spec_set=['query'], query='foo')
+        m = mock.Mock(spec_set=['m'], m='bar')
+        n = mock.Mock(spec_set=['foo'], foo='qux')
+        self.assertEquals(self.cache._model_query(q), 'foo')
+        self.assertEquals(self.cache._model_query(m), 'bar')
+        self.assertRaises(AttributeError, self.cache._model_query, [n])
 
     @mock.patch.object(M.repo.Tree.query, 'get')
     @mock.patch.object(M.repo.LastCommit.query, 'get')
     def test_get(self, lc_get, tr_get):
-        tr_get.return_value = 'bar'
-        lc_get.return_value = 'qux'
+        tree = tr_get.return_value = mock.Mock(_id='foo', val='bar')
+        lcd = lc_get.return_value = mock.Mock(_id='foo', val='qux')
 
         val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
         tr_get.assert_called_with(_id='foo')
-        self.assertEqual(val, 'bar')
+        self.assertEqual(val, tree)
 
         val = self.cache.get(M.repo.LastCommit, {'_id': 'foo'})
         lc_get.assert_called_with(_id='foo')
-        self.assertEqual(val, 'qux')
+        self.assertEqual(val, lcd)
 
     @mock.patch.object(M.repo.Tree.query, 'get')
-    def test_get_no_dup(self, tr_get):
-        tr_get.return_value = 'bar'
+    def test_get_no_query(self, tr_get):
+        tree1 = tr_get.return_value = mock.Mock(_id='foo', val='bar')
         val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
         tr_get.assert_called_once_with(_id='foo')
-        self.assertEqual(val, 'bar')
+        self.assertEqual(val, tree1)
 
-        tr_get.return_value = 'qux'
+        tree2 = tr_get.return_value = mock.Mock(_id='foo', val='qux')
         val = self.cache.get(M.repo.Tree, {'_id': 'foo'})
         tr_get.assert_called_once_with(_id='foo')
-        self.assertEqual(val, 'bar')
+        self.assertEqual(val, tree1)
 
     @mock.patch.object(M.repo.TreesDoc.m, 'get')
     def test_get_doc(self, tr_get):
-        tr_get.return_value = 'bar'
+        trees = tr_get.return_value = mock.Mock(_id='foo', val='bar')
         val = self.cache.get(M.repo.TreesDoc, {'_id': 'foo'})
         tr_get.assert_called_once_with(_id='foo')
-        self.assertEqual(val, 'bar')
+        self.assertEqual(val, trees)
 
     def test_set(self):
-        self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'test_set')
-        self.assertEqual(self.cache._cache, {M.repo.Tree: {(('_id', 'foo'),): 'test_set'}})
-
-    def test_keys(self):
-        self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo'
-        self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar'
-        self.assertEqual(self.cache.keys(M.repo.Tree), [{'_id': 'test_keys', 'text': 'tko'}, {'fubar': 'scm'}])
-        self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
-
-    def test_keys_not_as_dict(self):
-        self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo'
-        self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar'
-        self.assertEqual(self.cache.keys(M.repo.Tree, as_dict=False), [(('_id', 'test_keys'), ('text', 'tko')), (('fubar', 'scm'),)])
-        self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
+        tree = mock.Mock(_id='foo', val='test_set')
+        self.cache.set(M.repo.Tree, {'val': 'test_set'}, tree)
+        self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'foo'}})
+        self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'foo': tree}})
+
+    @mock.patch('bson.ObjectId')
+    def test_set_none_id(self, obj_id):
+        obj_id.return_value = 'OBJID'
+        tree = mock.Mock(_id=None, val='test_set')
+        self.cache.set(M.repo.Tree, {'val': 'test_set'}, tree)
+        self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'None_OBJID'}})
+        self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'None_OBJID': tree}})
+
+    @mock.patch('bson.ObjectId')
+    def test_set_none_val(self, obj_id):
+        obj_id.return_value = 'OBJID'
+        self.cache.set(M.repo.Tree, {'val': 'test_set'}, None)
+        self.assertEqual(self.cache._query_cache, {M.repo.Tree: {(('val', 'test_set'),): 'None_OBJID'}})
+        self.assertEqual(self.cache._instance_cache, {M.repo.Tree: {'None_OBJID': None}})
+
+    def test_instance_ids(self):
+        tree1 = mock.Mock(_id='id1', val='tree1')
+        tree2 = mock.Mock(_id='id2', val='tree2')
+        self.cache.set(M.repo.Tree, {'val': 'tree1'}, tree1)
+        self.cache.set(M.repo.Tree, {'val': 'tree2'}, tree2)
+        self.assertEqual(set(self.cache.instance_ids(M.repo.Tree)), set(['id1', 'id2']))
+        self.assertEqual(self.cache.instance_ids(M.repo.LastCommit), [])
 
     @mock.patch.object(M.repo.Tree.query, 'find')
     def test_batch_load(self, tr_find):
         # cls, query, attrs
-        m1 = mock.Mock(foo=1, qux=3)
-        m2 = mock.Mock(foo=2, qux=5)
+        m1 = mock.Mock(_id='id1', foo=1, qux=3)
+        m2 = mock.Mock(_id='id2', foo=2, qux=5)
         tr_find.return_value = [m1, m2]
 
         self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}})
         tr_find.assert_called_with({'foo': {'$in': 'bar'}})
-        self.assertEqual(self.cache._cache[M.repo.Tree], {
-                (('foo', 1),): m1,
-                (('foo', 2),): m2,
+        self.assertEqual(self.cache._query_cache[M.repo.Tree], {
+                (('foo', 1),): 'id1',
+                (('foo', 2),): 'id2',
+            })
+        self.assertEqual(self.cache._instance_cache[M.repo.Tree], {
+                'id1': m1,
+                'id2': m2,
             })
 
     @mock.patch.object(M.repo.Tree.query, 'find')
     def test_batch_load_attrs(self, tr_find):
         # cls, query, attrs
-        m1 = mock.Mock(foo=1, qux=3)
-        m2 = mock.Mock(foo=2, qux=5)
+        m1 = mock.Mock(_id='id1', foo=1, qux=3)
+        m2 = mock.Mock(_id='id2', foo=2, qux=5)
         tr_find.return_value = [m1, m2]
 
         self.cache.batch_load(M.repo.Tree, {'foo': {'$in': 'bar'}}, ['qux'])
         tr_find.assert_called_with({'foo': {'$in': 'bar'}})
-        self.assertEqual(self.cache._cache[M.repo.Tree], {
-                (('qux', 3),): m1,
-                (('qux', 5),): m2,
+        self.assertEqual(self.cache._query_cache[M.repo.Tree], {
+                (('qux', 3),): 'id1',
+                (('qux', 5),): 'id2',
+            })
+        self.assertEqual(self.cache._instance_cache[M.repo.Tree], {
+                'id1': m1,
+                'id2': m2,
             })
 
     def test_pruning(self):
-        self.cache.max_size = 3
+        cache = M.repo.ModelCache(max_queries=3, max_instances=2)
         # ensure cache expires as LRU
-        self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'bar')
-        self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz')
-        self.cache.set(M.repo.Tree, {'_id': 'f00'}, 'b4r')
-        self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'zaz')
-        self.cache.get(M.repo.Tree, {'_id': 'f00'})
-        self.cache.set(M.repo.Tree, {'_id': 'mee'}, 'you')
-        self.assertEqual(self.cache._cache, {
+        tree1 = mock.Mock(_id='foo', val='bar')
+        tree2 = mock.Mock(_id='qux', val='fuz')
+        tree3 = mock.Mock(_id='f00', val='b4r')
+        tree4 = mock.Mock(_id='foo', val='zaz')
+        cache.set(M.repo.Tree, {'_id': 'foo'}, tree1)
+        cache.set(M.repo.Tree, {'_id': 'qux'}, tree2)
+        cache.set(M.repo.Tree, {'_id': 'f00'}, tree3)
+        cache.set(M.repo.Tree, {'_id': 'foo'}, tree4)
+        cache.get(M.repo.Tree, {'_id': 'f00'})
+        cache.set(M.repo.Tree, {'val': 'b4r'}, tree3)
+        self.assertEqual(cache._query_cache, {
+                M.repo.Tree: {
+                    (('_id', 'foo'),): 'foo',
+                    (('_id', 'f00'),): 'f00',
+                    (('val', 'b4r'),): 'f00',
+                },
+            })
+        self.assertEqual(cache._instance_cache, {
                 M.repo.Tree: {
-                    (('_id', 'foo'),): 'zaz',
-                    (('_id', 'f00'),): 'b4r',
-                    (('_id', 'mee'),): 'you',
+                    'f00': tree3,
+                    'foo': tree4,
                 },
             })