Mailing-List: contact allura-commits-help@incubator.apache.org; run by ezmlm
Precedence: bulk
Reply-To: allura-dev@incubator.apache.org
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: johnsca@apache.org
To: allura-commits@incubator.apache.org
Subject: [5/21] git commit: [#4691] ModelCache improvements and more debuggon
 for refresh-last-commit.py
Message-Id: <20121207161148.1CE9031D219@tyr.zones.apache.org>
Date: Fri,  7 Dec 2012 16:11:48 +0000 (UTC)

[#4691] ModelCache improvements and more debuggon for refresh-last-commit.py

Signed-off-by: Cory Johns <johnsca@geek.net>


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/12f487b7
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/12f487b7
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/12f487b7

Branch: refs/heads/cj/4691
Commit: 12f487b72a4cc46dbb545470f846d336dd078c5e
Parents: 1bcbaf1
Author: Cory Johns <johnsca@geek.net>
Authored: Fri Nov 30 19:35:32 2012 +0000
Committer: Cory Johns <johnsca@geek.net>
Committed: Fri Dec 7 16:11:27 2012 +0000

----------------------------------------------------------------------
 Allura/allura/model/repo.py            |   34 ++++++++++++++------------
 Allura/allura/model/repo_refresh.py    |    2 +-
 Allura/allura/tests/model/test_repo.py |   16 ++++++++++--
 scripts/refresh-last-commits.py        |   24 +++++++++++++-----
 4 files changed, 49 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/12f487b7/Allura/allura/model/repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo.py b/Allura/allura/model/repo.py
index 2bc22a9..5a7f002 100644
--- a/Allura/allura/model/repo.py
+++ b/Allura/allura/model/repo.py
@@ -5,7 +5,7 @@ import logging
 from hashlib import sha1
 from itertools import chain
 from datetime import datetime
-from collections import defaultdict
+from collections import defaultdict, OrderedDict
 from difflib import SequenceMatcher, unified_diff
 
 from pylons import c
@@ -824,12 +824,11 @@ class ModelCache(object):
         Commit instances and 2000 Tree instances in the cache
         at once with the default value.
         '''
-        self._cache = defaultdict(dict)
+        self._cache = defaultdict(OrderedDict)
         self.max_size = max_size
-        self._insertion_order = defaultdict(list)
         # temporary, for performance testing
-        self._hits = 0
-        self._misses = 0
+        self._hits = defaultdict(int)
+        self._accesses = defaultdict(int)
         self._get_calls = 0
         self._get_walks = 0
         self._get_walks_max = 0
@@ -847,12 +846,13 @@ class ModelCache(object):
 
     def get(self, cls, key):
         _key = self._normalize_key(key)
+        self._manage_cache(cls, _key)
+        self._accesses[cls] += 1
         if _key not in self._cache[cls]:
-            self._misses += 1
             query = getattr(cls, 'query', getattr(cls, 'm', None))
             self.set(cls, _key, query.get(**key))
         else:
-            self._hits += 1
+            self._hits[cls] += 1
         return self._cache[cls][_key]
 
     def set(self, cls, key, val):
@@ -866,23 +866,25 @@ class ModelCache(object):
         and expire from the cache in a FIFO manner.
         '''
         if key in self._cache[cls]:
-            return
-        self._insertion_order[cls].append(key)
-        if len(self._insertion_order[cls]) > self.max_size:
-            _key = self._insertion_order[cls].pop(0)
-            self._cache[cls].pop(_key)
+            # refresh access time in cache
+            val = self._cache[cls].pop(key)
+            self._cache[cls][key] = val
+        elif len(self._cache[cls]) >= self.max_size:
+            # remove the least-recently-used cache item
+            self._cache[cls].popitem(last=False)
 
     def size(self):
-        return sum([len(c) for c in self._insertion_order.values()])
+        return sum([len(c) for c in self._cache.values()])
 
-    def keys(self, cls):
+    def keys(self, cls, as_dict=True):
         '''
         Returns all the cache keys for a given class.  Each
         cache key will be a dict.
         '''
-        if self._cache[cls]:
+        if as_dict:
             return [dict(k) for k in self._cache[cls].keys()]
-        return []
+        else:
+            return self._cache[cls].keys()
 
     def batch_load(self, cls, query, attrs=None):
         '''

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/12f487b7/Allura/allura/model/repo_refresh.py
----------------------------------------------------------------------
diff --git a/Allura/allura/model/repo_refresh.py b/Allura/allura/model/repo_refresh.py
index 149fcae..4796daa 100644
--- a/Allura/allura/model/repo_refresh.py
+++ b/Allura/allura/model/repo_refresh.py
@@ -515,6 +515,6 @@ def _walk_commit_tree(commit, cache):
 
 def _update_tree_cache(tree_ids, cache):
     current_ids = set(tree_ids)
-    cached_ids = set([k['_id'] for k in cache.keys(Tree)])
+    cached_ids = set([k[0][1] for k in cache.keys(Tree, as_dict=False)])
     new_ids = current_ids - cached_ids
     cache.batch_load(Tree, {'_id': {'$in': list(new_ids)}})

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/12f487b7/Allura/allura/tests/model/test_repo.py
----------------------------------------------------------------------
diff --git a/Allura/allura/tests/model/test_repo.py b/Allura/allura/tests/model/test_repo.py
index 0dbff66..040c750 100644
--- a/Allura/allura/tests/model/test_repo.py
+++ b/Allura/allura/tests/model/test_repo.py
@@ -571,6 +571,12 @@ class TestModelCache(unittest.TestCase):
         self.assertEqual(self.cache.keys(M.repo.Tree), [{'_id': 'test_keys', 'text': 'tko'}, {'fubar': 'scm'}])
         self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
 
+    def test_keys_not_as_dict(self):
+        self.cache._cache[M.repo.Tree][(('_id', 'test_keys'), ('text', 'tko'))] = 'foo'
+        self.cache._cache[M.repo.Tree][(('fubar', 'scm'),)] = 'bar'
+        self.assertEqual(self.cache.keys(M.repo.Tree, as_dict=False), [(('_id', 'test_keys'), ('text', 'tko')), (('fubar', 'scm'),)])
+        self.assertEqual(self.cache.keys(M.repo.LastCommit), [])
+
     @mock.patch.object(M.repo.Tree.query, 'find')
     def test_batch_load(self, tr_find):
         # cls, query, attrs
@@ -600,14 +606,18 @@ class TestModelCache(unittest.TestCase):
             })
 
     def test_pruning(self):
-        self.cache.max_size = 2
+        self.cache.max_size = 3
+        # ensure cache expires as LRU
         self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'bar')
         self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz')
         self.cache.set(M.repo.Tree, {'_id': 'f00'}, 'b4r')
-        self.cache.set(M.repo.Tree, {'_id': 'qux'}, 'zaz')
+        self.cache.set(M.repo.Tree, {'_id': 'foo'}, 'zaz')
+        self.cache.get(M.repo.Tree, {'_id': 'f00'})
+        self.cache.set(M.repo.Tree, {'_id': 'mee'}, 'you')
         self.assertEqual(self.cache._cache, {
                 M.repo.Tree: {
-                    (('_id', 'qux'),): 'zaz',
+                    (('_id', 'foo'),): 'zaz',
                     (('_id', 'f00'),): 'b4r',
+                    (('_id', 'mee'),): 'you',
                 },
             })

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/12f487b7/scripts/refresh-last-commits.py
----------------------------------------------------------------------
diff --git a/scripts/refresh-last-commits.py b/scripts/refresh-last-commits.py
index 0a59b31..ed753ab 100644
--- a/scripts/refresh-last-commits.py
+++ b/scripts/refresh-last-commits.py
@@ -2,6 +2,7 @@ import sys
 import argparse
 import logging
 import re
+from math import pow, log10
 from datetime import datetime
 from contextlib import contextmanager
 
@@ -97,29 +98,38 @@ def refresh_repo_lcds(commit_ids, options):
                 at = tt / len(timings)
                 print '  Processed %d commits (max: %f, avg: %f, tot: %f, cl: %d)' % (
                         len(timings), mt, at, tt, len(tree_cache))
-    lcd_cache = M.repo.ModelCache(80000)
+    lcd_cache = M.repo.ModelCache(20000)
     timings = []
     print 'Processing last commits'
+    debug_step = int(pow(10, max(0, int(log10(len(commit_ids)) - log10(options.step) - 1))))
     for i, commit_id in enum_step(commit_ids, options.step):
-        print '    Processing commit %s...' % commit_id,
-        sys.stdout.flush()
+        #print '    Processing commit %s...' % commit_id,
+        #sys.stdout.flush()
         commit = M.repo.Commit.query.get(_id=commit_id)
         with time(timings):
             M.repo_refresh.compute_lcds(commit, lcd_cache)
-        print 'done in %fs' % timings[-1]
-        if len(timings) % 10 == 0:
+        #print 'done in %fs [%d%% in %d]' % (
+        #        timings[-1],
+        #        lcd_cache._hits[M.repo.LastCommit] * 100 / lcd_cache._accesses[M.repo.LastCommit],
+        #        len(lcd_cache._cache[M.repo.LastCommit]),
+        #    )
+        if len(timings) % debug_step == 0:
             mt = max(timings)
             tt = sum(timings)
             at = tt / len(timings)
-            mat = sum(timings[-10:]) / 10
+            mat = sum(timings[-debug_step:]) / debug_step
+            hits = sum(lcd_cache._hits.values())
+            accs = sum(lcd_cache._accesses.values())
             print '  Processed %d commits (max: %f, avg: %f, mavg: %f, tot: %f, lc: %d, lcl: %d, hits: %d, agw: %d, mgw: %d, gh: %d, abw: %d, mbw: %d, ts: %d)' % (
                     len(timings), mt, at, mat, tt, lcd_cache.size(), len(lcd_cache._cache[M.repo.LastCommit]),
-                    lcd_cache._hits * 100 / (lcd_cache._hits + lcd_cache._misses),
+                    hits * 100 / accs,
                     lcd_cache._get_walks / lcd_cache._get_calls, lcd_cache._get_walks_max, lcd_cache._get_hits * 100 / lcd_cache._get_calls,
                     lcd_cache._build_walks / lcd_cache._build_calls, lcd_cache._build_walks_max,
                     len(lcd_cache.get(M.repo.TreesDoc, dict(_id=commit._id)).tree_ids))
             ThreadLocalORMSession.flush_all()
             ThreadLocalORMSession.close_all()
+    ThreadLocalORMSession.flush_all()
+    ThreadLocalORMSession.close_all()
 
 
 @contextmanager