incubator-allura-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From john...@apache.org
Subject [02/25] git commit: [#6458] Refactored Google Code project extractor
Date Wed, 14 Aug 2013 18:56:32 GMT
[#6458] Refactored Google Code project extractor

- Introduced a caching get_page() method so that page fetching
  is no longer tightly coupled to extractor instantiation. In
  other words, you can fetch any page you want, regardless of
  the page that you instantiated the extractor with.

- Also removed the wiki-specific stuff and moved that to a
  subclass in the googlecodewikiimporter pkg.

Signed-off-by: Tim Van Steenburgh <tvansteenburgh@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/ee1c5914
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/ee1c5914
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/ee1c5914

Branch: refs/heads/cj/6464
Commit: ee1c5914a92961fb46326c59c456dbea860b10f1
Parents: 3600da2
Author: Tim Van Steenburgh <tvansteenburgh@gmail.com>
Authored: Thu Aug 8 13:33:42 2013 +0000
Committer: Dave Brondsema <dbrondsema@slashdotmedia.com>
Committed: Thu Aug 8 21:25:45 2013 +0000

----------------------------------------------------------------------
 .../forgeimporters/google/__init__.py           | 57 ++++++++++++++------
 .../tests/google/test_extractor.py              | 26 +++++----
 ForgeWiki/forgewiki/wiki_main.py                |  2 +-
 3 files changed, 57 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ee1c5914/ForgeImporters/forgeimporters/google/__init__.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/google/__init__.py b/ForgeImporters/forgeimporters/google/__init__.py
index 8c91fd3..a307bcd 100644
--- a/ForgeImporters/forgeimporters/google/__init__.py
+++ b/ForgeImporters/forgeimporters/google/__init__.py
@@ -40,7 +40,6 @@ class GoogleCodeProjectExtractor(object):
     PAGE_MAP = {
             'project_info': BASE_URL + '/p/%s/',
             'source_browse': BASE_URL + '/p/%s/source/browse/',
-            'wiki_index': BASE_URL + '/p/%s/w/list',
         }
 
     LICENSE_MAP = defaultdict(lambda:'Other/Proprietary License', {
@@ -58,17 +57,49 @@ class GoogleCodeProjectExtractor(object):
 
     DEFAULT_ICON = 'http://www.gstatic.com/codesite/ph/images/defaultlogo.png'
 
-    def __init__(self, allura_project, gc_project_name, page):
+    def __init__(self, allura_project, gc_project_name, page=None):
         self.project = allura_project
         self.gc_project_name = gc_project_name
-        self.url = self.PAGE_MAP[page] % urllib.quote(self.gc_project_name)
-        self.page = BeautifulSoup(urllib2.urlopen(self.url))
+        self._page_cache = {}
+        self.url = None
+        self.page = None
+        if page:
+            self.get_page(page)
+
+    def get_page(self, page_name_or_url):
+        """Return a Beautiful soup object for the given page name or url.
+
+        If a page name is provided, the associated url is looked up in
+        :attr:`PAGE_MAP`.
+
+        Results are cached so that subsequent calls for the same page name or
+        url will return the cached result rather than making another HTTP
+        request.
+
+        """
+        if page_name_or_url in self._page_cache:
+            return self._page_cache[page_name_or_url]
+        self.url = (self.get_page_url(page_name_or_url) if page_name_or_url in
+                self.PAGE_MAP else page_name_or_url)
+        self.page = self._page_cache[page_name_or_url] = \
+                BeautifulSoup(urllib2.urlopen(self.url))
+        return self.page
+
+    def get_page_url(self, page_name):
+        """Return the url associated with ``page_name``.
+
+        Raises KeyError if ``page_name`` is not in :attr:`PAGE_MAP`.
+
+        """
+        return self.PAGE_MAP[page_name] % urllib.quote(self.gc_project_name)
 
     def get_short_description(self):
-        self.project.short_description = self.page.find(itemprop='description').string.strip()
+        page = self.get_page('project_info')
+        self.project.short_description = page.find(itemprop='description').string.strip()
 
     def get_icon(self):
-        icon_url = urljoin(self.url, self.page.find(itemprop='image').attrMap['src'])
+        page = self.get_page('project_info')
+        icon_url = urljoin(self.url, page.find(itemprop='image').attrMap['src'])
         if icon_url == self.DEFAULT_ICON:
             return
         icon_name = urllib.unquote(urlparse(icon_url).path).split('/')[-1]
@@ -81,12 +112,14 @@ class GoogleCodeProjectExtractor(object):
             thumbnail_meta={'project_id': self.project._id, 'category': 'icon'})
 
     def get_license(self):
-        license = self.page.find(text='Code license').findNext().find('a').string.strip()
+        page = self.get_page('project_info')
+        license = page.find(text='Code license').findNext().find('a').string.strip()
         trove = M.TroveCategory.query.get(fullname=self.LICENSE_MAP[license])
         self.project.trove_license.append(trove._id)
 
     def get_repo_type(self):
-        repo_type = self.page.find(id="crumb_root")
+        page = self.get_page('source_browse')
+        repo_type = page.find(id="crumb_root")
         if not repo_type:
             raise Exception("Couldn't detect repo type: no #crumb_root in "
                     "{0}".format(self.url))
@@ -95,11 +128,3 @@ class GoogleCodeProjectExtractor(object):
             return re_match.group(0)
         else:
             raise Exception("Unknown repo type: {0}".format(repo_type.text))
-
-    def get_wiki_pages(self):
-        RE_WIKI_PAGE_URL = r'^/p/{0}/wiki/.*$'.format(self.gc_project_name)
-        seen = set()
-        for a in self.page.find(id="resultstable").findAll("a"):
-            if re.match(RE_WIKI_PAGE_URL, a['href']) and a['href'] not in seen:
-                yield (a.text, self.BASE_URL + a['href'])
-                seen.add(a['href'])

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ee1c5914/ForgeImporters/forgeimporters/tests/google/test_extractor.py
----------------------------------------------------------------------
diff --git a/ForgeImporters/forgeimporters/tests/google/test_extractor.py b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
index 250759f..b4e64c0 100644
--- a/ForgeImporters/forgeimporters/tests/google/test_extractor.py
+++ b/ForgeImporters/forgeimporters/tests/google/test_extractor.py
@@ -43,6 +43,18 @@ class TestGoogleCodeProjectExtractor(TestCase):
         self.soup.assert_called_once_with(self.urlopen.return_value)
         self.assertEqual(extractor.page, self.soup.return_value)
 
+    def test_get_page(self):
+        extractor = google.GoogleCodeProjectExtractor(self.project, 'my-project', 'project_info')
+        self.assertEqual(1, self.urlopen.call_count)
+        page = extractor.get_page('project_info')
+        self.assertEqual(1, self.urlopen.call_count)
+        self.assertEqual(page, extractor._page_cache['project_info'])
+
+    def test_get_page_url(self):
+        extractor = google.GoogleCodeProjectExtractor(self.project, 'my-project')
+        self.assertEqual(extractor.get_page_url('project_info'),
+                'http://code.google.com/p/my-project/')
+
     def test_get_short_description(self):
         extractor = google.GoogleCodeProjectExtractor(self.project, 'my-project', 'project_info')
         extractor.page.find.return_value.string = 'My Super Project'
@@ -93,9 +105,10 @@ class TestGoogleCodeProjectExtractor(TestCase):
 
     def _make_extractor(self, html):
         from BeautifulSoup import BeautifulSoup
-        with mock.patch.object(google, 'urllib2') as urllib2:
-            extractor = google.GoogleCodeProjectExtractor(self.project, 'my-project', 'project_info')
+        with mock.patch.object(google, 'urllib2'):
+            extractor = google.GoogleCodeProjectExtractor(self.project, 'my-project')
         extractor.page = BeautifulSoup(html)
+        extractor.get_page = lambda pagename: extractor.page
         extractor.url="http://test/source/browse"
         return extractor
 
@@ -118,12 +131,3 @@ class TestGoogleCodeProjectExtractor(TestCase):
         with self.assertRaises(Exception) as cm:
             extractor.get_repo_type()
         self.assertEqual(str(cm.exception), "Unknown repo type: cvs")
-
-    def test_get_wiki_pages(self):
-        extractor = self._make_extractor('''
-        <div id="resultstable">
-            <a href="#">Link that's not a wiki page</a>
-            <a href="/p/my-project/wiki/PageOne">PageOne</a>
-        </div>''')
-        self.assertEqual(list(extractor.get_wiki_pages()), [
-            ('PageOne', 'http://code.google.com/p/my-project/wiki/PageOne')])

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ee1c5914/ForgeWiki/forgewiki/wiki_main.py
----------------------------------------------------------------------
diff --git a/ForgeWiki/forgewiki/wiki_main.py b/ForgeWiki/forgewiki/wiki_main.py
index 064a9cd..816db78 100644
--- a/ForgeWiki/forgewiki/wiki_main.py
+++ b/ForgeWiki/forgewiki/wiki_main.py
@@ -139,7 +139,7 @@ class ForgeWikiApp(Application):
             elif new_root_page_name != self.default_root_page_name:
                 globals = WM.Globals(app_config_id=self.config._id, root=new_root_page_name)
             if globals is not None:
-                session(globals).flush()
+                session(globals).flush(globals)
 
     @Property
     def show_discussion():


Mime
View raw message