Return-Path: X-Original-To: apmail-incubator-allura-commits-archive@minotaur.apache.org Delivered-To: apmail-incubator-allura-commits-archive@minotaur.apache.org Received: from mail.apache.org (hermes.apache.org [140.211.11.3]) by minotaur.apache.org (Postfix) with SMTP id 5031210FE3 for ; Mon, 26 Aug 2013 19:11:13 +0000 (UTC) Received: (qmail 27493 invoked by uid 500); 26 Aug 2013 19:11:13 -0000 Delivered-To: apmail-incubator-allura-commits-archive@incubator.apache.org Received: (qmail 27432 invoked by uid 500); 26 Aug 2013 19:11:13 -0000 Mailing-List: contact allura-commits-help@incubator.apache.org; run by ezmlm Precedence: bulk List-Help: List-Unsubscribe: List-Post: List-Id: Reply-To: allura-dev@incubator.apache.org Delivered-To: mailing list allura-commits@incubator.apache.org Received: (qmail 27412 invoked by uid 99); 26 Aug 2013 19:11:12 -0000 Received: from tyr.zones.apache.org (HELO tyr.zones.apache.org) (140.211.11.114) by apache.org (qpsmtpd/0.29) with ESMTP; Mon, 26 Aug 2013 19:11:12 +0000 Received: by tyr.zones.apache.org (Postfix, from userid 65534) id ADB818C5B04; Mon, 26 Aug 2013 19:11:12 +0000 (UTC) Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: johnsca@apache.org To: allura-commits@incubator.apache.org Date: Mon, 26 Aug 2013 19:11:13 -0000 Message-Id: In-Reply-To: <4959af6ac01843b7876db5205755bb98@git.apache.org> References: <4959af6ac01843b7876db5205755bb98@git.apache.org> X-Mailer: ASF-Git Admin Mailer Subject: [2/7] git commit: [#6531] Refactored get_page to accept parser argument [#6531] Refactored get_page to accept parser argument Signed-off-by: Cory Johns Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/ff5af166 Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/ff5af166 Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/ff5af166 Branch: refs/heads/master Commit: ff5af166fae48eaa3f0f8c7a2f969b0764d5e4e8 Parents: 2d5cf6c Author: Cory Johns Authored: Fri Aug 23 20:51:59 2013 +0000 Committer: Cory Johns Committed: Mon Aug 26 17:19:57 2013 +0000 ---------------------------------------------------------------------- ForgeImporters/forgeimporters/base.py | 13 +++- .../forgeimporters/google/__init__.py | 65 +++++++++++--------- .../tests/google/functional/test_tracker.py | 2 +- .../tests/google/test_extractor.py | 29 +++++---- 4 files changed, 65 insertions(+), 44 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/base.py ---------------------------------------------------------------------- diff --git a/ForgeImporters/forgeimporters/base.py b/ForgeImporters/forgeimporters/base.py index 3cf6774..19a3d70 100644 --- a/ForgeImporters/forgeimporters/base.py +++ b/ForgeImporters/forgeimporters/base.py @@ -86,12 +86,16 @@ class ProjectExtractor(object): req.add_header('User-Agent', 'Allura Data Importer (http://sf.net/p/allura)') return h.urlopen(req, retries=retries, codes=codes) - def get_page(self, page_name_or_url, **kw): + def get_page(self, page_name_or_url, parser=None, **kw): """Return a Beautiful soup object for the given page name or url. If a page name is provided, the associated url is looked up in :attr:`PAGE_MAP`. + If provided, the class or callable passed in :param:`parser` will be + used to transform the result of the `urlopen` before returning it. + Otherwise, the class's :meth:`parse_page` will be used. + Results are cached so that subsequent calls for the same page name or url will return the cached result rather than making another HTTP request. @@ -104,8 +108,10 @@ class ProjectExtractor(object): if self.url in self._page_cache: self.page = self._page_cache[self.url] else: + if parser is None: + parser = self.parse_page self.page = self._page_cache[self.url] = \ - self.parse_page(self.urlopen(self.url)) + parser(self.urlopen(self.url)) return self.page def get_page_url(self, page_name, **kw): @@ -125,7 +131,8 @@ class ProjectExtractor(object): the html. Subclasses can override to change the behavior or handle other types - of content (like JSON). + of content (like JSON). The parser can also be overridden via the + `parser` parameter to :meth:`get_page` :param page: A file-like object return from :meth:`urlopen` http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/google/__init__.py ---------------------------------------------------------------------- diff --git a/ForgeImporters/forgeimporters/google/__init__.py b/ForgeImporters/forgeimporters/google/__init__.py index 29e5011..849f924 100644 --- a/ForgeImporters/forgeimporters/google/__init__.py +++ b/ForgeImporters/forgeimporters/google/__init__.py @@ -51,6 +51,24 @@ def _as_text(node, chunks=None): _as_text(n, chunks) return ''.join(chunks) +def csv_parser(page): + lines = page.readlines() + if not lines: + return [] + # skip CSV header + lines = lines[1:] + # skip "next page here" info footer + if not lines[-1].startswith('"'): + lines.pop() + # remove CSV wrapping (quotes, commas, newlines) + return [line.strip('",\n') for line in lines] + +def stringio_parser(page): + return { + 'content-type': page.info()['content-type'], + 'data': StringIO(page.read()), + } + class GoogleCodeProjectExtractor(ProjectExtractor): BASE_URL = 'http://code.google.com' @@ -88,11 +106,9 @@ class GoogleCodeProjectExtractor(ProjectExtractor): if icon_url == self.DEFAULT_ICON: return icon_name = urllib.unquote(urlparse(icon_url).path).split('/')[-1] - fp_ish = self.urlopen(icon_url) - fp = StringIO(fp_ish.read()) + icon = File(icon_url, icon_name) M.ProjectFile.save_image( - icon_name, fp, - fp_ish.info()['content-type'].split(';')[0], # strip off charset=x extra param, + icon_name, icon.file, icon.type, square=True, thumbnail_size=(48,48), thumbnail_meta={'project_id': project._id, 'category': 'icon'}) @@ -115,16 +131,6 @@ class GoogleCodeProjectExtractor(ProjectExtractor): raise Exception("Unknown repo type: {0}".format(repo_type.text)) @classmethod - def _get_issue_ids_page(cls, project_name, start): - url = cls.PAGE_MAP['issues_csv'].format(project_name=project_name, start=start) - with closing(cls.urlopen(url)) as fp: - lines = fp.readlines()[1:] # skip CSV header - if not lines[-1].startswith('"'): - lines.pop() # skip "next page here" info footer - issue_ids = [line.strip('",\n') for line in lines] - return issue_ids - - @classmethod def iter_issues(cls, project_name): """ Iterate over all issues for a project, @@ -133,13 +139,14 @@ class GoogleCodeProjectExtractor(ProjectExtractor): start = 0 limit = 100 - while True: - issue_ids = cls._get_issue_ids_page(project_name, start) - if len(issue_ids) <= 0: + extractor = cls(project_name, 'issues_csv', parser=csv_parser, start=start) + while extractor.page: + if len(extractor.page) <= 0: return - for issue_id in issue_ids: + for issue_id in extractor.page: yield (int(issue_id), cls(project_name, 'issue', issue_id=issue_id)) start += limit + extractor.get_page('issues_csv', parser=csv_parser, start=start) def get_issue_summary(self): text = self.page.find(id='issueheader').findAll('td', limit=2)[1].span.string.strip() @@ -256,14 +263,16 @@ class Comment(object): ) return text -class Attachment(object): - def __init__(self, tag): - self.url = urljoin(GoogleCodeProjectExtractor.BASE_URL, tag.get('href')) - self.filename = parse_qs(urlparse(self.url).query)['name'][0] - self.type = None +class File(object): + def __init__(self, url, filename): + extractor = GoogleCodeProjectExtractor(None, url, parser=stringio_parser) + self.url = url + self.filename = filename + self.type = extractor.page['content-type'].split(';')[0] + self.file = extractor.page['data'] - @property - def file(self): - fp_ish = GoogleCodeProjectExtractor(None).urlopen(self.url) - fp = StringIO(fp_ish.read()) - return fp +class Attachment(File): + def __init__(self, tag): + url = urljoin(GoogleCodeProjectExtractor.BASE_URL, tag.get('href')) + filename = parse_qs(urlparse(url).query)['name'][0] + super(Attachment, self).__init__(url, filename) http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py ---------------------------------------------------------------------- diff --git a/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py b/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py index 184f7fd..2e5f542 100644 --- a/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py +++ b/ForgeImporters/forgeimporters/tests/google/functional/test_tracker.py @@ -51,7 +51,7 @@ class TestGCTrackerImporter(TestCase): with mock.patch.object(base.h, 'urlopen') as urlopen,\ mock.patch.object(google.tracker, 'GoogleCodeProjectExtractor') as GPE,\ mock.patch('forgetracker.tasks.update_bin_counts') as ubc: - urlopen.side_effect = lambda req, **kw: mock.Mock(read=req.get_full_url) + urlopen.side_effect = lambda req, **kw: mock.Mock(read=req.get_full_url, info=lambda:{'content-type': 'text/plain'}) GPE.iter_issues.return_value = [(issue_id, issue)] gti = google.tracker.GoogleCodeTrackerImporter() gti.import_tool(self.project, self.user, 'test-issue-project', mount_point='test-issue') http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/ff5af166/ForgeImporters/forgeimporters/tests/google/test_extractor.py ---------------------------------------------------------------------- diff --git a/ForgeImporters/forgeimporters/tests/google/test_extractor.py b/ForgeImporters/forgeimporters/tests/google/test_extractor.py index d5a9f22..668662e 100644 --- a/ForgeImporters/forgeimporters/tests/google/test_extractor.py +++ b/ForgeImporters/forgeimporters/tests/google/test_extractor.py @@ -64,6 +64,10 @@ class TestGoogleCodeProjectExtractor(TestCase): page = extractor.get_page('source_browse') self.assertEqual(2, self.urlopen.call_count) self.assertEqual(page, extractor._page_cache['http://code.google.com/p/my-project/source/browse/']) + parser = mock.Mock(return_value='parsed') + page = extractor.get_page('url', parser=parser) + self.assertEqual(page, 'parsed') + self.assertEqual(page, extractor._page_cache['url']) def test_get_page_url(self): extractor = google.GoogleCodeProjectExtractor('my-project') @@ -79,22 +83,20 @@ class TestGoogleCodeProjectExtractor(TestCase): extractor.page.find.assert_called_once_with(itemprop='description') self.assertEqual(self.project.short_description, 'My Super Project') - @mock.patch.object(google, 'StringIO') + @mock.patch.object(google, 'File') @mock.patch.object(google, 'M') - def test_get_icon(self, M, StringIO): - self.urlopen.return_value.info.return_value = {'content-type': 'image/png'} + def test_get_icon(self, M, File): + File.return_value.type = 'image/png' + File.return_value.file = 'data' extractor = google.GoogleCodeProjectExtractor('my-project', 'project_info') extractor.page.find.return_value.get.return_value = 'http://example.com/foo/bar/my-logo.png' - self.urlopen.reset_mock() extractor.get_icon(self.project) extractor.page.find.assert_called_once_with(itemprop='image') - self.urlopen.assert_called_once_with('http://example.com/foo/bar/my-logo.png') - self.urlopen.return_value.info.assert_called_once_with() - StringIO.assert_called_once_with(self.urlopen.return_value.read.return_value) + File.assert_called_once_with('http://example.com/foo/bar/my-logo.png', 'my-logo.png') M.ProjectFile.save_image.assert_called_once_with( - 'my-logo.png', StringIO.return_value, 'image/png', square=True, + 'my-logo.png', 'data', 'image/png', square=True, thumbnail_size=(48,48), thumbnail_meta={ 'project_id': self.project._id, 'category': 'icon'}) @@ -209,19 +211,22 @@ class TestGoogleCodeProjectExtractor(TestCase): 'OpSys-OSX', ]) - def test_get_issue_attachments(self): + @mock.patch.object(google, 'StringIO') + def test_get_issue_attachments(self, StringIO): + self.urlopen.return_value.info.return_value = {'content-type': 'text/plain; foo'} test_issue = open(pkg_resources.resource_filename('forgeimporters', 'tests/data/google/test-issue.html')).read() gpe = self._make_extractor(test_issue) attachments = gpe.get_issue_attachments() self.assertEqual(len(attachments), 2) self.assertEqual(attachments[0].filename, 'at1.txt') self.assertEqual(attachments[0].url, 'http://allura-google-importer.googlecode.com/issues/attachment?aid=70000000&name=at1.txt&token=3REU1M3JUUMt0rJUg7ldcELt6LA%3A1376059941255') - self.assertIsNone(attachments[0].type) + self.assertEqual(attachments[0].type, 'text/plain') self.assertEqual(attachments[1].filename, 'at2.txt') self.assertEqual(attachments[1].url, 'http://allura-google-importer.googlecode.com/issues/attachment?aid=70000001&name=at2.txt&token=C9Hn4s1-g38hlSggRGo65VZM1ys%3A1376059941255') - self.assertIsNone(attachments[1].type) + self.assertEqual(attachments[1].type, 'text/plain') - def test_iter_comments(self): + @mock.patch.object(google, 'StringIO') + def test_iter_comments(self, StringIO): test_issue = open(pkg_resources.resource_filename('forgeimporters', 'tests/data/google/test-issue.html')).read() gpe = self._make_extractor(test_issue) comments = list(gpe.iter_comments())