allura-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From tvansteenbu...@apache.org
Subject [4/4] git commit: [#5245] escape spaces and html tags when converting plain text to markdown
Date Tue, 13 Nov 2012 02:45:28 GMT
[#5245] escape spaces and html tags when converting plain text to markdown


Project: http://git-wip-us.apache.org/repos/asf/incubator-allura/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-allura/commit/e1b445ad
Tree: http://git-wip-us.apache.org/repos/asf/incubator-allura/tree/e1b445ad
Diff: http://git-wip-us.apache.org/repos/asf/incubator-allura/diff/e1b445ad

Branch: refs/heads/master
Commit: e1b445adef9d58ca997e2824fa8eae6f1c647dc6
Parents: ee3b451
Author: Dave Brondsema <dbrondsema@geek.net>
Authored: Thu Nov 8 22:00:11 2012 +0000
Committer: Dave Brondsema <dbrondsema@geek.net>
Committed: Thu Nov 8 23:02:55 2012 +0000

----------------------------------------------------------------------
 ForgeBlog/forgeblog/command/rssfeeds.py    |   43 ++++++++++++++++++++---
 ForgeBlog/forgeblog/tests/test_commands.py |   36 ++++++++++++++++++-
 2 files changed, 73 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/e1b445ad/ForgeBlog/forgeblog/command/rssfeeds.py
----------------------------------------------------------------------
diff --git a/ForgeBlog/forgeblog/command/rssfeeds.py b/ForgeBlog/forgeblog/command/rssfeeds.py
index c824f96..d213945 100644
--- a/ForgeBlog/forgeblog/command/rssfeeds.py
+++ b/ForgeBlog/forgeblog/command/rssfeeds.py
@@ -1,5 +1,6 @@
 from time import mktime
 from datetime import datetime
+import re
 
 import feedparser
 import html2text
@@ -20,6 +21,39 @@ from allura.lib.decorators import exceptionless
 
 html2text.BODY_WIDTH = 0
 
+re_amp = re.compile(r'''
+    [&]          # amp
+    (?=          # look ahead for:
+      ([a-zA-Z0-9]+;)  # named HTML entity
+      |
+      (\#[0-9]+;)      # decimal entity
+      |
+      (\#x[0-9A-F]+;)  # hex entity
+    )
+    ''', re.VERBOSE)
+re_leading_spaces = re.compile(r'^[ ]+', re.MULTILINE)
+re_preserve_spaces = re.compile(r'''
+    [ ]           # space
+    (?=[ ])       # lookahead for a space
+    ''', re.VERBOSE)
+re_angle_bracket_open = re.compile('<')
+re_angle_bracket_close = re.compile('>')
+def plain2markdown(text, preserve_multiple_spaces=False, has_html_entities=False):
+    if not has_html_entities:
+        # prevent &foo; and &#123; from becoming HTML entities
+        text = re_amp.sub('&amp;', text)
+    # avoid accidental 4-space indentations creating code blocks
+    if preserve_multiple_spaces:
+        text = re_preserve_spaces.sub('&nbsp;', text)
+    else:
+        text = re_leading_spaces.sub('', text)
+    # use html2text for most of the escaping
+    text = html2text.escape_md_section(text, snob=True)
+    # prevent < and > from becoming tags
+    text = re_angle_bracket_open.sub('&lt;', text)
+    text = re_angle_bracket_close.sub('&gt;', text)
+    return text
+
 
 class RssFeedsCommand(base.BlogCommand):
     summary = 'Rss feed client'
@@ -90,17 +124,16 @@ class RssFeedsCommand(base.BlogCommand):
             content = u''
             for ct in e.content:
                 if ct.type != 'text/html':
-                    content += html2text.escape_md_section(ct.value, snob=True)
+                    content += plain2markdown(ct.value)
                 else:
                     html2md = html2text.HTML2Text(baseurl=e.link)
                     html2md.escape_snob = True
                     markdown_content = html2md.handle(ct.value)
                     content += markdown_content
         else:
-            content = html2text.escape_md_section(getattr(e, 'summary',
-                                                    getattr(e, 'subtitle',
-                                                      getattr(e, 'title'))),
-                                                  snob=True)
+            content = plain2markdown(getattr(e, 'summary',
+                                        getattr(e, 'subtitle',
+                                            getattr(e, 'title'))))
 
         content += u' [link](%s)' % e.link
         updated = datetime.utcfromtimestamp(mktime(e.updated_parsed))

http://git-wip-us.apache.org/repos/asf/incubator-allura/blob/e1b445ad/ForgeBlog/forgeblog/tests/test_commands.py
----------------------------------------------------------------------
diff --git a/ForgeBlog/forgeblog/tests/test_commands.py b/ForgeBlog/forgeblog/tests/test_commands.py
index f1c9389..15c054f 100644
--- a/ForgeBlog/forgeblog/tests/test_commands.py
+++ b/ForgeBlog/forgeblog/tests/test_commands.py
@@ -3,7 +3,7 @@ import pylons
 pylons.c = pylons.tmpl_context
 pylons.g = pylons.app_globals
 from pylons import c, g
-from nose.tools import assert_equal
+from datadiff.tools import assert_equal
 
 from html2text import html2text
 
@@ -139,3 +139,37 @@ def test_plaintext_preprocessor_wrapped():
         '<p>#foo bar <a class="" href="../baz">baz</a> foo bar </p>\n'
         '<p>#foo bar <a class="" href="../baz"> baz </a></p></div>'
     )
+
+
+def test_plain2markdown():
+    text = '''paragraph
+
+    4 spaces before this
+
+    *blah*
+
+here's a <tag> that should be <b>preserved</b>
+Literal &gt; &Ograve; &frac14; &amp; &#38; &#x123F;
+M & Ms - doesn't get escaped
+http://blah.com/?x=y&a=b - not escaped either
+'''
+
+    expected = '''paragraph
+
+4 spaces before this
+
+\*blah\*
+
+here's a &lt;tag&gt; that should be &lt;b&gt;preserved&lt;/b&gt;
+Literal &amp;gt; &amp;Ograve; &amp;frac14; &amp;amp; &amp;\#38; &amp;\#x123F;
+M & Ms - doesn't get escaped
+http://blah.com/?x=y&a=b - not escaped either
+'''
+    # note: the \# isn't necessary it could be just # but that's the way
+    # html2text escapes all #s currently.  The extra escaping of \# ends up
+    # being ok though when rendered
+
+    assert_equal(rssfeeds.plain2markdown(text), expected)
+
+    assert_equal(rssfeeds.plain2markdown('a foo  bar\n\n    code here?', preserve_multiple_spaces=True),
+                'a foo&nbsp; bar\n\n&nbsp;&nbsp;&nbsp; code here?')


Mime
View raw message