lucene-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From rm...@apache.org
Subject svn commit: r1381713 - in /lucene/dev/branches/branch_4x: ./ dev-tools/ dev-tools/scripts/checkJavadocLinks.py lucene/ lucene/analysis/ lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
Date Thu, 06 Sep 2012 18:38:15 GMT
Author: rmuir
Date: Thu Sep  6 18:38:15 2012
New Revision: 1381713

URL: http://svn.apache.org/viewvc?rev=1381713&view=rev
Log:
fix broken unicode in javadocs

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/   (props changed)
    lucene/dev/branches/branch_4x/dev-tools/scripts/checkJavadocLinks.py
    lucene/dev/branches/branch_4x/lucene/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/   (props changed)
    lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java

Modified: lucene/dev/branches/branch_4x/dev-tools/scripts/checkJavadocLinks.py
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/dev-tools/scripts/checkJavadocLinks.py?rev=1381713&r1=1381712&r2=1381713&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/dev-tools/scripts/checkJavadocLinks.py (original)
+++ lucene/dev/branches/branch_4x/dev-tools/scripts/checkJavadocLinks.py Thu Sep  6 18:38:15
2012
@@ -23,6 +23,9 @@ import urllib.parse as urlparse
 reHyperlink = re.compile(r'<a(\s+.*?)>', re.I)
 reAtt = re.compile(r"""(?:\s+([a-z]+)\s*=\s*("[^"]*"|'[^']?'|[^'"\s]+))+""", re.I)
 
+# Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF] /* any
Unicode character, excluding the surrogate blocks, FFFE, and FFFF. */
+reValidChar = re.compile("^[\u0009\u000A\u000D\u0020-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]*$")
+
 # silly emacs: '
 
 class FindHyperlinks(HTMLParser):
@@ -79,6 +82,12 @@ class FindHyperlinks(HTMLParser):
                    
 def parse(baseURL, html):
   global failures
+  # look for broken unicode
+  if not reValidChar.match(html):
+    print(' WARNING: invalid characters detected in: %s' % baseURL)
+    failures = True
+    return [], []
+
   parser = FindHyperlinks(baseURL)
   try:
     parser.feed(html)

Modified: lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java?rev=1381713&r1=1381712&r2=1381713&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
(original)
+++ lucene/dev/branches/branch_4x/lucene/analysis/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java
Thu Sep  6 18:38:15 2012
@@ -227,8 +227,8 @@ public class SynonymMap {
     /**
      * Add a phrase->phrase synonym mapping.
      * Phrases are character sequences where words are
-     * separated with character zero (\u0000).  Empty words
-     * (two \u0000s in a row) are not allowed in the input nor
+     * separated with character zero (U+0000).  Empty words
+     * (two U+0000s in a row) are not allowed in the input nor
      * the output!
      * 
      * @param input input phrase



Mime
View raw message