commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From chtom...@apache.org
Subject [1/4] [text] 1. Adding a new html4 translator to escape HTML characters only once. This prevents single, double or recursive escaping of html characters. 2. Using SingleLookupTranslator directly, instead of passing it to AggregateTranslator. 3. Added esc
Date Wed, 28 Dec 2016 15:54:09 GMT
Repository: commons-text
Updated Branches:
  refs/heads/master cbb5701cf -> 55c332f3d


1. Adding a new html4 translator to escape HTML characters only once. This prevents single,
double or recursive escaping of html characters.
2. Using SingleLookupTranslator directly, instead of passing it to AggregateTranslator.
3. Added escapeHtml3Once() method.
4. Commenting improvements.


Project: http://git-wip-us.apache.org/repos/asf/commons-text/repo
Commit: http://git-wip-us.apache.org/repos/asf/commons-text/commit/f94e3144
Tree: http://git-wip-us.apache.org/repos/asf/commons-text/tree/f94e3144
Diff: http://git-wip-us.apache.org/repos/asf/commons-text/diff/f94e3144

Branch: refs/heads/master
Commit: f94e314415f2dd0ca1a9659f09f6921eca2783ef
Parents: 8bda461
Author: sampyash <sampanna.kahu@flipkart.com>
Authored: Mon Dec 26 23:08:01 2016 +0530
Committer: sampyash <sampanna.kahu@flipkart.com>
Committed: Mon Dec 26 23:08:01 2016 +0530

----------------------------------------------------------------------
 .../apache/commons/text/StringEscapeUtils.java  | 102 ++++++++++++++-
 .../text/translate/SingleLookupTranslator.java  | 129 +++++++++++++++++++
 .../commons/text/StringEscapeUtilsTest.java     |  44 +++++--
 3 files changed, 262 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/commons-text/blob/f94e3144/src/main/java/org/apache/commons/text/StringEscapeUtils.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/StringEscapeUtils.java b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
index cae872c..16949da 100644
--- a/src/main/java/org/apache/commons/text/StringEscapeUtils.java
+++ b/src/main/java/org/apache/commons/text/StringEscapeUtils.java
@@ -16,12 +16,8 @@
  */
 package org.apache.commons.text;
 
-import java.io.IOException;
-import java.io.Writer;
-
 import org.apache.commons.lang3.CharUtils;
 import org.apache.commons.lang3.StringUtils;
-
 import org.apache.commons.text.translate.AggregateTranslator;
 import org.apache.commons.text.translate.CharSequenceTranslator;
 import org.apache.commons.text.translate.EntityArrays;
@@ -30,9 +26,13 @@ import org.apache.commons.text.translate.LookupTranslator;
 import org.apache.commons.text.translate.NumericEntityEscaper;
 import org.apache.commons.text.translate.NumericEntityUnescaper;
 import org.apache.commons.text.translate.OctalUnescaper;
+import org.apache.commons.text.translate.SingleLookupTranslator;
 import org.apache.commons.text.translate.UnicodeUnescaper;
 import org.apache.commons.text.translate.UnicodeUnpairedSurrogateRemover;
 
+import java.io.IOException;
+import java.io.Writer;
+
 /**
  * <p>Escapes and unescapes {@code String}s for
  * Java, Java Script, HTML and XML.</p>
@@ -197,6 +197,27 @@ public class StringEscapeUtils {
             );
 
     /**
+     * The improved translator object for escaping HTML version 3.0.
+     * The 'improved' part of this translator is that it checks if the html is already translated.
+     * This check prevents double, triple, or recursive translations.
+     *
+     * While {@link #escapeHtml3Once(String)} is the expected method of use, this
+     * object allows the HTML escaping functionality to be used
+     * as the foundation for a custom translator.
+     *
+     * Note that, multiple lookup tables should be passed to this translator
+     * instead of passing multiple instances of this translator to the
+     * AggregateTranslator. Because, a SingleLookupTranslator only checks the values of the
+     * lookup table passed to that instance while deciding whether a value is
+     * already translated or not.
+     *
+     * @since 3.0
+     */
+    public static final CharSequenceTranslator ESCAPE_HTML3_ONCE =
+            new SingleLookupTranslator(EntityArrays.BASIC_ESCAPE(), EntityArrays.ISO8859_1_ESCAPE());
+
+
+    /**
      * Translator object for escaping HTML version 4.0.
      *
      * While {@link #escapeHtml4(String)} is the expected method of use, this 
@@ -211,6 +232,26 @@ public class StringEscapeUtils {
             );
 
     /**
+     * The improved translator object for escaping HTML version 4.0.
+     * The 'improved' part of this translator is that it checks if the html is already translated.
+     * This check prevents double, triple, or recursive translations.
+     *
+     * While {@link #escapeHtml4Once(String)} is the expected method of use, this
+     * object allows the HTML escaping functionality to be used
+     * as the foundation for a custom translator.
+     *
+     * Note that, multiple lookup tables should be passed to this translator
+     * instead of passing multiple instances of this translator to the
+     * AggregateTranslator. Because, a SingleLookupTranslator only checks the values of the
+     * lookup table passed to that instance while deciding whether a value is
+     * already translated or not.
+     *
+     * @since 3.0
+     */
+    public static final CharSequenceTranslator ESCAPE_HTML4_ONCE =
+            new SingleLookupTranslator(EntityArrays.BASIC_ESCAPE(), EntityArrays.ISO8859_1_ESCAPE(),
EntityArrays.HTML40_EXTENDED_ESCAPE());
+
+    /**
      * Translator object for escaping individual Comma Separated Values. 
      *
      * While {@link #escapeCsv(String)} is the expected method of use, this 
@@ -683,6 +724,45 @@ public class StringEscapeUtils {
     }
 
     /**
+     * <p>Escapes the characters in a {@code String} using HTML entities.
+     * But escapes them only once. i.e. does not escape already escaped characters.</p>
+     *
+     * <p>
+     * For example:
+     * </p>
+     * <p><code>"bread" &amp; "butter"</code></p>
+     * becomes:
+     * <p>
+     * <code>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</code>.
+     * </p>
+     *
+     * <p>
+     * But:
+     * </p>
+     * <p><code>&amp;quot;bread&amp;quot; &amp;amp; &amp;quot;butter&amp;quot;</code></p>
+     * remains unaffected.
+     *
+     * <p>Supports all known HTML 4.0 entities, including funky accents.
+     * Note that the commonly used apostrophe escape character (&amp;apos;)
+     * is not a legal entity and so is not supported). </p>
+     *
+     * @param input  the {@code String} to escape, may be null
+     * @return a new escaped {@code String}, {@code null} if null string input
+     *
+     * @see <a href="http://hotwired.lycos.com/webmonkey/reference/special_characters/">ISO
Entities</a>
+     * @see <a href="http://www.w3.org/TR/REC-html32#latin1">HTML 3.2 Character Entities
for ISO Latin-1</a>
+     * @see <a href="http://www.w3.org/TR/REC-html40/sgml/entities.html">HTML 4.0 Character
entity references</a>
+     * @see <a href="http://www.w3.org/TR/html401/charset.html#h-5.3">HTML 4.01 Character
References</a>
+     * @see <a href="http://www.w3.org/TR/html401/charset.html#code-position">HTML
4.01 Code positions</a>
+     *
+     * @since 3.0
+     */
+    public static final String escapeHtml4Once(final String input) {
+        return ESCAPE_HTML4_ONCE.translate(input);
+    }
+
+
+    /**
      * <p>Escapes the characters in a {@code String} using HTML entities.</p>
      * <p>Supports only the HTML 3.0 entities. </p>
      *
@@ -693,6 +773,20 @@ public class StringEscapeUtils {
         return ESCAPE_HTML3.translate(input);
     }
 
+    /**
+     * <p>Escapes the characters in a {@code String} using HTML entities.
+     * But escapes them only once. i.e. does not escape already escaped characters.</p>
+     * <p>Supports only the HTML 3.0 entities. </p>
+     *
+     * @param input  the {@code String} to escape, may be null
+     * @return a new escaped {@code String}, {@code null} if null string input
+     *
+     * @since 3.0
+     */
+    public static final String escapeHtml3Once(final String input) {
+        return ESCAPE_HTML3_ONCE.translate(input);
+    }
+
     //-----------------------------------------------------------------------
     /**
      * <p>Unescapes a string containing entity escapes to a string

http://git-wip-us.apache.org/repos/asf/commons-text/blob/f94e3144/src/main/java/org/apache/commons/text/translate/SingleLookupTranslator.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/commons/text/translate/SingleLookupTranslator.java b/src/main/java/org/apache/commons/text/translate/SingleLookupTranslator.java
new file mode 100644
index 0000000..0944d64
--- /dev/null
+++ b/src/main/java/org/apache/commons/text/translate/SingleLookupTranslator.java
@@ -0,0 +1,129 @@
+package org.apache.commons.text.translate;
+
+import java.io.IOException;
+import java.io.Writer;
+import java.util.HashMap;
+import java.util.HashSet;
+
+/**
+ * Translates a value using a lookup table.
+ * But doesn't translate if that value is already translated.
+ *
+ * @since 3.0
+ */
+public class SingleLookupTranslator extends CharSequenceTranslator {
+
+    private final HashMap<String, String> lookupMap;
+    private final HashSet<Character>      prefixSet;
+    private final int                     shortest;
+    private final int                     longest;
+    private final int                     shortestValue;
+    private final int                     longestValue;
+
+    /**
+     * Define the look tables to be used in translation.
+     *
+     * Note that, as of Lang 3.1, the key to the lookup table is converted to a
+     * java.lang.String. This is because we need the key to support hashCode and
+     * equals(Object), allowing it to be the key for a HashMap. See LANG-882.
+     *
+     * Also note that, multiple lookup tables should be passed to this translator
+     * instead of passing multiple instances of this translator to the
+     * AggregateTranslator. Because, this translator only checks the values of the
+     * lookup table passed to this instance while deciding whether a value is
+     * already translated or not.
+     *
+     * @param inputArrays
+     */
+    public SingleLookupTranslator(final String[][]... inputArrays) {
+        String[][] lookup = new String[0][];
+        for (String[][] input : inputArrays) {
+            lookup = append(lookup, input);
+        }
+        lookupMap = new HashMap<String, String>();
+        prefixSet = new HashSet<Character>();
+        int _shortest = Integer.MAX_VALUE;
+        int _longest = 0;
+        int _shortestValue = Integer.MAX_VALUE;
+        int _longestValue = 0;
+        if (lookup != null) {
+            for (final CharSequence[] seq : lookup) {
+                this.lookupMap.put(seq[0].toString(), seq[1].toString());
+                this.prefixSet.add(seq[0].charAt(0));
+                final int sz = seq[0].length();
+                if (sz < _shortest) {
+                    _shortest = sz;
+                }
+                if (sz > _longest) {
+                    _longest = sz;
+                }
+                final int sizeOfValue = seq[1].length();
+                if (sizeOfValue < _shortestValue) {
+                    _shortestValue = sizeOfValue;
+                }
+                if (sizeOfValue > _longestValue) {
+                    _longestValue = sizeOfValue;
+                }
+            }
+        }
+        shortest = _shortest;
+        longest = _longest;
+        shortestValue = _shortestValue;
+        longestValue = _longestValue;
+    }
+
+    private static String[][] append(String[][] a, String[][] b) {
+        String[][] result = new String[a.length + b.length][];
+        System.arraycopy(a, 0, result, 0, a.length);
+        System.arraycopy(b, 0, result, a.length, b.length);
+        return result;
+    }
+
+    /**
+     * Translate a set of codepoints, represented by an int index into a CharSequence,
+     * into another set of codepoints. The number of codepoints consumed must be returned,
+     * and the only IOExceptions thrown must be from interacting with the Writer so that
+     * the top level API may reliably ignore StringWriter IOExceptions.
+     *
+     * @param input CharSequence that is being translated
+     * @param index int representing the current point of translation
+     * @param out   Writer to translate the text to
+     * @return int count of codepoints consumed
+     * @throws IOException if and only if the Writer produces an IOException
+     */
+    @Override
+    public int translate(CharSequence input, int index, Writer out) throws IOException {
+        // check if already translated
+        int maxValue = longestValue;
+        if (index + maxValue > input.length()) {
+            maxValue = input.length() - index;
+        }
+        // implement greedy algorithm to check all the possible 'value' matches for which
we need to skip translation.
+        for (int i = maxValue; i >= shortestValue; i--) {
+            final CharSequence subSeq = input.subSequence(index, index + i);
+            // If the sub-string is already translated, return without translating.
+            if (lookupMap.containsValue(subSeq.toString())) {
+                return 0;
+            }
+        }
+
+        // check if translation exists for the input at position index
+        if (prefixSet.contains(input.charAt(index))) {
+            int max = longest;
+            if (index + longest > input.length()) {
+                max = input.length() - index;
+            }
+            // implement greedy algorithm by trying maximum match first
+            for (int i = max; i >= shortest; i--) {
+                final CharSequence subSeq = input.subSequence(index, index + i);
+                final String result = lookupMap.get(subSeq.toString());
+
+                if (result != null) {
+                    out.write(result);
+                    return i;
+                }
+            }
+        }
+        return 0;
+    }
+}

http://git-wip-us.apache.org/repos/asf/commons-text/blob/f94e3144/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
index c0f7d05..ac2fac2 100644
--- a/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
+++ b/src/test/java/org/apache/commons/text/StringEscapeUtilsTest.java
@@ -16,14 +16,7 @@
  */
 package org.apache.commons.text;
 
-import static org.apache.commons.text.StringEscapeUtils.escapeXSI;
-import static org.apache.commons.text.StringEscapeUtils.unescapeXSI;
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertFalse;
-import static org.junit.Assert.assertNotNull;
-import static org.junit.Assert.assertNull;
-import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
+import org.junit.Test;
 
 import java.io.IOException;
 import java.io.StringWriter;
@@ -34,7 +27,14 @@ import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Paths;
 
-import org.junit.Test;
+import static org.apache.commons.text.StringEscapeUtils.escapeXSI;
+import static org.apache.commons.text.StringEscapeUtils.unescapeXSI;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
 
 /**
  * Unit tests for {@link StringEscapeUtils}.
@@ -241,6 +241,32 @@ public class StringEscapeUtilsTest {
     }
 
     @Test
+    public void testEscapeHtml4Once() {
+        for (final String[] element : HTML_ESCAPES) {
+            final String message = element[0];
+            final String expected = element[1];
+            final String original = element[2];
+            assertEquals(message, expected, org.apache.commons.lang3.StringEscapeUtils.escapeHtml4Once(original));
+            assertEquals(message, expected, org.apache.commons.lang3.StringEscapeUtils.escapeHtml4Once(expected));
+            final StringWriter sw = new StringWriter();
+            try {
+                org.apache.commons.lang3.StringEscapeUtils.ESCAPE_HTML4_ONCE.translate(original,
sw);
+            } catch (final IOException e) {
+            }
+            final String actual = original == null ? null : sw.toString();
+            assertEquals(message, expected, actual);
+            final StringWriter sw2 = new StringWriter();
+            try {
+                org.apache.commons.lang3.StringEscapeUtils.ESCAPE_HTML4_ONCE.translate(expected,
sw2);
+            } catch (final IOException e) {
+            }
+            final String actual2 = original == null ? null : sw2.toString();
+            assertEquals(message, expected, actual2);
+        }
+    }
+
+
+    @Test
     public void testUnescapeHtml4() {
         for (final String[] element : HTML_ESCAPES) {
             final String message = element[0];


Mime
View raw message