freemarker-notifications mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From ddek...@apache.org
Subject [2/2] incubator-freemarker git commit: Improvements related to NodeModel static convenience methods:
Date Wed, 18 Jan 2017 23:54:41 GMT
Improvements related to NodeModel static convenience methods:

- Bug fixed: NodeModel.mergeAdjacentText(Node) didn't merged all adjacent text nodes, only
pairs of adjacent text nodes. (Luckily this method is hardly ever used, and the more often
used NodeModel.simplify(Node) was working correctly.)
- Performance improvements in the static utility methods of NodeModel: simplify(Node), mergeAdjacentText(Node),
removeComments(Node), removePIs(Node).
- Added warning to the JavaDoc of NodeModel.parse methods to inform users about the possibility
of XML External Entity (XXE) attacks if the source XML (not a template) comes from untrusted
source. This is just an XML fact (i.e., that in an XML you can have external entities and
they can be exploited), and has no much to do with FreeMarker. Also note that FreeMarker itself
never calls NodeModel.parse; these are merely convenience methods that some applications directly
call themselves to create a NodeModel from an XML file. As this method encapsulates the call
to the platform XML parser, we thought it's better to point this risk out.


Project: http://git-wip-us.apache.org/repos/asf/incubator-freemarker/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-freemarker/commit/ef638589
Tree: http://git-wip-us.apache.org/repos/asf/incubator-freemarker/tree/ef638589
Diff: http://git-wip-us.apache.org/repos/asf/incubator-freemarker/diff/ef638589

Branch: refs/heads/2.3-gae
Commit: ef638589b547917b15bb1bddbefab0c75595c253
Parents: ebc5c8b
Author: ddekany <ddekany@apache.org>
Authored: Sun Jan 15 23:31:22 2017 +0100
Committer: ddekany <ddekany@apache.org>
Committed: Thu Jan 19 00:54:16 2017 +0100

----------------------------------------------------------------------
 src/main/java/freemarker/ext/dom/NodeModel.java | 294 ++++++++++++-------
 src/manual/en_US/book.xml                       |  58 ++++
 .../ext/dom/DOMConvenienceStaticsTest.java      | 197 +++++++++++++
 3 files changed, 438 insertions(+), 111 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-freemarker/blob/ef638589/src/main/java/freemarker/ext/dom/NodeModel.java
----------------------------------------------------------------------
diff --git a/src/main/java/freemarker/ext/dom/NodeModel.java b/src/main/java/freemarker/ext/dom/NodeModel.java
index ea96bac..8b47457 100644
--- a/src/main/java/freemarker/ext/dom/NodeModel.java
+++ b/src/main/java/freemarker/ext/dom/NodeModel.java
@@ -22,6 +22,7 @@ package freemarker.ext.dom;
 
 import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.lang.ref.WeakReference;
 import java.net.MalformedURLException;
 import java.util.Collections;
@@ -112,9 +113,17 @@ implements TemplateNodeModelEx, TemplateHashModel, TemplateSequenceModel,
     private NodeModel parent;
     
     /**
-     * Sets the DOM Parser implementation to be used when building NodeModel
-     * objects from XML files.
+     * Sets the DOM parser implementation to be used when building {@link NodeModel} objects
from XML files or from
+     * {@link InputStream} with the static convenience methods of {@link NodeModel}. Otherwise
FreeMarker itself doesn't
+     * use this.
+     * 
+     * @see #getDocumentBuilderFactory()
+     * 
+     * @deprecated It's a bad practice to change static fields, as if multiple independent
components do that in the
+     *             same JVM, they unintentionally affect each other. Therefore it's recommended
to leave this static
+     *             value at its default.
      */
+    @Deprecated
     static public void setDocumentBuilderFactory(DocumentBuilderFactory docBuilderFactory)
{
         synchronized (STATIC_LOCK) {
             NodeModel.docBuilderFactory = docBuilderFactory;
@@ -122,8 +131,11 @@ implements TemplateNodeModelEx, TemplateHashModel, TemplateSequenceModel,
     }
     
     /**
-     * @return the DOM Parser implementation that is used when 
-     * building NodeModel objects from XML files.
+     * Returns the DOM parser implementation that is used when building {@link NodeModel}
objects from XML files or from
+     * {@link InputStream} with the static convenience methods of {@link NodeModel}. Otherwise
FreeMarker itself doesn't
+     * use this.
+     * 
+     * @see #setDocumentBuilderFactory(DocumentBuilderFactory)
      */
     static public DocumentBuilderFactory getDocumentBuilderFactory() {
         synchronized (STATIC_LOCK) {
@@ -138,8 +150,15 @@ implements TemplateNodeModelEx, TemplateHashModel, TemplateSequenceModel,
     }
     
     /**
-     * sets the error handler to use when parsing the document.
+     * Sets the error handler to use when parsing the document with the static convenience
methods of {@link NodeModel}.
+     * 
+     * @deprecated It's a bad practice to change static fields, as if multiple independent
components do that in the
+     *             same JVM, they unintentionally affect each other. Therefore it's recommended
to leave this static
+     *             value at its default.
+     *             
+     * @see #getErrorHandler()
      */
+    @Deprecated
     static public void setErrorHandler(ErrorHandler errorHandler) {
         synchronized (STATIC_LOCK) {
             NodeModel.errorHandler = errorHandler;
@@ -147,7 +166,9 @@ implements TemplateNodeModelEx, TemplateHashModel, TemplateSequenceModel,
     }
 
     /**
-     * @since 2.3.20 
+     * @since 2.3.20
+     * 
+     * @see #setErrorHandler(ErrorHandler)
      */
     static public ErrorHandler getErrorHandler() {
         synchronized (STATIC_LOCK) {
@@ -156,12 +177,32 @@ implements TemplateNodeModelEx, TemplateHashModel, TemplateSequenceModel,
     }
     
     /**
-     * Create a NodeModel from a SAX input source. Adjacent text nodes will be merged (and
CDATA sections
-     * are considered as text nodes).
-     * @param removeComments whether to remove all comment nodes 
-     * (recursively) from the tree before processing
-     * @param removePIs whether to remove all processing instruction nodes
-     * (recursively from the tree before processing
+     * Convenience method to create a {@link NodeModel} from a SAX {@link InputSource}; please
see the security warning
+     * further down. Adjacent text nodes will be merged (and CDATA sections are considered
as text nodes) as with
+     * {@link #mergeAdjacentText(Node)}. Further simplifications are applied depending on
the parameters. If all
+     * simplifications are turned on, then it applies {@link #simplify(Node)} on the loaded
DOM.
+     * 
+     * <p>
+     * Note that {@code parse(...)} is only a convenience method, and FreeMarker itself doesn't
use it (except when you
+     * call the other similar static convenience methods, as they may build on each other).
In particular, if you want
+     * full control over the {@link DocumentBuilderFactory} used, create the {@link Node}
with your own
+     * {@link DocumentBuilderFactory}, apply {@link #simplify(Node)} (or such) on it, then
call
+     * {@link NodeModel#wrap(Node)}.
+     * 
+     * <p>
+     * <b>Security warning:</b> If the XML to load is coming from a source that
you can't fully trust, you shouldn't use
+     * this method, as the {@link DocumentBuilderFactory} it uses by default supports external
entities, and so it
+     * doesn't prevent XML External Entity (XXE) attacks. Note that XXE attacks are not specific
to FreeMarker, they
+     * affect all XML parsers in general. If that's a problem in your application, OWASP
has a cheat sheet to set up a
+     * {@link DocumentBuilderFactory} that has limited functionality but is immune to XXE
attacks. Because it's just a
+     * convenience method, you can just use your own {@link DocumentBuilderFactory} and do
a few extra steps instead
+     * (see earlier).
+     * 
+     * @param removeComments
+     *            Whether to remove all comment nodes (recursively); this is like calling
{@link #removeComments(Node)}
+     * @param removePIs
+     *            Whether to remove all processing instruction nodes (recursively); this
is like calling
+     *            {@link #removePIs(Node)}
      */
     static public NodeModel parse(InputSource is, boolean removeComments, boolean removePIs)
         throws SAXException, IOException, ParserConfigurationException {
@@ -198,46 +239,43 @@ implements TemplateNodeModelEx, TemplateHashModel, TemplateSequenceModel,
     }
     
     /**
-     * Create a NodeModel from an XML input source. By default,
-     * all comments and processing instruction nodes are 
-     * stripped from the tree.
+     * Same as {@link #parse(InputSource, boolean, boolean) parse(is, true, true)}; don't
miss the security warnings
+     * documented there.
      */
-    static public NodeModel parse(InputSource is) 
-    throws SAXException, IOException, ParserConfigurationException {
+    static public NodeModel parse(InputSource is) throws SAXException, IOException, ParserConfigurationException
{
         return parse(is, true, true);
     }
     
     
     /**
-     * Create a NodeModel from an XML file.
-     * @param removeComments whether to remove all comment nodes 
-     * (recursively) from the tree before processing
-     * @param removePIs whether to remove all processing instruction nodes
-     * (recursively from the tree before processing
+     * Same as {@link #parse(InputSource, boolean, boolean)}, but loads from a {@link File};
don't miss the security
+     * warnings documented there.
      */
     static public NodeModel parse(File f, boolean removeComments, boolean removePIs) 
-        throws SAXException, IOException, ParserConfigurationException {
+    throws SAXException, IOException, ParserConfigurationException {
         DocumentBuilder builder = getDocumentBuilderFactory().newDocumentBuilder();
         ErrorHandler errorHandler = getErrorHandler();
         if (errorHandler != null) builder.setErrorHandler(errorHandler);
         Document doc = builder.parse(f);
-        if (removeComments) {
-            removeComments(doc);
-        }
-        if (removePIs) {
-            removePIs(doc);
+        if (removeComments && removePIs) {
+            simplify(doc);
+        } else {
+            if (removeComments) {
+                removeComments(doc);
+            }
+            if (removePIs) {
+                removePIs(doc);
+            }
+            mergeAdjacentText(doc);
         }
-        mergeAdjacentText(doc);
         return wrap(doc);
     }
     
     /**
-     * Create a NodeModel from an XML file. By default,
-     * all comments and processing instruction nodes are 
-     * stripped from the tree.
+     * Same as {@link #parse(InputSource, boolean, boolean) parse(source, true, true)}, but
loads from a {@link File};
+     * don't miss the security warnings documented there.
      */
-    static public NodeModel parse(File f) 
-    throws SAXException, IOException, ParserConfigurationException {
+    static public NodeModel parse(File f) throws SAXException, IOException, ParserConfigurationException
{
         return parse(f, true, true);
     }
     
@@ -411,120 +449,154 @@ implements TemplateNodeModelEx, TemplateHashModel, TemplateSequenceModel,
     }
     
     /**
-     * Recursively removes all comment nodes
-     * from the subtree.
+     * Recursively removes all comment nodes from the subtree.
      *
      * @see #simplify
      */
-    static public void removeComments(Node node) {
-        NodeList children = node.getChildNodes();
-        int i = 0;
-        int len = children.getLength();
-        while (i < len) {
-            Node child = children.item(i);
-            if (child.hasChildNodes()) {
+    static public void removeComments(Node parent) {
+        Node child = parent.getFirstChild();
+        while (child != null) {
+            Node nextSibling = child.getNextSibling();
+            if (child.getNodeType() == Node.COMMENT_NODE) {
+                parent.removeChild(child);
+            } else if (child.hasChildNodes()) {
                 removeComments(child);
-                i++;
-            } else {
-                if (child.getNodeType() == Node.COMMENT_NODE) {
-                    node.removeChild(child);
-                    len--;
-                } else {
-                    i++;
-                }
             }
+            child = nextSibling;
         }
     }
     
     /**
-     * Recursively removes all processing instruction nodes
-     * from the subtree.
+     * Recursively removes all processing instruction nodes from the subtree.
      *
      * @see #simplify
      */
-    static public void removePIs(Node node) {
-        NodeList children = node.getChildNodes();
-        int i = 0;
-        int len = children.getLength();
-        while (i < len) {
-            Node child = children.item(i);
-            if (child.hasChildNodes()) {
+    static public void removePIs(Node parent) {
+        Node child = parent.getFirstChild();
+        while (child != null) {
+            Node nextSibling = child.getNextSibling();
+            if (child.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
+                parent.removeChild(child);
+            } else if (child.hasChildNodes()) {
                 removePIs(child);
-                i++;
-            } else {
-                if (child.getNodeType() == Node.PROCESSING_INSTRUCTION_NODE) {
-                    node.removeChild(child);
-                    len--;
-                } else {
-                    i++;
-                }
             }
+            child = nextSibling;
         }
     }
     
     /**
-     * Merges adjacent text/cdata nodes, so that there are no 
-     * adjacent text/cdata nodes. Operates recursively 
-     * on the entire subtree. You thus lose information
-     * about any CDATA sections occurring in the doc.
+     * Merges adjacent text nodes (where CDATA counts as text node too). Operates recursively
on the entire subtree.
+     * The merged node will have the type of the first node of the adjacent merged nodes.
+     * 
+     * <p>Because XPath assumes that there are no adjacent text nodes in the tree,
not doing this can have
+     * undesirable side effects. Xalan queries like {@code text()} will only return the first
of a list of matching
+     * adjacent text nodes instead of all of them, while Jaxen will return all of them as
intuitively expected. 
      *
      * @see #simplify
      */
-    static public void mergeAdjacentText(Node node) {
-        Node child = node.getFirstChild();
+    static public void mergeAdjacentText(Node parent) {
+        mergeAdjacentText(parent, null);
+    }
+    
+    static private void mergeAdjacentText(Node parent, StringBuilder collectorBuf) {
+        Node child = parent.getFirstChild();
         while (child != null) {
-            if (child instanceof Text || child instanceof CDATASection) {
-                Node next = child.getNextSibling();
-                if (next instanceof Text || next instanceof CDATASection) {
-                    String fullText = child.getNodeValue() + next.getNodeValue();
-                    ((CharacterData) child).setData(fullText);
-                    node.removeChild(next);
+            Node next = child.getNextSibling();
+            if (child instanceof Text) {
+                boolean atFirstText = true;
+                while (next instanceof Text) { //
+                    if (atFirstText) {
+                        if (collectorBuf == null) {
+                            collectorBuf = new StringBuilder(
+                                    child.getNodeValue().length() + next.getNodeValue().length());
+                        } else {
+                            collectorBuf.setLength(0);
+                        }
+                        collectorBuf.append(child.getNodeValue());
+                        atFirstText = false;
+                    }
+                    collectorBuf.append(next.getNodeValue());
+                    
+                    parent.removeChild(next);
+                    
+                    next = child.getNextSibling();
+                }
+                if (!atFirstText && collectorBuf.length() != 0) {
+                    ((CharacterData) child).setData(collectorBuf.toString());
                 }
             } else {
-                mergeAdjacentText(child);
+                mergeAdjacentText(child, collectorBuf);
             }
-            child = child.getNextSibling();
+            child = next;
         }
     }
     
     /**
-     * Removes comments and processing instruction, and then unites adjacent text nodes.
-     * Note that CDATA sections count as text nodes.
+     * Removes all comments and processing instruction, and unites adjacent text nodes (here
CDATA counts as text as
+     * well). This is similar to applying {@link #removeComments(Node)}, {@link #removePIs(Node)},
and finally
+     * {@link #mergeAdjacentText(Node)}, but it does all that somewhat faster.
      */    
-    static public void simplify(Node node) {
-        NodeList children = node.getChildNodes();
-        int i = 0;
-        int len = children.getLength();
-        Node prevTextChild = null;
-        while (i < len) {
-            Node child = children.item(i);
+    static public void simplify(Node parent) {
+        simplify(parent, null);
+    }
+    
+    static private void simplify(Node parent, StringBuilder collectorTextChildBuff) {
+        Node collectorTextChild = null;
+        Node child = parent.getFirstChild();
+        while (child != null) {
+            Node next = child.getNextSibling();
             if (child.hasChildNodes()) {
-                simplify(child);
-                prevTextChild = null;
-                i++;
+                if (collectorTextChild != null) {
+                    // Commit pending text node merge:
+                    if (collectorTextChildBuff != null && collectorTextChildBuff.length()
!= 0) {
+                        ((CharacterData) collectorTextChild).setData(collectorTextChildBuff.toString());
+                        collectorTextChildBuff.setLength(0);
+                    }
+                    collectorTextChild = null;
+                }
+                
+                simplify(child, collectorTextChildBuff);
             } else {
                 int type = child.getNodeType();
-                if (type == Node.PROCESSING_INSTRUCTION_NODE) {
-                    node.removeChild(child);
-                    len--;
-                } else if (type == Node.COMMENT_NODE) {
-                    node.removeChild(child);
-                    len--;
-                } else if (type == Node.TEXT_NODE || type == Node.CDATA_SECTION_NODE ) {
-                    if (prevTextChild != null) {
-                        CharacterData ptc = (CharacterData) prevTextChild;
-                        ptc.setData(ptc.getNodeValue() + child.getNodeValue());
-                        node.removeChild(child);
-                        len--;
+                if (type == Node.TEXT_NODE || type == Node.CDATA_SECTION_NODE ) {
+                    if (collectorTextChild != null) {
+                        if (collectorTextChildBuff == null) {
+                            collectorTextChildBuff = new StringBuilder(
+                                    collectorTextChild.getNodeValue().length() + child.getNodeValue().length());
+                        }
+                        if (collectorTextChildBuff.length() == 0) {
+                            collectorTextChildBuff.append(collectorTextChild.getNodeValue());
+                        }
+                        collectorTextChildBuff.append(child.getNodeValue());
+                        parent.removeChild(child);
                     } else {
-                        prevTextChild = child;
-                        i++;
+                        collectorTextChild = child;
+                        if (collectorTextChildBuff != null) {
+                            collectorTextChildBuff.setLength(0);
+                        }
                     }
-                } else {
-                    prevTextChild = null;
-                    i++;
+                } else if (type == Node.COMMENT_NODE) {
+                    parent.removeChild(child);
+                } else if (type == Node.PROCESSING_INSTRUCTION_NODE) {
+                    parent.removeChild(child);
+                } else if (collectorTextChild != null) {
+                    // Commit pending text node merge:
+                    if (collectorTextChildBuff != null && collectorTextChildBuff.length()
!= 0) {
+                        ((CharacterData) collectorTextChild).setData(collectorTextChildBuff.toString());
+                        collectorTextChildBuff.setLength(0);
+                    }
+                    collectorTextChild = null;
                 }
             }
+            child = next;
+        }
+        
+        if (collectorTextChild != null) {
+            // Commit pending text node merge:
+            if (collectorTextChildBuff != null && collectorTextChildBuff.length()
!= 0) {
+                ((CharacterData) collectorTextChild).setData(collectorTextChildBuff.toString());
+                collectorTextChildBuff.setLength(0);
+            }
         }
     }
     

http://git-wip-us.apache.org/repos/asf/incubator-freemarker/blob/ef638589/src/manual/en_US/book.xml
----------------------------------------------------------------------
diff --git a/src/manual/en_US/book.xml b/src/manual/en_US/book.xml
index e88d1a0..38da320 100644
--- a/src/manual/en_US/book.xml
+++ b/src/manual/en_US/book.xml
@@ -26813,6 +26813,64 @@ TemplateModel x = env.getVariable("x");  // get variable x</programlisting>
               implementation, which just sees it as a syntactical
               error.)</para>
             </listitem>
+
+            <listitem>
+              <para>Bug fixed:
+              <literal>NodeModel.mergeAdjacentText(Node)</literal> didn't
+              merged all adjacent text nodes, only pairs of adjacent text
+              nodes. (Luckily this method is hardly ever used, and the more
+              often used <literal>NodeModel.simplify(Node)</literal> was
+              working correctly.)</para>
+            </listitem>
+
+            <listitem>
+              <para>Performance improvements in the static utility methods of
+              <literal>NodeModel</literal>: <literal>simplify(Node)</literal>,
+              <literal>mergeAdjacentText(Node)</literal>,
+              <literal>removeComments(Node)</literal>,
+              <literal>removePIs(Node)</literal>.</para>
+            </listitem>
+
+            <listitem>
+              <para>Added warning to the JavaDoc of
+              <literal>NodeModel.parse</literal> methods to inform users about
+              the possibility of XML External Entity (XXE) attacks if the
+              source XML (not a template) comes from untrusted source. This is
+              just an XML fact (i.e., that in an XML you can have external
+              entities and they can be exploited), and has no much to do with
+              FreeMarker. Also note that FreeMarker itself never calls
+              <literal>NodeModel.parse</literal>; these are merely convenience
+              methods that some applications directly call themselves to
+              create a <literal>NodeModel</literal> from an XML file. As this
+              method encapsulates the call to the platform XML parser, we
+              thought it's better to point this risk out.</para>
+            </listitem>
+          </itemizedlist>
+        </section>
+
+        <section>
+          <title>Other changes</title>
+
+          <itemizedlist>
+            <listitem>
+              <para><link
+              xlink:href="https://issues.apache.org/jira/browse/FREEMARKER-17">FREEMARKER-17</link>:
+              Removed the Servlet- and JSP-related <literal>*.dtd</literal>
+              files to simplify licensing. We can operate without them as
+              before, as validation with them was disabled earlier too. At
+              this point, everything in the source code of the FreeMarker
+              engine, and everything in the produced
+              <literal>freemarker.jar</literal> was created inside the
+              FreeMarker project.</para>
+            </listitem>
+
+            <listitem>
+              <para><link
+              xlink:href="https://issues.apache.org/jira/browse/FREEMARKER-27">FREEMARKER-27</link>:
+              Moved some content from the <literal>NOTICES</literal> files
+              over to the <literal>LICENSE</literal> file, to follow the
+              Apache Incubator guidelines closer.</para>
+            </listitem>
           </itemizedlist>
         </section>
 

http://git-wip-us.apache.org/repos/asf/incubator-freemarker/blob/ef638589/src/test/java/freemarker/ext/dom/DOMConvenienceStaticsTest.java
----------------------------------------------------------------------
diff --git a/src/test/java/freemarker/ext/dom/DOMConvenienceStaticsTest.java b/src/test/java/freemarker/ext/dom/DOMConvenienceStaticsTest.java
new file mode 100644
index 0000000..e412230
--- /dev/null
+++ b/src/test/java/freemarker/ext/dom/DOMConvenienceStaticsTest.java
@@ -0,0 +1,197 @@
+package freemarker.ext.dom;
+
+import static org.junit.Assert.assertEquals;
+
+import java.io.IOException;
+import java.io.StringReader;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.junit.Test;
+import org.w3c.dom.CDATASection;
+import org.w3c.dom.Comment;
+import org.w3c.dom.Document;
+import org.w3c.dom.DocumentType;
+import org.w3c.dom.Element;
+import org.w3c.dom.Node;
+import org.w3c.dom.ProcessingInstruction;
+import org.w3c.dom.Text;
+import org.xml.sax.ErrorHandler;
+import org.xml.sax.InputSource;
+import org.xml.sax.SAXException;
+
+public class DOMConvenienceStaticsTest {
+
+    private static final String COMMON_TEST_XML
+            = "<!DOCTYPE a []><?p?><a>x<![CDATA[y]]><!--c--><?p?>z<?p?><b><!--c--></b><c></c>"
+              + "<d>a<e>c</e>b<!--c--><!--c--><!--c--><?p?><?p?><?p?></d>"
+              + "<f><![CDATA[1]]>2</f></a><!--c-->";
+
+    private static final String TEXT_MERGE_CONTENT =
+            "<a>"
+            + "a<!--c--><s/>"
+            + "<!--c-->a<s/>"
+            + "a<!--c-->b<s/>"
+            + "<!--c-->a<!--c-->b<!--c--><s/>"
+            + "a<b>b</b>c<s/>"
+            + "a<b>b</b><!--c-->c<s/>"
+            + "a<!--c-->1<b>b<!--c--></b>c<!--c-->1<s/>"
+            + "a<!--c-->1<b>b<!--c-->c</b>d<!--c-->1<s/>"
+            + "a<!--c-->1<b>b<!--c-->c</b>d<!--c-->1<s/>"
+            + "a<!--c-->1<b>b<!--c-->1<e>c<!--c-->1</e>d<!--c-->1</b>e<!--c-->1<s/>"
+            + "</a>";
+    private static final String TEXT_MERGE_EXPECTED =
+            "<a>"
+            + "%a<s/>"
+            + "%a<s/>"
+            + "%ab<s/>"
+            + "%ab<s/>"
+            + "%a<b>%b</b>%c<s/>"
+            + "%a<b>%b</b>%c<s/>"
+            + "%a1<b>%b</b>%c1<s/>"
+            + "%a1<b>%bc</b>%d1<s/>"
+            + "%a1<b>%bc</b>%d1<s/>"
+            + "%a1<b>%b1<e>%c1</e>%d1</b>%e1<s/>"
+            + "</a>";
+    
+    @Test
+    public void testTest() throws Exception {
+        String expected = "<!DOCTYPE ...><?p?><a>%x<![CDATA[y]]><!--c--><?p?>%z<?p?><b><!--c--></b><c/>"
+                   + "<d>%a<e>%c</e>%b<!--c--><!--c--><!--c--><?p?><?p?><?p?></d>"
+                   + "<f><![CDATA[1]]>%2</f></a><!--c-->";
+        assertEquals(expected, toString(toDOM(COMMON_TEST_XML)));
+    }
+
+    @Test
+    public void testMergeAdjacentText() throws Exception {
+        Document dom = toDOM(COMMON_TEST_XML);
+        NodeModel.mergeAdjacentText(dom);
+        assertEquals(
+                "<!DOCTYPE ...><?p?><a>%xy<!--c--><?p?>%z<?p?><b><!--c--></b><c/>"
+                + "<d>%a<e>%c</e>%b<!--c--><!--c--><!--c--><?p?><?p?><?p?></d>"
+                + "<f><![CDATA[12]]></f></a><!--c-->",
+                toString(dom));
+    }
+
+    @Test
+    public void testRemoveComments() throws Exception {
+        Document dom = toDOM(COMMON_TEST_XML);
+        NodeModel.removeComments(dom);
+        assertEquals(
+                "<!DOCTYPE ...><?p?><a>%x<![CDATA[y]]><?p?>%z<?p?><b/><c/>"
+                + "<d>%a<e>%c</e>%b<?p?><?p?><?p?></d>"
+                + "<f><![CDATA[1]]>%2</f></a>",
+                toString(dom));
+    }
+
+    @Test
+    public void testRemovePIs() throws Exception {
+        Document dom = toDOM(COMMON_TEST_XML);
+        NodeModel.removePIs(dom);
+        assertEquals(
+                "<!DOCTYPE ...><a>%x<![CDATA[y]]><!--c-->%z<b><!--c--></b><c/>"
+                + "<d>%a<e>%c</e>%b<!--c--><!--c--><!--c--></d>"
+                + "<f><![CDATA[1]]>%2</f></a><!--c-->",
+                toString(dom));
+    }
+    
+    @Test
+    public void testSimplify() throws Exception {
+        testSimplify(
+                "<!DOCTYPE ...><a>%xyz<b/><c/>"
+                + "<d>%a<e>%c</e>%b</d><f><![CDATA[12]]></f></a>",
+                COMMON_TEST_XML);
+    }
+
+    @Test
+    public void testSimplify2() throws Exception {
+        testSimplify(TEXT_MERGE_EXPECTED, TEXT_MERGE_CONTENT);
+    }
+
+    @Test
+    public void testSimplify3() throws Exception {
+        testSimplify("<a/>", "<a/>");
+    }
+    
+    private void testSimplify(String expected, String content)
+            throws SAXException, IOException, ParserConfigurationException {
+        {
+            Document dom = toDOM(content);
+            NodeModel.simplify(dom);
+            assertEquals(expected, toString(dom));
+        }
+        
+        // Must be equivalent:
+        {
+            Document dom = toDOM(content);
+            NodeModel.removeComments(dom);
+            NodeModel.removePIs(dom);
+            NodeModel.mergeAdjacentText(dom);
+            assertEquals(expected, toString(dom));
+        }
+        
+        // Must be equivalent:
+        {
+            Document dom = toDOM(content);
+            NodeModel.removeComments(dom);
+            NodeModel.removePIs(dom);
+            NodeModel.simplify(dom);
+            assertEquals(expected, toString(dom));
+        }
+    }
+
+    private Document toDOM(String content) throws SAXException, IOException, ParserConfigurationException
{
+        DocumentBuilder builder =  NodeModel.getDocumentBuilderFactory().newDocumentBuilder();
+        ErrorHandler errorHandler =  NodeModel.getErrorHandler();
+        if (errorHandler != null) builder.setErrorHandler(errorHandler);
+        return builder.parse(toInputSource(content));
+    }
+
+    private InputSource toInputSource(String content) {
+        return new InputSource(new StringReader(content));
+    }
+
+    private String toString(Document doc) {
+        StringBuilder sb = new StringBuilder();
+        toString(doc, sb);
+        return sb.toString();
+    }
+
+    private void toString(Node node, StringBuilder sb) {
+        if (node instanceof Document) {
+            childrenToString(node, sb);
+        } else if (node instanceof Element) {
+            if (node.hasChildNodes()) {
+                sb.append("<").append(node.getNodeName()).append(">");
+                childrenToString(node, sb);
+                sb.append("</").append(node.getNodeName()).append(">");
+            } else {
+                sb.append("<").append(node.getNodeName()).append("/>");
+            }
+        } else if (node instanceof Text) {
+            if (node instanceof CDATASection) {
+                sb.append("<![CDATA[").append(node.getNodeValue()).append("]]>");
+            } else {
+                sb.append("%").append(node.getNodeValue());
+            }
+        } else if (node instanceof Comment) {
+            sb.append("<!--").append(node.getNodeValue()).append("-->");
+        } else if (node instanceof ProcessingInstruction) {
+            sb.append("<?").append(node.getNodeName()).append("?>");
+        } else if (node instanceof DocumentType) {
+            sb.append("<!DOCTYPE ...>");
+        } else {
+            throw new IllegalStateException("Unhandled node type: " + node.getClass().getName());
+        }
+    }
+
+    private void childrenToString(Node node, StringBuilder sb) {
+        Node child = node.getFirstChild();
+        while (child != null) {
+            toString(child, sb);
+            child = child.getNextSibling();
+        }
+    }
+    
+}


Mime
View raw message