portals-jetspeed-dev mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From pau...@apache.org
Subject cvs commit: jakarta-jetspeed/src/java/org/apache/jetspeed/util HTMLRewriter.java
Date Fri, 11 Jan 2002 23:29:56 GMT
paulsp      02/01/11 15:29:56

  Modified:    src/java/org/apache/jetspeed/util HTMLRewriter.java
  Log:
  o Fixed bug in <SCRIPT> processing. Thanks to patch provided by Ozgur Balsoy
  o Comments that contain script and style code are now included
  o Added "openInNewWindow" parameter to insert TARGET="_BLANK" attribute in <A>.
  o Added support for HREF in <BASE>
  o Removed support for BACKGROUND attribute in <BODY>
  o Removed URL rewriting from VALUE attribute in <OPTION>
  o Added default CODEBASE attribute to <APPLET> and <OBJECT>
  o General cleanup of import statements
  o Added documentation
  
  Revision  Changes    Path
  1.7       +264 -70   jakarta-jetspeed/src/java/org/apache/jetspeed/util/HTMLRewriter.java
  
  Index: HTMLRewriter.java
  ===================================================================
  RCS file: /home/cvs/jakarta-jetspeed/src/java/org/apache/jetspeed/util/HTMLRewriter.java,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- HTMLRewriter.java	9 Nov 2001 07:11:59 -0000	1.6
  +++ HTMLRewriter.java	11 Jan 2002 23:29:56 -0000	1.7
  @@ -67,8 +67,73 @@
    *          <INPUT SRCs, <APPLET CODEBASEs
    *      = Removal of <SCRIPT>, <STYLE>, <HEAD>, <EMBED>, <OBJECT>,
<APPLET>,
    *          <NOSCRIPT>
  - *      
  + * 
  + ****
  + * Please include the following section in the WebPagePortlet documentation     
  + ****
  + * <CODE>
  + *
  + * The following describes how HTML tags are rewritten
    *
  + * <!-- --> (HTML Comments)
  + *   o Unless otherwise mentioned, comments are striped.
  + * 
  + * <A>
  + *   o HREF attribute   - URL merged with base URL (See Note 1)
  + *   o TARGET attribute - Set to "_BLANK" if it does not exist 
  + *                        and openInNewWindow = TRUE
  + * <APPLET>
  + *   o Optionally included
  + *   o CODEBASE attribute - Set to the current path if it does
  + *                          not exist.
  + * 
  + * <BASE>
  + *   o <HEAD> does NOT have to be included.
  + *   o HREF attribute  - Set the Base URL of the page, but the tag
  + *                       not set in resulting HTML. URL merged with
  + *                       base URL (See Note 1)
  + * 
  + * <BODY>
  + *   o Background attribute - Always striped.
  + * 
  + * <EMBED>
  + *   o May not work.  Not supported by JDK 1.3/
  + * 
  + * <FORM>
  + *   o ACTION attribute - Set to the current URL if it does
  + *                        not exist. URL merged with base
  + *                        URL (See Note 1)
  + * 
  + * <IMG>
  + *   o SRC attribute - URL merged with base URL (See Note 1)
  + * 
  + * <INPUT>
  + *   o SRC attribute - URL merged with base URL (See Note 1)
  + * 
  + * <OBJECT>
  + *   o Optionally included
  + *   o CODEBASE attribute - Set to the current path if it does
  + *                          not exist. URL merged with base
  + *                          URL (See Note 1)
  + * 
  + * <SCRIPT>
  + *   o Optionally included
  + *   o Contents may be striped if this tag appears in the <HEAD>
  + *     and the contents are NOT in a comment
  + *   o SRC attribute - URL merged with base URL (See Note 1)
  + * 
  + * <TD>
  + *   o BACKGROUND attribute - URL merged with base URL (See Note 1)
  + * 
  + * Note 1: URL Merging.
  + *   This is done because the source of the page sent to the
  + *   user's browser is different then source the current page.
  + *   Example:
  + *     Base URL........ http://jakarta.apache.org/jetspeed
  + *     URL............. logo.gif
  + *     Resulting URL... http://jakarta.apache.org/jetspeed/logo.gif
  + * 
  + * </CODE>
    *  KNOWN PROBLEMS
    *
    *
  @@ -91,22 +156,24 @@
    *
    *
    */
  -
  -
   package org.apache.jetspeed.util;
   
  -import javax.swing.text.html.parser.*;
  -import javax.swing.text.html.*;
  +import org.apache.turbine.util.Log;
  +
  +import java.io.Reader;
  +import java.io.StringWriter;
  +import java.net.MalformedURLException;
  +import java.net.URL;
  +import java.util.Enumeration;
  +import javax.swing.text.html.HTML;
   import javax.swing.text.html.HTMLEditorKit;
  -import java.io.*;
  -import java.util.*;
  -import javax.swing.text.*;
  -import java.net.*;
  +import javax.swing.text.MutableAttributeSet;
   
   /**
    *
    * @author  Ingo Rammer (rammer@sycom.at)
    * @author <a href="mailto:sgala@apache.org">Santiago Gala</a>
  + * @author <a href="mailto:paulsp@apache.org">Paul Spencer</a>
    * @version 0.2
    */
   
  @@ -124,14 +191,78 @@
    * @param removeHead Shall HEAD-Tags and their content be removed
    * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
    */    
  -    public HTMLRewriter (boolean removeScript,
  -                         boolean removeStyle,
  -                         boolean removeNoScript,
  -                         boolean removeMeta,
  -                         boolean removeApplet,
  -                         boolean removeObject,
  -                         boolean removeHead,
  -                         boolean removeOnSomething) 
  +    public HTMLRewriter(boolean removeScript,
  +                        boolean removeStyle,
  +                        boolean removeNoScript,
  +                        boolean removeMeta,
  +                        boolean removeApplet,
  +                        boolean removeObject,
  +                        boolean removeHead,
  +                        boolean removeOnSomething) {
  +        init ( removeScript,
  +        removeStyle,
  +        removeNoScript,
  +        removeMeta,
  +        removeApplet,
  +        removeObject,
  +        removeHead,
  +        removeOnSomething,
  +        false);
  +    }
  +        
  +    /**
  +     * Sets the parameters for the HTMLRewriter
  +     * @param removeScript Shall SCRIPT-Tags and their content be removed
  +     * @param removeStyle Shall STYLE-Tags and their content be removed
  +     * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
  +     * @param removeMeta Shall META-Tags be removed
  +     * @param removeApplet Shall APPLET-Tags and their content be removed
  +     * @param removeObject Shall OBJECT-Tags and their content be removed
  +     * @param removeHead Shall HEAD-Tags and their content be removed
  +     * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
  +     */
  +    public HTMLRewriter(boolean removeScript,
  +                        boolean removeStyle,
  +                        boolean removeNoScript,
  +                        boolean removeMeta,
  +                        boolean removeApplet,
  +                        boolean removeObject,
  +                        boolean removeHead,
  +                        boolean removeOnSomething,
  +                        boolean openInNewWindow ) {
  +        init ( removeScript,
  +        removeStyle,
  +        removeNoScript,
  +        removeMeta,
  +        removeApplet,
  +        removeObject,
  +        removeHead,
  +        removeOnSomething,
  +        openInNewWindow ); 
  +    }
  +
  +    /**
  +     * Sets the parameters for the HTMLRewriter
  +     *
  +     * @param removeScript Shall SCRIPT-Tags and their content be removed
  +     * @param removeStyle Shall STYLE-Tags and their content be removed
  +     * @param removeNoScript Shall NOSCRIPT-Tags and their content be removed
  +     * @param removeMeta Shall META-Tags be removed
  +     * @param removeApplet Shall APPLET-Tags and their content be removed
  +     * @param removeObject Shall OBJECT-Tags and their content be removed
  +     * @param removeHead Shall HEAD-Tags and their content be removed
  +     * @param removeOnSomething Shall onClick, onBlur, etc. -Attributes be removed
  +     * @param openInNewWindow Shall links set Target="_blank"
  +     */
  +    private void init (boolean removeScript,
  +                       boolean removeStyle,
  +                       boolean removeNoScript,
  +                       boolean removeMeta,
  +                       boolean removeApplet,
  +                       boolean removeObject,
  +                       boolean removeHead,
  +                       boolean removeOnSomething,
  +                       boolean openInNewWindow ) 
       {
           cb.removeScript = removeScript;
           cb.removeStyle = removeStyle; 
  @@ -141,18 +272,20 @@
           cb.removeObject = removeObject;
           cb.removeHead = removeHead;
           cb.removeOnSomething = removeOnSomething;    
  +        cb.openInNewWindow = openInNewWindow;    
       }
       
  -/** Does the conversion of the HTML
  - * @param HTMLrdr Reader for HTML to be converted
  - * @param BaseUrl URL from which this HTML was taken. We be the base-Url
  - * for all URL-rewritings.
  - * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
  - * the document could not be converted. Should not happen
  - * normally, even in badly formatted HTML.
  - * @return HTML-String with rewritten URLs and removed (according
  - * to constructor-settings) tags
  - */    
  +    /**
  +     * Does the conversion of the HTML
  +     * @param HTMLrdr Reader for HTML to be converted
  +     * @param BaseUrl URL from which this HTML was taken. We be the base-Url
  +     * for all URL-rewritings.
  +     * @throws MalformedURLException If the BaseUrl is not a valid URL or if an URL inside
  +     * the document could not be converted. Should not happen
  +     * normally, even in badly formatted HTML.
  +     * @return HTML-String with rewritten URLs and removed (according
  +     * to constructor-settings) tags
  +     */
       public synchronized String convertURLs(Reader HTMLrdr, String BaseUrl) throws MalformedURLException
       {
           HTMLEditorKit.Parser parse = new HTMLRewriter.ParserGetter().getParser();     
  
  @@ -209,10 +342,14 @@
           private boolean removeApplet = true;
           private boolean removeObject = true;
           private boolean removeHead = true;
  +        private boolean openInNewWindow = false;
           
           // remove the onClick=, onBlur=, etc. - Attributes
           private boolean removeOnSomething = true;
           
  +        private boolean inScript = false;
  +        private boolean inStyle = false;
  +        
           private StringWriter result = new StringWriter();
           
           private Callback () {
  @@ -223,7 +360,7 @@
           {
               // to allow for implementation using Stringbuffer or StringWriter
               // I don't know yet, which one is better in this case
  -            if (ignoreLevel > 0 ) return this;
  +            if (ignoreLevel > 0) return this;
   
               try {
                   result.write(txt.toString());
  @@ -234,7 +371,7 @@
           private Callback addToResult(char[] txt)
           {
               if (ignoreLevel > 0) return this;
  -            
  +
               try {
                   result.write(txt);
               } catch (Exception e) { /* ignore */ }
  @@ -260,8 +397,20 @@
               // nothing to do here ...
           }
   
  +        /** 
  +         * Because Scripts and Stlyle sometimes are defined in comments, thoese
  +         * will be written. Otherwise comments are removed
  +         */
           public void handleComment(char[] values,int param) {
  -            // we ignore them 
  +            if ( !( inStyle || inScript))
  +                return;
  +
  +            try {
  +                result.write("<!--");
  +                result.write(values);
  +                result.write("-->");
  +            } catch (Exception e) { /* ignore */ }
  +          // we ignore them 
           }
   
           public void handleEndOfLineString(java.lang.String str) {
  @@ -275,13 +424,12 @@
           public void handleSimpleTag(HTML.Tag tag,MutableAttributeSet attrs,int param) {
               if (removeMeta && (tag == HTML.Tag.META)) {
                   return;
  -            }
  -            
  +            }            
               appendTagToResult(tag,attrs);        
           }
   
           public void handleStartTag(HTML.Tag tag,  MutableAttributeSet attrs, int position)
{
  -          appendTagToResult(tag,attrs);
  +            appendTagToResult(tag,attrs);
           }
   
           public void handleEndTag(HTML.Tag tag, int position) {
  @@ -297,6 +445,12 @@
               }
               
               
  +            if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
  +                inScript = false;
  +            } else if ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
  +                inStyle = false;
  +            }
  +
               if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
                   ignoreLevel --;
               } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
  @@ -328,47 +482,86 @@
               
               convertURLS(tag,attrs);
               Enumeration e = attrs.getAttributeNames();
  +            if (tag == HTML.Tag.BASE)
  +                return;
  +            
               addToResult("<").addToResult(tag);
               while (e.hasMoreElements()) {
  -              Object attr = e.nextElement();
  -              String onsomething = attr.toString().substring(0,2).toUpperCase();
  -              if (removeOnSomething && (! onsomething.equals("ON"))) {  
  -                  // filter the onClick, onThis, onThat-Attributes
  -                  String value = attrs.getAttribute(attr).toString();
  -                  addToResult(" ").addToResult(attr).addToResult("=\"").addToResult(value).addToResult("\"");
  -              }
  +                Object attr = e.nextElement();
  +                String attrName = attr.toString();
  +                String value = attrs.getAttribute(attr).toString();
  +
  +                // include attribute only when Not(RemoveOnSomething = True and starts
with "on")
  +                if (!(removeOnSomething
  +                && attrName.toLowerCase().startsWith("on")
  +                && (attrName.length() > 2))) {
  +                    // Attribute included
  +                    addToResult(" ").addToResult(attr).addToResult("=\"")
  +                    .addToResult(value).addToResult("\"");
  +                }
               }
               addToResult(">");
           }
  -
  -        
  +                   
           /** Here the magic happens.
            *
            * If someone wants new types of URLs to be rewritten, add them here
            * @param tag TAG from the Callback-Interface
            * @param attrs Attribute-Set from the Callback-Interface
  - */
  +         */
           
           private void convertURLS( HTML.Tag tag, MutableAttributeSet attrs ) {
   
              // first we do an URL-rewrite on different tags
               
  -            if ((tag == HTML.Tag.A) && (attrs.getAttribute(HTML.Attribute.HREF)
!= null)) {
  -                // ---- CHECKING <A HREF
  -                addConvertedAttribute( HTML.Attribute.HREF,
  -                                       attrs );
  -            } else if (((tag == HTML.Tag.IMG) || (tag == HTML.Tag.INPUT)) && (attrs.getAttribute(HTML.Attribute.SRC)
!= null)) {
  +            if (tag == HTML.Tag.A) {
  +                if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
  +                    // ---- CHECKING <A HREF
  +                    addConvertedAttribute( HTML.Attribute.HREF,
  +                    attrs );
  +                }
  +                if ((attrs.getAttribute(HTML.Attribute.TARGET) == null) && cb.openInNewWindow)
{
  +                    attrs.addAttribute(HTML.Attribute.TARGET, "_BLANK");
  +                }
  +            } else if (((tag == HTML.Tag.IMG) || (tag == HTML.Tag.INPUT) || (tag == HTML.Tag.SCRIPT))
  +                         && (attrs.getAttribute(HTML.Attribute.SRC) != null)) {
                   // ---- CHECKING <IMG SRC & <INPUT SRC
                   addConvertedAttribute( HTML.Attribute.SRC,
                                          attrs );
  -            } else if (((tag == HTML.Tag.OPTION) ) && (attrs.getAttribute(HTML.Attribute.VALUE)
!= null)) {
  -                // ---- CHECKING <OPTION 
  -                addConvertedAttribute( HTML.Attribute.VALUE,
  -                                       attrs );
               } else if ( tag == HTML.Tag.APPLET ) {
                   // ---- CHECKING <APPLET CODEBASE=
  -                addConvertedAttribute( HTML.Attribute.CODEBASE,
  -                                       attrs );
  +                if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
  +                    int endOfPath = baseUrl.toString().lastIndexOf("/");
  +                    attrs.addAttribute(HTML.Attribute.CODEBASE, 
  +                                       baseUrl.toString().substring(0,endOfPath +1));
  +                } else {
  +                    addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
  +                }
  +            } else if (tag == HTML.Tag.OBJECT) {
  +                // ---- CHECKING <OBJECT CODEBASE=
  +                if (attrs.getAttribute(HTML.Attribute.CODEBASE) == null) {
  +                    int endOfPath = baseUrl.toString().lastIndexOf("/");
  +                    attrs.addAttribute(HTML.Attribute.CODEBASE, 
  +                                       baseUrl.toString().substring(0,endOfPath +1));
  +                } else {
  +                    addConvertedAttribute( HTML.Attribute.CODEBASE, attrs );
  +                }
  +            } else if (tag == HTML.Tag.BODY) {
  +                if (attrs.getAttribute(HTML.Attribute.BACKGROUND) != null) {
  +                    // background images are applied to the ENTIRE page, this remove them!
  +                    attrs.removeAttribute( HTML.Attribute.BACKGROUND);
  +                }
  +            } else if (tag == HTML.Tag.BASE) {
  +                if (attrs.getAttribute(HTML.Attribute.HREF) != null) {
  +                    try {
  +                        baseUrl = new URL(attrs.getAttribute(HTML.Attribute.HREF).toString());
  +                    } catch (Throwable t) {
  +                        Log.error( "HTMLRewriter: Setting BASE=" 
  +                        + attrs.getAttribute(HTML.Attribute.HREF).toString()
  +                        + t.getMessage());
  +                    }
  +                    attrs.removeAttribute(HTML.Attribute.HREF);
  +                }
               } else if (tag == HTML.Tag.FORM) {
                   // ---- CHECKING <FORM ACTION=
                     inForm = true; // buggy <form> handling in jdk 1.3 
  @@ -396,6 +589,12 @@
               // don't forget to add changes to  handleEndTag() as well, else 
               // things will get screwed up!
               
  +            if ( (removeScript == false) && (tag == HTML.Tag.SCRIPT)) {
  +                inScript = true;
  +            } else if ( (removeStyle == false) && (tag == HTML.Tag.STYLE)) {
  +                inStyle = true;
  +            }
  +
               if ( removeScript && (tag == HTML.Tag.SCRIPT)) {
                     ignoreLevel ++;
               } else if ( removeStyle && (tag == HTML.Tag.STYLE)) {
  @@ -409,7 +608,6 @@
               } else if (removeNoScript && (tag.toString().equalsIgnoreCase("NOSCRIPT")))
{
                     ignoreLevel ++;
               }
  -
           }
   
           /**
  @@ -427,29 +625,25 @@
           }
                 
                 
  -        private String generateNewUrl(String oldURL) 
  -        {
  +        private String generateNewUrl(String oldURL) {
               try {
                   URL x = new URL(baseUrl,oldURL);
                   return x.toString();
  -            } catch (Throwable t)
  -            {
  -                //FIXME: transient print to debug...
  -                System.err.print( "HTMLRewriter: BASE=" );
  -                System.err.print( baseUrl );
  -                System.err.print( "old=" );
  -                System.err.println( oldURL );
  -                t.printStackTrace();
  -                return oldURL; // default behaviour ... 
  +            } catch (Throwable t) {
  +                if (oldURL.toLowerCase().startsWith("javascript:")) {
  +                    return oldURL;
  +                }
  +                Log.error( "HTMLRewriter: Setting BASE="
  +                + baseUrl
  +                + " Old = "
  +                + oldURL
  +                + t.getMessage());
  +                return oldURL; // default behaviour ...
               }
           }
   
           public void handleText(char[] values,int param) {
               addToResult(values);
           }
  -        
       }
  -    
   }
  -
  -
  
  
  

--
To unsubscribe, e-mail:   <mailto:jetspeed-dev-unsubscribe@jakarta.apache.org>
For additional commands, e-mail: <mailto:jetspeed-dev-help@jakarta.apache.org>


Mime
View raw message