incubator-droids-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From thors...@apache.org
Subject svn commit: r939648 - /incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
Date Fri, 30 Apr 2010 12:00:39 GMT
Author: thorsten
Date: Fri Apr 30 12:00:39 2010
New Revision: 939648

URL: http://svn.apache.org/viewvc?rev=939648&view=rev
Log:
DROIDS-72
Reporter: Richard Frovarp
Patch: Richard Frovarp
review: thorsten

Modified:
    incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java

Modified: incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
URL: http://svn.apache.org/viewvc/incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java?rev=939648&r1=939647&r2=939648&view=diff
==============================================================================
--- incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
(original)
+++ incubator/droids/trunk/droids-core/src/main/java/org/apache/droids/parse/html/LinkExtractor.java
Fri Apr 30 12:00:39 2010
@@ -17,6 +17,7 @@
 package org.apache.droids.parse.html;
 
 import java.net.URI;
+import java.net.URISyntaxException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
@@ -34,6 +35,17 @@ import org.xml.sax.helpers.DefaultHandle
 
 public class LinkExtractor extends DefaultHandler 
 {
+
+  /**
+   * Name of element that may contain base URI
+   */
+  private static final String BASE_ELEMENT = "base";
+
+  /**
+   * Name of attribute for base URI
+   */
+  private static final String BASE_ATTRIBUTE = "href";
+
   protected final Log log = LogFactory.getLog(this.getClass());
 
   /**
@@ -55,7 +67,17 @@ public class LinkExtractor extends Defau
    * Set of URIs visited yet
    */
   private Set<String> history = null;
-
+  
+  /**
+   * Base URI for resolving
+   */
+  private URI baseUri = null;
+  
+  /**
+   * Check for base elements
+   */
+  private boolean checkBase = true;
+  
   /**
    * The parsed link
    */
@@ -65,6 +87,7 @@ public class LinkExtractor extends Defau
     super();
     this.base = base;
     this.elements = elements;
+    this.baseUri = base.getURI();
   }
   
   @Override
@@ -76,6 +99,17 @@ public class LinkExtractor extends Defau
   @Override
   public void startElement(String uri, String loc, String raw, Attributes att) throws SAXException

   {
+    if(checkBase && BASE_ELEMENT.equalsIgnoreCase(loc) && att.getValue(BASE_ATTRIBUTE)
!= null) {
+      try {
+        baseUri = new URI(att.getValue(BASE_ATTRIBUTE));
+        log.debug("Found base URI: " + baseUri);
+        checkBase = false;
+      } 
+      catch ( URISyntaxException e) {
+        log.error("Base URI not valid: " + att.getValue(BASE_ATTRIBUTE));
+      }
+    }
+    
     Iterator<String> it = elements.keySet().iterator();
     String elem, linkAtt;
     while (it.hasNext()) {
@@ -132,7 +166,7 @@ public class LinkExtractor extends Defau
     try {
       if (!target.toLowerCase().startsWith("javascript")
           && !target.contains(":/")) {
-        return base.getURI().resolve(target.split("#")[0]);
+        return baseUri.resolve(target.split("#")[0]);
       } 
       else if (!target.toLowerCase().startsWith("javascript")) {
         return new URI(target.split("#")[0]);



Mime
View raw message