commons-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject svn commit: r1649223 - /commons/proper/validator/trunk/src/test/java/org/apache/commons/validator/routines/DomainValidatorTest.java
Date Sat, 03 Jan 2015 17:35:15 GMT
Author: sebb
Date: Sat Jan  3 17:35:15 2015
New Revision: 1649223

URL: http://svn.apache.org/r1649223
Log:
Parse html page to get tld types and comments

Modified:
    commons/proper/validator/trunk/src/test/java/org/apache/commons/validator/routines/DomainValidatorTest.java

Modified: commons/proper/validator/trunk/src/test/java/org/apache/commons/validator/routines/DomainValidatorTest.java
URL: http://svn.apache.org/viewvc/commons/proper/validator/trunk/src/test/java/org/apache/commons/validator/routines/DomainValidatorTest.java?rev=1649223&r1=1649222&r2=1649223&view=diff
==============================================================================
--- commons/proper/validator/trunk/src/test/java/org/apache/commons/validator/routines/DomainValidatorTest.java
(original)
+++ commons/proper/validator/trunk/src/test/java/org/apache/commons/validator/routines/DomainValidatorTest.java
Sat Jan  3 17:35:15 2015
@@ -28,14 +28,17 @@ import java.lang.reflect.Modifier;
 import java.net.HttpURLConnection;
 import java.net.URL;
 import java.text.SimpleDateFormat;
-import java.util.ArrayList;
 import java.util.Arrays;
-import java.util.Collections;
 import java.util.Date;
+import java.util.HashMap;
 import java.util.HashSet;
-import java.util.List;
+import java.util.Iterator;
 import java.util.Locale;
+import java.util.Map;
 import java.util.Set;
+import java.util.TreeMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
 
 import junit.framework.TestCase;
 
@@ -105,29 +108,29 @@ public class DomainValidatorTest extends
         assertFalse("empty string shouldn't validate as TLD", validator.isValid(""));
         assertFalse("null shouldn't validate as TLD", validator.isValid(null));
     }
-    
+
     public void testAllowLocal() {
        DomainValidator noLocal = DomainValidator.getInstance(false);
        DomainValidator allowLocal = DomainValidator.getInstance(true);
-       
+
        // Default is false, and should use singletons
        assertEquals(noLocal, validator);
-       
+
        // Default won't allow local
        assertFalse("localhost.localdomain should validate", noLocal.isValid("localhost.localdomain"));
        assertFalse("localhost should validate", noLocal.isValid("localhost"));
-       
+
        // But it may be requested
        assertTrue("localhost.localdomain should validate", allowLocal.isValid("localhost.localdomain"));
        assertTrue("localhost should validate", allowLocal.isValid("localhost"));
        assertTrue("hostname should validate", allowLocal.isValid("hostname"));
        assertTrue("machinename should validate", allowLocal.isValid("machinename"));
-       
+
        // Check the localhost one with a few others
        assertTrue("apache.org should validate", allowLocal.isValid("apache.org"));
        assertFalse("domain name with spaces shouldn't validate", allowLocal.isValid(" apache.org
"));
     }
-    
+
     public void testIDN() {
        assertTrue("b\u00fccher.ch in IDN should validate", validator.isValid("www.xn--bcher-kva.ch"));
     }
@@ -162,39 +165,17 @@ public class DomainValidatorTest extends
     public static void main(String a[]) throws Exception {
         Set ianaTlds = new HashSet(); // keep for comparison with array contents
         DomainValidator dv = DomainValidator.getInstance();;
-        File f = new File("target/tlds-alpha-by-domain.txt");
-        String tldurl="http://data.iana.org/TLD/tlds-alpha-by-domain.txt";
-        HttpURLConnection hc = (HttpURLConnection) new URL(tldurl).openConnection();
-        if (f.canRead()) {
-            SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z");//Sun,
06 Nov 1994 08:49:37 GMT
-            long modTime = f.lastModified();
-            String since = sdf.format(new Date(modTime));
-            hc.addRequestProperty("If-Modified-Since", since);
-            System.out.println("Found " + f + " with date " + since);       
-        }   
-        if (hc.getResponseCode() == 304) {
-            System.out.println("Already have most recent " + tldurl);       
-        } else {
-            System.out.println("Downloading " + tldurl);
-            byte buff[] = new byte[1024];
-            InputStream is = hc.getInputStream();
-            
-            FileOutputStream fos = new FileOutputStream(f);
-            int len;
-            while((len=is.read(buff)) != -1) {
-                fos.write(buff, 0, len);
-            }
-            fos.close();
-            is.close();
-            System.out.println("Done");
-        }
-        BufferedReader br = new BufferedReader(new FileReader(f));
+        File txtFile = new File("target/tlds-alpha-by-domain.txt");
+        long timestamp = download(txtFile, "http://data.iana.org/TLD/tlds-alpha-by-domain.txt",
0L);
+        final File htmlFile = new File("target/tlds-alpha-by-domain.html");
+        download(htmlFile,"http://www.iana.org/domains/root/db", timestamp);
+
+        BufferedReader br = new BufferedReader(new FileReader(txtFile));
         String line;
         final String header;
         line = br.readLine(); // header
         if (line.startsWith("# Version ")) {
             header = line.substring(2);
-            System.out.println("        // Taken from " + header);
         } else {
             br.close();
             throw new IOException("File does not have expected Version header");
@@ -204,33 +185,46 @@ public class DomainValidatorTest extends
             System.err.println("Cannot convert XN-- entries (no access to java.net.IDN)");
         }
 
-        List missing = new ArrayList();
+        // Parse html page to get entries
+        Map htmlInfo = getHtmlInfo(htmlFile);
+        Map missingTLD = new TreeMap(); // stores entry and comments as String[]
+        Map missingCC = new TreeMap();
         while((line = br.readLine()) != null) {
             if (!line.startsWith("#")) {
                 final String item;
+                final String key = line.toLowerCase(Locale.ENGLISH);
                 if (line.startsWith("XN--")) {
                     if (toUnicode != null) {
-                        item = toUnicode(toUnicode, line);                        
+                        item = toUnicode(toUnicode, line);
                     } else {
                         continue;
                     }
                 } else {
-                    item = line.toLowerCase(Locale.ENGLISH);
+                    item = key;
                 }
                 if (!dv.isValidTld(item)) {
-                    missing.add(item);
+                    String [] info = (String[]) htmlInfo.get(key);
+                    if (info != null) {
+                        String type = info[0];
+                        String comment = info[1];
+                        if ("country-code".equals(type)) {
+                            missingCC.put(item, key + " " + comment);
+                        } else {
+                            missingTLD.put(item, key + " " + comment);
+                        }
+                    } else {
+                        System.err.println("Expected to find info for "+ key);
+                    }
                 }
                 ianaTlds.add(item);
             }
         }
         br.close();
-        if (!missing.isEmpty()) {
-            System.out.println("Entries missing from TLD List\n");
-            Collections.sort(missing); // XN-- entries are not in sorted order once converted
-            for(int i = 0; i < missing.size(); i++) {
-              System.out.println("        \""+missing.get(i)+"\",");
-            }
-            System.out.println("\nDone");
+        if (!missingTLD.isEmpty()) {
+            printMap(header, missingTLD, "TLD");
+        }
+        if (!missingCC.isEmpty()) {
+            printMap(header, missingCC, "CC");
         }
         // Check if internal tables contain any additional entries
         isInIanaList("INFRASTRUCTURE_TLDS", ianaTlds);
@@ -239,6 +233,106 @@ public class DomainValidatorTest extends
         // Don't check local TLDS isInIanaList("LOCAL_TLDS", ianaTlds);
     }
 
+    private static void printMap(final String header, Map map, String string) {
+        System.out.println("Entries missing from "+ string +" List\n");
+        if (header != null) {
+            System.out.println("        // Taken from " + header);
+        }
+        Iterator it = map.entrySet().iterator();
+        while(it.hasNext()){
+            Map.Entry me = (Map.Entry)it.next();
+            System.out.println("        \"" + me.getKey() + "\", // " + me.getValue());
+        }
+        System.out.println("\nDone");
+    }
+
+    private static Map getHtmlInfo(final File f) throws IOException {
+        final Map info = new HashMap();
+
+//        <td><span class="domain tld"><a href="/domains/root/db/ax.html">.ax</a></span></td>
+        final Pattern domain = Pattern.compile(".*<a href=\"/domains/root/db/([^.]+)\\.html");
+//        <td>country-code</td>
+        final Pattern type = Pattern.compile("\\s+<td>([^<]+)</td>");
+//        <!-- <td>Åland Islands<br/><span class="tld-table-so">Ålands
landskapsregering</span></td> </td> -->
+//        <td>Ålands landskapsregering</td>
+        final Pattern comment = Pattern.compile("\\s+<td>([^<]+)</td>");
+
+        final BufferedReader br = new BufferedReader(new FileReader(f));
+        String line;
+        while((line=br.readLine())!=null){
+            Matcher m = domain.matcher(line);
+            if (m.lookingAt()) {
+                String dom = m.group(1);
+                String typ = "??";
+                String com = "??";
+                line = br.readLine();
+                Matcher t = type.matcher(line);
+                if (t.lookingAt()) {
+                    typ = t.group(1);
+                    line = br.readLine();
+                    if (line.matches("\\s+<!--.*")) {
+                        while(!line.matches(".*-->.*")){
+                            line = br.readLine();
+                        }
+                        line = br.readLine();
+                    }
+                    // Should have comment; is it wrapped?
+                    while(!line.matches(".*</td>.*")){
+                        line += " " +br.readLine();
+                    }
+                    Matcher n = comment.matcher(line);
+                    if (n.lookingAt()) {
+                        com = n.group(1);
+                    }
+                    info.put(dom.toLowerCase(Locale.ENGLISH), new String[]{typ, com});
+//                    System.out.println(dom + " " + typ + " " +com);
+                }
+            }
+        }
+        br.close();
+        return info;
+    }
+
+    /*
+     * Download a file if it is more recent than our cached copy.
+     * Unfortunately the server does not seem to honour If-Modified-Since for the
+     * Html page, so we check if it is newer than the txt file and skip download if so
+     */
+    private static long download(File f, String tldurl, long timestamp) throws IOException
{
+        if (timestamp > 0 && f.canRead()) {
+            long modTime = f.lastModified();            
+            if (modTime > timestamp) {
+                System.out.println("Skipping download - found recent " + f);
+                return modTime;
+            }
+        }
+        HttpURLConnection hc = (HttpURLConnection) new URL(tldurl).openConnection();
+        if (f.canRead()) {
+            long modTime = f.lastModified();            
+            SimpleDateFormat sdf = new SimpleDateFormat("EEE, dd MMM yyyy HH:mm:ss z");//Sun,
06 Nov 1994 08:49:37 GMT
+            String since = sdf.format(new Date(modTime));
+            hc.addRequestProperty("If-Modified-Since", since);
+            System.out.println("Found " + f + " with date " + since);
+        }
+        if (hc.getResponseCode() == 304) {
+            System.out.println("Already have most recent " + tldurl);
+        } else {
+            System.out.println("Downloading " + tldurl);
+            byte buff[] = new byte[1024];
+            InputStream is = hc.getInputStream();
+
+            FileOutputStream fos = new FileOutputStream(f);
+            int len;
+            while((len=is.read(buff)) != -1) {
+                fos.write(buff, 0, len);
+            }
+            fos.close();
+            is.close();
+            System.out.println("Done");
+        }
+        return f.lastModified();
+    }
+
     private static String toUnicode(Method m, String line) {
         try {
             return (String) m.invoke(null, new String[]{line.toLowerCase(Locale.ENGLISH)});



Mime
View raw message