hc-httpclient-users mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From Zhu Wayne <zhuw.chic...@gmail.com>
Subject Can't get a complete page source with HttpClient
Date Wed, 25 May 2011 14:46:40 GMT
Greetings!
I tried to get a complete page source like the one generated by a web
browser. However, HttpClient-generated file sizes change from time to
time, and I miss quite a lot of lines comparing to the browser
version. I am using an Amazon URL as an example since its page is
always super-sized.
I am really puzzled by this. What could be the cause? User agent,
buffer size, etc.  Thanks.

Here is the source code:

import java.io.BufferedReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.PrintWriter;


import org.apache.http.HttpEntity;
import org.apache.http.HttpResponse;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.params.ClientPNames;
import org.apache.http.client.params.CookiePolicy;
import org.apache.http.impl.client.DefaultConnectionKeepAliveStrategy;
import org.apache.http.impl.client.DefaultHttpClient;



public class RawHttpWebPageFetcher {

    public static String getRaw(String url) throws
ClientProtocolException, IOException {
        HttpClient httpclient = new DefaultHttpClient();
        HttpGet httpget = new HttpGet(url);
        HttpResponse response = httpclient.execute(httpget);
        HttpEntity entity = response.getEntity();
        StringBuilder sb = new StringBuilder();
        if (entity != null) {
            BufferedReader isr = new BufferedReader(new
InputStreamReader(entity.getContent(), "UTF-8"));
            int byteR;
            while((byteR = isr.read()) != -1) {
                char ch = (char) byteR;
                if(ch != '\n' && ch != '\r')
                    sb.append(ch);
            }
            isr.close();
        }
        return sb.toString();
    }


    static  public void main(String [] args) throws
ClientProtocolException, IOException {
        String url =
"http://www.amazon.com/Nikon-D3100-Digital-18-55mm-3-5-5-6/dp/B003ZYF3LO/ref=zg_bs_281052_3";
        String oneLiner = RawHttpWebPageFetcher.getRaw(url);
        String[] outputFileNames = url.split("://");
        String outputFileName = outputFileNames[1].replaceAll("/",
"-").replaceAll("\\.","-");
        PrintWriter out = new PrintWriter(new
FileWriter(outputFileName.concat(".html")));
        System.out.println(outputFileName);
        out.print(oneLiner);
        out.close();
    }
}

---------------------------------------------------------------------
To unsubscribe, e-mail: httpclient-users-unsubscribe@hc.apache.org
For additional commands, e-mail: httpclient-users-help@hc.apache.org


Mime
View raw message