利用java的HttpURLConnection捕获网页信息,同时记录链接的sessionId。
考虑到网页编码方式的不同,自动解析网页内容,并捕获其编码方式,并按该编码方式读取网内容。但也存在一些并非正规网页,捕获不到charset,默认iso-8859-1方式读取网页内容。
其中该处记录sessionId可以为了下次再次访问使用同一个sessionId,欺骗服务器还在同一个会话中,比如登陆验证之类的...。
import java.io.BufferedReader;import java.io.InputStreamReader;import java.net.HttpURLConnection;import java.net.URL;/** * java访问获取web页面信息 * 可以记录sessionId供登录相关使用... * @author Arthur126 * @date 2015-8-21 下午10:00:14 * */public class CallHttpTest { /** * 网址被访问记录的sessionId */ private static String SESSION_ID = ""; public static String callHttp(String callURL) throws Exception { String result = ""; URL u0 = new URL(callURL); HttpURLConnection conn = (HttpURLConnection) u0.openConnection(); conn.setRequestMethod("POST"); conn.setRequestProperty("Content-Type", "text/plain"); conn.setRequestProperty("Content-Language", "en-US"); conn.setConnectTimeout(30000); conn.setReadTimeout(30000); conn.setUseCaches(false); conn.setDoInput(true); conn.setDoOutput(true); if (SESSION_ID != null && !"".equals(SESSION_ID)) { // 已经记录sessionId则放入session中 conn.setRequestProperty("Cookie", SESSION_ID); } else if ((SESSION_ID == null || "".equals(SESSION_ID)) && conn != null) { // 捕获sessionId String key = null; for (int i = 1; (key = conn.getHeaderFieldKey(i)) != null; i++) { if (key.equalsIgnoreCase("set-cookie")) { SESSION_ID = conn.getHeaderField(key); SESSION_ID = SESSION_ID.substring(0, SESSION_ID.indexOf(";")); break; } } } // 自动捕获网页编码,并按其编码方式读取网页内容 String charset = getChareset(conn.getContentType()); BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream(), charset)); StringBuffer buffer = new StringBuffer(); String line; while ((line = reader.readLine()) != null) { buffer.append(line); } reader.close(); result = buffer.toString(); // 非正规网页,默认iso-8859-1读取网页内容 /*InputStream in = conn.getInputStream(); StringBuffer buffer = new StringBuffer(); int count = 0; while (count != -1) { count = in.read(); if (count != -1) { buffer.append((char) count); } } in.close(); result = new String(buffer.toString().getBytes("iso-8859-1"), "UTF-8");*/ conn.disconnect(); return result; } /** * 获取网页编码方式 * @param contentType * @return */ public static String getChareset(String contentType) { int i = contentType == null ? -1 : contentType.indexOf("charset="); return i == -1 ? "UTF-8" : contentType.substring(i + 8); } public static void main(String[] args) throws Exception { System.out.println(callHttp("https://www.baidu.com/")); System.out.println(SESSION_ID); }}
测试结果:
页面不存在_百度搜索 .......略 __bsi=14410402226605058380_00_12_R_N_2_0301_002F_N_I_I_0