记录一次java语言使用httpclient爬取网站接口数据的经历
需要用到的依赖:
httpclient和httpcore是封装了http请求的工具类
jsoup可以将返回的网页html找到你需要的xml节点,很方便
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version> <!-- 请检查并使用最新版本 -->
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpcore</artifactId>
<version>4.4.14</version> <!-- 请检查并使用最新版本 -->
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
<version>1.13.1</version>
</dependency>
java类:
需要将网站请求中的cookie配置到BasicClientCookie 对象中,然后添加到请求中去,如何获取cookie文章最后有截图
package com.utils;
import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.io.IOException;
public class HttpClientWithCookieExample {
public static void main(String[] args) throws InterruptedException {
// 创建一个Cookie存储对象(支持多个cookie)
CookieStore cookieStore = new BasicCookieStore();
// 创建一个Cookie并设置属性
BasicClientCookie cookie = new BasicClientCookie("ASP.NET_SessionId", "mkuq512333ljwcqkfq4i");
cookie.setDomain("abc.com");
cookie.setPath("/");
BasicClientCookie cookie1 = new BasicClientCookie("Email", "abc@qq.com");
cookie1.setDomain("abc.com");
cookie1.setPath("/");
BasicClientCookie cookie2 = new BasicClientCookie("Password", "511B0D5F341BDDBD9A5348923B48D14C");
cookie2.setDomain("abc.com");
cookie2.setPath("/");
// 将Cookie添加到Cookie存储中
cookieStore.addCookie(cookie);
cookieStore.addCookie(cookie1);
cookieStore.addCookie(cookie2);
// 创建一个HttpClientContext对象,并将Cookie存储设置进去
HttpClientContext context = HttpClientContext.create();
context.setCookieStore(cookieStore);
// 创建HttpClient
HttpClient httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build();
extracted_area( context, httpClient);
}
/**
* 爬取区域信息
* @param context
* @param httpClient
* @throws InterruptedException
*/
private static void extracted_area(HttpClientContext context, HttpClient httpClient) throws InterruptedException {
int page = 1;
HttpGet request = null;
for (int i = 1; i<= page; i++){
// 创建一个HttpGet请求,用于发送HTTP GET请求
request = new HttpGet("https://abc.com/adminKdUser/GuanLi/AreaList.aspx");
// 设置请求头
try {
// 使用HttpClient发送请求
HttpResponse response = httpClient.execute(request, context);
String result = "";
if (response != null) {
int statusCode = response.getStatusLine().getStatusCode();
result = EntityUtils.toString(response.getEntity(), "utf-8");
//System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);
Document doc = Jsoup.parse(result);
Elements tables = doc.select("table");
if (tables == null){
System.out.println("第"+i+"页===终止");
break;
}
System.out.println("第"+i+"页==="+tables.html());
/* if (result.contains("<div class=\"content\">")){
int s = result.indexOf("<div class=\"content\">");
result = result.substring(s);
System.out.println("截取后返内容:" + result);
}*/
JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");
}
} catch (IOException e) {
System.out.println(i+"解析失败");
}finally{
Thread.sleep(1000);
}
}
}
private static void extracted_fanyi(HttpClientContext context, HttpClient httpClient) throws InterruptedException {
int page = 984;
HttpGet request = null;
for (int i = 1; i<= page; i++){
// 创建一个HttpGet请求,用于发送HTTP GET请求
request = new HttpGet("https://abc.com/123/GuanLi/FanYiList.aspx?page="+i);
// 设置请求头
try {
// 使用HttpClient发送请求
HttpResponse response = httpClient.execute(request, context);
String result = "";
if (response != null) {
int statusCode = response.getStatusLine().getStatusCode();
result = EntityUtils.toString(response.getEntity(), "utf-8");
//System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);
Document doc = Jsoup.parse(result);
Elements tables = doc.select("table");
if (tables == null){
System.out.println("第"+i+"页===终止");
break;
}
System.out.println("第"+i+"页==="+tables.html());
/* if (result.contains("<div class=\"content\">")){
int s = result.indexOf("<div class=\"content\">");
result = result.substring(s);
System.out.println("截取后返内容:" + result);
}*/
JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");
}
} catch (IOException e) {
System.out.println(i+"解析失败");
}finally{
Thread.sleep(1000);
}
}
}
private static void extracted( HttpClientContext context, HttpClient httpClient) throws InterruptedException {
int page = 2415;
HttpGet request = null;
for (int i = 1; i<= page; i++){
// 创建一个HttpGet请求,用于发送HTTP GET请求
request = new HttpGet("https://abc.com/123/User/GoodRecordList.aspx?page="+i);
// 设置请求头
try {
// 使用HttpClient发送请求
HttpResponse response = httpClient.execute(request, context);
String result = "";
if (response != null) {
int statusCode = response.getStatusLine().getStatusCode();
result = EntityUtils.toString(response.getEntity(), "utf-8");
//System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);
Document doc = Jsoup.parse(result);
Elements tables = doc.select("table");
if (tables == null){
System.out.println("第"+i+"页===终止");
break;
}
System.out.println("第"+i+"页==="+tables.html());
/* if (result.contains("<div class=\"content\">")){
int s = result.indexOf("<div class=\"content\">");
result = result.substring(s);
System.out.println("截取后返内容:" + result);
}*/
JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");
}
} catch (IOException e) {
System.out.println(i+"解析失败");
}finally{
Thread.sleep(1000);
}
}
}
}
此处不方便透露实际网站,就用百度来作例子,取请求标头中的cookie内容,并且拼接到BasicClientCookie中即可