爬虫爬取快递100数据
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.HttpURLConnection;
import java.net.InetSocketAddress;
import java.net.Socket;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.lang.Integer;
import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import HRP.Comm.Util.DotNetToJavaStringHelper;
import HRP.Comm.Util.HLogger;
import HRP.Comm.Util.HttpClientUtil;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
public class Kuaidi100Util {
//查询快递单号所属物流公司编码
public static String QUERYCOMPANYURL = “http://m.kuaidi100.com/apicenter/kdquerytools.do?method=autoComNum&text=”;
//查询快递单号物流信息
public static final String LOGISTICSURL = “https://www.kuaidi100.com/query?type=%s&postid=%s&temp=%s&phone=%s”;
//获取CSRFTOKEND
public static final String TOKENURL = “https://www.kuaidi100.com/”;
public static final String ACCEPT = “*/*”;
public static final String ACCEPTENCODING = “gzip, deflate, br”;
public static final String ACCEPTLANGUAGE = “zh-CN,zh;q=0.9”;
public static final String CONNECTION = “keep-alive”;
public static String COOKIE = “csrftoken=%s;Hm_lvt_22ea01af58ba2be0fec7c11b25e88e6c=1560406501,1560413520,1560483221,1560492961; WWWID=WWWBE7FD902D10B00E861E80874FE337FF6; Hm_lpvt_22ea01af58ba2be0fec7c11b25e88e6c=1560498038”;
public static String TOKENCOOKIE = “Hm_lvt_22ea01af58ba2be0fec7c11b25e88e6c=1560406501,1560413520,1560483221,1560492961; WWWID=WWWBE7FD902D10B00E861E80874FE337FF6; Hm_lpvt_22ea01af58ba2be0fec7c11b25e88e6c=1560498038”;
public static final String HOST = “www.kuaidi100.com”;
public static final String REFERER = “https://www.kuaidi100.com/all/sf.shtml”;
public static final String USERAGENT = “Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36”;
public static String companyName = “”;
public String resultCode = “failed”;
private static final String ERRORCODE1 = “快递公司参数异常:验证码错误”;
private static final String ERRORCODE2 = “非法访问:IP禁止访问”;
private static List<Map<String, String>> proxyList = new ArrayList<Map<String, String>>();
//爬取4页共60个代理IP
private static String[] proxyUrls = {
”http://www.89ip.cn/index_1.html”,
”http://www.89ip.cn/index_2.html”,
”http://www.89ip.cn/index_3.html”,
”http://www.89ip.cn/index_4.html”
};
/**
* 获取CSRFTOKEND
*/
public static String getScrtoken() {
String csrftoken = “”;
CloseableHttpClient client = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(Kuaidi100Util.TOKENURL);
//设置请求头信息
httpGet.setHeader(“Accept”, Kuaidi100Util.ACCEPT);
httpGet.setHeader(“Accept-Encoding”, Kuaidi100Util.ACCEPTENCODING);
httpGet.setHeader(“Accept-Language”, Kuaidi100Util.ACCEPTLANGUAGE);
httpGet.setHeader(“Connection”, Kuaidi100Util.CONNECTION);
httpGet.setHeader(“Cookie”, Kuaidi100Util.TOKENCOOKIE);
httpGet.setHeader(“Host”, Kuaidi100Util.HOST);
httpGet.setHeader(“User-Agent”, Kuaidi100Util.USERAGENT);
CloseableHttpResponse response = null;
try {
response = client.execute(httpGet);
int code = response.getStatusLine().getStatusCode();
if (code == 200) {
Header[] header = response.getHeaders(“Set-Cookie”);
if (header.length > 0) {
for (int i = 0; i < header.length; i++) {
String value = header[i].getValue();
if (value.indexOf(“csrftoken”) != -1) {
//根据规则获取csrftoken
csrftoken = getScrtokenInfo(value);
}
}
} else {
HLogger.info(“get scrtoken failed!”);
csrftoken = “”;
}
} else {
HLogger.info(“get scrtoken failed!”);
csrftoken = “”;
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return csrftoken;
}
/**
* 获取快递单号所属物流公司
* @param trackingNumber 快递单号
*/
@SuppressWarnings(“unchecked”)
public static String getTrackCom(String trackingNumber) {
String url = Kuaidi100Util.QUERYCOMPANYURL + trackingNumber;
String getResult=””;
try {
getResult= HttpClientUtil.doGet(url, false);
JSONObject jsonObject = JSONObject.fromObject(getResult);
Map<String, String> respMapJsons = JSONObject.fromObject(jsonObject);
//根据返回数据规则,获取快递单号所属物流公司
JSONArray arrInfo = JSONArray.fromObject(respMapJsons.get(“auto”));
Boolean flag = arrInfo.isEmpty();
if (!flag) {
JSONObject autoInfo = arrInfo.getJSONObject(0);
Kuaidi100Util.companyName = String.valueOf(autoInfo.get(“name”));
String comCode = String.valueOf(autoInfo.get(“comCode”));
return comCode;
} else {
return “”;
}
} catch (Exception e) {
e.printStackTrace();
}
return “”;
}
/**
* 获取快递单号的物流信息
* @param companyCode 物流公司编码
* @param trackingNumber 快递单号
* @param verification 顺丰物流手机号后四位
*/
public static Map<String, Object> getLogisticsInfo(String companyCode, String trackingNumber, String verification){
Map<String, Object> resultMap = new HashMap<String, Object>();
String tempNum = getTemp();
String csrftoken = getScrtoken();
if (DotNetToJavaStringHelper.isNullOrEmpty(csrftoken)) {
resultMap.put(“code”, “error”);
resultMap.put(“csrftoken”, “csrftoken”);
resultMap.put(“message”, “请求获取物流信息失败”);
return resultMap;
}
String url = String.format(Kuaidi100Util.LOGISTICSURL, companyCode, trackingNumber, tempNum, verification);
String cookie = String.format(Kuaidi100Util.COOKIE, csrftoken);
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader(“Accept”, Kuaidi100Util.ACCEPT);
httpGet.setHeader(“Accept-Encoding”, Kuaidi100Util.ACCEPTENCODING);
httpGet.setHeader(“Accept-Language”, Kuaidi100Util.ACCEPTLANGUAGE);
httpGet.setHeader(“Connection”, Kuaidi100Util.CONNECTION);
httpGet.setHeader(“Cookie”, cookie);
httpGet.setHeader(“Host”, Kuaidi100Util.HOST);
httpGet.setHeader(“Referer”, Kuaidi100Util.REFERER);
httpGet.setHeader(“User-Agent”, Kuaidi100Util.USERAGENT);
CloseableHttpClient client = HttpClients.createDefault();
//设置代理
setProxy(httpGet);
Map<String, String> paramMap = new HashMap<String, String>();
paramMap.put(“trackingNumber”, trackingNumber);
resultMap = getResult(client, httpGet, paramMap);
return resultMap;
}
/**
* 处理请求结果
* @param client
* @param httpGet
* @param params
*/
private static Map<String, Object> getResult(CloseableHttpClient client, HttpGet httpGet, Map<String, String> params) {
Map<String, Object> resultMap = new HashMap<String, Object>();
CloseableHttpResponse response = null;
try {
response = client.execute(httpGet);
int code = response.getStatusLine().getStatusCode();
if (code == 200) {
HttpEntity entity = response.getEntity();
JSONObject jsonObject = JSONObject.fromObject(EntityUtils.toString(entity, “utf-8”));
//200:查询成功 201:快递公司参数异常:单号不存在或者已经过 400:参数错误
String status = jsonObject.getString(“status”);
String nu = jsonObject.getString(“nu”);
String trackingNumber = params.get(“trackingNumber”);
if (“200”.equals(status) && trackingNumber.equals(nu)) {
JSONArray logisticsInfo = JSONArray.fromObject(jsonObject.get(“data”));
List<Map<String, Object>> dataList = new ArrayList<Map<String, Object>>();
for (int i = 0; i < logisticsInfo.size(); i++) {
Map<String, Object> dataMap = new LinkedHashMap<String, Object>();
JSONObject temp = (JSONObject) logisticsInfo.get(i);
dataMap.put(“time”, (String) temp.get(“time”));
dataMap.put(“context”, (String) temp.get(“context”));
dataList.add(dataMap);
}
resultMap.put(“code”, “success”);
resultMap.put(“companyName”, Kuaidi100Util.companyName);
resultMap.put(“data”, dataList);
return resultMap;
} else {
resultMap.put(“code”, “error”);
String message = jsonObject.getString(“message”);
if (ERRORCODE2.equals(message)) {
//非法访问:IP禁止访问
resultMap.put(“message”, “查询物流信息失败,请稍后再查”);
} else if (ERRORCODE1.equals(message)) {
resultMap.put(“message”, “验证码输入有误”);
} else {
resultMap.put(“message”, “请求获取物流信息失败,请稍后再查”);
}
}
} else {
resultMap.put(“code”, “error”);
resultMap.put(“message”, “请求获取物流信息失败”);
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return resultMap;
}
//生成一个16位的小数
private static String getTemp() {
int random = (int) (Math.random()*9+1);
String valueOf = String.valueOf(random);
int hashCode = UUID.randomUUID().toString().hashCode();
if(hashCode<0){
hashCode = -hashCode;
}
String value = “0.” + valueOf + String.format(“%015d”, hashCode);
return value;
}
//截取csrftoken
private static String getScrtokenInfo(String str) {
String result = “”;
if (str != null) {
String[] strArr = str.split(“;”);
for (int i = 0; i < strArr.length; i++) {
String s = strArr[i];
if (s.indexOf(“csrftoken”) != -1) {
result = s.substring(s.indexOf(“=”) + 1);
}
}
}
return result;
}
/**
* 设置代理
* @param httpGet
*/
private static void setProxy(HttpGet httpGet){
proxyList.clear();
for (String proxyUrl : proxyUrls) {
getProxyInfo(proxyUrl);
}
String proxyIp = “”;
int proxyPort = 9999;
int size = proxyList.size();
if (size > 0) {
//连接三次代理,连接不成功则返回失败
for (int i = 0; i < 3; i++) {
int index = (int)(Math.random()*size);
Map<String, String> map = proxyList.get(index);
proxyIp = map.get(“ip”);
try {
proxyPort = Integer.parseInt(map.get(“port”));
} catch (NumberFormatException e) {
e.printStackTrace();
}
boolean flag = isHostConnectable(proxyIp, proxyPort);
if (flag) {
break;
}
}
}
if (!DotNetToJavaStringHelper.isNullOrEmpty(proxyIp)) {
HttpHost proxy = new HttpHost(proxyIp, proxyPort, “http”);
httpGet.setHeader(“x-forwarded-for”, proxyIp);
//RequestConfig requestConfig = RequestConfig.custom().setProxy(proxy).setConnectTimeout(6000).setSocketTimeout(6000).setConnectionRequestTimeout(6000).build();
RequestConfig requestConfig = RequestConfig.custom().setProxy(proxy).build();
httpGet.setConfig(requestConfig);
}
}
//获取代理IP
private static void getProxyInfo(String proxyUrl) {
StringBuffer str = new StringBuffer();
Map<String, String> map = new HashMap<String, String>();
try {
URL url = new URL(proxyUrl);
URLConnection urlConnection = url.openConnection();
urlConnection.setRequestProperty(“User-Agent”,”Mozilla/4.0 (compatible; MSIE 7.0; NT 5.1; GTB5; .NET CLR 2.0.50727; CIBA)”);
HttpURLConnection connection = null;
if(urlConnection instanceof HttpURLConnection) {
connection = (HttpURLConnection) urlConnection;
} else {
return;
}
BufferedReader in = new BufferedReader(
new InputStreamReader(connection.getInputStream(),”utf-8″));
String current;
while((current = in.readLine()) != null)
{
if (current.indexOf(“</td>”) != -1) {
str.append(current);
}
}
String[] arrAtr = str.toString().split(“</td>”);
int len = arrAtr.length;
for (int i = 0; i < len; i++) {
if (i%5 == 0) {
//判断IP格式和范围
String regex = “([1-9]|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])(\\.(\\d|[1-9]\\d|1\\d{2}|2[0-4]\\d|25[0-5])){3}”;
String portRegex = “^[0-9]*$”;
String ip = arrAtr[i].trim();
String port = arrAtr[i + 1].trim();
if (ip.matches(regex) || ip.length() > 7 || ip.length() < 15 || “”.equals(ip) || port.matches(portRegex)) {
map.put(“ip”, ip);
map.put(“port”, port);
proxyList.add(map);
}
}
}
}catch(IOException e)
{
e.printStackTrace();
}
}
//查看IP端口是否连接正常
private static boolean isHostConnectable(String host, int port) {
Socket socket = new Socket();
try {
socket.connect(new InetSocketAddress(host, port));
} catch (IOException e) {
return false;
} finally {
try {
socket.close();
} catch (IOException e) {
e.printStackTrace();
}
}
return true;
}
}