您好,登錄后才能下訂單哦!
本篇文章為大家展示了使用java怎么爬取代理IP,內容簡明扼要并且容易理解,絕對能使你眼前一亮,通過這篇文章的詳細介紹希望你能有所收獲。
<dependency> <groupId>com.alibaba</groupId> <artifactId>fastjson</artifactId> <version>1.2.28</version> </dependency> <dependency> <groupId>org.jsoup</groupId> <artifactId>jsoup</artifactId> <version>1.10.2</version> </dependency>
完整的代碼如下:
package com.tuniu.fcm.facade.IPProxy; import com.alibaba.fastjson.JSONObject; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 獲取代理IP,需要 * com.alibaba.fastjson.JSONObject以及Jsoup */ public class ProxyCralwerUnusedVPN { ThreadLocal<Integer> localWantedNumber = new ThreadLocal<Integer>(); ThreadLocal<List<ProxyInfo>> localProxyInfos = new ThreadLocal<List<ProxyInfo>>(); public static void main(String[] args) { ProxyCralwerUnusedVPN proxyCrawler = new ProxyCralwerUnusedVPN(); /** * 想要獲取的代理IP個數,由需求方自行指定。(如果個數太多,將導致返回變慢) */ proxyCrawler.startCrawler(1); } /** * 暴露給外部模塊調用的入口 * @param wantedNumber 調用方期望獲取到的代理IP個數 */ public String startCrawler(int wantedNumber) { localWantedNumber.set(wantedNumber); kuaidailiCom("http://www.xicidaili.com/nn/", 15); kuaidailiCom("http://www.xicidaili.com/nt/", 15); kuaidailiCom("http://www.xicidaili.com/wt/", 15); kuaidailiCom("http://www.kuaidaili.com/free/inha/", 15); kuaidailiCom("http://www.kuaidaili.com/free/intr/", 15); kuaidailiCom("http://www.kuaidaili.com/free/outtr/", 15); /** * 構造返回數據 */ ProxyResponse response = new ProxyResponse(); response.setSuccess("true"); Map<String, Object> dataInfoMap = new HashMap<String, Object>(); dataInfoMap.put("numFound", localProxyInfos.get().size()); dataInfoMap.put("pageNum", 1); dataInfoMap.put("proxy", localProxyInfos.get()); response.setData(dataInfoMap); String responseString = JSONObject.toJSON(response).toString(); System.out.println(responseString); return responseString; } private void kuaidailiCom(String baseUrl, int totalPage) { String ipReg = "\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3} \\d{1,6}"; Pattern ipPtn = Pattern.compile(ipReg); for (int i = 1; i < totalPage; i++) { if (getCurrentProxyNumber() >= localWantedNumber.get()) { return; } try { Document doc = Jsoup.connect(baseUrl + i + "/") .header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8") .header("Accept-Encoding", "gzip, deflate, sdch") .header("Accept-Language", "zh-CN,zh;q=0.8,en;q=0.6") .header("Cache-Control", "max-age=0") .header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36") .header("Cookie", "Hm_lvt_7ed65b1cc4b810e9fd37959c9bb51b31=1462812244; _gat=1; _ga=GA1.2.1061361785.1462812244") .header("Host", "www.kuaidaili.com") .header("Referer", "http://www.kuaidaili.com/free/outha/") .timeout(30 * 1000) .get(); Matcher m = ipPtn.matcher(doc.text()); while (m.find()) { if (getCurrentProxyNumber() >= localWantedNumber.get()) { break; } String[] strs = m.group().split(" "); if (checkProxy(strs[0], Integer.parseInt(strs[1]))) { System.out.println("獲取到可用代理IP\t" + strs[0] + "\t" + strs[1]); addProxy(strs[0], strs[1], "http"); } } } catch (Exception e) { e.printStackTrace(); } } } private static boolean checkProxy(String ip, Integer port) { try { //http://1212.ip138.com/ic.asp 可以換成任何比較快的網頁 Jsoup.connect("http://1212.ip138.com/ic.asp") .timeout(2 * 1000) .proxy(ip, port) .get(); return true; } catch (Exception e) { return false; } } private int getCurrentProxyNumber() { List<ProxyInfo> proxyInfos = localProxyInfos.get(); if (proxyInfos == null) { proxyInfos = new ArrayList<ProxyInfo>(); localProxyInfos.set(proxyInfos); return 0; } else { return proxyInfos.size(); } } private void addProxy(String ip, String port, String protocol){ List<ProxyInfo> proxyInfos = localProxyInfos.get(); if (proxyInfos == null) { proxyInfos = new ArrayList<ProxyInfo>(); proxyInfos.add(new ProxyInfo(ip, port, protocol)); } else { proxyInfos.add(new ProxyInfo(ip, port, protocol)); } } } class ProxyInfo { private String userName = ""; private String ip; private String password = ""; private String type; private String port; private int is_internet = 1; public ProxyInfo(String ip, String port, String type) { this.ip = ip; this.type = type; this.port = port; } public String getUserName() { return userName; } public void setUserName(String userName) { this.userName = userName; } public String getIp() { return ip; } public void setIp(String ip) { this.ip = ip; } public String getPassword() { return password; } public void setPassword(String password) { this.password = password; } public String getType() { return type; } public void setType(String type) { this.type = type; } public String getPort() { return port; } public void setPort(String port) { this.port = port; } public int getIs_internet() { return is_internet; } public void setIs_internet(int is_internet) { this.is_internet = is_internet; } } class ProxyResponse { private String success; private Map<String, Object> data; public String getSuccess() { return success; } public void setSuccess(String success) { this.success = success; } public Map<String, Object> getData() { return data; } public void setData(Map<String, Object> data) { this.data = data; } }
上述內容就是使用java怎么爬取代理IP,你們學到知識或技能了嗎?如果還想學到更多技能或者豐富自己的知識儲備,歡迎關注億速云行業資訊頻道。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。