[ 登录 ] - [ 注册 ] | 开发者(推广)分成 无忧代理IP最新上线 | 代码示例 | IP测试视频 |

Java配合爬虫代理IP采集大众点评店铺信息

作者:数据无忧   时间:2018-10-13 20:18:14
大众点评店铺网址格式如下:
http://www.dianping.com/shop/6000000/
http://www.dianping.com/shop/6000001/

shop后面的ID是连续的,范围是1-1500万,当然有许多店铺是不存在的(404错误),实际的店铺数量在700万左右,这里是用的穷举法,当然也可以进入网页按深度索引。

程序采集过程中会发现大众点评采取了严格的反爬虫措施,如果一个IP一秒一个进行采集,大概采集500-1000个左右就会出现403错误,IP被冻结了,一段时间后才解封,如果冻结了你不死心,继续大量采,就永久冻结了。

其实这个问题很好解决,使用爬虫代理IP,那403迎刃而解,爬虫IP网址 http://www.data5u.com/buy/dynamic.html

代码如下:


import java.io.BufferedInputStream;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.util.ArrayList;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.ProxyConfig;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.WebResponse;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
import com.gargoylesoftware.htmlunit.util.NameValuePair;

/**
 * 这个DEMO主要为了测试爬虫(动态)代理IP的稳定性
 * 完美支持企业信息天眼查、电商Ebay、亚马逊、新浪微博、法院文书、分类信息等
 * 也可以作为爬虫参考项目,如需使用,请自行修改webParseHtml方法
 */
public class TestDynamicIpContinue {
	
	public static List ipList = new ArrayList<>();
	public static boolean gameOver = false;
	
	public static void main(String[] args) throws Exception {
		// 每隔几秒提取一次IP
		long fetchIpSeconds = 5;
		int testTime = 3;
		
		// 请填写无忧代理IP订单号,填写之后才可以提取到IP哦
		String order = "88888888888888888888888888888";
		
		// 你要抓去的目标网址
		String targetUrl = "http://www.dianping.com/shop/6000000/";
		
		// 设置referer信息,如果抓取淘宝、天猫需要设置
		String referer = "";
		// 开启对https的支持
		boolean https = true;
		// 是否输出Header信息
		boolean outputHeaderInfo = false;
		// 是否加载JS,加载JS会导致速度变慢
		boolean useJS = false;
		// 请求超时时间,单位毫秒,默认5秒
		int timeOut = 10000;
		
		if (order == null || "".equals(order)) {
			System.err.println("请输入爬虫(动态)代理订单号");
			return;
		}
		System.out.println(">>>>>>>>>>>>>>动态IP测试开始<<<<<<<<<<<<<<");
		System.out.println("***************");
		System.out.println("提取IP间隔 " + fetchIpSeconds + " 秒 ");
		System.out.println("爬虫目标网址  " + targetUrl);
		System.out.println("***************\n");
		TestDynamicIpContinue tester = new TestDynamicIpContinue();
		new Thread(tester.new GetIP(fetchIpSeconds * 1000, testTime, order, targetUrl, useJS, timeOut, referer, https, outputHeaderInfo)).start();
	
		while(!gameOver){
			try {
				Thread.sleep(100);
			} catch (InterruptedException e) {
				e.printStackTrace();
			}
		}
		System.out.println(">>>>>>>>>>>>>>动态IP测试结束<<<<<<<<<<<<<<");
		System.exit(0);
	}
    
	// 抓取IP138,检测IP
	public class Crawler extends Thread{
		@Override
		public void run() {
			webParseHtml(targetUrl);
		}
		
		long sleepMs = 200;
		boolean useJs = false;
		String targetUrl = "";
		int timeOut = 5000;
		String ipport = "";
		
		String referer;
		boolean https;
		boolean outputHeaderInfo;
		
		public Crawler(long sleepMs, String targetUrl, boolean useJs, int timeOut, String ipport, String referer, boolean https, boolean outputHeader) {
			this.sleepMs = sleepMs;
			this.targetUrl = targetUrl;
			this.useJs = useJs;
			this.timeOut = timeOut;
			this.ipport = ipport;
			
			this.referer = referer;
			this.https = https;
			this.outputHeaderInfo = outputHeader;
		}
		public String webParseHtml(String url) {
			String html = "";
			BrowserVersion[] versions = { BrowserVersion.CHROME, BrowserVersion.FIREFOX_38, BrowserVersion.INTERNET_EXPLORER_11, BrowserVersion.INTERNET_EXPLORER_8};
			WebClient client = new WebClient(versions[(int)(versions.length * Math.random())]);
			try {
				client.getOptions().setThrowExceptionOnFailingStatusCode(false);
				client.getOptions().setJavaScriptEnabled(useJs);
				client.getOptions().setCssEnabled(false);
				client.getOptions().setThrowExceptionOnScriptError(false);
				client.getOptions().setTimeout(timeOut);
				client.getOptions().setAppletEnabled(true);
				client.getOptions().setGeolocationEnabled(true);
				client.getOptions().setRedirectEnabled(true);
				
				// 对于HTTPS网站,加上这行代码可以跳过SSL验证
				client.getOptions().setUseInsecureSSL(https);
				
				if (referer != null && !"".equals(referer)) {
					client.addRequestHeader("Referer", referer);
				}
				
				if (ipport != null) {
					ProxyConfig proxyConfig = new ProxyConfig((ipport.split(",")[0]).split(":")[0], Integer.parseInt((ipport.split(",")[0]).split(":")[1]));
					client.getOptions().setProxyConfig(proxyConfig);
				}else {
					System.out.print(".");
					return "";
				}
			
				long startMs = System.currentTimeMillis();
				
				Page page = client.getPage(url);
				WebResponse response = page.getWebResponse();
				
				if (outputHeaderInfo) {
					// 输出header信息
					List headers = response.getResponseHeaders();
					for (NameValuePair nameValuePair : headers) {
						System.out.println(nameValuePair.getName() + "-->" + nameValuePair.getValue());
					}
				}
				
				boolean isJson = false ;
				if (response.getContentType().equals("application/json")) {
					html = response.getContentAsString();
					isJson = true ;
				}else if(page.isHtmlPage()){
					html = ((HtmlPage)page).asXml();
				}
				
				long endMs = System.currentTimeMillis();
				
				Document doc = Jsoup.parse(html);System.out.println(getName() + " " + ipport + " 用时 " + (endMs - startMs) + "毫秒 :" + doc.select("title").text());				
} catch (Exception e) { System.err.println(ipport + ":" + e.getMessage()); } finally { client.close(); } return html; } } // 定时获取动态IP public class GetIP implements Runnable{ long sleepMs = 1000; int maxTime = 3; String order = ""; String targetUrl; boolean useJs; int timeOut; String referer; boolean https; boolean outputHeaderInfo; public GetIP(long sleepMs, int maxTime, String order, String targetUrl, boolean useJs, int timeOut, String referer, boolean https, boolean outputHeaderInfo) { this.sleepMs = sleepMs; this.maxTime = maxTime; this.order = order; this.targetUrl = targetUrl; this.useJs = useJs; this.timeOut = timeOut; this.referer=referer; this.https=https; this.outputHeaderInfo=outputHeaderInfo; } @Override public void run() { int time = 1; while(!gameOver){ if(time >= 4){ gameOver = true; break; } try { java.net.URL url = new java.net.URL("http://api.ip.data5u.com/dynamic/get.html?order=" + order + "&ttl&random=true"); HttpURLConnection connection = (HttpURLConnection)url.openConnection(); connection.setConnectTimeout(3000); connection = (HttpURLConnection)url.openConnection(); InputStream raw = connection.getInputStream(); InputStream in = new BufferedInputStream(raw); byte[] data = new byte[in.available()]; int bytesRead = 0; int offset = 0; while(offset < data.length) { bytesRead = in.read(data, offset, data.length - offset); if(bytesRead == -1) { break; } offset += bytesRead; } in.close(); raw.close(); String[] res = new String(data, "UTF-8").split("\n"); System.out.println(">>>>>>>>>>>>>>当前返回IP量 " + res.length); for (String ip : res) { new Crawler(100, targetUrl, useJs, timeOut, ip, referer, https, outputHeaderInfo).start(); } } catch (Exception e) { System.err.println(">>>>>>>>>>>>>>获取IP出错, " + e.getMessage()); } try { Thread.sleep(sleepMs); } catch (InterruptedException e) { e.printStackTrace(); } } } } public String joinList(List list){ StringBuilder re = new StringBuilder(); for (String string : list) { re.append(string).append(","); } return re.toString(); } public String trim(String html) { if (html != null) { return html.replaceAll(" ", "").replaceAll("\n", ""); } return null; } }


无忧代理IP(www.data5u.com)原创文章,转载请注明出处。

抽取IP代金券
返回顶部
在线咨询
人工客服
 QQ交流群 - 001
QQ:340529947(满)
 QQ交流群 - 002
QQ:828393818
 售前咨询 - 001
QQ:1598729786
 技术支持 - 001
QQ:2562178315
为了避免 QQ 丢消息,请尽量添加客服为 QQ 好友。
客服工作时间08:30──17:30
关注微信
在线咨询
人工客服
 QQ交流群 - 001
QQ:340529947(满)
 QQ交流群 - 002
QQ:828393818
 售前咨询 - 001
QQ:1598729786
 技术支持 - 001
QQ:2562178315
为了避免 QQ 丢消息,请尽量添加客服为 QQ 好友。
客服工作时间08:30──17:30
电话:4007-745-096
QQ:
周一至周日8:30-18:00 技术部电话热线