您好,登錄后才能下訂單哦!
package work;
import org.jsoup.Jsoup;
import org.jsoup.safety.Whitelist;
import org.springframework.dao.DuplicateKeyException;
import org.springframework.jdbc.core.JdbcTemplate;
import cn.edu.hfut.dmic.contentextractor.ContentExtractor;
import cn.edu.hfut.dmic.contentextractor.News;
import cn.edu.hfut.dmic.webcollector.conf.Configuration;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;
import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.plugin.net.OkHttpRequester;
import db.JDBCHelper;
import okhttp3.Request;
import util.HtmlTools;
/**
* Crawling news from hfut news
* use 2.72 lib
* @author hu
*/
public class ChujiingNewstpl extends BreadthCrawler {
//種子 url
public String seedUrl="http://news.cnhubei.com/";
//需要采集的內容頁url
public String contentRegUrl="http://news.cnhubei.com/.*/p/.*?.html\\.*";
//線程數量
public int threads_num=10;
//每次迭代爬取的網頁數量上限
public int topn_num=10;
//爬取文章深度
public static int levelnum=10;
//停止后能否繼續上次采集
public static boolean resumable=true;
public int executeTime=20000; //ms
public static int MaxExecuteCount=2;
public int connectTimeout=50;
public int readTimeout=60;
private String contentTable="news_content";
@Override
public void visit(Page page, CrawlDatums next) {
// String url = page.url();
if (page.matchUrl(contentRegUrl)) {
//
/*extract title and content of news by css selector*/
// String title = page.select("div[id=Article]>h3").first().text();
// String content = page.selectText("div#artibody");
News n = null;
try {
n=ContentExtractor.getNewsByHtml(page.html());
String title=n.getTitle();
String content=n.getContent();
content = Jsoup.clean(content, HtmlTools.getWhitelist());
content=HtmlTools.stripNewLine(content);
title=Jsoup.clean(title,Whitelist.none());
title=title.trim();
System.out.println(" get content :"+title );
if(!title.isEmpty() && !content.isEmpty()) {
ChujiingNewstpl.dbHandler.update("insert into "+contentTable+"(title,content) value(?,?)",title,content);
}
} catch(DuplicateKeyException e) {
System.out.println(" duplicate item ");
}catch (Exception e) {
// TODO Auto-generated catch block
System.out.println(e.getMessage());
}
}
}
private static JdbcTemplate dbHandler;
// 自定義的請求插件
public class MyRequester extends OkHttpRequester {
String userAgent = "Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html)";
// String cookie = "name=abcdef";
// 每次發送請求前都會執行這個方法來構建請求
@Override
public Request.Builder createRequestBuilder(CrawlDatum crawlDatum) {
// 這里使用的是OkHttp中的Request.Builder
// 可以參考OkHttp的文檔來修改請求頭
// System.out.println("request with cookie: " + cookie);
return super.createRequestBuilder(crawlDatum).header("User-Agent", userAgent);
// .header("Cookie", cookie);
}
}
public ChujiingNewstpl(String crawlPath, boolean autoParse) {
super(crawlPath, autoParse);
// 設置請求插件
//setRequester(new MyRequester());
/*start page*/
this.addSeed(seedUrl);
this.addRegex(contentRegUrl);
this.addRegex("-.*\\.(jpg|png|gif|css|js|font).*");
setThreads(threads_num);
Configuration cnf=getConf();
cnf.setTopN(topn_num);
// cnf.setExecuteInterval(executeTime);
// cnf.setConnectTimeout(connectTimeout);
// cnf.setReadTimeout(readTimeout);
}
public static void main(String[] args) throws Exception {
dbHandler=JDBCHelper.db();
ChujiingNewstpl crawler = new ChujiingNewstpl("spiderdata"+java.io.File.separator+ChujiingNewstpl.class.getName(), true);
crawler.setResumable(resumable);
crawler.start(levelnum);
//失敗最大嘗試次數
crawler.setMaxExecuteCount(MaxExecuteCount);
}
}
源碼地址 https://down.51cto.com/data/2461609
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。