您好,登錄后才能下訂單哦!
這個框架關注了很久,但是直到最近空了才仔細的看了下 這里我用的是scrapy0.24版本
先來個成品好感受這個框架帶來的便捷性,等這段時間慢慢整理下思緒再把最近學到的關于此框架的知識一一更新到博客來。
最近想學git 于是把代碼放到 git-osc上了:
https://git.oschina.net/1992mrwang/doubangroupspider
先說明下這個玩具爬蟲的目的
能夠將種子URL頁面當中的小組進行爬取 并分析出有關聯的小組連接 以及小組的組員人數 和組名等信息
出來的數據大概是這樣的
{ 'RelativeGroups': [u'http://www.douban.com/group/10127/', u'http://www.douban.com/group/seventy/', u'http://www.douban.com/group/lovemuseum/', u'http://www.douban.com/group/486087/', u'http://www.douban.com/group/lovesh/', u'http://www.douban.com/group/NoAstrology/', u'http://www.douban.com/group/shanghaijianzhi/', u'http://www.douban.com/group/12658/', u'http://www.douban.com/group/shanghaizufang/', u'http://www.douban.com/group/gogo/', u'http://www.douban.com/group/117546/', u'http://www.douban.com/group/159755/'], 'groupName': u'\u4e0a\u6d77\u8c46\u74e3', 'groupURL': 'http://www.douban.com/group/Shanghai/', 'totalNumber': u'209957'}
有啥用 其實這些數據就能夠分析小組與小組之間的關聯度等,如果有心還能抓取到更多的信息。不在此展開 本文章主要是為了能夠快速感受一把。
首先就是 start 一個新的名為douban的項目
# scrapy startproject douban
# cd douban
這是整個項目的完整后的目錄 ps 放到git-osc時候為了美觀改變了項目主目錄名稱 clone下來無影響 mrwang@mrwang-ubuntu:~/student/py/douban$ tree . ├── douban │ ├── __init__.py │ ├── items.py # 實體 │ ├── pipelines.py # 數據管道文件 │ ├── settings.py # 設置 │ └── spiders │ ├── BasicGroupSpider.py # 真正進行爬取的爬蟲 │ └── __init__.py ├── nohup.out # 我用nohup 進行后臺運行生成的一個日志文件 ├── scrapy.cfg ├── start.sh # 為了方便寫的啟動shell 很簡單 ├── stop.sh # 為了方便寫的停止shell 很簡單 └── test.log # 抓取時生成的日志 在啟動腳本中就有
編寫實體 items.py , 主要是為了抓回來的數據可以很方便的持久化
mrwang@mrwang-ubuntu:~/student/py/douban$ cat douban/items.py # -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html from scrapy.item import Item, Field class DoubanItem(Item): # define the fields for your item here like: # name = Field() groupName = Field() groupURL = Field() totalNumber = Field() RelativeGroups = Field() ActiveUesrs = Field()
編寫爬蟲并自定義一些規則進行數據的處理
mrwang@mrwang-ubuntu:~/student/py/douban$ cat douban/spiders/BasicGroupSpider.py # -*- coding: utf-8 -*- from scrapy.contrib.spiders import CrawlSpider, Rule from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scrapy.item import Item from douban.items import DoubanItem import re class GroupSpider(CrawlSpider): # 爬蟲名 name = "Group" allowed_domains = ["douban.com"] # 種子鏈接 start_urls = [ "http://www.douban.com/group/explore?tag=%E8%B4%AD%E7%89%A9", "http://www.douban.com/group/explore?tag=%E7%94%9F%E6%B4%BB", "http://www.douban.com/group/explore?tag=%E7%A4%BE%E4%BC%9A", "http://www.douban.com/group/explore?tag=%E8%89%BA%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E5%AD%A6%E6%9C%AF", "http://www.douban.com/group/explore?tag=%E6%83%85%E6%84%9F", "http://www.douban.com/group/explore?tag=%E9%97%B2%E8%81%8A", "http://www.douban.com/group/explore?tag=%E5%85%B4%E8%B6%A3" ] # 規則 滿足后 使用callback指定的函數進行處理 rules = [ Rule(SgmlLinkExtractor(allow=('/group/[^/]+/$', )), callback='parse_group_home_page', process_request='add_cookie'), Rule(SgmlLinkExtractor(allow=('/group/explore\?tag', )), follow=True, process_request='add_cookie'), ] def __get_id_from_group_url(self, url): m = re.search("^http://www.douban.com/group/([^/]+)/$", url) if(m): return m.group(1) else: return 0 def add_cookie(self, request): request.replace(cookies=[ ]); return request; def parse_group_topic_list(self, response): self.log("Fetch group topic list page: %s" % response.url) pass def parse_group_home_page(self, response): self.log("Fetch group home page: %s" % response.url) # 這里使用的是一個叫 XPath 的選擇器 hxs = HtmlXPathSelector(response) item = DoubanItem() #get group name item['groupName'] = hxs.select('//h2/text()').re("^\s+(.*)\s+$")[0] #get group id item['groupURL'] = response.url groupid = self.__get_id_from_group_url(response.url) #get group members number members_url = "http://www.douban.com/group/%s/members" % groupid members_text = hxs.select('//a[contains(@href, "%s")]/text()' % members_url).re("\((\d+)\)") item['totalNumber'] = members_text[0] #get relative groups item['RelativeGroups'] = [] groups = hxs.select('//div[contains(@class, "group-list-item")]') for group in groups: url = group.select('div[contains(@class, "title")]/a/@href').extract()[0] item['RelativeGroups'].append(url) return item
編寫數據處理的管道這個階段我會把爬蟲收集到的數據存儲到mongodb當中去
mrwang@mrwang-ubuntu:~/student/py/douban$ cat douban/pipelines.py # -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo from scrapy import log from scrapy.conf import settings from scrapy.exceptions import DropItem class DoubanPipeline(object): def __init__(self): self.server = settings['MONGODB_SERVER'] self.port = settings['MONGODB_PORT'] self.db = settings['MONGODB_DB'] self.col = settings['MONGODB_COLLECTION'] connection = pymongo.Connection(self.server, self.port) db = connection[self.db] self.collection = db[self.col] def process_item(self, item, spider): self.collection.insert(dict(item)) log.msg('Item written to MongoDB database %s/%s' % (self.db, self.col),level=log.DEBUG, spider=spider) return item
在設置類中設置 所使用的數據處理管道 以及mongodb連接參數 和 user-agent 躲避爬蟲被禁
mrwang@mrwang-ubuntu:~/student/py/douban$ cat douban/settings.py # -*- coding: utf-8 -*- # Scrapy settings for douban project # # For simplicity, this file contains only the most important settings by # default. All the other settings are documented here: # # http://doc.scrapy.org/en/latest/topics/settings.html # BOT_NAME = 'douban' SPIDER_MODULES = ['douban.spiders'] NEWSPIDER_MODULE = 'douban.spiders' # 設置等待時間緩解服務器壓力 并能夠隱藏自己 DOWNLOAD_DELAY = 2 RANDOMIZE_DOWNLOAD_DELAY = True USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.54 Safari/536.5' COOKIES_ENABLED = True # 配置使用的數據管道 ITEM_PIPELINES = ['douban.pipelines.DoubanPipeline'] MONGODB_SERVER='localhost' MONGODB_PORT=27017 MONGODB_DB='douban' MONGODB_COLLECTION='doubanGroup' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'douban (+http://www.yourdomain.com)'
OK 一個玩具爬蟲就簡單的完成了
啟動啟動命令
nohup scrapy crawl Group --logfile=test.log &
=========================== 2014/12/02 更新 ===================================
在github上發現已經有人 和我想的一樣 重新寫了一個調度器 使用mongodb進行存儲需要接下來訪問的頁面,于是照著模仿了一遍寫一個來用
mrwang@mrwang-ThinkPad-Edge-E431:~/student/py/douban$ cat douban/scheduler.py from scrapy.utils.reqser import request_to_dict, request_from_dict import pymongo import datetime class Scheduler(object): def __init__(self, mongodb_server, mongodb_port, mongodb_db, persist, queue_key, queue_order): self.mongodb_server = mongodb_server self.mongodb_port = mongodb_port self.mongodb_db = mongodb_db self.queue_key = queue_key self.persist = persist self.queue_order = queue_order def __len__(self): return self.client.size() @classmethod def from_crawler(cls, crawler): settings = crawler.settings mongodb_server = settings.get('MONGODB_QUEUE_SERVER', 'localhost') mongodb_port = settings.get('MONGODB_QUEUE_PORT', 27017) mongodb_db = settings.get('MONGODB_QUEUE_DB', 'scrapy') persist = settings.get('MONGODB_QUEUE_PERSIST', True) queue_key = settings.get('MONGODB_QUEUE_NAME', None) queue_type = settings.get('MONGODB_QUEUE_TYPE', 'FIFO') if queue_type not in ('FIFO', 'LIFO'): raise Error('MONGODB_QUEUE_TYPE must be FIFO (default) or LIFO') if queue_type == 'LIFO': queue_order = -1 else: queue_order = 1 return cls(mongodb_server, mongodb_port, mongodb_db, persist, queue_key, queue_order) def open(self, spider): self.spider = spider if self.queue_key is None: self.queue_key = "%s_queue"%spider.name connection = pymongo.Connection(self.mongodb_server, self.mongodb_port) self.db = connection[self.mongodb_db] self.collection = self.db[self.queue_key] # notice if there are requests already in the queue size = self.collection.count() if size > 0: spider.log("Resuming crawl (%d requests scheduled)" % size) def close(self, reason): if not self.persist: self.collection.drop() def enqueue_request(self, request): data = request_to_dict(request, self.spider) self.collection.insert({ 'data': data, 'created': datetime.datetime.utcnow() }) def next_request(self): entry = self.collection.find_and_modify(sort={"$natural":self.queue_order}, remove=True) if entry: request = request_from_dict(entry['data'], self.spider) return request return None def has_pending_requests(self): return self.collection.count() > 0
這個默認都有配置,如果希望自定義也可以在douban/settings.py 中配置
具體的可以配置的東西有
參數名 默認值
MONGODB_QUEUE_SERVER=localhost 服務器
MONGODB_QUEUE_PORT=27017 端口號
MONGODB_QUEUE_DB=scrapy 數據庫名
MONGODB_QUEUE_PERSIST=True 完成后是否將任務隊列從mongo中刪除
MONGODB_QUEUE_NAME=None 隊列集合名 如果為None 默認為你爬蟲的名字
MONGODB_QUEUE_TYPE=FIFO 先進先出 或者 LIFO后進先出
任務隊列分離后可以方便后期將爬蟲改造成為分布式突破單機限制,git-osc 已更新。
會有人考慮任務隊列的效率問題,我在個人電腦上測試隊列達到將近百萬級對mongodb做一次比較復雜的查詢,再未做任何索引的情況下出來的效果還是不錯的。8G內存+I5 內存未用盡,還打開了大量程序的情況下進行,如果有人在看,也可以自行做一次測試 不算太糟糕。
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。