在Scrapy中處理多級頁面跳轉通常可以通過兩種方式來實現:
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class MyCrawlSpider(CrawlSpider):
name = 'my_crawl_spider'
allowed_domains = ['example.com']
start_urls = ['http://www.example.com']
rules = (
Rule(LinkExtractor(allow='item'), callback='parse_item'),
)
def parse_item(self, response):
# 提取數據
pass
import scrapy
class MySpider(scrapy.Spider):
name = 'my_spider'
start_urls = ['http://www.example.com']
def parse(self, response):
# 提取數據
# 處理下一個頁面的跳轉
next_page_url = response.css('a.next_page::attr(href)').extract_first()
if next_page_url:
yield response.follow(next_page_url, callback=self.parse_next_page)
def parse_next_page(self, response):
# 提取數據
pass
使用以上兩種方法之一,你可以很方便地處理多級頁面跳轉并提取需要的數據。