您好,登錄后才能下訂單哦!
本篇內容介紹了“Python爬蟲入門案例之實現爬取二手房源數據”的有關知識,在實際案例的操作過程中,不少人都會遇到這樣的困境,接下來就讓小編帶領大家學習一下如何處理這些情況吧!希望大家仔細閱讀,能夠學有所成!
系統分析網頁性質
結構化的數據解析
csv數據保存
python 3.8
pycharm 專業版 >>> 激活碼
#模塊使用
requests >>> pip install requests
parsel >>> pip install parsel
csv
【付費VIP完整版】只要看了就能學會的教程,80集Python基礎入門視頻教學
點這里即可免費在線觀看
爬蟲代碼實現步驟: 發送請求 >>> 獲取數據 >>> 解析數據 >>> 保存數據
import requests # 數據請求模塊 第三方模塊 pip install requests import parsel # 數據解析模塊 import re import csv
url = 'https://bj.lianjia.com/ershoufang/pg1/' # 需要攜帶上 請求頭: 把python代碼偽裝成瀏覽器 對于服務器發送請求 # User-Agent 瀏覽器的基本信息 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' } response = requests.get(url=url, headers=headers)
print(response.text)
selector_1 = parsel.Selector(response.text) # 把獲取到response.text 數據內容轉成 selector 對象 href = selector_1.css('div.leftContent li div.title a::attr(href)').getall() for link in href: html_data = requests.get(url=link, headers=headers).text selector = parsel.Selector(html_data) # css選擇器 語法 # try: title = selector.css('.title h2::text').get() # 標題 area = selector.css('.areaName .info a:nth-child(1)::text').get() # 區域 community_name = selector.css('.communityName .info::text').get() # 小區 room = selector.css('.room .mainInfo::text').get() # 戶型 room_type = selector.css('.type .mainInfo::text').get() # 朝向 height = selector.css('.room .subInfo::text').get().split('/')[-1] # 樓層 # 中樓層/共5層 split('/') 進行字符串分割 ['中樓層', '共5層'] [-1] # ['中樓層', '共5層'][-1] 列表索引位置取值 取列表中最后一個元素 共5層 # re.findall('共(\d+)層', 共5層) >>> [5][0] >>> 5 height = re.findall('共(\d+)層', height)[0] sub_info = selector.css('.type .subInfo::text').get().split('/')[-1] # 裝修 Elevator = selector.css('.content li:nth-child(12)::text').get() # 電梯 # if Elevator == '暫無數據電梯' or Elevator == None: # Elevator = '無電梯' house_area = selector.css('.content li:nth-child(3)::text').get().replace('㎡', '') # 面積 price = selector.css('.price .total::text').get() # 價格(萬元) date = selector.css('.area .subInfo::text').get().replace('年建', '') # 年份 dit = { '標題': title, '市區': area, '小區': community_name, '戶型': room, '朝向': room_type, '樓層': height, '裝修情況': sub_info, '電梯': Elevator, '面積(㎡)': house_area, '價格(萬元)': price, '年份': date, } csv_writer.writerow(dit) print(title, area, community_name, room, room_type, height, sub_info, Elevator, house_area, price, date, sep='|')
f = open('二手房數據.csv', mode='a', encoding='utf-8', newline='') csv_writer = csv.DictWriter(f, fieldnames=[ '標題', '市區', '小區', '戶型', '朝向', '樓層', '裝修情況', '電梯', '面積(㎡)', '價格(萬元)', '年份', ]) csv_writer.writeheader()
import pandas as pd from pyecharts.charts import Map from pyecharts.charts import Bar from pyecharts.charts import Line from pyecharts.charts import Grid from pyecharts.charts import Pie from pyecharts.charts import Scatter from pyecharts import options as opts
df = pd.read_csv('鏈家.csv', encoding = 'utf-8') df.head()
new = [x + '區' for x in region] m = ( Map() .add('', [list(z) for z in zip(new, count)], '北京') .set_global_opts( title_opts=opts.TitleOpts(title='北京市二手房各區分布'), visualmap_opts=opts.VisualMapOpts(max_=3000), ) ) m.render_notebook()
df_price.values.tolist() price = [round(x,2) for x in df_price.values.tolist()] bar = ( Bar() .add_xaxis(region) .add_yaxis('數量', count, label_opts=opts.LabelOpts(is_show=True)) .extend_axis( yaxis=opts.AxisOpts( name="價格(萬元)", type_="value", min_=200, max_=900, interval=100, axislabel_opts=opts.LabelOpts(formatter="{value}"), ) ) .set_global_opts( title_opts=opts.TitleOpts(title='各城區二手房數量-平均價格柱狀圖'), tooltip_opts=opts.TooltipOpts( is_show=True, trigger="axis", axis_pointer_type="cross" ), xaxis_opts=opts.AxisOpts( type_="category", axispointer_opts=opts.AxisPointerOpts(is_show=True, type_="shadow"), ), yaxis_opts=opts.AxisOpts(name='數量', axistick_opts=opts.AxisTickOpts(is_show=True), splitline_opts=opts.SplitLineOpts(is_show=False),) ) ) line2 = ( Line() .add_xaxis(xaxis_data=region) .add_yaxis( series_name="價格", yaxis_index=1, y_axis=price, label_opts=opts.LabelOpts(is_show=True), z=10 ) ) bar.overlap(line2) grid = Grid() grid.add(bar, opts.GridOpts(pos_left="5%", pos_right="20%"), is_control_axis_index=True) grid.render_notebook()
area0 = top_price['小區'].values.tolist() count = top_price['價格(萬元)'].values.tolist() bar = ( Bar() .add_xaxis(area0) .add_yaxis('數量', count,category_gap = '50%') .set_global_opts( yaxis_opts=opts.AxisOpts(name='價格(萬元)'), xaxis_opts=opts.AxisOpts(name='數量'), ) ) bar.render_notebook()
s = ( Scatter() .add_xaxis(df['面積(㎡)'].values.tolist()) .add_yaxis('',df['價格(萬元)'].values.tolist()) .set_global_opts(xaxis_opts=opts.AxisOpts(type_='value')) ) s.render_notebook()
directions = df_direction.index.tolist() count = df_direction.values.tolist() c1 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count)], radius=['20%', '60%'], center=['40%', '50%'], # rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='房屋朝向占比',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="80%",pos_top="25%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} (aegqsqibtmh%)'),position="outside") ) c1.render_notebook()
fitment = df_fitment.index.tolist() count1 = df_fitment.values.tolist() directions = df_direction.index.tolist() count2 = df_direction.values.tolist() bar = ( Bar() .add_xaxis(fitment) .add_yaxis('', count1, category_gap = '50%') .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position='right')) .set_global_opts( xaxis_opts=opts.AxisOpts(name='數量'), title_opts=opts.TitleOpts(title='裝修情況/有無電梯玫瑰圖(組合圖)',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="58%",orient="vertical") ) ) c2 = ( Pie(init_opts=opts.InitOpts( width='800px', height='600px', ) ) .add( '', [list(z) for z in zip(directions, count2)], radius=['10%', '30%'], center=['75%', '65%'], rosetype="radius", label_opts=opts.LabelOpts(is_show=True), ) .set_global_opts(title_opts=opts.TitleOpts(title='有/無電梯',pos_left='33%',pos_top="5%"), legend_opts=opts.LegendOpts(type_="scroll", pos_left="90%",pos_top="15%",orient="vertical") ) .set_series_opts(label_opts=opts.LabelOpts(formatter='{b}:{c} \n (aegqsqibtmh%)'),position="outside") ) bar.overlap(c2) bar.render_notebook()
floor = df_floor.index.tolist() count = df_floor.values.tolist() bar = ( Bar() .add_xaxis(floor) .add_yaxis('數量', count) .set_global_opts( title_opts=opts.TitleOpts(title='二手房樓層分布柱狀縮放圖'), yaxis_opts=opts.AxisOpts(name='數量'), xaxis_opts=opts.AxisOpts(name='樓層'), datazoom_opts=opts.DataZoomOpts(type_='slider') ) ) bar.render_notebook()
area = df_area.index.tolist() count = df_area.values.tolist() bar = ( Bar() .add_xaxis(area) .add_yaxis('數量', count) .reversal_axis() .set_series_opts(label_opts=opts.LabelOpts(position="right")) .set_global_opts( title_opts=opts.TitleOpts(title='房屋面積分布縱向柱狀圖'), yaxis_opts=opts.AxisOpts(name='面積(㎡)'), xaxis_opts=opts.AxisOpts(name='數量'), ) ) bar.render_notebook()
“Python爬蟲入門案例之實現爬取二手房源數據”的內容就介紹到這里了,感謝大家的閱讀。如果想了解更多行業相關的知識可以關注億速云網站,小編將為大家輸出更多高質量的實用文章!
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。