您好,登錄后才能下訂單哦!
from future import print_function
#python2.X中print不需要括號,而在python3.X中則需要。在開頭加上這句之后,即使在
python2.X,使用print就得像python3.X那樣加括號使用
import requests
這個網址的前兩句下載pip 用 pip ×××tall requests 下載requests
requests是發起請求獲取網頁源代碼
from bs4 import BeautifulSoup
BeautifulSoup庫,是用于解析html代碼的,可以幫助你更方便的通過標簽定位你需要的信息
import pymongo
#源碼安裝mongodb數據庫 pip安裝pymongo 是python鏈接mongodb的第三方庫是驅動程
序,使python程序能夠使用Mongodb數據庫,使用python編寫而成.
import json
#json 是輕量級的文本數據交換格式。是用來存儲和交換文本信息的語法。
1.源碼安裝mongodb https://fastdl.mongodb.org/linux/mongodb-linux-x86_64-rhel70-3.2.5.tgz 解壓mongodb 源碼包, 放在 /usr/local
2 mkdir -p /data/db
3.cd /usr/local/mongodb/bin
./mongod &
./mongo
exit退出
查看數據庫內容:
cd/usr/local/mongodb/bin
./mongo
show dbs
數據庫 : iaaf
use iaaf
show collections
db.athletes.find()
第一步:提取網站HTML信息
#需要的網址
url = 'https://www.iaaf.org/records/toplists/jumps/long-jump/outdoor/men/senior/2018?regionType=world&windReading=regular&page={}&bestResultsOnly=true'
#使用headers設置請求頭,將代碼偽裝成瀏覽器
headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15', }
for i in range(1,23):
res = requests.get(url.format(i), headers=headers)
html = res.text
print(i)
soup = BeautifulSoup(html, 'html.parser') #使用BeautifulSoup解析這段代碼
#tbody_l = soup.find_all('tbody')
record_table = soup.find_all('table', class_='records-table')
list_re = record_table[2]
tr_l = list_re.find_all('tr')
for i in tr_l: # 針對每一個tr 也就是一行
td_l = i.find_all('td') # td的列表 第三項是 帶href
# 只要把td_l里面的每一項賦值就好了 組成json數據 {} 插入到mongo
# 再從mongo里面取href 訪問 得到 生涯數據 再存回這個表
# 再 把所有數據 存到 excel
j_data = {}
try:
j_data['Rank'] = td_l[0].get_text().strip()
j_data['Mark'] = td_l[1].get_text().strip()
j_data['WIND'] = td_l[2].get_text().strip()
j_data['Competitior'] = td_l[3].get_text().strip()
j_data['DOB'] = td_l[4].get_text().strip()
j_data['Nat'] = td_l[5].get_text().strip()
j_data['Pos'] = td_l[6].get_text().strip()
j_data['Venue'] = td_l[8].get_text().strip()
j_data['Date'] = td_l[9].get_text().strip()
j_data['href'] = td_l[3].find('a')['href']
#把想要的數據存到字典里
#!/usr/bin/env python
#encoding=utf-8
from future import print_function
import requests
from bs4 import BeautifulSoup as bs
def long_jump(url):
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'}
res = requests.get(url, headers=headers)
html = res.text
soup = bs(html,'html.parser')
div = soup.find('div', id='progression')
h3_l = []
if div != None:
h3_l = div.find_all('h3')
tbody_l = []
outdoor = []
indoor = []
for i in h3_l: # 得到h3 標簽
text = str(i.get_text().strip())
if "Long Jump" in text and "View Graph" in text:
tbody = i.parent.parent.table.tbody
#print(tbody) # 可以拿到里面的數據
# 兩份 一份是室外 一份是室內
tbody_l.append(tbody)
# 拿到兩個元素的tbody 一個為室外 一個室內 用try except
# 組兩個json數據 outdoor={} indoor={}
# db.×××ert() 先打印
try:
tbody_out = tbody_l[0]
tbody_in = tbody_l[1]
tr_l = tbody_out.find_all('tr')
for i in tr_l:
# print(i)
# print('+++++++++++++')
td_l = i.find_all('td')
td_dict = {}
td_dict['Year'] = str(td_l[0].get_text().strip())
td_dict['Performance'] = str(td_l[1].get_text().strip())
td_dict['Wind'] = str(td_l[2].get_text().strip())
td_dict['Place'] = str(td_l[3].get_text().strip())
td_dict['Date'] = str(td_l[4].get_text().strip())
outdoor.append(td_dict)
# print(outdoor)
# print('+++++++++++++++')
tr_lin = tbody_in.find_all('tr')
for i in tr_lin:
td_l = i.find_all('td')
td_dict = {}
td_dict['Year'] = str(td_l[0].get_text().strip())
td_dict['Performance'] = str(td_l[1].get_text().strip())
td_dict['Place'] = str(td_l[2].get_text().strip())
td_dict['Date'] = str(td_l[3].get_text().strip())
indoor.append(td_dict)
# print(indoor)
except:
pass
return outdoor, indoor
if __name__ == '__main__':
long_jump(url'https://www.iaaf.org/athletes/cuba/juan-miguel-echevarria-294120')
在獲取到整個頁面的HTML代碼后,我們需要從整個網頁中提取運動員跳遠的數據
#!/usr/bin/env python
#coding=utf-8
from future import print_function
import pymongo
import requests
from bs4 import BeautifulSoup
import json
from long_jump import *
db = pymongo.MongoClient().iaaf
headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.0 Safari/605.1.15'}
def get_href():
href_list = db.athletes.find()
# 794
count = 0
for i in href_list:
# 取id 根據id把爬來的生涯數據插回去
print(count)
href = i.get('href')
outdoor = []
indoor = []
if href == None:
pass
else:
url = 'https://www.iaaf.org'+ str(href)
outdoor, indoor = long_jump(url)
db.athletes.update({'_id':i.get('_id')},{"$set":{"outdoor":outdoor,"indoor":indoor}})
count += 1
def get_progression():
pass
if name == 'main':
get_href()
#!/usr/bin/env python
#coding=utf-8
from future import print_function
import xlwt
import pymongo
def write_into_xls(cursor):
title = ['Rank','Mark','age','Competitior','DOB','Nat','country','Venue','Date','out_year','out_performance','out_wind','out_place','out_date','in_year','in_performance','in_place','in_date']
book = xlwt.Workbook(encoding='utf-8',style_compression=0)
sheet = book.add_sheet('iaaf',cell_overwrite_ok=True)
for i in range(len(title)):
sheet.write(0, i, title[i])
# db = pymongo.MongoClient().iaaf
# cursor = db.athletes.find()
flag = 1
db = pymongo.MongoClient().iaaf
country_l = ['CUB', 'RSA', 'CHN', 'USA', 'RUS', 'AUS', 'CZE', 'URU', 'GRE', 'JAM', 'TTO', 'UKR', 'GER', 'IND', 'BRA', 'GBR', 'CAN', 'SRI', 'FRA', 'NGR', 'POL', 'SWE', 'JPN', 'INA', 'GUY', 'TKS', 'KOR', 'TPE', 'BER', 'MAR', 'ALG', 'ESP', 'SUI', 'EST', 'SRB', 'BEL', 'ITA', 'NED', 'FIN', 'CHI', 'BUL', 'CRO', 'ALB', 'KEN', 'POR', 'BAR', 'DEN', 'PER', 'ROU', 'MAS', 'CMR', 'TUR', 'PHI', 'HUN', 'VEN', 'HKG', 'PAN', 'BLR', 'MEX', 'LAT', 'GHA', 'MRI', 'IRL', 'ISV', 'BAH', 'KUW', 'NOR', 'SKN', 'UZB', 'BOT', 'AUT', 'PUR', 'DMA', 'KAZ', 'ARM', 'BEN', 'DOM', 'CIV', 'LUX', 'COL', 'ANA', 'MLT', 'SVK', 'THA', 'MNT', 'ISR', 'LTU', 'VIE', 'IRQ', 'NCA', 'ARU', 'KSA', 'ZIM', 'SLO', 'ECU', 'SYR', 'TUN', 'ARG', 'ZAM', 'SLE', 'BUR', 'NZL', 'AZE', 'GRN', 'OMA', 'CYP', 'GUA', 'ISL', 'SUR', 'TAN', 'GEO', 'BOL', 'ANG', 'QAT', 'TJK', 'MDA', 'MAC']
for i in country_l:
cursor = db.athletes.find({'Nat':i})
for i in cursor:
print(i)
count_out = len(i['outdoor'])
count_in = len(i['indoor'])
count = 1
if count_out >= count_in:
count = count_out
else:
count = count_in
if count == 0:
count = 1
# count 為這條數據占的行數
# title = ['Rank','Mark','Wind','Competitior','DOB','Nat','Pos','Venue',
# 'Date','out_year','out_performance','out_wind','out_place','out_date',
# 'in_year','in_performance','in_place','in_date']
sheet.write(flag, 0, i.get('Rank'))
sheet.write(flag, 1, i.get('Mark'))
sheet.write(flag, 2, i.get('age'))
sheet.write(flag, 3, i.get('Competitior'))
sheet.write(flag, 4, i.get('DOB'))
sheet.write(flag, 5, i.get('Nat'))
sheet.write(flag, 6, i.get('country'))
sheet.write(flag, 7, i.get('Venue'))
sheet.write(flag, 8, i.get('Date'))
if count_out > 0:
for j in range(count_out):
sheet.write(flag+j, 9, i['outdoor'][j]['Year'])
sheet.write(flag+j, 10, i['outdoor'][j]['Performance'])
sheet.write(flag+j, 11, i['outdoor'][j]['Wind'])
sheet.write(flag+j, 12, i['outdoor'][j]['Place'])
sheet.write(flag+j, 13, i['outdoor'][j]['Date'])
if count_in > 0:
for k in range(count_in):
sheet.write(flag+k, 14, i['indoor'][k]['Year'])
sheet.write(flag+k, 15, i['indoor'][k]['Performance'])
sheet.write(flag+k, 16, i['indoor'][k]['Place'])
sheet.write(flag+k, 17, i['indoor'][k]['Date'])
flag = flag + count
book.save(r'iaaf.xls')
# 開始從第一行 輸入數據 從數據庫取
if name == 'main':
write_into_xls(cursor=None)
運行完上述代碼后,我們得到的結果是
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。