您好,登錄后才能下訂單哦!
beautifulsoup解析頁面
from bs4 import BeautifulSoup soup = BeautifulSoup(htmltxt, "lxml") # 三種裝載器 soup = BeautifulSoup("<a></p>", "html.parser") ### 只有起始標簽的會自動補全,只有結束標簽的會自動忽略 ### 結果為:<a></a> soup = BeautifulSoup("<a></p>", "lxml") ### 結果為:<html><body><a></a></body></html> soup = BeautifulSoup("<a></p>", "html5lib") ### html5lib則出現一般的標簽都會自動補全 ### 結果為:<html><head></head><body><a><p></p></a></body></html> # 根據標簽名、id、class、屬性等查找標簽 ### 根據class、id、以及屬性alog-action的值和標簽類別查詢 soup.find("a",class_="title",id="t1",attrs={"alog-action": "qb-ask-uname"})) ### 查詢標簽內某屬性的值 pubtime = soup.find("meta",attrs={"itemprop":"datePublished"}).attrs['content'] ### 獲取所有class為title的標簽 for i in soup.find_all(class_="title"): print(i.get_text()) ### 獲取特定數量的class為title的標簽 for i in soup.find_all(class_="title",limit = 2): print(i.get_text()) ### 獲取文本內容時可以指定不同標簽之間的分隔符,也可以選擇是否去掉前后的空白。 soup = BeautifulSoup('<p class="title" id="p1"><b> The Dormouses story </b></p><p class="title" id="p1"><b>The Dormouses story</b></p>', "html5lib") soup.find(class_="title").get_text("|", strip=True) #結果為:The Dormouses story|The Dormouses story ### 獲取class為title的p標簽的id soup.find(class_="title").get("id") ### 對class名稱正則: soup.find_all(class_=re.compile("tit")) ### recursive參數,recursive=False時,只find當前標簽的第一級子標簽的數據 soup = BeautifulSoup('<html><head><title>abc','lxml') soup.html.find_all("title", recursive=False)
unicode編碼轉中文
content = "\u65f6\u75c7\u5b85" content = content.encode("utf8","ignore").decode('unicode_escape')
url encode的解碼與解碼
from urllib import parse # 編碼 x = "中國你好" y = parse.quote(x) print(y) # 解碼 x = parse.unquote(y) print(x)
html轉義字符的解碼
from html.parser import HTMLParser htmls = "<div><p>" txt = HTMLParser().unescape(htmls) print(txt) . # 輸出<div><p>
base64的編碼與解碼
import base64 # 編碼 content = "測試轉碼文本123" contents_base64 = base64.b64encode(content.encode('utf-8','ignore')).decode("utf-8") # 解碼 contents = base64.b64decode(contents_base64)
過濾emoji表情
def filter_emoji(desstr,restr=''): try: co = re.compile(u'[\U00010000-\U0010ffff]') except re.error: co = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]') return co.sub(restr, desstr)
完全過濾script和style標簽
import requests from bs4 import BeautifulSoup soup = BeautifulSoup(htmls, "lxml") for script in soup(["script", "style"]): script.extract() print(soup)
過濾html的標簽,但保留標簽里的內容
import re htmls = "<p>abc</p>" dr = re.compile(r'<[^>]+>',re.S) htmls2 = dr.sub('',htmls) print(htmls2) #abc 正則提取內容(一般處理json) rollback({ "response": { "code": "0", "msg": "Success", "dext": "" }, "data": { "count": 3, "page": 1, "article_info": [{ "title": "“小庫里”:適應比賽是首要任務 投籃終會找到節奏", "url": "http:\/\/sports.qq.com\/a\/20180704\/035378.htm", "time": "2018-07-04 16:58:36", "column": "NBA", "img": "", "desc": "" }, { "title": "首鋼體育助力國家冰球集訓隊 中國冰球聯賽年底啟動", "url": "http:\/\/sports.qq.com\/a\/20180704\/034698.htm", "time": "2018-07-04 16:34:44", "column": "綜合體育", "img": "", "desc": "" }...] } }) import re # 提取這個json中的每條新聞的title、url # (.*?)為要提取的內容,可以在正則字符串中加入.*?表示中間省略若干字符 reg_str = r'"title":"(.*?)",.*?"url":"(.*?)"' pattern = re.compile(reg_str,re.DOTALL) items = re.findall(pattern,htmls) for i in items: tilte = i[0] url = i[1]
時間操作
# 獲取當前日期 today = datetime.date.today() print(today) #2018-07-05 # 獲取當前時間并格式化 time_now = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())) print(time_now) #2018-07-05 14:20:55 # 對時間戳格式化 a = 1502691655 time_a = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(a))) print(time_a) #2017-08-14 14:20:55 # 字符串轉為datetime類型 str = "2018-07-01 00:00:00" datetime.datetime.strptime(st, "%Y-%m-%d %H:%M:%S") # 將時間轉化為時間戳 time_line = "2018-07-16 10:38:50" time_tuple = time.strptime(time_line, "%Y-%m-%d %H:%M:%S") time_line2 = int(time.mktime(time_tuple)) # 明天的日期 today = datetime.date.today() tomorrow = today + datetime.timedelta(days=1) print(tomorrow) #2018-07-06 # 三天前的時間 today = datetime.datetime.today() tomorrow = today + datetime.timedelta(days=-3) print(tomorrow) #2018-07-02 13:37:00.107703 # 計算時間差 start = "2018-07-03 00:00:00" time_now = datetime.datetime.now() b = datetime.datetime.strptime(start,'%Y-%m-%d %H:%M:%S') minutes = (time_now-b).seconds/60 days = (time_now-b).days all_minutes = days*24*60+minutes print(minutes) #821.7666666666667 print(days) #2 print(all_minutes) #3701.7666666666664
數據庫操作
import pymysql conn = pymysql.connect(host='10.0.8.81', port=3306, user='root', passwd='root',db='xxx', charset='utf8') cur = conn.cursor() insert_sql = "insert into tbl_name(id,name,age) values(%s,%s,%s) id = 1 name = "like" age = 26 data_list = [] data = (id,name,age) # 單條插入 cur.execute(insert_sql,data) conn.commit() # 批量插入 data_list.append(data) cur.executemany(insert_sql,data_list) conn.commit() #特殊字符處理(name中含有特殊字符) data = (id,pymysql.escape_string(name),age) #更新 update_sql = "update tbl_name set content = '%s' where id = "+str(id) cur.execute(update_sql%(pymysql.escape_string(content))) conn.commit() #批量更新 update_sql = "UPDATE tbl_recieve SET content = %s ,title = %s , is_spider = %s WHERE id = %s" update_data = (contents,title,is_spider,one_new[0]) update_data_list.append(update_data) if len(update_data_list) > 500: try: cur.executemany(update_sql,update_data_list) conn.commit()
總結
以上就是這篇文章的全部內容了,希望本文的內容對大家的學習或者工作具有一定的參考學習價值,謝謝大家對億速云的支持。如果你想了解更多相關內容請查看下面相關鏈接
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。