您好,登錄后才能下訂單哦!
1,使用xpath清理不必要的標簽元素,以及無內容標簽
from lxml import etree def xpath_clean(self, text: str, xpath_dict: dict) -> str: ''' xpath 清除不必要的元素 :param text: html_content :param xpath_dict: 清除目標xpath :return: string type html_content ''' remove_by_xpath = xpath_dict if xpath_dict else dict() # 必然清除的項目 除非極端情況 一般這些都是要清除的 remove_by_xpath.update({ '_remove_2': '//iframe', '_remove_4': '//button', '_remove_5': '//form', '_remove_6': '//input', '_remove_7': '//select', '_remove_8': '//option', '_remove_9': '//textarea', '_remove_10': '//figure', '_remove_11': '//figcaption', '_remove_12': '//frame', '_remove_13': '//video', '_remove_14': '//script', '_remove_15': '//style' }) parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True) selector = etree.HTML(text, parser=parser) # 常規刪除操作,不需要的標簽刪除 for xpath in remove_by_xpath.values(): for bad in selector.xpath(xpath): bad_string = etree.tostring(bad, encoding='utf-8', pretty_print=True).decode() logger.debug(f"clean article content : {bad_string}") bad.getparent().remove(bad) skip_tip = "name()='img' or name()='tr' or " \ "name()='th' or name()='tbody' or " \ "name()='thead' or name()='table'" # 判斷所有p標簽,是否有內容存在,沒有的直接刪除 for p in selector.xpath(f"http://*[not({skip_tip})]"): # 跳過邏輯 if p.xpath(f".//*[{skip_tip}]") or \ bool(re.sub('\s', '', p.xpath('string(.)'))): continue bad_p = etree.tostring(p, encoding='utf-8', pretty_print=True).decode() logger.debug(f"clean p tag : {bad_p}") p.getparent().remove(p) return etree.tostring(selector, encoding='utf-8', pretty_print=True).decode()
2,使用pyquery清理標簽屬性,并返回處理后源碼和純凈文本
#!/usr/bin/env python # -*-coding:utf-8-*- from pyquery import PyQuery as pq def pyquery_clean(self, text, url, pq_dict) -> object: ''' pyquery 做出必要的處理, :param text: :param url: :param pq_dict: :return: ''' # 刪除pq表達式字典 remove_by_pq = pq_dict if pq_dict else dict() # 標簽屬性白名單 attr_white_list = ['rowspan', 'colspan'] # 圖片鏈接key img_key_list = ['src', 'data-echo', 'data-src', 'data-original'] # 生成pyquery對象 dom = pq(text) # 刪除無用標簽 for bad_tag in remove_by_pq.values(): for bad in dom(bad_tag): bad_string = pq(bad).html() logger.debug(f"clean article content : {bad_string}") dom.remove(bad_tag) # 標簽各個屬性處理 for tag in dom('*'): for key, value in tag.attrib.items(): # 跳過邏輯,保留表格的rowspan和colspan屬性 if key in attr_white_list: continue # 處理圖片鏈接,不完整url,補充完整后替換 if key in img_key_list: img_url = self.absolute_url(url, value) pq(tag).remove_attr(key) pq(tag).attr('src', img_url) pq(tag).attr('alt', '') # img標簽的alt屬性保留為空 elif key == 'alt': pq(tag).attr(key, '') # 其余所有屬性做刪除操作 else: pq(tag).remove_attr(key) return dom.text(), dom.html()
3,正則表達清理空格以及換行符內容
#!/usr/bin/env python # -*-coding:utf-8-*- import re def regular_clean(self, str1: str, str2: str): ''' 正則表達式處理數據格式 :param str1: content :param str2: html_content :return: 返回處理后的結果 ''' def new_line(text): text = re.sub('<br\s?/?>', '<br>', text) text = re.sub( '</?a>|</?em>|</?html>|</?body>|' '</?head>|<[a-zA-Z]{1,10}\s?/>|' '</?strong>|</?blockquote>|</?b>|' '</?span>|</?i>|</?hr>|</?font>', '', text) text = re.sub('\n', '', text) text = re.sub('<h[1-6]>', '<p>', text) text = re.sub('</h[1-6]>', '</p>', text) text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>') return text str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 處理空白行問題 # TODO html_content處理 1,刪除多余的無法使用的標簽以及影響數據展示的標簽 2,換行符問題處理以及更換 str2 = new_line(text=str2) return str1, str2
結尾部分,各個方法封裝類代碼展示
#!/usr/bin/env python # -*-coding:utf-8-*- ''' author: szhan date:2020-08-17 summery: 清理html_conent以及獲取純凈數據格式 ''' import re from lxml import etree from pyquery import PyQuery as pq from urllib.parse import urlsplit, urljoin from loguru import logger class CleanArticle: def __init__( self, text: str, url: str = '', xpath_dict: dict = None, pq_dict: dict = None ): self.text = text self.url = url self.xpath_dict = xpath_dict or dict() self.pq_dict = pq_dict or dict() @staticmethod def absolute_url(baseurl: str, url: str) -> str: ''' 補充url :param baseurl:scheme url :param url: target url :return: complete url ''' target_url = url if urlsplit(url).scheme else urljoin(baseurl, url) return target_url @staticmethod def clean_blank(text): ''' 空白處理 :param text: :return: ''' text = text.replace(' ', '').replace('\u3000', '').replace('\t', '').replace('\xa0', '') text = re.sub('\s{2,}', '', text) text = re.sub('\n{2,}', '\n', text) text = text.strip('\n').strip() return text def run(self): ''' :return:處理后的content, html_content ''' if (not bool(self.text)) or (not isinstance(self.text, str)): raise ValueError('html_content has a bad type value') # 首先,使用xpath去除空格,以及注釋,iframe, button, form, script, style, video等標簽 text = self.xpath_clean(self.text, self.xpath_dict) # 第二步,使用pyquery處理具體細節方面 str1, str2 = self.pyquery_clean(text, self.url, self.pq_dict) # 最終的正則處理 content, html_content = self.regular_clean(str1, str2) return content, html_content def xpath_clean(self, text: str, xpath_dict: dict) -> str: ''' xpath 清除不必要的元素 :param text: html_content :param xpath_dict: 清除目標xpath :return: string type html_content ''' remove_by_xpath = xpath_dict if xpath_dict else dict() # 必然清除的項目 除非極端情況 一般這些都是要清除的 remove_by_xpath.update({ '_remove_2': '//iframe', '_remove_4': '//button', '_remove_5': '//form', '_remove_6': '//input', '_remove_7': '//select', '_remove_8': '//option', '_remove_9': '//textarea', '_remove_10': '//figure', '_remove_11': '//figcaption', '_remove_12': '//frame', '_remove_13': '//video', '_remove_14': '//script', '_remove_15': '//style' }) parser = etree.HTMLParser(remove_blank_text=True, remove_comments=True) selector = etree.HTML(text, parser=parser) # 常規刪除操作,不需要的標簽刪除 for xpath in remove_by_xpath.values(): for bad in selector.xpath(xpath): bad_string = etree.tostring(bad, encoding='utf-8', pretty_print=True).decode() logger.debug(f"clean article content : {bad_string}") bad.getparent().remove(bad) skip_tip = "name()='img' or name()='tr' or " \ "name()='th' or name()='tbody' or " \ "name()='thead' or name()='table'" # 判斷所有p標簽,是否有內容存在,沒有的直接刪除 for p in selector.xpath(f"http://*[not({skip_tip})]"): # 跳過邏輯 if p.xpath(f".//*[{skip_tip}]") or \ bool(re.sub('\s', '', p.xpath('string(.)'))): continue bad_p = etree.tostring(p, encoding='utf-8', pretty_print=True).decode() logger.debug(f"clean p tag : {bad_p}") p.getparent().remove(p) return etree.tostring(selector, encoding='utf-8', pretty_print=True).decode() def pyquery_clean(self, text, url, pq_dict) -> object: ''' pyquery 做出必要的處理, :param text: :param url: :param pq_dict: :return: ''' # 刪除pq表達式字典 remove_by_pq = pq_dict if pq_dict else dict() # 標簽屬性白名單 attr_white_list = ['rowspan', 'colspan'] # 圖片鏈接key img_key_list = ['src', 'data-echo', 'data-src', 'data-original'] # 生成pyquery對象 dom = pq(text) # 刪除無用標簽 for bad_tag in remove_by_pq.values(): for bad in dom(bad_tag): bad_string = pq(bad).html() logger.debug(f"clean article content : {bad_string}") dom.remove(bad_tag) # 標簽各個屬性處理 for tag in dom('*'): for key, value in tag.attrib.items(): # 跳過邏輯,保留表格的rowspan和colspan屬性 if key in attr_white_list: continue # 處理圖片鏈接,不完整url,補充完整后替換 if key in img_key_list: img_url = self.absolute_url(url, value) pq(tag).remove_attr(key) pq(tag).attr('src', img_url) pq(tag).attr('alt', '') # img標簽的alt屬性保留為空 elif key == 'alt': pq(tag).attr(key, '') # 其余所有屬性做刪除操作 else: pq(tag).remove_attr(key) return dom.text(), dom.html() def regular_clean(self, str1: str, str2: str): ''' 正則表達式處理數據格式 :param str1: content :param str2: html_content :return: 返回處理后的結果 ''' def new_line(text): text = re.sub('<br\s?/?>', '<br>', text) text = re.sub( '</?a>|</?em>|</?html>|</?body>|' '</?head>|<[a-zA-Z]{1,10}\s?/>|' '</?strong>|</?blockquote>|</?b>|' '</?span>|</?i>|</?hr>|</?font>', '', text) text = re.sub('\n', '', text) text = re.sub('<h[1-6]>', '<p>', text) text = re.sub('</h[1-6]>', '</p>', text) text = text.replace('</p>', '</p>\n').replace('<br>', '<br/>') return text str1, str2 = self.clean_blank(str1), self.clean_blank(str2) # TODO 處理空白行問題 # TODO html_content處理 1,刪除多余的無法使用的標簽以及影響數據展示的標簽 2,換行符問題處理以及更換 str2 = new_line(text=str2) return str1, str2 if __name__ == '__main__': with open('html_content.html', 'r', encoding='utf-8') as f: lines = f.readlines() html = '' for line in lines: html += line ca = CleanArticle(text=html) _, html_content = ca.run() print(html_content)
總結
到此這篇關于基于xpath選擇器、PyQuery、正則表達式的格式清理工具詳解的文章就介紹到這了,更多相關PyQuery、正則表達式的格式清理工具內容請搜索億速云以前的文章或繼續瀏覽下面的相關文章希望大家以后多多支持億速云!
免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。