91超碰碰碰碰久久久久久综合_超碰av人澡人澡人澡人澡人掠_国产黄大片在线观看画质优化_txt小说免费全本

溫馨提示×

溫馨提示×

您好,登錄后才能下訂單哦!

密碼登錄×
登錄注冊×
其他方式登錄
點擊 登錄注冊 即表示同意《億速云用戶服務條款》

Python3 中怎么解析html

發布時間:2021-06-17 15:28:27 來源:億速云 閱讀:168 作者:Leah 欄目:開發技術

Python3 中怎么解析html,針對這個問題,這篇文章詳細介紹了相對應的分析和解答,希望可以幫助更多想解決這個問題的小伙伴找到更簡單易行的方法。

輔助函數,主要用于獲取html并輸入解析后的結束

#把傳遞解析函數,便于下面的修改
def get_html(url, paraser=bs4_paraser):
 headers = {
  'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate, sdch',
  'Accept-Language': 'zh-CN,zh;q=0.8',
  'Host': 'www.360kan.com',
  'Proxy-Connection': 'keep-alive',
  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'
 }
 request = urllib2.Request(url, headers=headers)
 response = urllib2.urlopen(request)
 response.encoding = 'utf-8'
 if response.code == 200:
  data = StringIO.StringIO(response.read())
  gzipper = gzip.GzipFile(fileobj=data)
  data = gzipper.read()
  value = paraser(data) # open('E:/h6/haPkY0osd0r5UB.html').read()
  return value
 else:
  pass
 
 
value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)
for row in value:
 print row

1,lxml.html的方式進行解析,

The lxml XML toolkit is a Pythonic binding for the C libraries libxml2 and libxslt. It is unique in that it combines the speed and XML feature completeness of these libraries with the simplicity of a native Python API, mostly compatible but superior to the well-known ElementTree API. The latest release works with all CPython versions from 2.6 to 3.5. See the introduction for more information about background and goals of the lxml project. Some common questions are answered in the FAQ. [官網](http://lxml.de/)

def lxml_parser(page):
 data = []
 doc = etree.HTML(page)
 all_div = doc.xpath('//div[@class="yingping-list-wrap"]')
 for row in all_div:
  # 獲取每一個影評,即影評的item
  all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})
  for r in all_div_item:
   value = {}
   # 獲取影評的標題部分
   title = r.xpath('.//div[@class="g-clear title-wrap"][1]')
   value['title'] = title[0].xpath('./a/text()')[0]
   value['title_href'] = title[0].xpath('./a/@href')[0]
   score_text = title[0].xpath('./div/span/span/@style')[0]
   score_text = re.search(r'\d+', score_text).group()
   value['score'] = int(score_text) / 20
   # 時間
   value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]
   # 多少人喜歡
   value['people'] = int(
     re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())
   data.append(value)
 return data

2,使用BeautifulSoup,不多說了,大家網上找資料看看

def bs4_paraser(html):
 all_value = []
 value = {}
 soup = BeautifulSoup(html, 'html.parser')
 # 獲取影評的部分
 all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1)
 for row in all_div:
  # 獲取每一個影評,即影評的item
  all_div_item = row.find_all('div', attrs={'class': 'item'})
  for r in all_div_item:
   # 獲取影評的標題部分
   title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)
   if title is not None and len(title) > 0:
    value['title'] = title[0].a.string
    value['title_href'] = title[0].a['href']
    score_text = title[0].div.span.span['style']
    score_text = re.search(r'\d+', score_text).group()
    value['score'] = int(score_text) / 20
    # 時間
    value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string
    # 多少人喜歡
    value['people'] = int(
      re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())
   # print r
   all_value.append(value)
   value = {}
 return all_value

3,使用SGMLParser,主要是通過start、end tag的方式進行了,解析工程比較明朗,但是有點麻煩,而且該案例的場景不太適合該方法,(哈哈)

class CommentParaser(SGMLParser):
 def __init__(self):
  SGMLParser.__init__(self)
  self.__start_div_yingping = False
  self.__start_div_item = False
  self.__start_div_gclear = False
  self.__start_div_ratingwrap = False
  self.__start_div_num = False
  # a
  self.__start_a = False
  # span 3中狀態
  self.__span_state = 0
  # 數據
  self.__value = {}
  self.data = []
 
 def start_div(self, attrs):
  for k, v in attrs:
   if k == 'class' and v == 'yingping-list-wrap':
    self.__start_div_yingping = True
   elif k == 'class' and v == 'item':
    self.__start_div_item = True
   elif k == 'class' and v == 'g-clear title-wrap':
    self.__start_div_gclear = True
   elif k == 'class' and v == 'rating-wrap g-clear':
    self.__start_div_ratingwrap = True
   elif k == 'class' and v == 'num':
    self.__start_div_num = True
 
 def end_div(self):
  if self.__start_div_yingping:
   if self.__start_div_item:
    if self.__start_div_gclear:
     if self.__start_div_num or self.__start_div_ratingwrap:
      if self.__start_div_num:
       self.__start_div_num = False
      if self.__start_div_ratingwrap:
       self.__start_div_ratingwrap = False
     else:
      self.__start_div_gclear = False
    else:
     self.data.append(self.__value)
     self.__value = {}
     self.__start_div_item = False
   else:
    self.__start_div_yingping = False
 
 def start_a(self, attrs):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   self.__start_a = True
   for k, v in attrs:
    if k == 'href':
     self.__value['href'] = v
 
 def end_a(self):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
   self.__start_a = False
 
 def start_span(self, attrs):
  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
   if self.__start_div_ratingwrap:
    if self.__span_state != 1:
     for k, v in attrs:
      if k == 'class' and v == 'rating':
       self.__span_state = 1
      elif k == 'class' and v == 'time':
       self.__span_state = 2
    else:
     for k, v in attrs:
      if k == 'style':
       score_text = re.search(r'\d+', v).group()
     self.__value['score'] = int(score_text) / 20
     self.__span_state = 3
   elif self.__start_div_num:
    self.__span_state = 4
 
 def end_span(self):
  self.__span_state = 0
 
 def handle_data(self, data):
  if self.__start_a:
   self.__value['title'] = data
  elif self.__span_state == 2:
   self.__value['time'] = data
  elif self.__span_state == 4:
   score_text = re.search(r'\d+', data).group()
   self.__value['people'] = int(score_text)
  pass
def sgl_parser(html):
 parser = CommentParaser()
 parser.feed(html)
 return parser.data

4,HTMLParaer,與3原理相識,就是調用的方法不太一樣,基本上可以公用,

class CommentHTMLParser(HTMLParser.HTMLParser):
 def __init__(self):
  HTMLParser.HTMLParser.__init__(self)
  self.__start_div_yingping = False
  self.__start_div_item = False
  self.__start_div_gclear = False
  self.__start_div_ratingwrap = False
  self.__start_div_num = False
  # a
  self.__start_a = False
  # span 3中狀態
  self.__span_state = 0
  # 數據
  self.__value = {}
  self.data = []
 
 def handle_starttag(self, tag, attrs):
  if tag == 'div':
   for k, v in attrs:
    if k == 'class' and v == 'yingping-list-wrap':
     self.__start_div_yingping = True
    elif k == 'class' and v == 'item':
     self.__start_div_item = True
    elif k == 'class' and v == 'g-clear title-wrap':
     self.__start_div_gclear = True
    elif k == 'class' and v == 'rating-wrap g-clear':
     self.__start_div_ratingwrap = True
    elif k == 'class' and v == 'num':
     self.__start_div_num = True
  elif tag == 'a':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
    self.__start_a = True
    for k, v in attrs:
     if k == 'href':
      self.__value['href'] = v
  elif tag == 'span':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:
    if self.__start_div_ratingwrap:
     if self.__span_state != 1:
      for k, v in attrs:
       if k == 'class' and v == 'rating':
        self.__span_state = 1
       elif k == 'class' and v == 'time':
        self.__span_state = 2
     else:
      for k, v in attrs:
       if k == 'style':
        score_text = re.search(r'\d+', v).group()
      self.__value['score'] = int(score_text) / 20
      self.__span_state = 3
    elif self.__start_div_num:
     self.__span_state = 4
 
 def handle_endtag(self, tag):
  if tag == 'div':
   if self.__start_div_yingping:
    if self.__start_div_item:
     if self.__start_div_gclear:
      if self.__start_div_num or self.__start_div_ratingwrap:
       if self.__start_div_num:
        self.__start_div_num = False
       if self.__start_div_ratingwrap:
        self.__start_div_ratingwrap = False
      else:
       self.__start_div_gclear = False
     else:
      self.data.append(self.__value)
      self.__value = {}
      self.__start_div_item = False
    else:
     self.__start_div_yingping = False
  elif tag == 'a':
   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:
    self.__start_a = False
  elif tag == 'span':
   self.__span_state = 0
 
 def handle_data(self, data):
  if self.__start_a:
   self.__value['title'] = data
  elif self.__span_state == 2:
   self.__value['time'] = data
  elif self.__span_state == 4:
   score_text = re.search(r'\d+', data).group()
   self.__value['people'] = int(score_text)
  pass
def html_parser(html):
 parser = CommentHTMLParser()
 parser.feed(html)
 return parser.data

關于Python3 中怎么解析html問題的解答就分享到這里了,希望以上內容可以對大家有一定的幫助,如果你還有很多疑惑沒有解開,可以關注億速云行業資訊頻道了解更多相關知識。

向AI問一下細節

免責聲明:本站發布的內容(圖片、視頻和文字)以原創、轉載和分享為主,文章觀點不代表本網站立場,如果涉及侵權請聯系站長郵箱:is@yisu.com進行舉報,并提供相關證據,一經查實,將立刻刪除涉嫌侵權內容。

AI

徐水县| 汉阴县| 通榆县| 三都| 图片| 六枝特区| 黄大仙区| 麟游县| 佛坪县| 西峡县| 大田县| 巩留县| 利津县| 阿鲁科尔沁旗| 安丘市| 宣武区| 禹州市| 乐亭县| 灌云县| 通城县| 芦山县| 大庆市| 鄄城县| 保山市| 兰州市| 正宁县| 秦安县| 洪雅县| 江孜县| 丹寨县| 临沭县| 淅川县| 宁安市| 广平县| 揭阳市| 蚌埠市| 改则县| 丰镇市| 清河县| 同德县| 南丰县|