千家信息网

Python3中如何解析html

发表于:2025-01-23 作者:千家信息网编辑
千家信息网最后更新 2025年01月23日,这篇"Python3中如何解析html"文章的知识点大部分人都不太理解,所以小编给大家总结了以下内容,内容详细,步骤清晰,具有一定的借鉴价值,希望大家阅读完这篇文章能有所收获,下面我们一起来看看这篇"
千家信息网最后更新 2025年01月23日Python3中如何解析html

这篇"Python3中如何解析html"文章的知识点大部分人都不太理解,所以小编给大家总结了以下内容,内容详细,步骤清晰,具有一定的借鉴价值,希望大家阅读完这篇文章能有所收获,下面我们一起来看看这篇"Python3中如何解析html"文章吧。

辅助函数,主要用于获取html并输入解析后的结束

#把传递解析函数,便于下面的修改def get_html(url, paraser=bs4_paraser): headers = {  'Accept': '*/*',  'Accept-Encoding': 'gzip, deflate, sdch',  'Accept-Language': 'zh-CN,zh;q=0.8',  'Host': 'www.360kan.com',  'Proxy-Connection': 'keep-alive',  'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) response.encoding = 'utf-8' if response.code == 200:  data = StringIO.StringIO(response.read())  gzipper = gzip.GzipFile(fileobj=data)  data = gzipper.read()  value = paraser(data) # open('E:/h6/haPkY0osd0r5UB.html').read()  return value else:  pass  value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)for row in value: print row

1,lxml.html的方式进行解析,

def lxml_parser(page): data = [] doc = etree.HTML(page) all_div = doc.xpath('//div[@class="yingping-list-wrap"]') for row in all_div:  # 获取每一个影评,即影评的item  all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'})  for r in all_div_item:   value = {}   # 获取影评的标题部分   title = r.xpath('.//div[@class="g-clear title-wrap"][1]')   value['title'] = title[0].xpath('./a/text()')[0]   value['title_href'] = title[0].xpath('./a/@href')[0]   score_text = title[0].xpath('./div/span/span/@style')[0]   score_text = re.search(r'\d+', score_text).group()   value['score'] = int(score_text) / 20   # 时间   value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0]   # 多少人喜欢   value['people'] = int(     re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group())   data.append(value) return data

2,使用BeautifulSoup,不多说了,大家网上找资料看看

def bs4_paraser(html): all_value = [] value = {} soup = BeautifulSoup(html, 'html.parser') # 获取影评的部分 all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1) for row in all_div:  # 获取每一个影评,即影评的item  all_div_item = row.find_all('div', attrs={'class': 'item'})  for r in all_div_item:   # 获取影评的标题部分   title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1)   if title is not None and len(title) > 0:    value['title'] = title[0].a.string    value['title_href'] = title[0].a['href']    score_text = title[0].div.span.span['style']    score_text = re.search(r'\d+', score_text).group()    value['score'] = int(score_text) / 20    # 时间    value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string    # 多少人喜欢    value['people'] = int(      re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group())   # print r   all_value.append(value)   value = {} return all_value

3,使用SGMLParser,主要是通过start、end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)

class CommentParaser(SGMLParser): def __init__(self):  SGMLParser.__init__(self)  self.__start_div_yingping = False  self.__start_div_item = False  self.__start_div_gclear = False  self.__start_div_ratingwrap = False  self.__start_div_num = False  # a  self.__start_a = False  # span 3中状态  self.__span_state = 0  # 数据  self.__value = {}  self.data = []  def start_div(self, attrs):  for k, v in attrs:   if k == 'class' and v == 'yingping-list-wrap':    self.__start_div_yingping = True   elif k == 'class' and v == 'item':    self.__start_div_item = True   elif k == 'class' and v == 'g-clear title-wrap':    self.__start_div_gclear = True   elif k == 'class' and v == 'rating-wrap g-clear':    self.__start_div_ratingwrap = True   elif k == 'class' and v == 'num':    self.__start_div_num = True  def end_div(self):  if self.__start_div_yingping:   if self.__start_div_item:    if self.__start_div_gclear:     if self.__start_div_num or self.__start_div_ratingwrap:      if self.__start_div_num:       self.__start_div_num = False      if self.__start_div_ratingwrap:       self.__start_div_ratingwrap = False     else:      self.__start_div_gclear = False    else:     self.data.append(self.__value)     self.__value = {}     self.__start_div_item = False   else:    self.__start_div_yingping = False  def start_a(self, attrs):  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:   self.__start_a = True   for k, v in attrs:    if k == 'href':     self.__value['href'] = v  def end_a(self):  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:   self.__start_a = False  def start_span(self, attrs):  if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:   if self.__start_div_ratingwrap:    if self.__span_state != 1:     for k, v in attrs:      if k == 'class' and v == 'rating':       self.__span_state = 1      elif k == 'class' and v == 'time':       self.__span_state = 2    else:     for k, v in attrs:      if k == 'style':       score_text = re.search(r'\d+', v).group()     self.__value['score'] = int(score_text) / 20     self.__span_state = 3   elif self.__start_div_num:    self.__span_state = 4  def end_span(self):  self.__span_state = 0  def handle_data(self, data):  if self.__start_a:   self.__value['title'] = data  elif self.__span_state == 2:   self.__value['time'] = data  elif self.__span_state == 4:   score_text = re.search(r'\d+', data).group()   self.__value['people'] = int(score_text)  passdef sgl_parser(html): parser = CommentParaser() parser.feed(html) return parser.data

4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,

class CommentHTMLParser(HTMLParser.HTMLParser): def __init__(self):  HTMLParser.HTMLParser.__init__(self)  self.__start_div_yingping = False  self.__start_div_item = False  self.__start_div_gclear = False  self.__start_div_ratingwrap = False  self.__start_div_num = False  # a  self.__start_a = False  # span 3中状态  self.__span_state = 0  # 数据  self.__value = {}  self.data = []  def handle_starttag(self, tag, attrs):  if tag == 'div':   for k, v in attrs:    if k == 'class' and v == 'yingping-list-wrap':     self.__start_div_yingping = True    elif k == 'class' and v == 'item':     self.__start_div_item = True    elif k == 'class' and v == 'g-clear title-wrap':     self.__start_div_gclear = True    elif k == 'class' and v == 'rating-wrap g-clear':     self.__start_div_ratingwrap = True    elif k == 'class' and v == 'num':     self.__start_div_num = True  elif tag == 'a':   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:    self.__start_a = True    for k, v in attrs:     if k == 'href':      self.__value['href'] = v  elif tag == 'span':   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear:    if self.__start_div_ratingwrap:     if self.__span_state != 1:      for k, v in attrs:       if k == 'class' and v == 'rating':        self.__span_state = 1       elif k == 'class' and v == 'time':        self.__span_state = 2     else:      for k, v in attrs:       if k == 'style':        score_text = re.search(r'\d+', v).group()      self.__value['score'] = int(score_text) / 20      self.__span_state = 3    elif self.__start_div_num:     self.__span_state = 4  def handle_endtag(self, tag):  if tag == 'div':   if self.__start_div_yingping:    if self.__start_div_item:     if self.__start_div_gclear:      if self.__start_div_num or self.__start_div_ratingwrap:       if self.__start_div_num:        self.__start_div_num = False       if self.__start_div_ratingwrap:        self.__start_div_ratingwrap = False      else:       self.__start_div_gclear = False     else:      self.data.append(self.__value)      self.__value = {}      self.__start_div_item = False    else:     self.__start_div_yingping = False  elif tag == 'a':   if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a:    self.__start_a = False  elif tag == 'span':   self.__span_state = 0  def handle_data(self, data):  if self.__start_a:   self.__value['title'] = data  elif self.__span_state == 2:   self.__value['time'] = data  elif self.__span_state == 4:   score_text = re.search(r'\d+', data).group()   self.__value['people'] = int(score_text)  passdef html_parser(html): parser = CommentHTMLParser() parser.feed(html) return parser.data

以上就是关于"Python3中如何解析html"这篇文章的内容,相信大家都有了一定的了解,希望小编分享的内容对大家有帮助,若想了解更多相关的知识内容,请关注行业资讯频道。

0