Python3中如何解析html
发表于:2025-01-23 作者:千家信息网编辑
千家信息网最后更新 2025年01月23日,这篇"Python3中如何解析html"文章的知识点大部分人都不太理解,所以小编给大家总结了以下内容,内容详细,步骤清晰,具有一定的借鉴价值,希望大家阅读完这篇文章能有所收获,下面我们一起来看看这篇"
千家信息网最后更新 2025年01月23日Python3中如何解析html
这篇"Python3中如何解析html"文章的知识点大部分人都不太理解,所以小编给大家总结了以下内容,内容详细,步骤清晰,具有一定的借鉴价值,希望大家阅读完这篇文章能有所收获,下面我们一起来看看这篇"Python3中如何解析html"文章吧。
辅助函数,主要用于获取html并输入解析后的结束
#把传递解析函数,便于下面的修改def get_html(url, paraser=bs4_paraser): headers = { 'Accept': '*/*', 'Accept-Encoding': 'gzip, deflate, sdch', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Host': 'www.360kan.com', 'Proxy-Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36' } request = urllib2.Request(url, headers=headers) response = urllib2.urlopen(request) response.encoding = 'utf-8' if response.code == 200: data = StringIO.StringIO(response.read()) gzipper = gzip.GzipFile(fileobj=data) data = gzipper.read() value = paraser(data) # open('E:/h6/haPkY0osd0r5UB.html').read() return value else: pass value = get_html('http://www.360kan.com/m/haPkY0osd0r5UB.html', paraser=lxml_parser)for row in value: print row
1,lxml.html的方式进行解析,
def lxml_parser(page): data = [] doc = etree.HTML(page) all_div = doc.xpath('//div[@class="yingping-list-wrap"]') for row in all_div: # 获取每一个影评,即影评的item all_div_item = row.xpath('.//div[@class="item"]') # find_all('div', attrs={'class': 'item'}) for r in all_div_item: value = {} # 获取影评的标题部分 title = r.xpath('.//div[@class="g-clear title-wrap"][1]') value['title'] = title[0].xpath('./a/text()')[0] value['title_href'] = title[0].xpath('./a/@href')[0] score_text = title[0].xpath('./div/span/span/@style')[0] score_text = re.search(r'\d+', score_text).group() value['score'] = int(score_text) / 20 # 时间 value['time'] = title[0].xpath('./div/span[@class="time"]/text()')[0] # 多少人喜欢 value['people'] = int( re.search(r'\d+', title[0].xpath('./div[@class="num"]/span/text()')[0]).group()) data.append(value) return data
2,使用BeautifulSoup,不多说了,大家网上找资料看看
def bs4_paraser(html): all_value = [] value = {} soup = BeautifulSoup(html, 'html.parser') # 获取影评的部分 all_div = soup.find_all('div', attrs={'class': 'yingping-list-wrap'}, limit=1) for row in all_div: # 获取每一个影评,即影评的item all_div_item = row.find_all('div', attrs={'class': 'item'}) for r in all_div_item: # 获取影评的标题部分 title = r.find_all('div', attrs={'class': 'g-clear title-wrap'}, limit=1) if title is not None and len(title) > 0: value['title'] = title[0].a.string value['title_href'] = title[0].a['href'] score_text = title[0].div.span.span['style'] score_text = re.search(r'\d+', score_text).group() value['score'] = int(score_text) / 20 # 时间 value['time'] = title[0].div.find_all('span', attrs={'class': 'time'})[0].string # 多少人喜欢 value['people'] = int( re.search(r'\d+', title[0].find_all('div', attrs={'class': 'num'})[0].span.string).group()) # print r all_value.append(value) value = {} return all_value
3,使用SGMLParser,主要是通过start、end tag的方式进行了,解析工程比较明朗,但是有点麻烦,而且该案例的场景不太适合该方法,(哈哈)
class CommentParaser(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.__start_div_yingping = False self.__start_div_item = False self.__start_div_gclear = False self.__start_div_ratingwrap = False self.__start_div_num = False # a self.__start_a = False # span 3中状态 self.__span_state = 0 # 数据 self.__value = {} self.data = [] def start_div(self, attrs): for k, v in attrs: if k == 'class' and v == 'yingping-list-wrap': self.__start_div_yingping = True elif k == 'class' and v == 'item': self.__start_div_item = True elif k == 'class' and v == 'g-clear title-wrap': self.__start_div_gclear = True elif k == 'class' and v == 'rating-wrap g-clear': self.__start_div_ratingwrap = True elif k == 'class' and v == 'num': self.__start_div_num = True def end_div(self): if self.__start_div_yingping: if self.__start_div_item: if self.__start_div_gclear: if self.__start_div_num or self.__start_div_ratingwrap: if self.__start_div_num: self.__start_div_num = False if self.__start_div_ratingwrap: self.__start_div_ratingwrap = False else: self.__start_div_gclear = False else: self.data.append(self.__value) self.__value = {} self.__start_div_item = False else: self.__start_div_yingping = False def start_a(self, attrs): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: self.__start_a = True for k, v in attrs: if k == 'href': self.__value['href'] = v def end_a(self): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a: self.__start_a = False def start_span(self, attrs): if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: if self.__start_div_ratingwrap: if self.__span_state != 1: for k, v in attrs: if k == 'class' and v == 'rating': self.__span_state = 1 elif k == 'class' and v == 'time': self.__span_state = 2 else: for k, v in attrs: if k == 'style': score_text = re.search(r'\d+', v).group() self.__value['score'] = int(score_text) / 20 self.__span_state = 3 elif self.__start_div_num: self.__span_state = 4 def end_span(self): self.__span_state = 0 def handle_data(self, data): if self.__start_a: self.__value['title'] = data elif self.__span_state == 2: self.__value['time'] = data elif self.__span_state == 4: score_text = re.search(r'\d+', data).group() self.__value['people'] = int(score_text) passdef sgl_parser(html): parser = CommentParaser() parser.feed(html) return parser.data
4,HTMLParaer,与3原理相识,就是调用的方法不太一样,基本上可以公用,
class CommentHTMLParser(HTMLParser.HTMLParser): def __init__(self): HTMLParser.HTMLParser.__init__(self) self.__start_div_yingping = False self.__start_div_item = False self.__start_div_gclear = False self.__start_div_ratingwrap = False self.__start_div_num = False # a self.__start_a = False # span 3中状态 self.__span_state = 0 # 数据 self.__value = {} self.data = [] def handle_starttag(self, tag, attrs): if tag == 'div': for k, v in attrs: if k == 'class' and v == 'yingping-list-wrap': self.__start_div_yingping = True elif k == 'class' and v == 'item': self.__start_div_item = True elif k == 'class' and v == 'g-clear title-wrap': self.__start_div_gclear = True elif k == 'class' and v == 'rating-wrap g-clear': self.__start_div_ratingwrap = True elif k == 'class' and v == 'num': self.__start_div_num = True elif tag == 'a': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: self.__start_a = True for k, v in attrs: if k == 'href': self.__value['href'] = v elif tag == 'span': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear: if self.__start_div_ratingwrap: if self.__span_state != 1: for k, v in attrs: if k == 'class' and v == 'rating': self.__span_state = 1 elif k == 'class' and v == 'time': self.__span_state = 2 else: for k, v in attrs: if k == 'style': score_text = re.search(r'\d+', v).group() self.__value['score'] = int(score_text) / 20 self.__span_state = 3 elif self.__start_div_num: self.__span_state = 4 def handle_endtag(self, tag): if tag == 'div': if self.__start_div_yingping: if self.__start_div_item: if self.__start_div_gclear: if self.__start_div_num or self.__start_div_ratingwrap: if self.__start_div_num: self.__start_div_num = False if self.__start_div_ratingwrap: self.__start_div_ratingwrap = False else: self.__start_div_gclear = False else: self.data.append(self.__value) self.__value = {} self.__start_div_item = False else: self.__start_div_yingping = False elif tag == 'a': if self.__start_div_yingping and self.__start_div_item and self.__start_div_gclear and self.__start_a: self.__start_a = False elif tag == 'span': self.__span_state = 0 def handle_data(self, data): if self.__start_a: self.__value['title'] = data elif self.__span_state == 2: self.__value['time'] = data elif self.__span_state == 4: score_text = re.search(r'\d+', data).group() self.__value['people'] = int(score_text) passdef html_parser(html): parser = CommentHTMLParser() parser.feed(html) return parser.data
以上就是关于"Python3中如何解析html"这篇文章的内容,相信大家都有了一定的了解,希望小编分享的内容对大家有帮助,若想了解更多相关的知识内容,请关注行业资讯频道。
影评
内容
部分
函数
就是
数据
文章
方式
方法
时间
标题
状态
知识
篇文章
明朗
价值
原理
场景
大部分
工程
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
服务器系统升级通知
获取数据库的年龄
数据库 扩展表
主流网络技术有几种
数据库属性名添加注视
方舟生存进化服务器炸服两天
广州三途网络技术有限公司
数据库sc的意思
sql 服务器时间
西北工业大学网络安全学院的电话
龙岩服务器
智慧电梯软件开发价位
文山计算机网络技术培训
护苗网络安全课观后感百度
网络安全教育知识 小学生
服务器ip登不上管理口
美国服务器租用 选择
苹果手机丢失数据库
网络服务器架设价格
网无法连接服务器失败
完善网络安全体系
手机软件开发网
如果服务器对上传的压缩包
武汉大学网络安全法学院
软件开发金融信息化
yjk的数据库文件
senecadata监控服务器
网络安全的主要侧重点是什么
计算机网络技术实训教案
徐州软件开发技术项目实训中心