python中如何使用Scrapy框架爬虫爬取微博热搜
发表于:2025-02-01 作者:千家信息网编辑
千家信息网最后更新 2025年02月01日,这篇文章给大家分享的是有关python中如何使用Scrapy框架爬虫爬取微博热搜的内容。小编觉得挺实用的,因此分享给大家做个参考,一起跟随小编过来看看吧。主要实现的功能:0.理所应当的,绕过了各种反爬
千家信息网最后更新 2025年02月01日python中如何使用Scrapy框架爬虫爬取微博热搜
这篇文章给大家分享的是有关python中如何使用Scrapy框架爬虫爬取微博热搜的内容。小编觉得挺实用的,因此分享给大家做个参考,一起跟随小编过来看看吧。
主要实现的功能:
0.理所应当的,绕过了各种反爬。
1.爬取全部的热搜主要内容。
2.爬取每条热搜的相关微博。
3.爬取每条相关微博的评论,评论用户的各种详细信息。
4.实现了自动翻译,理论上来说,是可以拿下与热搜相关的任何细节,但数据量比较大,推荐使用数据库对这个爬虫程序进行优化(因为当时还没学数据库,不会用,就按照一定格式在本地进行了存储)
(未实现功能):
利用爬取数据构建社交网。可构建python的数据分析,将爬取的用户构成一个社交网络。
项目结构:
weibo.py
用于爬取需要数据,调用回调分析数据后移交给item,再由item移交给管道进行处理,包括持久化数据等等。
import scrapyfrom copy import deepcopyfrom time import sleepimport jsonfrom lxml import etreeimport reclass WeiboSpider(scrapy.Spider): name = 'weibo' start_urls = ['https://s.weibo.com/top/summary?Refer=top_hot&topnav=1&wvr=6'] home_page = "https://s.weibo.com/" #携带cookie发起请求 def start_requests(self): cookies = "" #获取一个cookie cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ")} yield scrapy.Request( self.start_urls[0], callback=self.parse, cookies=cookies ) #分析热搜和链接 def parse(self, response, **kwargs): page_text = response.text with open('first.html','w',encoding='utf-8') as fp: fp.write(page_text) item = {} tr = response.xpath('//*[@id="pl_top_realtimehot"]/table//tr')[1:] #print(tr) for t in tr: item['title'] = t.xpath('./td[2]//text()').extract()[1] print('title : ',item['title']) #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['description'] = response.xpath('//div[@id="description"]').get() detail_url = self.home_page + t.xpath('./td[2]//@href').extract_first() item['href'] = detail_url print("href:",item['href']) #print(item) #yield item yield scrapy.Request(detail_url,callback=self.parse_item, meta={'item':deepcopy(item)}) # print("parse完成") sleep(3) #print(item)# item{'title':href,} #分析每种热搜下的各种首页消息 def parse_item(self, response, **kwargs): # print("开始parse_item") item = response.meta['item'] #print(item) div_list = response.xpath('//div[@id="pl_feedlist_index"]//div[@class="card-wrap"]')[1:] #print('--------------') #print(div_list) #details_url_list = [] #print("div_list : ",div_list) #创建名字为标题的文本存储热搜 name = item['title'] file_path = './' + name for div in div_list: author = div.xpath('.//div[@class="info"]/div[2]/a/@nick-name').extract_first() brief_con = div.xpath('.//p[@node-type="feed_list_content_full"]//text()').extract() if brief_con is None: brief_con = div.xpath('.//p[@class="txt"]//text()').extract() brief_con = ''.join(brief_con) print("brief_con : ",brief_con) link = div.xpath('.//p[@class="from"]/a/@href').extract_first() if author is None or link is None: continue link = "https:" + link + '_&type=comment' news_id = div.xpath('./@mid').extract_first() print("news_id : ",news_id) # print(link) news_time = div.xpath(".//p[@class='from']/a/text()").extract() news_time = ''.join(news_time) print("news_time:", news_time) print("author为:",author) item['author'] = author item['news_id'] = news_id item['news_time'] = news_time item['brief_con'] = brief_con item['details_url'] = link #json链接模板:https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4577307216321742&from=singleWeiBo link = "https://weibo.com/aj/v6/comment/big?ajwvr=6&id="+ news_id + "&from=singleWeiBo" # print(link) yield scrapy.Request(link,callback=self.parse_detail,meta={'item':deepcopy(item)}) #if response.xpath('.//') #分析每条消息的详情和评论 #https://weibo.com/1649173367/JwjbPDW00?refer_flag=1001030103__&type=comment #json数据包 #https://weibo.com/aj/v6/comment/big?ajwvr=6&id=4577307216321742&from=singleWeiBo&__rnd=1606879908312 def parse_detail(self, response, **kwargs): # print("status:",response.status) # print("ur;:",response.url) # print("request:",response.request) # print("headers:",response.headers) # #print(response.text) # print("parse_detail开始") item = response.meta['item'] all= json.loads(response.text)['data']['html'] # #print(all) with open('3.html','w',encoding='utf-8') as fp: fp.write(all) tree = etree.HTML(all) # print(type(tree)) # username = tree.xpath('//div[@class="list_con"]/div[@class="WB_text"]/a[1]/text()') # usertime = re.findall('(.*?)', all) # comment = tree.xpath('//div[@class="list_con"]/div[@class="WB_text"]//text()') # print(usertime) # #因为评论前有个中文的引号,正则格外的好用 # #comment = re.findall(r':(.*?)<',all) # for i in comment: # for w in i: # if i == "\\n": # comment.pop(i) # break # with open("12.txt","w",encoding='utf-8') as fp: # for i in comment: # fp.write(i) # print(comment) #95-122 div_lists = tree.xpath('.//div[@class="list_con"]') final_lists = [] #print(div_lists) with open('13.txt', 'a', encoding='utf-8') as fp: for div in div_lists: list = [] username = div.xpath('./div[@class="WB_text"]/a[1]/text()')[0] usertime = div.xpath('.//div[@class="WB_from S_txt2"]/text()')[0] usercontent = div.xpath('./div[@class="WB_text"]/text()') str = usertime + '\n' + username #print(username,usertime,usercontent) # fp.write(usertime + '\n' + username) for con in usercontent[1:]: str += '\n' + username + '\n' + usertime + '\n' + con + '\n' # usercontent = ''.join(usercontent) #print('usercontent:',usercontent) item['username'] = username item['usertime'] = usertime item['usercontent'] = usercontent list.append(username) list.append(usertime) list.append(usercontent) final_lists.append(list) #item['user'] = [username,usertime,usercontent] item['user'] = final_lists yield item
items.py
在这里定义分析的数据,移交给管道处理
import scrapyclass WeiboproItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() #热搜标题 title = scrapy.Field() #热搜的链接 href = scrapy.Field() #发布每条相关热搜消息的作者 author = scrapy.Field() #发布每条相关热搜消息的时间 news_time = scrapy.Field() #发布每条相关热搜消息的内容 brief_con = scrapy.Field() #发布每条相关热搜消息的详情链接 details_url = scrapy.Field() #详情页ID,拿json必备 news_id = scrapy.Field() #传入每条热搜消息微博详情页下的作者 username = scrapy.Field() #传入每条热搜消息微博详情页下的时间 usertime = scrapy.Field() #传入每条热搜消息微博详情页下的评论 usercontent = scrapy.Field() #所有评论和人 user = scrapy.Field()
middlewares.py
中间件,用于处理spider和服务器中间的通讯。
import random# 自定义微博请求的中间件class WeiboproDownloaderMiddleware(object): def process_request(self, request, spider): # "设置cookie" cookies = "" cookies = {i.split("=")[0]: i.split("=")[1] for i in cookies.split("; ")} request.cookies = cookies # 设置ua ua = random.choice(spider.settings.get("USER_AGENT_LIST")) request.headers["User-Agent"] = ua return None
pipelines.py
from itemadapter import ItemAdapterclass WeiboproPipeline: fp = None def open_spider(self,spider): print("starting...") def process_item(self, item, spider): title = item['title'] href = item['href'] author = item['author'] news_time = item['news_time'] brief_con = item['brief_con'] details_url = item['details_url'] news_id = item['news_id'] #username = item['username'] #usertime = item['usertime'] #usercontent = item['usercontent'] user = item['user'] filepath = './' + title + '.txt' with open(filepath,'a',encoding='utf-8') as fp: fp.write('title:\n' + title + '\n' + 'href:\n'+href + '\n' +'author:\n' + author + '\n' + 'news_time:\n' +news_time + '\n' + 'brief_con\n' + brief_con + '\n' +'details_url:\n' + details_url + '\n' +'news_id'+news_id + '\n') for u in user: fp.write('username:'+u[0] + '\n' + u[1] + '\n' +'usercontent:\n'+u[2] + '\n\n\n') fp.write('---------------------------------------------------------\n') fp.close() return item
setting.py
设置spider的属性,包括在这里已经加入了各种浏览器请求头,设置线程数,爬取频率等等,能够让spider拥有更强大的反爬
# Scrapy settings for weiboPro project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## https://docs.scrapy.org/en/latest/topics/settings.html# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html# https://docs.scrapy.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'weiboPro'SPIDER_MODULES = ['weiboPro.spiders']NEWSPIDER_MODULE = 'weiboPro.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent#USER_AGENT = 'weiboPro (+http://www.yourdomain.com)'MEDIA_ALLOW_REDIRECTS = TrueUSER_AGENT_LIST = ["Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", "Opera/8.0 (Windows NT 5.1; U; en)", "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", # Firefox "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", # Safari "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2", # chrome "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.71 Safari/537.36", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16", # 360 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36", "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko", # 淘宝浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", # 猎豹浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", # QQ浏览器 "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", # sogou浏览器 "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)", # maxthon浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36", # UC浏览器 "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36" ]LOG_LEVEL = 'ERROR'# Obey robots.txt rulesROBOTSTXT_OBEY = False# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html# SPIDER_MIDDLEWARES = {# 'weiboPro.middlewares.WeiboproSpiderMiddleware': 543,# }# Enable or disable downloader middlewares# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.htmlDOWNLOADER_MIDDLEWARES = { 'weiboPro.middlewares.WeiboproDownloaderMiddleware': 543,}# Enable or disable extensions# See https://docs.scrapy.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See https://docs.scrapy.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { 'weiboPro.pipelines.WeiboproPipeline': 300,}# Enable and configure the AutoThrottle extension (disabled by default)# See https://docs.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
scrapy.cfg
配置文件,没啥好写的
[settings]default = weiboPro.settings[deploy]#url = http://localhost:6800/project = weiboPro
剩下的两个__init__文件空着就行,用不上。
感谢各位的阅读!关于"python中如何使用Scrapy框架爬虫爬取微博热搜"这篇文章就分享到这里了,希望以上内容可以对大家有一定的帮助,让大家可以学到更多知识,如果觉得文章不错,可以把它分享出去让更多的人看到吧!
数据
消息
浏览器
浏览
详情
分析
评论
内容
链接
爬虫
处理
框架
中间件
作者
功能
数据库
文件
时间
更多
标题
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
西方攻击中国网络安全
Xbox土豆服务器打不开
数据库技术和智能设备
智能交通终端管理服务器
网络安全自我提升
计算机与网络技术课后答案
网络安全应届生招聘
杭州开创网络技术有公司
福山区安卓软件开发公司有哪些
如何创建数据库用户
淄川微信小程序软件开发
前端列表怎么和数据库对应
防沉迷网络安全教育平台
pi服务器租一个月多少钱
清远自主可控软件开发费用
网络安全和网络攻防警察
企业普遍使用的数据库防火墙统计
部落冲突9本科技数据库
芦沟事件网络安全
原神太空岛数据库
联通公司软件开发怎么样
数据库植入后门危害
网络安全应届生招聘
违法广告监测数据库
粤教版网络技术光盘
提高网络安全素养的途径
轻量服务器打开25端口
达梦数据库登录失败查询
网络安全大赛中有人叫韩商言吗
大巴租车软件开发