python中scrapy爬虫的示例分析
发表于:2025-01-18 作者:千家信息网编辑
千家信息网最后更新 2025年01月18日,这篇文章主要为大家展示了"python中scrapy爬虫的示例分析",内容简而易懂,条理清晰,希望能够帮助大家解决疑惑,下面让小编带领大家一起研究并学习一下"python中scrapy爬虫的示例分析"
千家信息网最后更新 2025年01月18日python中scrapy爬虫的示例分析
这篇文章主要为大家展示了"python中scrapy爬虫的示例分析",内容简而易懂,条理清晰,希望能够帮助大家解决疑惑,下面让小编带领大家一起研究并学习一下"python中scrapy爬虫的示例分析"这篇文章吧。
具体如下。
例程1: douban
目录树
douban--douban --spiders --__init__.py --bookspider.py --douban_comment_spider.py --doumailspider.py --__init__.py --items.py --pipelines.py --settings.py--scrapy.cfg
-spiders-init.py
# This package will contain the spiders of your Scrapy project## Please refer to the documentation for information on how to create and manage# your spiders.
bookspider.py
# -*- coding:utf-8 -*-'''by sudo rm -rf http://imchenkun.com'''import scrapyfrom douban.items import DoubanBookItemclass BookSpider(scrapy.Spider): name = 'douban-book' allowed_domains = ['douban.com'] start_urls = [ 'https://book.douban.com/top250' ] def parse(self, response): # 请求第一页 yield scrapy.Request(response.url, callback=self.parse_next) # 请求其它页 for page in response.xpath('//div[@class="paginator"]/a'): link = page.xpath('@href').extract()[0] yield scrapy.Request(link, callback=self.parse_next) def parse_next(self, response): for item in response.xpath('//tr[@class="item"]'): book = DoubanBookItem() book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0] book['content'] = item.xpath('td[2]/p/text()').extract()[0] book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0] yield book
douban_comment_spider.py
# -*- coding:utf-8 -*-import scrapyfrom faker import Factoryfrom douban.items import DoubanMovieCommentItemimport urlparsef = Factory.create()class MailSpider(scrapy.Spider): name = 'douban-comment' allowed_domains = ['accounts.douban.com', 'douban.com'] start_urls = [ 'https://www.douban.com/' ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Host': 'accounts.douban.com', 'User-Agent': f.user_agent() } formdata = { 'form_email': '你的邮箱', 'form_password': '你的密码', # 'captcha-solution': '', # 'captcha-id': '', 'login': '登录', 'redir': 'https://www.douban.com/', 'source': 'None' } def start_requests(self): return [scrapy.Request(url='https://www.douban.com/accounts/login', headers=self.headers, meta={'cookiejar': 1}, callback=self.parse_login)] def parse_login(self, response): # 如果有验证码要人为处理 if 'captcha_image' in response.body: print 'Copy the link:' link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0] print link captcha_solution = raw_input('captcha-solution:') captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id'] self.formdata['captcha-solution'] = captcha_solution self.formdata['captcha-id'] = captcha_id return [scrapy.FormRequest.from_response(response, formdata=self.formdata, headers=self.headers, meta={'cookiejar': response.meta['cookiejar']}, callback=self.after_login )] def after_login(self, response): print response.status self.headers['Host'] = "www.douban.com" yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url) yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter = True) #不去重 def parse_next_page(self, response): print response.status try: next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0]) print "下一页" print next_url yield scrapy.Request(url=next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment_url, dont_filter = True) yield scrapy.Request(url=next_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_next_page, dont_filter = True) except: print "Next page Error" return def parse_comment_url(self, response): print response.status for item in response.xpath('//div[@class="main review-item"]'): comment_url = item.xpath('header/h4[@class="title"]/a/@href').extract()[0] comment_title = item.xpath('header/h4[@class="title"]/a/text()').extract()[0] print comment_title print comment_url yield scrapy.Request(url=comment_url, meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_comment) def parse_comment(self, response): print response.status for item in response.xpath('//div[@id="content"]'): comment = DoubanMovieCommentItem() comment['useful_num'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip() comment['no_help_num'] = item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip() comment['people'] = item.xpath('//span[@property="v:reviewer"]/text()').extract()[0] comment['people_url'] = item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0] comment['star'] = item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0] data_type = item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0] print "data_type: "+data_type if data_type == '0': comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div/p/text()').extract())) elif data_type == '1': comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div[1]/text()').extract())) comment['title'] = item.xpath('//span[@property="v:summary"]/text()').extract()[0] comment['comment_page_url'] = response.url #print comment yield comment
doumailspider.py
# -*- coding:utf-8 -*-'''by sudo rm -rf http://imchenkun.com'''import scrapyfrom faker import Factoryfrom douban.items import DoubanMailItemimport urlparsef = Factory.create()class MailSpider(scrapy.Spider): name = 'douban-mail' allowed_domains = ['accounts.douban.com', 'douban.com'] start_urls = [ 'https://www.douban.com/' ] headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Connection': 'keep-alive', 'Host': 'accounts.douban.com', 'User-Agent': f.user_agent() } formdata = { 'form_email': '你的邮箱', 'form_password': '你的密码', # 'captcha-solution': '', # 'captcha-id': '', 'login': '登录', 'redir': 'https://www.douban.com/', 'source': 'None' } def start_requests(self): return [scrapy.Request(url='https://www.douban.com/accounts/login', headers=self.headers, meta={'cookiejar': 1}, callback=self.parse_login)] def parse_login(self, response): # 如果有验证码要人为处理 if 'captcha_image' in response.body: print 'Copy the link:' link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0] print link captcha_solution = raw_input('captcha-solution:') captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id'] self.formdata['captcha-solution'] = captcha_solution self.formdata['captcha-id'] = captcha_id return [scrapy.FormRequest.from_response(response, formdata=self.formdata, headers=self.headers, meta={'cookiejar': response.meta['cookiejar']}, callback=self.after_login )] def after_login(self, response): print response.status self.headers['Host'] = "www.douban.com" return scrapy.Request(url='https://www.douban.com/doumail/', meta={'cookiejar': response.meta['cookiejar']}, headers=self.headers, callback=self.parse_mail) def parse_mail(self, response): print response.status for item in response.xpath('//div[@class="doumail-list"]/ul/li'): mail = DoubanMailItem() mail['sender_time'] = item.xpath('div[2]/div/span[1]/text()').extract()[0] mail['sender_from'] = item.xpath('div[2]/div/span[2]/text()').extract()[0] mail['url'] = item.xpath('div[2]/p/a/@href').extract()[0] mail['title'] = item.xpath('div[2]/p/a/text()').extract()[0] print mail yield mail
init.py
(此文件内无代码)
items.py
# -*- coding: utf-8 -*-import scrapyclass DoubanBookItem(scrapy.Item): name = scrapy.Field() # 书名 price = scrapy.Field() # 价格 edition_year = scrapy.Field() # 出版年份 publisher = scrapy.Field() # 出版社 ratings = scrapy.Field() # 评分 author = scrapy.Field() # 作者 content = scrapy.Field()class DoubanMailItem(scrapy.Item): sender_time = scrapy.Field() # 发送时间 sender_from = scrapy.Field() # 发送人 url = scrapy.Field() # 豆邮详细地址 title = scrapy.Field() # 豆邮标题class DoubanMovieCommentItem(scrapy.Item): useful_num = scrapy.Field() # 多少人评论有用 no_help_num = scrapy.Field() # 多少人评论无用 people = scrapy.Field() # 评论者 people_url = scrapy.Field() # 评论者页面 star = scrapy.Field() # 评分 comment = scrapy.Field() # 评论 title = scrapy.Field() # 标题 comment_page_url = scrapy.Field()# 当前页
pipelines.py
# -*- coding: utf-8 -*-class DoubanBookPipeline(object): def process_item(self, item, spider): info = item['content'].split(' / ') # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元 item['name'] = item['name'] item['price'] = info[-1] item['edition_year'] = info[-2] item['publisher'] = info[-3] return itemclass DoubanMailPipeline(object): def process_item(self, item, spider): item['title'] = item['title'].replace(' ', '').replace('\\n', '') return itemclass DoubanMovieCommentPipeline(object): def process_item(self, item, spider): return item
settings.py
# -*- coding: utf-8 -*-# Scrapy settings for douban project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## http://doc.scrapy.org/en/latest/topics/settings.html# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'douban'SPIDER_MODULES = ['douban.spiders']NEWSPIDER_MODULE = 'douban.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agentfrom faker import Factoryf = Factory.create()USER_AGENT = f.user_agent()# Obey robots.txt rulesROBOTSTXT_OBEY = True# Configure maximum concurrent requests performed by Scrapy (default: 16)#CONCURRENT_REQUESTS = 32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs#DOWNLOAD_DELAY = 3# The download delay setting will honor only one of:#CONCURRENT_REQUESTS_PER_DOMAIN = 16#CONCURRENT_REQUESTS_PER_IP = 16# Disable cookies (enabled by default)#COOKIES_ENABLED = False# Disable Telnet Console (enabled by default)#TELNETCONSOLE_ENABLED = False# Override the default request headers:DEFAULT_REQUEST_HEADERS = { 'Host': 'book.douban.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive',}#DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',#}# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html#SPIDER_MIDDLEWARES = {# 'douban.middlewares.MyCustomSpiderMiddleware': 543,#}# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#DOWNLOADER_MIDDLEWARES = {# 'douban.middlewares.MyCustomDownloaderMiddleware': 543,#}# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html#EXTENSIONS = {# 'scrapy.extensions.telnet.TelnetConsole': None,#}# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { #'douban.pipelines.DoubanBookPipeline': 300, #'douban.pipelines.DoubanMailPipeline': 600, 'douban.pipelines.DoubanMovieCommentPipeline': 900,}# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html#AUTOTHROTTLE_ENABLED = True# The initial download delay#AUTOTHROTTLE_START_DELAY = 5# The maximum download delay to be set in case of high latencies#AUTOTHROTTLE_MAX_DELAY = 60# The average number of requests Scrapy should be sending in parallel to# each remote server#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0# Enable showing throttling stats for every response received:#AUTOTHROTTLE_DEBUG = False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings#HTTPCACHE_ENABLED = True#HTTPCACHE_EXPIRATION_SECS = 0#HTTPCACHE_DIR = 'httpcache'#HTTPCACHE_IGNORE_HTTP_CODES = []#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
scrapy.cfg
# Automatically created by: scrapy startproject## For more information about the [deploy] section see:# https://scrapyd.readthedocs.org/en/latest/deploy.html[settings]default = douban.settings[deploy]#url = http://localhost:6800/project = douban
例程2: douban_imgs
目录树
douban_imgs--douban --spiders --__init__.py --download_douban.py --__init__.py --items.py --pipelines.py --run_spider.py --settings.py--scrapy.cfg
-spiders-init.py
# This package will contain the spiders of your Scrapy project## Please refer to the documentation for information on how to create and manage# your spiders.
download_douban.py
# coding=utf-8from scrapy.spiders import Spiderimport refrom scrapy import Requestfrom douban_imgs.items import DoubanImgsItemclass download_douban(Spider): name = 'download_douban' default_headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate, sdch, br', 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.douban.com', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', } def __init__(self, url='1638835355', *args, **kwargs): self.allowed_domains = ['douban.com'] self.start_urls = [ 'http://www.douban.com/photos/album/%s/' % (url)] self.url = url # call the father base function #super(download_douban, self).__init__(*args, **kwargs) def start_requests(self): for url in self.start_urls: yield Request(url=url, headers=self.default_headers, callback=self.parse) def parse(self, response): list_imgs = response.xpath('//div[@class="photolst clearfix"]//img/@src').extract() if list_imgs: item = DoubanImgsItem() item['image_urls'] = list_imgs yield item
init.py
(此文件内无代码)
items.py
# -*- coding: utf-8 -*-# Define here the models for your scraped items## See documentation in:# http://doc.scrapy.org/en/latest/topics/items.htmlimport scrapyfrom scrapy import Item, Fieldclass DoubanImgsItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() image_urls = Field() images = Field() image_paths = Field()
pipelines.py
# -*- coding: utf-8 -*-# Define your item pipelines here## Don't forget to add your pipeline to the ITEM_PIPELINES setting# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.htmlfrom scrapy.pipelines.images import ImagesPipelinefrom scrapy.exceptions import DropItemfrom scrapy import Requestfrom scrapy import logclass DoubanImgsPipeline(object): def process_item(self, item, spider): return itemclass DoubanImgDownloadPipeline(ImagesPipeline): default_headers = { 'accept': 'image/webp,image/*,*/*;q=0.8', 'accept-encoding': 'gzip, deflate, sdch, br', 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6', 'cookie': 'bid=yQdC/AzTaCw', 'referer': 'https://www.douban.com/photos/photo/2370443040/', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36', } def get_media_requests(self, item, info): for image_url in item['image_urls']: self.default_headers['referer'] = image_url yield Request(image_url, headers=self.default_headers) def item_completed(self, results, item, info): image_paths = [x['path'] for ok, x in results if ok] if not image_paths: raise DropItem("Item contains no images") item['image_paths'] = image_paths return item
run_spider.py
from scrapy import cmdlinecmd_str = 'scrapy crawl download_douban'cmdline.execute(cmd_str.split(' '))
settings.py
# -*- coding: utf-8 -*-# Scrapy settings for douban_imgs project## For simplicity, this file contains only settings considered important or# commonly used. You can find more settings consulting the documentation:## http://doc.scrapy.org/en/latest/topics/settings.html# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.htmlBOT_NAME = 'douban_imgs'SPIDER_MODULES = ['douban_imgs.spiders']NEWSPIDER_MODULE = 'douban_imgs.spiders'# Crawl responsibly by identifying yourself (and your website) on the user-agent# USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)'# Configure maximum concurrent requests performed by Scrapy (default: 16)# CONCURRENT_REQUESTS=32# Configure a delay for requests for the same website (default: 0)# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay# See also autothrottle settings and docs# DOWNLOAD_DELAY=3# The download delay setting will honor only one of:# CONCURRENT_REQUESTS_PER_DOMAIN=16# CONCURRENT_REQUESTS_PER_IP=16# Disable cookies (enabled by default)# COOKIES_ENABLED=False# Disable Telnet Console (enabled by default)# TELNETCONSOLE_ENABLED=False# Override the default request headers:# DEFAULT_REQUEST_HEADERS = {# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',# 'Accept-Language': 'en',# }# Enable or disable spider middlewares# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html# SPIDER_MIDDLEWARES = {# 'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543,# }# Enable or disable downloader middlewares# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html# DOWNLOADER_MIDDLEWARES = {# 'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543,# }# Enable or disable extensions# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html# EXTENSIONS = {# 'scrapy.telnet.TelnetConsole': None,# }# Configure item pipelines# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.htmlITEM_PIPELINES = { 'douban_imgs.pipelines.DoubanImgDownloadPipeline': 300,}IMAGES_STORE = 'D:\\doubanimgs'#IMAGES_STORE = '/tmp'IMAGES_EXPIRES = 90# Enable and configure the AutoThrottle extension (disabled by default)# See http://doc.scrapy.org/en/latest/topics/autothrottle.html# NOTE: AutoThrottle will honour the standard settings for concurrency and delay# AUTOTHROTTLE_ENABLED=True# The initial download delay# AUTOTHROTTLE_START_DELAY=5# The maximum download delay to be set in case of high latencies# AUTOTHROTTLE_MAX_DELAY=60# Enable showing throttling stats for every response received:# AUTOTHROTTLE_DEBUG=False# Enable and configure HTTP caching (disabled by default)# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings# HTTPCACHE_ENABLED=True# HTTPCACHE_EXPIRATION_SECS=0# HTTPCACHE_DIR='httpcache'# HTTPCACHE_IGNORE_HTTP_CODES=[]# HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'
scrapy.cfg
# Automatically created by: scrapy startproject## For more information about the [deploy] section see:# https://scrapyd.readthedocs.org/en/latest/deploy.html[settings]default = douban_imgs.settings[deploy]#url = http://localhost:6800/project = douban_imgs
以上是"python中scrapy爬虫的示例分析"这篇文章的所有内容,感谢各位的阅读!相信大家都有了一定的了解,希望分享的内容对大家有所帮助,如果还想学习更多知识,欢迎关注行业资讯频道!
utf-8
评论
爬虫
示例
分析
内容
篇文章
代码
出版社
密码
文件
标题
目录
要人
评论者
邮箱
出版
处理
学习
帮助
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
代理服务器查找
北京快鱼服务器参数
核心工业软件开发
开数据库vip可以开发票吗
云网络技术收费标准
计算机网络技术的本科专业
新浪云创建数据库
上海校准服务器
战术小队打开后没有服务器
众生互联网科技有限公司招聘
长城宽带访问不了公司服务器
软件开发流程ppt课件
什么是异构网络技术
广州微商软件开发外包
拾金网络技术服务有限公司
服务器如何启动无敌模式
运营商清理数据库
c 内存数据库
网络安全模式怎么打字
html遍历表格数据库
山西仁君互动网络技术有限公司
山东传奇网络技术服务有限公司
三支一扶服务器满后
数据库采用并发控制技术
面向方面软件开发
网络安全报送材料
战术小队打开后没有服务器
戴尔R510服务器管理网址
大学本科计算机软件开发课程
万方中文期刊数据库