千家信息网

「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)

发表于:2025-02-05 作者:千家信息网编辑
千家信息网最后更新 2025年02月05日,原创文章,欢迎转载。转载请注明:转载自IT人故事会,谢谢!原文链接地址:「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)上次已经分析出来具体的app的请求连
千家信息网最后更新 2025年02月05日「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)

原创文章,欢迎转载。转载请注明:转载自IT人故事会,谢谢!
原文链接地址:「docker实战篇」python的docker爬虫技术-python脚本app抓取(13)

上次已经分析出来具体的app的请求连接了,本次主要说说python的开发,抓取APP里面的信息。源码:https://github.com/limingios/dockerpython.git

分析app数据包

查看分析

解析出来的header

夜神配置

python代码,爬取分类

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/9 11:06# @Author  : lm# @Url     : idig8.com# @Site    : # @File    : spider_douguomeishi.py# @Software: PyCharmimport requests#header内容比较多,因为各个厂家的思路不同,#fiddler爬取出来的字段比较多,有些内容应该是非必填的,只能在实际的时候尝试注释一些来试。def handle_request(url,data):    header ={        "client": "4",        "version": "6916.2",        "device": "SM-G955N",        "sdk": "22,5.1.1",        "imei": "354730010002552",        "channel": "zhuzhan",        "mac": "00:FF:E2:A2:7B:58",        "resolution": "1440*900",        "dpi":"2.0",        "android-id":"bcdaf527105cc26f",        "pseudo-id":"354730010002552",        "brand":"samsung",        "scale":"2.0",        "timezone":"28800",        "language":"zh",        "cns":"3",        "carrier": "Android",        #"imsi": "310260000000000",        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",        "lon": "105.566938",        "lat": "29.99831",        "cid": "512000",        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",        "Accept-Encoding": "gzip, deflate",        "Connection": "Keep-Alive",        # "Cookie": "duid=58349118",        "Host": "api.douguo.net",        #"Content-Length": "65"    }    response = requests.post(url=url,headers=header,data=data)    return responsedef handle_index():    url = "http://api.douguo.net/recipe/flatcatalogs"    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0    data ={        "client":"4",        "_session":"1547000257341354730010002552",        "v":"1503650468",        "_vs":"0"    }    response = handle_request(url,data)    print(response.text)handle_index()

爬取详情,信息通过分类找到里面的详情

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/9 11:06# @Author  : lm# @Url     : idig8.com# @Site    : # @File    : spider_douguomeishi.py# @Software: PyCharmimport jsonimport requestsfrom multiprocessing import Queue#创建队列queue_list = Queue()def handle_request(url,data):    header ={        "client": "4",        "version": "6916.2",        "device": "SM-G955N",        "sdk": "22,5.1.1",        "imei": "354730010002552",        "channel": "zhuzhan",        "mac": "00:FF:E2:A2:7B:58",        "resolution": "1440*900",        "dpi":"2.0",        "android-id":"bcdaf527105cc26f",        "pseudo-id":"354730010002552",        "brand":"samsung",        "scale":"2.0",        "timezone":"28800",        "language":"zh",        "cns":"3",        "carrier": "Android",        #"imsi": "310260000000000",        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",        "lon": "105.566938",        "lat": "29.99831",        "cid": "512000",        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",        "Accept-Encoding": "gzip, deflate",        "Connection": "Keep-Alive",        # "Cookie": "duid=58349118",        "Host": "api.douguo.net",        #"Content-Length": "65"    }    response = requests.post(url=url,headers=header,data=data)    return responsedef handle_index():    url = "http://api.douguo.net/recipe/flatcatalogs"    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0    data ={        "client":"4",        "_session":"1547000257341354730010002552",        "v":"1503650468",        "_vs":"0"    }    response = handle_request(url,data)    # print(response.text)    index_response_dic = json.loads(response.text)    for item_index in index_response_dic["result"]["cs"]:        # print(item_index)        for item_index_cs in item_index["cs"]:            # print(item_index_cs)            for item in item_index_cs["cs"]:                #print(item)                data_2 ={                    "client":"4",                    "_session":"1547000257341354730010002552",                    "keyword":item["name"],                    "_vs ":"400"                }                #print(data_2)                queue_list.put(data_2)handle_index()print(queue_list.qsize())

分类菜谱内部的详情信息

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/9 11:06# @Author  : lm# @Url     : idig8.com# @Site    : # @File    : spider_douguomeishi.py# @Software: PyCharmimport jsonimport requestsfrom multiprocessing import Queue#创建队列queue_list = Queue()def handle_request(url,data):    header ={        "client": "4",        "version": "6916.2",        "device": "SM-G955N",        "sdk": "22,5.1.1",        "imei": "354730010002552",        "channel": "zhuzhan",        "mac": "00:FF:E2:A2:7B:58",        "resolution": "1440*900",        "dpi":"2.0",        "android-id":"bcdaf527105cc26f",        "pseudo-id":"354730010002552",        "brand":"samsung",        "scale":"2.0",        "timezone":"28800",        "language":"zh",        "cns":"3",        "carrier": "Android",        #"imsi": "310260000000000",        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",        "lon": "105.566938",        "lat": "29.99831",        "cid": "512000",        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",        "Accept-Encoding": "gzip, deflate",        "Connection": "Keep-Alive",        # "Cookie": "duid=58349118",        "Host": "api.douguo.net",        #"Content-Length": "65"    }    response = requests.post(url=url,headers=header,data=data)    return responsedef handle_index():    url = "http://api.douguo.net/recipe/flatcatalogs"    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0    data ={        "client":"4",        "_session":"1547000257341354730010002552",        "v":"1503650468",        "_vs":"0"    }    response = handle_request(url,data)    # print(response.text)    index_response_dic = json.loads(response.text)    for item_index in index_response_dic["result"]["cs"]:        # print(item_index)        for item_index_cs in item_index["cs"]:            # print(item_index_cs)            for item in item_index_cs["cs"]:                #print(item)                data_2 ={                    "client":"4",                    #"_session":"1547000257341354730010002552",                    "keyword":item["name"],                    "_vs ":"400",                    "order":"0"                }                #print(data_2)                queue_list.put(data_2)def handle_caipu_list(data):    print("当前的食材:",data["keyword"])    caipu_list_url = "http://api.douguo.net/recipe/s/0/20";    caipu_response = handle_request(caipu_list_url, data)    caipu_response_dict = json.loads(caipu_response.text)    for caipu_item in caipu_response_dict["result"]["list"]:        caipu_info ={}        caipu_info["shicai"] = data["keyword"]        if caipu_item["type"]==13:            caipu_info["user_name"] = caipu_item["r"]["an"]            caipu_info["shicai_id"] = caipu_item["r"]["id"]            caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")            caipu_info["caipu_name"] = caipu_item["r"]["n"]            caipu_info["zuoliao_list"] = caipu_item["r"]["major"]            print(caipu_info)        else:            continuehandle_index()handle_caipu_list(queue_list.get())

菜品内部的详情信息

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/9 11:06# @Author  : lm# @Url     : idig8.com# @Site    : # @File    : spider_douguomeishi.py# @Software: PyCharmimport jsonimport requestsfrom multiprocessing import Queue#创建队列queue_list = Queue()def handle_request(url,data):    header ={        "client": "4",        "version": "6916.2",        "device": "SM-G955N",        "sdk": "22,5.1.1",        "imei": "354730010002552",        "channel": "zhuzhan",        "mac": "00:FF:E2:A2:7B:58",        "resolution": "1440*900",        "dpi":"2.0",        "android-id":"bcdaf527105cc26f",        "pseudo-id":"354730010002552",        "brand":"samsung",        "scale":"2.0",        "timezone":"28800",        "language":"zh",        "cns":"3",        "carrier": "Android",        #"imsi": "310260000000000",        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",        "lon": "105.566938",        "lat": "29.99831",        "cid": "512000",        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",        "Accept-Encoding": "gzip, deflate",        "Connection": "Keep-Alive",        # "Cookie": "duid=58349118",        "Host": "api.douguo.net",        #"Content-Length": "65"    }    response = requests.post(url=url,headers=header,data=data)    return responsedef handle_index():    url = "http://api.douguo.net/recipe/flatcatalogs"    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0    data ={        "client":"4",        "_session":"1547000257341354730010002552",        "v":"1503650468",        "_vs":"0"    }    response = handle_request(url,data)    # print(response.text)    index_response_dic = json.loads(response.text)    for item_index in index_response_dic["result"]["cs"]:        # print(item_index)        for item_index_cs in item_index["cs"]:            # print(item_index_cs)            for item in item_index_cs["cs"]:                #print(item)                data_2 ={                    "client":"4",                    #"_session":"1547000257341354730010002552",                    "keyword":item["name"],                    "_vs ":"400",                    "order":"0"                }                #print(data_2)                queue_list.put(data_2)def handle_caipu_list(data):    print("当前的食材:",data["keyword"])    caipu_list_url = "http://api.douguo.net/recipe/s/0/20";    caipu_response = handle_request(caipu_list_url, data)    caipu_response_dict = json.loads(caipu_response.text)    for caipu_item in caipu_response_dict["result"]["list"]:        caipu_info ={}        caipu_info["shicai"] = data["keyword"]        if caipu_item["type"]==13:            caipu_info["user_name"] = caipu_item["r"]["an"]            caipu_info["shicai_id"] = caipu_item["r"]["id"]            caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")            caipu_info["caipu_name"] = caipu_item["r"]["n"]            caipu_info["zuoliao_list"] = caipu_item["r"]["major"]            #print(caipu_info)            detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])            detail_data ={                "client":"4",                "_session":"1547000257341354730010002552",                "author_id":"0",                "_vs":"2803",                "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'            }            detail_reponse = handle_request(detail_url,detail_data)            detail_reponse_dic = json.loads(detail_reponse.text)            caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]            caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]            print(json.dumps(caipu_info))        else:            continuehandle_index()handle_caipu_list(queue_list.get())
将数据保存在mongodb中
  • 通过vagrant 安装虚拟机

    vagrant up
  • 进入虚拟机

    ip 192.168.66.100

su -#密码:vagrantdocker

  • 拉取mongodb的镜像

    https://hub.docker.com/r/bitnami/mongodb
    默认端口:27017

    docker pull bitnami/mongodb:latest

  • 创建mongodb的容器
    mkdir bitnamicd bitnamimkdir mongodbdocker run -d -v /path/to/mongodb-persistence:/root/bitnami -p 27017:27017 bitnami/mongodb:latest

#关闭防火墙
systemctl stop firewalld

>用第三方工具连接![](https://upload-images.jianshu.io/upload_images/11223715-aea2f13184d728c2.png?imageMogr2/auto-orient/strip%7CimageView2/2/w/1240)>连接mongodb的工具``` python#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/11 0:53# @Author  :  liming# @Site    : # @File    : handle_mongodb.py# @url    : idig8.com# @Software: PyCharmimport pymongofrom pymongo.collection import Collectionclass Connect_mongo(object):    def __init__(self):        self.client = pymongo.MongoClient(host="192.168.66.100",port=27017)        self.db_data = self.client["dou_guo_mei_shi"]    def insert_item(self,item):        db_collection = Collection(self.db_data,'dou_guo_mei_shi_item')        db_collection.insert(item)# 暴露出来mongo_info = Connect_mongo()

python爬取的数据通过mongo的工具保存到centos7的docker镜像中

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/9 11:06# @Author  : lm# @Url     : idig8.com# @Site    : # @File    : spider_douguomeishi.py# @Software: PyCharmimport jsonimport requestsfrom multiprocessing import Queuefrom handle_mongo import mongo_info#创建队列queue_list = Queue()def handle_request(url,data):    header ={        "client": "4",        "version": "6916.2",        "device": "SM-G955N",        "sdk": "22,5.1.1",        "imei": "354730010002552",        "channel": "zhuzhan",        "mac": "00:FF:E2:A2:7B:58",        "resolution": "1440*900",        "dpi":"2.0",        "android-id":"bcdaf527105cc26f",        "pseudo-id":"354730010002552",        "brand":"samsung",        "scale":"2.0",        "timezone":"28800",        "language":"zh",        "cns":"3",        "carrier": "Android",        #"imsi": "310260000000000",        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",        "lon": "105.566938",        "lat": "29.99831",        "cid": "512000",        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",        "Accept-Encoding": "gzip, deflate",        "Connection": "Keep-Alive",        # "Cookie": "duid=58349118",        "Host": "api.douguo.net",        #"Content-Length": "65"    }    response = requests.post(url=url,headers=header,data=data)    return responsedef handle_index():    url = "http://api.douguo.net/recipe/flatcatalogs"    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0    data ={        "client":"4",        "_session":"1547000257341354730010002552",        "v":"1503650468",        "_vs":"0"    }    response = handle_request(url,data)    # print(response.text)    index_response_dic = json.loads(response.text)    for item_index in index_response_dic["result"]["cs"]:        # print(item_index)        for item_index_cs in item_index["cs"]:            # print(item_index_cs)            for item in item_index_cs["cs"]:                #print(item)                data_2 ={                    "client":"4",                    #"_session":"1547000257341354730010002552",                    "keyword":item["name"],                    "_vs ":"400",                    "order":"0"                }                #print(data_2)                queue_list.put(data_2)def handle_caipu_list(data):    print("当前的食材:",data["keyword"])    caipu_list_url = "http://api.douguo.net/recipe/s/0/20";    caipu_response = handle_request(caipu_list_url, data)    caipu_response_dict = json.loads(caipu_response.text)    for caipu_item in caipu_response_dict["result"]["list"]:        caipu_info ={}        caipu_info["shicai"] = data["keyword"]        if caipu_item["type"]==13:            caipu_info["user_name"] = caipu_item["r"]["an"]            caipu_info["shicai_id"] = caipu_item["r"]["id"]            caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")            caipu_info["caipu_name"] = caipu_item["r"]["n"]            caipu_info["zuoliao_list"] = caipu_item["r"]["major"]            #print(caipu_info)            detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])            detail_data ={                "client":"4",                "_session":"1547000257341354730010002552",                "author_id":"0",                "_vs":"2803",                "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'            }            detail_reponse = handle_request(detail_url,detail_data)            detail_reponse_dic = json.loads(detail_reponse.text)            caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]            caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]            #print(json.dumps(caipu_info))            mongo_info.insert_item(caipu_info)        else:            continuehandle_index()handle_caipu_list(queue_list.get())

通过python多线程-线程池抓取
  • python3通过concurrent.futures import ThreadPoolExecutor

    引用线程池

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/9 11:06# @Author  : lm# @Url     : idig8.com# @Site    : # @File    : spider_douguomeishi.py# @Software: PyCharmimport jsonimport requestsfrom multiprocessing import Queuefrom handle_mongo import mongo_infofrom concurrent.futures import ThreadPoolExecutor#创建队列queue_list = Queue()def handle_request(url,data):    header ={        "client": "4",        "version": "6916.2",        "device": "SM-G955N",        "sdk": "22,5.1.1",        "imei": "354730010002552",        "channel": "zhuzhan",        "mac": "00:FF:E2:A2:7B:58",        "resolution": "1440*900",        "dpi":"2.0",        "android-id":"bcdaf527105cc26f",        "pseudo-id":"354730010002552",        "brand":"samsung",        "scale":"2.0",        "timezone":"28800",        "language":"zh",        "cns":"3",        "carrier": "Android",        #"imsi": "310260000000000",        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",        "lon": "105.566938",        "lat": "29.99831",        "cid": "512000",        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",        "Accept-Encoding": "gzip, deflate",        "Connection": "Keep-Alive",        # "Cookie": "duid=58349118",        "Host": "api.douguo.net",        #"Content-Length": "65"    }    response = requests.post(url=url,headers=header,data=data)    return responsedef handle_index():    url = "http://api.douguo.net/recipe/flatcatalogs"    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0    data ={        "client":"4",        "_session":"1547000257341354730010002552",        "v":"1503650468",        "_vs":"0"    }    response = handle_request(url,data)    # print(response.text)    index_response_dic = json.loads(response.text)    for item_index in index_response_dic["result"]["cs"]:        # print(item_index)        for item_index_cs in item_index["cs"]:            # print(item_index_cs)            for item in item_index_cs["cs"]:                #print(item)                data_2 ={                    "client":"4",                    #"_session":"1547000257341354730010002552",                    "keyword":item["name"],                    "_vs ":"400",                    "order":"0"                }                #print(data_2)                queue_list.put(data_2)def handle_caipu_list(data):    print("当前的食材:",data["keyword"])    caipu_list_url = "http://api.douguo.net/recipe/s/0/20";    caipu_response = handle_request(caipu_list_url, data)    caipu_response_dict = json.loads(caipu_response.text)    for caipu_item in caipu_response_dict["result"]["list"]:        caipu_info ={}        caipu_info["shicai"] = data["keyword"]        if caipu_item["type"]==13:            caipu_info["user_name"] = caipu_item["r"]["an"]            caipu_info["shicai_id"] = caipu_item["r"]["id"]            caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")            caipu_info["caipu_name"] = caipu_item["r"]["n"]            caipu_info["zuoliao_list"] = caipu_item["r"]["major"]            #print(caipu_info)            detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])            detail_data ={                "client":"4",                "_session":"1547000257341354730010002552",                "author_id":"0",                "_vs":"2803",                "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'            }            detail_reponse = handle_request(detail_url,detail_data)            detail_reponse_dic = json.loads(detail_reponse.text)            caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]            caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]            #print(json.dumps(caipu_info))            mongo_info.insert_item(caipu_info)        else:            continuehandle_index()pool = ThreadPoolExecutor(max_workers=20)while queue_list.qsize()>0:    pool.submit(handle_caipu_list,queue_list.get())

通过使用代理ip隐藏爬虫

当app运维人员,发现我们的一直在请求他们的服务器,很可能就把咱们的ip给封了,通过代理ip的方式。隐藏自我。

  • 注册申请 abuyun.com

    一个小时1元,我申请了一个小时咱们一起使用下

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/11 2:40# @Author  : Aries# @Site    : # @File    : handle_proxy.py# @Software: PyCharm#60.17.177.187 代理出来的ipimport  requestsurl = 'http://ip.hahado.cn/ip'proxy = {'http':'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030'}response = requests.get(url=url,proxies=proxy)print(response.text)

#!/usr/bin/env python# -*- coding: utf-8 -*-# @Time    : 2019/1/9 11:06# @Author  : lm# @Url     : idig8.com# @Site    : # @File    : spider_douguomeishi.py# @Software: PyCharmimport jsonimport requestsfrom multiprocessing import Queuefrom handle_mongo import mongo_infofrom concurrent.futures import ThreadPoolExecutor#创建队列queue_list = Queue()def handle_request(url,data):    header ={        "client": "4",        "version": "6916.2",        "device": "SM-G955N",        "sdk": "22,5.1.1",        "imei": "354730010002552",        "channel": "zhuzhan",        "mac": "00:FF:E2:A2:7B:58",        "resolution": "1440*900",        "dpi":"2.0",        "android-id":"bcdaf527105cc26f",        "pseudo-id":"354730010002552",        "brand":"samsung",        "scale":"2.0",        "timezone":"28800",        "language":"zh",        "cns":"3",        "carrier": "Android",        #"imsi": "310260000000000",        "user-agent": "Mozilla/5.0 (Linux; Android 5.1.1; SM-G955N Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/39.0.0.0 Mobile Safari/537.36",        "lon": "105.566938",        "lat": "29.99831",        "cid": "512000",        "Content-Type": "application/x-www-form-urlencoded; charset=utf-8",        "Accept-Encoding": "gzip, deflate",        "Connection": "Keep-Alive",        # "Cookie": "duid=58349118",        "Host": "api.douguo.net",        #"Content-Length": "65"    }    proxy = {'http': 'http://H79623F667Q3936C:84F1527F3EE09817@http-cla.abuyun.com:9030'}    response = requests.post(url=url,headers=header,data=data,proxies=proxy)    return responsedef handle_index():    url = "http://api.douguo.net/recipe/flatcatalogs"    # client=4&_session=1547000257341354730010002552&v=1503650468&_vs=0    data ={        "client":"4",        "_session":"1547000257341354730010002552",        "v":"1503650468",        "_vs":"0"    }    response = handle_request(url,data)    # print(response.text)    index_response_dic = json.loads(response.text)    for item_index in index_response_dic["result"]["cs"]:        # print(item_index)        for item_index_cs in item_index["cs"]:            # print(item_index_cs)            for item in item_index_cs["cs"]:                #print(item)                data_2 ={                    "client":"4",                    #"_session":"1547000257341354730010002552",                    "keyword":item["name"],                    "_vs ":"400",                    "order":"0"                }                #print(data_2)                queue_list.put(data_2)def handle_caipu_list(data):    print("当前的食材:",data["keyword"])    caipu_list_url = "http://api.douguo.net/recipe/s/0/20";    caipu_response = handle_request(caipu_list_url, data)    caipu_response_dict = json.loads(caipu_response.text)    for caipu_item in caipu_response_dict["result"]["list"]:        caipu_info ={}        caipu_info["shicai"] = data["keyword"]        if caipu_item["type"]==13:            caipu_info["user_name"] = caipu_item["r"]["an"]            caipu_info["shicai_id"] = caipu_item["r"]["id"]            caipu_info["describe"] = caipu_item["r"]["cookstory"].replace("\n","").replace(" ","")            caipu_info["caipu_name"] = caipu_item["r"]["n"]            caipu_info["zuoliao_list"] = caipu_item["r"]["major"]            #print(caipu_info)            detail_url = "http://api.douguo.net/recipe/detail/"+ str(caipu_info["shicai_id"])            detail_data ={                "client":"4",                "_session":"1547000257341354730010002552",                "author_id":"0",                "_vs":"2803",                "ext":'{"query": {"kw": "'+data["keyword"]+'", "src": "2803", "idx": "1", "type": "13", "id": '+str(caipu_info["shicai_id"])+'}}'            }            detail_reponse = handle_request(detail_url,detail_data)            detail_reponse_dic = json.loads(detail_reponse.text)            caipu_info["tips"] = detail_reponse_dic["result"]["recipe"]["tips"]            caipu_info["cookstep"] = detail_reponse_dic["result"]["recipe"]["cookstep"]            #print(json.dumps(caipu_info))            mongo_info.insert_item(caipu_info)        else:            continuehandle_index()pool = ThreadPoolExecutor(max_workers=2)while queue_list.qsize()>0:    pool.submit(handle_caipu_list,queue_list.get())

PS:本次是app数据抓取的入门。首先是通过模拟器的代理服务,到本地的电脑(安装fiddler),这样fiddler就可以抓取数据了,分析数据这块要凭借自己的经验找到对应的url,如果能分析到url,基本爬虫就写一半。封装请求头。通过fiddler获取的。里面header内容比较多,尝试删除最简化,也是一种反爬虫的策略,有的数据放进去到容易被发现是爬虫了,例如cookies等等,但是有的爬虫爬取数据需要cookies。通过代理的方式设置代理ip,防止爬取过程中同一个ip,一直请求一个接口被发现是爬虫。引入了队列的目的就是为了使用线程池的时候方便提取。然后放入mongodb中。这样使用多线程的app数据就完成了。

数据 utf-8 爬虫 队列 代理 线程 分析 信息 详情 内容 工具 分类 小时 方式 时候 镜像 尝试 服务 实战 脚本 数据库的安全要保护哪些东西 数据库安全各自的含义是什么 生产安全数据库录入 数据库的安全性及管理 数据库安全策略包含哪些 海淀数据库安全审计系统 建立农村房屋安全信息数据库 易用的数据库客户端支持安全管理 连接数据库失败ssl安全错误 数据库的锁怎样保障安全 关于数据库的研究方向 北方网络安全手抄报 苹果服务器怎么充电 河北戴尔服务器虚拟化建设 社保基础数据库 警察大学网络安全与执法排名 数据库删除默认的语句 20 30岁就业数据库 调研网络安全人才培养 中小家庭教育与网络安全读后感 服务器配置要考虑几兆带宽 软件开发部门划分 平谷区正规软件开发服务电话 最安全最可靠的外网服务器 编程软件开发需要什么学历 泰安社区网络安全宣传 深圳科技互联网名企排名 计算机网络技术的好与坏 excel数据库怎么建立 华闻集团网络安全 如何排查服务器之间网络不稳定 ftp地址如何访问服务器 吴江区品牌网络技术优势 两个数据库的表如果同步 郑州汇一城互联网科技有限公司 在关系数据库中能够惟一的标识 山东十大网络安全上市公司 计算机网络技术面试内容 服务器的双链和单链的区别 如何关闭数据库的进程mysql
0