如何用python爬取知乎话题?
发表于:2025-02-19 作者:千家信息网编辑
千家信息网最后更新 2025年02月19日,因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用#coding:utf-8"""@a
千家信息网最后更新 2025年02月19日如何用python爬取知乎话题?
因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用
#coding:utf-8"""@author:haoning@create time:2015.8.5"""from __future__ import division # 精确除法from Queue import Queuefrom __builtin__ import Falseimport jsonimport osimport reimport platformimport uuidimport urllibimport urllib2import sysimport timeimport MySQLdb as mdbfrom bs4 import BeautifulSoupreload(sys)sys.setdefaultencoding( "utf-8" )headers = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With':'XMLHttpRequest', 'Referer':'https://www.zhihu.com/topics', 'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'}DB_HOST = '127.0.0.1'DB_USER = 'root'DB_PASS = 'root'queue= Queue() #接收队列nodeSet=set()keywordSet=set()stop=0offset=-20level=0maxLevel=7counter=0base=""conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')conn.autocommit(False)curr = conn.cursor()def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #在这里应该加入代理 html = response.read() return html except: pass return Nonedef getTopics(): url = 'https://www.zhihu.com/topics' print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read().decode('utf-8') print html soup = BeautifulSoup(html) lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'}) for li in lis: data_id=li.get('data-id') name=li.text curr.execute('select id from classify_new where name=%s',(name)) y= curr.fetchone() if not y: curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name)) conn.commit() except Exception as e: print "get topic error",edef get_extension(name): where=name.rfind('.') if where!=-1: return name[where:len(name)] return Nonedef which_platform(): sys_str = platform.system() return sys_strdef GetDateString(): when=time.strftime('%Y-%m-%d',time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + '//' + GetDateString() + '//' +str(classify) if which_platform()=="Linux": newFolderName=par + '/' + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,'w+b') file_object.write(dataimg) file_object.close() return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #如果没有下载下来就利用原来网站的链接def getChildren(node,name): global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch='父话题' node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text topic_cla=soup.find('div', {'class' : 'child-topic'}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点 if u'子话题' in p_ch: for a in aList: token=a.get('data-token') a=str(a).replace('\n','').replace('\t','').replace('\r','') start=str(a).find('>') end=str(a).rfind('') new_node=str(str(a)[start+1:end]) curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同 y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",edef getContent(n,name,p,top_id): try: global counter curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同 y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src') description=soup.find('div',{'class':'zm-editable-content'}) if description is not None: description=description.text if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环 description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #默认为杂谈 curr.execute('select id from rooms where name=%s',(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同 y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id ######################有资格入库的内容 counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #必须时时进入数据库,不然找不到父节点 if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e def work(): global queue curr.execute('select id,node,parent,name from classify where status=1') results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #首先放入队列 while queue.qsize() >0: n,p=queue.get() #顶节点出队 getContent(n,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what's wrong",e def new_work(): global queue curr.execute('select id,data_id,name from classify_new_copy where status=1') results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: passdef get_topis(data_id,name,top_id): global queue url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode('utf-8') json_str = json.loads(html) ms=json_str['msg'] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all('div', {'class' : 'blk'}) for blk in blks: page=blk.find('a').get('href') if page is not None: node=page.replace("/topic/","") #将更多的种子入库 parent=name ne=blk.find('strong').text try: queue.put((node,ne,parent)) #首先放入队列 while queue.qsize() >0: n,name,p=queue.get() #顶节点出队 size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what's wrong",e except urllib2.URLError, e: print "error is",e pass if __name__ == '__main__': i=0 while i<400: new_work() i=i+1
说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。
有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。
节点
话题
相同
内容
名字
绝不
保证
代码
数据
数据库
观点
队列
麻烦
utf-8
妥妥
精确
办法
字段
屋子
杂谈
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
打印服务器更换打印机
中国网络安全与应急救援协会
软件实施数据库操作
2005开发版数据库安装
软件开发需要交印花税吗
软件开发0税率增值税发票
大医集团软件开发怎么样
工业数据采集软件开发
永伯乐网络技术
舟山市网络安全和信息化
软件开发 结构
用友872登陆不了服务器怎么办
我与网络安全作文范文四年级
五个厂家的数据库
山东卫星授时服务器云主机
迅闪网络技术
数据库中的文字怎样转换
共享共享单车软件开发商
河南正规软件开发价格优惠
德阳网络安全监管局
夜晚滤镜软件开发
镇江丹橙科技网络技术
网易云音乐 服务器错误
服务器模式 多维
智慧芽专利数据库中公司树
数据库查询分类型排序
登录系统转移怎么重新连接数据库
ai合成国际网络安全大会
如何将shp文件导入数据库
网络技术大赛视频