如何用python爬取知乎话题?
发表于:2024-11-11 作者:千家信息网编辑
千家信息网最后更新 2024年11月11日,因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用#coding:utf-8"""@a
千家信息网最后更新 2024年11月11日如何用python爬取知乎话题?
因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用
#coding:utf-8"""@author:haoning@create time:2015.8.5"""from __future__ import division # 精确除法from Queue import Queuefrom __builtin__ import Falseimport jsonimport osimport reimport platformimport uuidimport urllibimport urllib2import sysimport timeimport MySQLdb as mdbfrom bs4 import BeautifulSoupreload(sys)sys.setdefaultencoding( "utf-8" )headers = { 'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'X-Requested-With':'XMLHttpRequest', 'Referer':'https://www.zhihu.com/topics', 'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'}DB_HOST = '127.0.0.1'DB_USER = 'root'DB_PASS = 'root'queue= Queue() #接收队列nodeSet=set()keywordSet=set()stop=0offset=-20level=0maxLevel=7counter=0base=""conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')conn.autocommit(False)curr = conn.cursor()def get_html(url): try: req = urllib2.Request(url) response = urllib2.urlopen(req,None,3) #在这里应该加入代理 html = response.read() return html except: pass return Nonedef getTopics(): url = 'https://www.zhihu.com/topics' print url try: req = urllib2.Request(url) response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞� html = response.read().decode('utf-8') print html soup = BeautifulSoup(html) lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'}) for li in lis: data_id=li.get('data-id') name=li.text curr.execute('select id from classify_new where name=%s',(name)) y= curr.fetchone() if not y: curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name)) conn.commit() except Exception as e: print "get topic error",edef get_extension(name): where=name.rfind('.') if where!=-1: return name[where:len(name)] return Nonedef which_platform(): sys_str = platform.system() return sys_strdef GetDateString(): when=time.strftime('%Y-%m-%d',time.localtime(time.time())) foldername = str(when) return foldername def makeDateFolder(par,classify): try: if os.path.isdir(par): newFolderName=par + '//' + GetDateString() + '//' +str(classify) if which_platform()=="Linux": newFolderName=par + '/' + GetDateString() + "/" +str(classify) if not os.path.isdir( newFolderName ): os.makedirs( newFolderName ) return newFolderName else: return None except Exception,e: print "kk",e return None def download_img(url,classify): try: extention=get_extension(url) if(extention is None): return None req = urllib2.Request(url) resp = urllib2.urlopen(req,None,3) dataimg=resp.read() name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention top="E://topic_pic" folder=makeDateFolder(top, classify) filename=None if folder is not None: filename =folder+"//"+name try: if "e82bab09c_m" in str(url): return True if not os.path.exists(filename): file_object = open(filename,'w+b') file_object.write(dataimg) file_object.close() return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name else: print "file exist" return None except IOError,e1: print "e1=",e1 pass except Exception as e: print "eee",e pass return None #如果没有下载下来就利用原来网站的链接def getChildren(node,name): global queue,nodeSet try: url="https://www.zhihu.com/topic/"+str(node)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) p_ch='父话题' node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text topic_cla=soup.find('div', {'class' : 'child-topic'}) if topic_cla is not None: try: p_ch=str(topic_cla.text) aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点 if u'子话题' in p_ch: for a in aList: token=a.get('data-token') a=str(a).replace('\n','').replace('\t','').replace('\r','') start=str(a).find('>') end=str(a).rfind('') new_node=str(str(a)[start+1:end]) curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同 y= curr.fetchone() if not y: print "y=",y,"new_node=",new_node,"token=",token queue.put((token,new_node,node_name)) except Exception as e: print "add queue error",e except Exception as e: print "get html error",edef getContent(n,name,p,top_id): try: global counter curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同 y= curr.fetchone() print "exist?? ",y,"n=",n if not y: url="https://www.zhihu.com/topic/"+str(n)+"/hot" html=get_html(url) if html is None: return soup = BeautifulSoup(html) title=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src') description=soup.find('div',{'class':'zm-editable-content'}) if description is not None: description=description.text if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环 description=None tag_path=download_img(pic_path,top_id) print "tag_path=",tag_path if (tag_path is not None) or tag_path==True: if tag_path==True: tag_path=None father_id=2 #默认为杂谈 curr.execute('select id from rooms where name=%s',(p)) results = curr.fetchall() for r in results: father_id=r[0] name=title curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同 y= curr.fetchone() print "store see..",y if not y: friends_num=0 temp = time.time() x = time.localtime(float(temp)) create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now create_time creater_id=None room_avatar=tag_path is_pass=1 has_index=0 reason_id=None #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id ######################有资格入库的内容 counter=counter+1 curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)) conn.commit() #必须时时进入数据库,不然找不到父节点 if counter % 200==0: print "current node",name,"num",counter except Exception as e: print "get content error",e def work(): global queue curr.execute('select id,node,parent,name from classify where status=1') results = curr.fetchall() for r in results: top_id=r[0] node=r[1] parent=r[2] name=r[3] try: queue.put((node,name,parent)) #首先放入队列 while queue.qsize() >0: n,p=queue.get() #顶节点出队 getContent(n,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what's wrong",e def new_work(): global queue curr.execute('select id,data_id,name from classify_new_copy where status=1') results = curr.fetchall() for r in results: top_id=r[0] data_id=r[1] name=r[2] try: get_topis(data_id,name,top_id) except: passdef get_topis(data_id,name,top_id): global queue url = 'https://www.zhihu.com/node/TopicsPlazzaListV2' isGet = True; offset = -20; data_id=str(data_id) while isGet: offset = offset + 20 values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'} try: msg=None try: data = urllib.urlencode(values) request = urllib2.Request(url,data,headers) response = urllib2.urlopen(request,None,5) html=response.read().decode('utf-8') json_str = json.loads(html) ms=json_str['msg'] if len(ms) <5: break msg=ms[0] except Exception as e: print "eeeee",e #print msg if msg is not None: soup = BeautifulSoup(str(msg)) blks = soup.find_all('div', {'class' : 'blk'}) for blk in blks: page=blk.find('a').get('href') if page is not None: node=page.replace("/topic/","") #将更多的种子入库 parent=name ne=blk.find('strong').text try: queue.put((node,ne,parent)) #首先放入队列 while queue.qsize() >0: n,name,p=queue.get() #顶节点出队 size=queue.qsize() if size > 0: print size getContent(n,name,p,top_id) getChildren(n,name) #出队内容的子节点 conn.commit() except Exception as e: print "what's wrong",e except urllib2.URLError, e: print "error is",e pass if __name__ == '__main__': i=0 while i<400: new_work() i=i+1
说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。
有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。
节点
话题
相同
内容
名字
绝不
保证
代码
数据
数据库
观点
队列
麻烦
utf-8
妥妥
精确
办法
字段
屋子
杂谈
数据库的安全要保护哪些东西
数据库安全各自的含义是什么
生产安全数据库录入
数据库的安全性及管理
数据库安全策略包含哪些
海淀数据库安全审计系统
建立农村房屋安全信息数据库
易用的数据库客户端支持安全管理
连接数据库失败ssl安全错误
数据库的锁怎样保障安全
简单的档案数据库
廊坊网络技术产品介绍
北京存储服务器虚拟化价格
怎么给路由器添加ftp服务器
金青网络技术
三级网络技术四级网络工程师
在哪里学习软件开发开发课程
预防网络安全简报
服务器配置教程
馆陶企业网络推广需要服务器吗
联想服务器市场
农安品质网络技术推荐咨询
千千画网络安全手抄报的教程
网络安全漫画图片非常漂亮
如何描述网络技术部
初级中学网络安全讲话稿
软件开发生产率计算公式
新沂明谷软件开发
网络安全主题怎么做
宿迁idc服务器厂商
东西湖网络安全学院在哪里
mysql数据库瞬段丢数据
插入判断数据库是否
方舟服务器搜不了
设置无线网络安全
trip 数据库
停车场系统数据库数据太大
大专软件开发学什么专业
杭州网络技术研究院
win8怎么加密服务器