千家信息网

如何用python爬取知乎话题?

发表于:2024-11-11 作者:千家信息网编辑
千家信息网最后更新 2024年11月11日,因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用#coding:utf-8"""@a
千家信息网最后更新 2024年11月11日如何用python爬取知乎话题?

因为要做观点,观点的屋子类似于知乎的话题,所以得想办法把他给爬下来,搞了半天最终还是妥妥的搞定了,代码是python写的,不懂得麻烦自学哈!懂得直接看代码,绝对可用

#coding:utf-8"""@author:haoning@create time:2015.8.5"""from __future__ import division  # 精确除法from Queue import Queuefrom __builtin__ import Falseimport jsonimport osimport reimport platformimport uuidimport urllibimport urllib2import sysimport timeimport MySQLdb as mdbfrom bs4 import BeautifulSoupreload(sys)sys.setdefaultencoding( "utf-8" )headers = {   'User-Agent' : 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:35.0) Gecko/20100101 Firefox/35.0',   'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',   'X-Requested-With':'XMLHttpRequest',   'Referer':'https://www.zhihu.com/topics',   'Cookie':'__utma=51854390.517069884.1416212035.1416212035.1416212035.1; q_c1=c02bf44d00d240798bfabcfc95baeb56|1455778173000|1416205243000; _za=b1c8ae35-f986-46a2-b24a-cb9359dc6b2a; aliyungf_tc=AQAAAJ1m71jL1woArKqF22VFnL/wRy6C; _xsrf=9d494558f9271340ab24598d85b2a3c8; cap_id="MDNiMjcwM2U0MTRhNDVmYjgxZWVhOWI0NTA2OGU5OTg=|1455864276|2a4ce8247ebd3c0df5393bb5661713ad9eec01dd"; n_c=1; _alicdn_sec=56c6ba4d556557d27a0f8c876f563d12a285f33a'}DB_HOST = '127.0.0.1'DB_USER = 'root'DB_PASS = 'root'queue= Queue() #接收队列nodeSet=set()keywordSet=set()stop=0offset=-20level=0maxLevel=7counter=0base=""conn = mdb.connect(DB_HOST, DB_USER, DB_PASS, 'zhihu', charset='utf8')conn.autocommit(False)curr = conn.cursor()def get_html(url):    try:        req = urllib2.Request(url)        response = urllib2.urlopen(req,None,3) #在这里应该加入代理        html = response.read()        return html    except:        pass    return Nonedef getTopics():    url = 'https://www.zhihu.com/topics'    print url    try:        req = urllib2.Request(url)        response = urllib2.urlopen(req) #鍦ㄨ繖閲屽簲璇ュ姞鍏ヤ唬鐞�        html = response.read().decode('utf-8')        print html        soup = BeautifulSoup(html)        lis = soup.find_all('li', {'class' : 'zm-topic-cat-item'})        for li in lis:            data_id=li.get('data-id')            name=li.text            curr.execute('select id from classify_new where name=%s',(name))            y= curr.fetchone()            if not y:                curr.execute('INSERT INTO classify_new(data_id,name)VALUES(%s,%s)',(data_id,name))        conn.commit()    except Exception as e:        print "get topic error",edef get_extension(name):      where=name.rfind('.')    if where!=-1:        return name[where:len(name)]    return Nonedef which_platform():    sys_str = platform.system()    return sys_strdef GetDateString():    when=time.strftime('%Y-%m-%d',time.localtime(time.time()))    foldername = str(when)    return foldername def makeDateFolder(par,classify):    try:        if os.path.isdir(par):            newFolderName=par + '//' + GetDateString() + '//'  +str(classify)            if which_platform()=="Linux":                newFolderName=par + '/' + GetDateString() + "/" +str(classify)            if not os.path.isdir( newFolderName ):                os.makedirs( newFolderName )            return newFolderName        else:            return None     except Exception,e:        print "kk",e    return None def download_img(url,classify):    try:        extention=get_extension(url)        if(extention is None):            return None        req = urllib2.Request(url)        resp = urllib2.urlopen(req,None,3)        dataimg=resp.read()        name=str(uuid.uuid1()).replace("-","")+"_www.guandn.com"+extention        top="E://topic_pic"        folder=makeDateFolder(top, classify)        filename=None        if folder is not None:            filename  =folder+"//"+name        try:            if "e82bab09c_m" in str(url):                return True            if not os.path.exists(filename):                file_object = open(filename,'w+b')                file_object.write(dataimg)                file_object.close()                return '/room/default/'+GetDateString()+'/'+str(classify)+"/"+name            else:                print "file exist"                return None        except IOError,e1:            print "e1=",e1            pass    except Exception as e:        print "eee",e        pass    return None #如果没有下载下来就利用原来网站的链接def getChildren(node,name):    global queue,nodeSet    try:        url="https://www.zhihu.com/topic/"+str(node)+"/hot"        html=get_html(url)        if html is None:            return        soup = BeautifulSoup(html)        p_ch='父话题'        node_name=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text        topic_cla=soup.find('div', {'class' : 'child-topic'})        if topic_cla is not None:            try:                p_ch=str(topic_cla.text)                aList = soup.find_all('a', {'class' : 'zm-item-tag'}) #获取所有子节点                if u'子话题' in p_ch:                    for a in aList:                        token=a.get('data-token')                        a=str(a).replace('\n','').replace('\t','').replace('\r','')                        start=str(a).find('>')                        end=str(a).rfind('')                        new_node=str(str(a)[start+1:end])                        curr.execute('select id from rooms where name=%s',(new_node)) #先保证名字绝不相同                        y= curr.fetchone()                        if not y:                            print "y=",y,"new_node=",new_node,"token=",token                            queue.put((token,new_node,node_name))            except Exception as e:                print "add queue error",e    except Exception as e:        print "get html error",edef getContent(n,name,p,top_id):    try:        global counter        curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同        y= curr.fetchone()        print "exist?? ",y,"n=",n        if not y:            url="https://www.zhihu.com/topic/"+str(n)+"/hot"            html=get_html(url)            if html is None:                return            soup = BeautifulSoup(html)            title=soup.find('div', {'id' : 'zh-topic-title'}).find('h2').text            pic_path=soup.find('a',{'id':'zh-avartar-edit-form'}).find('img').get('src')            description=soup.find('div',{'class':'zm-editable-content'})            if description is not None:                description=description.text            if (u"未归类" in title or u"根话题" in title): #允许入库,避免死循环                description=None            tag_path=download_img(pic_path,top_id)            print "tag_path=",tag_path            if (tag_path is not None) or tag_path==True:                if tag_path==True:                    tag_path=None                father_id=2 #默认为杂谈                curr.execute('select id from rooms where name=%s',(p))                results = curr.fetchall()                for r in results:                    father_id=r[0]                name=title                curr.execute('select id from rooms where name=%s',(name)) #先保证名字绝不相同                y= curr.fetchone()                print "store see..",y                if not y:                    friends_num=0                    temp = time.time()                    x = time.localtime(float(temp))                    create_time = time.strftime("%Y-%m-%d %H:%M:%S",x) # get time now                    create_time                    creater_id=None                    room_avatar=tag_path                    is_pass=1                    has_index=0                    reason_id=None                      #print father_id,name,friends_num,create_time,creater_id,room_avatar,is_pass,has_index,reason_id                    ######################有资格入库的内容                    counter=counter+1                    curr.execute("INSERT INTO rooms(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id)VALUES(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)",(father_id,name,friends_num,description,create_time,creater_id,room_avatar,is_pass,has_index,reason_id))                    conn.commit() #必须时时进入数据库,不然找不到父节点                    if counter % 200==0:                        print "current node",name,"num",counter    except Exception as e:        print "get content error",e       def work():    global queue    curr.execute('select id,node,parent,name from classify where status=1')    results = curr.fetchall()    for r in results:        top_id=r[0]        node=r[1]        parent=r[2]        name=r[3]        try:            queue.put((node,name,parent)) #首先放入队列            while queue.qsize() >0:                n,p=queue.get() #顶节点出队                getContent(n,p,top_id)                getChildren(n,name) #出队内容的子节点            conn.commit()        except Exception as e:            print "what's wrong",e  def new_work():    global queue    curr.execute('select id,data_id,name from classify_new_copy where status=1')    results = curr.fetchall()    for r in results:        top_id=r[0]        data_id=r[1]        name=r[2]        try:            get_topis(data_id,name,top_id)        except:            passdef get_topis(data_id,name,top_id):    global queue    url = 'https://www.zhihu.com/node/TopicsPlazzaListV2'    isGet = True;    offset = -20;    data_id=str(data_id)    while isGet:        offset = offset + 20        values = {'method': 'next', 'params': '{"topic_id":'+data_id+',"offset":'+str(offset)+',"hash_id":""}'}        try:            msg=None            try:                data = urllib.urlencode(values)                request = urllib2.Request(url,data,headers)                response = urllib2.urlopen(request,None,5)                html=response.read().decode('utf-8')                json_str = json.loads(html)                ms=json_str['msg']                if len(ms) <5:                    break                msg=ms[0]            except Exception as e:                print "eeeee",e            #print msg            if msg is not None:                soup = BeautifulSoup(str(msg))                blks = soup.find_all('div', {'class' : 'blk'})                for blk in blks:                    page=blk.find('a').get('href')                    if page is not None:                        node=page.replace("/topic/","") #将更多的种子入库                        parent=name                        ne=blk.find('strong').text                        try:                            queue.put((node,ne,parent)) #首先放入队列                            while queue.qsize() >0:                                n,name,p=queue.get() #顶节点出队                                size=queue.qsize()                                if size > 0:                                    print size                                getContent(n,name,p,top_id)                                getChildren(n,name) #出队内容的子节点                            conn.commit()                        except Exception as e:                            print "what's wrong",e          except urllib2.URLError, e:            print "error is",e            pass if __name__ == '__main__':    i=0    while i<400:        new_work()        i=i+1

说下数据库的问题,我这里就不传附件了,看字段自己建立,因为这确实太简单了,我是用的mysql,你看自己的需求自己建。

有什么不懂得麻烦去去转盘网找我,因为这个也是我开发的,上面会及时更新qq群号,这里不留qq号啥的,以免被系统给K了。

0