記和朋友學(xué)習(xí)的一段爬蟲

我的人生寶庫 2020-02-12

展開全文

記和朋友學(xué)習(xí)的一段爬蟲

目標(biāo)

效果

記和朋友學(xué)習(xí)的一段爬蟲

進(jìn)行訪問

爬取目標(biāo)網(wǎng)址為 https://s.weibo.com/

首先建立headers之類的

記和朋友學(xué)習(xí)的一段爬蟲

在network標(biāo)簽里面我們可以找到query string參數(shù)，統(tǒng)統(tǒng)都放進(jìn)代碼里，將這些參數(shù)encode后拼成完整url，對(duì)其進(jìn)行訪問，便可以得到目標(biāo)網(wǎng)頁的html

def get_Research(research_Words,page):    params = {        'q': research_Words,        'Refer': 'index',        'page': str(page)    }    url = 'https://s.weibo.com/weibo?' + urlencode(params)    try:        response = requests.get(url)        if response.status_code == 200:            return response.text    except requests.ConnectionError:        return None

信息提取

記和朋友學(xué)習(xí)的一段爬蟲

對(duì)網(wǎng)頁進(jìn)行分析可以看到，轉(zhuǎn)發(fā)評(píng)論之類的都在div節(jié)點(diǎn)class='card-act'的里面，而內(nèi)容則在class = 'card-feed' 的div里面，二者同屬class = 'card'的div里，因此先定位到card，再具體找到需要找的信息所對(duì)應(yīng)的子節(jié)點(diǎn)，將其文本保存到一個(gè)字典中，將所有的字典組成一個(gè)字典列表作為函數(shù)返回值。

def get_Information(research_Words,page):    res = []    html = get_Research(research_Words,page)    doc = pq(html)    #print(doc)    with open(current_Path + 'test.txt','w+',encoding = 'utf8') as f:        f.write(html)    # items = doc('.content').items()    items = doc('div[class='card']').items()        for li in items:        temp_Info_Dict = {}                ###抽取昵稱        info = li.find('div')('.name')        nick_Name = info.attr('nick-name')        temp_Info_Dict['博主id'] = nick_Name        ###抽取內(nèi)容        # text = li('.txt')        text = li('p[node-type='feed_list_content_full']>a')        temp_Info_Dict['微博正文'] = text.text()        if temp_Info_Dict['微博正文'] == '':            text = li('p[node-type='feed_list_content']>a')            temp_Info_Dict['微博正文'] = text.text()        #print(text.text())        #print(temp_Info_Dict['微博正文'])        ###時(shí)間&設(shè)備        time_Device = li('p[class='from']>a').text()        temp_Info_Dict['發(fā)布時(shí)間'] = time_Device        ###轉(zhuǎn)發(fā)數(shù) 評(píng)論數(shù) 點(diǎn)贊數(shù)        forwards = li('.card-act li').items()#('a[action-type='feed_list_forward']')        for i,forward in enumerate(forwards):            num = re.sub('\D','',forward.text())            #print(num)            if num == '':                num = 0            else:                num = int(num)            if i == 1:                temp_Info_Dict['轉(zhuǎn)發(fā)'] = num            elif i == 2:                temp_Info_Dict['評(píng)論'] = num            elif i == 3:                temp_Info_Dict['點(diǎn)贊'] = num            #print(forward.text())        res.append(temp_Info_Dict)        #print(res)    return res

保存到本地

利用pandas的函數(shù)把字典列表保存即可

##導(dǎo)出exceldef export_excel(export):    pf = pd.DataFrame(list(export))    #指定字段順序    order = ['博主id','微博正文','轉(zhuǎn)發(fā)','評(píng)論','點(diǎn)贊','發(fā)布時(shí)間']    pf = pf[order]    file_path = pd.ExcelWriter(current_Path + 'name.xlsx')    pf.fillna(' ',inplace = True)    #輸出    pf.to_excel(file_path,encoding = 'utf-8',index = False)    #保存表格    file_path.save()

進(jìn)行爬取

調(diào)用函數(shù)進(jìn)行爬取就好，但是實(shí)際操作的時(shí)候發(fā)現(xiàn)問題就是只能爬第一頁的內(nèi)容，反反復(fù)復(fù)debug看了好幾遍網(wǎng)頁結(jié)構(gòu)也沒發(fā)現(xiàn)第二頁及之后為啥爬不出來，最后終于發(fā)現(xiàn)是因?yàn)槿绻麤]有登錄微博就看不了第二頁，而程序中沒有進(jìn)行模擬登錄。

所以想要簡單的再爬好多頁的話還需要加入模擬登錄或者加入一個(gè)保有登錄信息的cookies

完整代碼

from urllib.parse import urlencodeimport requestsfrom pyquery import PyQuery as pqimport osimport reimport xlwtimport pandas as pdcurrent_Path = os.path.dirname(os.path.abspath(__file__)) + '\\'base_url = 'https://s.weibo.com/'headers = {    'Host':'m.weibo.cn',    'Refer':'https://weibo.com/zzk1996?is_all=1',    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 Edg/80.0.361.48'}#搜索def get_Research(research_Words,page):    params = {        'q': research_Words,        'Refer': 'index',        'page': str(page)    }    url = 'https://s.weibo.com/weibo?' + urlencode(params)    #print(url)    # print(urlencode(params))    try:        response = requests.get(url)        if response.status_code == 200:            return response.text    except requests.ConnectionError:        return Nonedef get_Information(research_Words,page):    res = []    html = get_Research(research_Words,page)    doc = pq(html)    #print(doc)    with open(current_Path + 'test.txt','w+',encoding = 'utf8') as f:        f.write(html)    # items = doc('.content').items()    items = doc('div[class='card']').items()        for li in items:        temp_Info_Dict = {}                ###抽取昵稱        info = li.find('div')('.name')        nick_Name = info.attr('nick-name')        temp_Info_Dict['博主id'] = nick_Name        ###抽取內(nèi)容        # text = li('.txt')        text = li('p[node-type='feed_list_content_full']>a')        temp_Info_Dict['微博正文'] = text.text()        if temp_Info_Dict['微博正文'] == '':            text = li('p[node-type='feed_list_content']>a')            temp_Info_Dict['微博正文'] = text.text()        #print(text.text())        #print(temp_Info_Dict['微博正文'])        ###時(shí)間&設(shè)備        time_Device = li('p[class='from']>a').text()        temp_Info_Dict['發(fā)布時(shí)間'] = time_Device        ###轉(zhuǎn)發(fā)數(shù) 評(píng)論數(shù) 點(diǎn)贊數(shù)        forwards = li('.card-act li').items()#('a[action-type='feed_list_forward']')        for i,forward in enumerate(forwards):            num = re.sub('\D','',forward.text())            #print(num)            if num == '':                num = 0            else:                num = int(num)            if i == 1:                temp_Info_Dict['轉(zhuǎn)發(fā)'] = num            elif i == 2:                temp_Info_Dict['評(píng)論'] = num            elif i == 3:                temp_Info_Dict['點(diǎn)贊'] = num            #print(forward.text())        res.append(temp_Info_Dict)        #print(res)    return res        ###發(fā)布時(shí)間##導(dǎo)出exceldef export_excel(export):    pf = pd.DataFrame(list(export))    #指定字段順序    order = ['博主id','微博正文','轉(zhuǎn)發(fā)','評(píng)論','點(diǎn)贊','發(fā)布時(shí)間']    pf = pf[order]    file_path = pd.ExcelWriter(current_Path + 'name.xlsx')    pf.fillna(' ',inplace = True)    #輸出    pf.to_excel(file_path,encoding = 'utf-8',index = False)    #保存表格    file_path.save()def main():    lis = []    #for i in range(1,10):    lis += get_Information('#尼日利亞爆發(fā)不明疾病#',1)    #print(lis)    export_excel(lis)if __name__ == '__main__':    main()

本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間，所有內(nèi)容均由用戶發(fā)布，不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息，謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容，請(qǐng)點(diǎn)擊一鍵舉報(bào)。

轉(zhuǎn)藏 分享

QQ空間 QQ好友新浪微博微信

獻(xiàn)花（0） +1

來自：我的人生寶庫 > 《電腦(軟件應(yīng)用)》

舉報(bào)/認(rèn)領(lǐng)