目標(biāo)效果進(jìn)行訪問爬取目標(biāo)網(wǎng)址為 https://s.weibo.com/ 首先建立headers之類的 在network標(biāo)簽里面我們可以找到query string參數(shù),統(tǒng)統(tǒng)都放進(jìn)代碼里,將這些參數(shù)encode后拼成完整url,對(duì)其進(jìn)行訪問,便可以得到目標(biāo)網(wǎng)頁的html def get_Research(research_Words,page): params = { 'q': research_Words, 'Refer': 'index', 'page': str(page) } url = 'https://s.weibo.com/weibo?' + urlencode(params) try: response = requests.get(url) if response.status_code == 200: return response.text except requests.ConnectionError: return None 信息提取對(duì)網(wǎng)頁進(jìn)行分析可以看到,轉(zhuǎn)發(fā)評(píng)論之類的都在div節(jié)點(diǎn)class='card-act'的里面,而內(nèi)容則在class = 'card-feed' 的div里面,二者同屬class = 'card'的div里,因此先定位到card,再具體找到需要找的信息所對(duì)應(yīng)的子節(jié)點(diǎn),將其文本保存到一個(gè)字典中,將所有的字典組成一個(gè)字典列表作為函數(shù)返回值。
保存到本地利用pandas的函數(shù)把字典列表保存即可 ##導(dǎo)出exceldef export_excel(export): pf = pd.DataFrame(list(export)) #指定字段順序 order = ['博主id','微博正文','轉(zhuǎn)發(fā)','評(píng)論','點(diǎn)贊','發(fā)布時(shí)間'] pf = pf[order] file_path = pd.ExcelWriter(current_Path + 'name.xlsx') pf.fillna(' ',inplace = True) #輸出 pf.to_excel(file_path,encoding = 'utf-8',index = False) #保存表格 file_path.save() 進(jìn)行爬取調(diào)用函數(shù)進(jìn)行爬取就好,但是實(shí)際操作的時(shí)候發(fā)現(xiàn)問題就是只能爬第一頁的內(nèi)容,反反復(fù)復(fù)debug看了好幾遍網(wǎng)頁結(jié)構(gòu)也沒發(fā)現(xiàn)第二頁及之后為啥爬不出來,最后終于發(fā)現(xiàn)是因?yàn)槿绻麤]有登錄微博就看不了第二頁,而程序中沒有進(jìn)行模擬登錄。 所以想要簡單的再爬好多頁的話還需要加入模擬登錄或者加入一個(gè)保有登錄信息的cookies
完整代碼from urllib.parse import urlencodeimport requestsfrom pyquery import PyQuery as pqimport osimport reimport xlwtimport pandas as pdcurrent_Path = os.path.dirname(os.path.abspath(__file__)) + '\\'base_url = 'https://s.weibo.com/'headers = { 'Host':'m.weibo.cn', 'Refer':'https://weibo.com/zzk1996?is_all=1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 Edg/80.0.361.48'}#搜索def get_Research(research_Words,page): params = { 'q': research_Words, 'Refer': 'index', 'page': str(page) } url = 'https://s.weibo.com/weibo?' + urlencode(params) #print(url) # print(urlencode(params)) try: response = requests.get(url) if response.status_code == 200: return response.text except requests.ConnectionError: return Nonedef get_Information(research_Words,page): res = [] html = get_Research(research_Words,page) doc = pq(html) #print(doc) with open(current_Path + 'test.txt','w+',encoding = 'utf8') as f: f.write(html) # items = doc('.content').items() items = doc('div[class='card']').items() for li in items: temp_Info_Dict = {} ###抽取昵稱 info = li.find('div')('.name') nick_Name = info.attr('nick-name') temp_Info_Dict['博主id'] = nick_Name ###抽取內(nèi)容 # text = li('.txt') text = li('p[node-type='feed_list_content_full']>a') temp_Info_Dict['微博正文'] = text.text() if temp_Info_Dict['微博正文'] == '': text = li('p[node-type='feed_list_content']>a') temp_Info_Dict['微博正文'] = text.text() #print(text.text()) #print(temp_Info_Dict['微博正文']) ###時(shí)間&設(shè)備 time_Device = li('p[class='from']>a').text() temp_Info_Dict['發(fā)布時(shí)間'] = time_Device ###轉(zhuǎn)發(fā)數(shù) 評(píng)論數(shù) 點(diǎn)贊數(shù) forwards = li('.card-act li').items()#('a[action-type='feed_list_forward']') for i,forward in enumerate(forwards): num = re.sub('\D','',forward.text()) #print(num) if num == '': num = 0 else: num = int(num) if i == 1: temp_Info_Dict['轉(zhuǎn)發(fā)'] = num elif i == 2: temp_Info_Dict['評(píng)論'] = num elif i == 3: temp_Info_Dict['點(diǎn)贊'] = num #print(forward.text()) res.append(temp_Info_Dict) #print(res) return res ###發(fā)布時(shí)間##導(dǎo)出exceldef export_excel(export): pf = pd.DataFrame(list(export)) #指定字段順序 order = ['博主id','微博正文','轉(zhuǎn)發(fā)','評(píng)論','點(diǎn)贊','發(fā)布時(shí)間'] pf = pf[order] file_path = pd.ExcelWriter(current_Path + 'name.xlsx') pf.fillna(' ',inplace = True) #輸出 pf.to_excel(file_path,encoding = 'utf-8',index = False) #保存表格 file_path.save()def main(): lis = [] #for i in range(1,10): lis += get_Information('#尼日利亞爆發(fā)不明疾病#',1) #print(lis) export_excel(lis)if __name__ == '__main__': main() |
|