大家好,我是皮皮。
一、前言
前幾天在Python白銀交流群【Ming】分享一個(gè)Python
爬東方財(cái)富股吧的帖子內(nèi)容,這里拿出來給大家分享下。
二、實(shí)現(xiàn)過程
這里的代碼,如下圖所示。
# -*- coding: utf-8 -*-
# @Time : 2023/2/11 21:27
# @Author : Euclid-Jie
# @File : main_class.py
import os
import sys
import time
import pandas as pd
import pymongo
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
import logging
from retrying import retry
from datetime import datetime
class guba_comments:
"""
this class is designed for get hot comments for guba, have two method which can be set at def get_data()
1、all: https://guba.eastmoney.com/list,600519_1.html, secCode: 600519, page: 1
2、hot: https://guba.eastmoney.com/list,600519,99_1.html secCode: 600519, page: 1
because to the ip control, this need to set proxies pools
by using proxies https://www./usercenter/overview/, can solve this problem
Program characteristics:
1、default write data to mongoDB, by init "MogoDB=False", can switch to write data to csv file
2、Use retry mechanism, once rise error, the program will restart at the least page and num (each page has 80 num)
"""
def __init__(self, secCode, pages_end, pages_start=1, num_start=0, MongoDB=False, collectionName=None, save_path = None):
# init
self.collectionName = collectionName
self.num_start = num_start
self.secCode = secCode
self.pages_end = pages_end
self.pages_start = pages_start
# default setting
self.header = None
self.proxies = None
if save_path == None:
self.SaveFolderPath = os.getcwd()
else :
self.SaveFolderPath = save_path
if self.collectionName is None:
self.collectionName = self.secCode
self.FilePath = secCode + f"_{str(datetime.today().date())}_start{self.pages_start}_end{self.pages_end}" + '.csv'
self.DBName = 'guba'
# choose one save method, default MongoDB
# 1、csv
# 2、MongoDB
if MongoDB:
self.col = self.MongoClient()
else:
self.col = None
# log setting
log_format = '%(levelname)s %(asctime)s %(filename)s %(lineno)d %(message)s'
logging.basicConfig(
filename='test.log',
format=log_format,
level=logging.DEBUG
)
@staticmethod
def clear_str(str_raw):
for pat in ['\n', ' ', ' ', '\r', '\xa0', '\n\r\n']:
str_raw.strip(pat)
return str_raw
@retry(stop_max_attempt_number=10) # 最多嘗試10次
def get_soup_form_url(self, url: str) -> BeautifulSoup:
"""
get the html content used by requests.get
:param url:
:return: BeautifulSoup
"""
response = requests.get(url, headers=self.header, timeout=60, proxies=self.proxies) # 使用request獲取網(wǎng)頁
html = response.content.decode('utf-8', 'ignore') # 將網(wǎng)頁源碼轉(zhuǎn)換格式為html
soup = BeautifulSoup(html, features="lxml") # 構(gòu)建soup對象,"lxml"為設(shè)置的解析器
return soup
def get_full_text(self, data_json):
"""
the href of each item have different fartherPath:
1、https://caifuhao
2、http://guba.eastmoney.com
:param data_json: the json data lack full text
:return: the data json with full text
"""
if 'caifuhao' in data_json['href']:
url = 'https:' + data_json['href']
soup = self.get_soup_form_url(url)
try:
data_json['full_text'] = soup.find('div', 'article-body').get_text()
return data_json
except BaseException as e:
logging.debug('{} get null content'.format(data_json['href']))
return data_json
elif '/new' in data_json['href']:
url = 'http://guba.eastmoney.com' + data_json['href']
soup = self.get_soup_form_url(url)
try:
data_json['full_text'] = self.clear_str(soup.find('div', {'id': 'post_content'}).text)
return data_json
except ValueError as e:
logging.debug('{} get null content'.format(data_json['href']))
else:
logging.info('{} is not in exit ip'.format(data_json['href']))
return data_json
def save_data(self, data_df):
"""
輪子函數(shù),用于存儲(chǔ)數(shù)據(jù),可實(shí)現(xiàn)對已存在文件的追加寫入
:param data_df: 目標(biāo)數(shù)據(jù)
:return:
"""
# concat the folderPath and dataPath
FileFullPath = os.path.join(self.SaveFolderPath, self.FilePath)
if os.path.isfile(FileFullPath):
data_df.to_csv(FileFullPath, mode='a', header=False, index=False, encoding='utf_8_sig')
else:
data_df.to_csv(FileFullPath, mode='w', header=True, index=False, encoding='utf_8_sig')
def get_data_json(self, item):
"""
get the special keys from item, in this the project,
the keys con be "閱讀"、"評論"、……
by use the get_full_text, the return json data will contain full_text
:param item:
:return: json data contains full_text
"""
spans = item.find_all('span')
# print(spans)
data_json = {
'閱讀': spans[0].text,
'評論': spans[1].text,
'標(biāo)題': spans[2].a['title'],
'href': spans[2].a['href'],
'作者': spans[3].a.text,
'最后更新': spans[4].text
}
return self.get_full_text(data_json)
def get_data(self, page):
"""
process to deal the single page's data
:param page: the page needed to be processed
:return:
"""
# Url = 'http://guba.eastmoney.com/list,{},99_{}.html'.format(self.secCode, page)
Url = 'http://guba.eastmoney.com/list,{}_{}.html'.format(self.secCode, page)
# print(Url)
soup = self.get_soup_form_url(Url)
data_list = soup.find_all('div', 'articleh')
error_num = 0
if self.col:
for item in data_list[self.num_start:]:
try:
data_json = self.get_data_json(item)
self.col.insert_one(data_json)
self.t.set_postfix({"狀態(tài)": "已寫num:{}".format(self.num_start)}) # 進(jìn)度條右邊顯示信息
error_num = 0
except ValueError as e:
logging.error('item get_data getting fail')
error_num += 1
if error_num >= 5:
sys.exit()
finally:
self.num_start += 1
elif self.FilePath:
out_df = pd.DataFrame()
for item in data_list[self.num_start:]:
try:
data_json = self.get_data_json(item)
out_df = pd.concat([out_df, pd.DataFrame(data_json, index=[0])])
self.t.set_postfix({"狀態(tài)": "已寫入page:{} num:{}".format(page, self.num_start)}) # 進(jìn)度條右邊顯示信息
error_num = 0
except ValueError as e:
logging.error('item get_data getting fail')
error_num += 1
if error_num >= 5:
sys.exit()
finally:
self.num_start += 1
self.save_data(out_df)
else:
raise ValueError("please set least one method to save data")
def MongoClient(self):
# 連接數(shù)據(jù)庫
myclient = pymongo.MongoClient("mongodb://localhost:27017/")
mydb = myclient[self.DBName] # 數(shù)據(jù)庫名稱
mycol = mydb[self.collectionName] # 集合(表)
return mycol
@retry(stop_max_attempt_number=10) # 最多嘗試10次
def main(self):
with tqdm(range(self.pages_start, self.pages_end)) as self.t:
for page in self.t:
self.t.set_description("page:{}".format(page)) # 進(jìn)度條左邊顯示信息
self.get_data(page)
time.sleep(5)
self.num_start = 0
self.pages_start += 1
time.sleep(4)
if __name__ == '__main__':
# init
demo = guba_comments('600519', pages_start=1, pages_end=3, save_path=r"C:\Users\pdcfi\Desktop\tmp")
# setting
header = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.13; rv:61.0) Gecko/20100101 Firefox/61.0',
}
demo.header = header
# proxies = {'http': 'http://y889.:15818', 'https': 'http://y889.:15818'}
# demo.proxies = proxies
# run and get data
demo.main()
代碼有230多行,各方面都是做的非常不錯(cuò)的,觀望大佬。
最終跑完,可以得到評論數(shù)據(jù),這里以茅臺(tái)為例,如下所示:
三、總結(jié)
大家好,我是皮皮。這篇文章主要盤點(diǎn)了一個(gè)Python
網(wǎng)絡(luò)爬蟲的代碼,文中針對該問題,給出了具體的解析和代碼實(shí)現(xiàn),幫助粉絲順利解決了問題。
最后感謝粉絲【Ming】的分享,感謝【eric】等人參與學(xué)習(xí)交流。
【提問補(bǔ)充】溫馨提示,大家在群里提問的時(shí)候??梢宰⒁庀旅鎺c(diǎn):如果涉及到大文件數(shù)據(jù),可以數(shù)據(jù)脫敏后,發(fā)點(diǎn)demo數(shù)據(jù)來(小文件的意思),然后貼點(diǎn)代碼(可以復(fù)制的那種),記得發(fā)報(bào)錯(cuò)截圖(截全)。代碼不多的話,直接發(fā)代碼文字即可,代碼超過50行這樣的話,發(fā)個(gè).py文件就行。