#==>第一部分:基本配置<===
#1、項(xiàng)目名稱,默認(rèn)的USER_AGENT由它來(lái)構(gòu)成,也作為日志記錄的日志名
BOT_NAME = 'Amazon'
#2、爬蟲(chóng)應(yīng)用路徑
SPIDER_MODULES = ['Amazon.spiders']
NEWSPIDER_MODULE = 'Amazon.spiders'
#3、客戶端User-Agent請(qǐng)求頭
#USER_AGENT = 'Amazon (+http://www.)'
#4、是否遵循爬蟲(chóng)協(xié)議
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
#5、是否支持cookie,cookiejar進(jìn)行操作cookie,默認(rèn)開(kāi)啟
#COOKIES_ENABLED = False
#6、Telnet用于查看當(dāng)前爬蟲(chóng)的信息,操作爬蟲(chóng)等...使用telnet ip port ,然后通過(guò)命令操作
#TELNETCONSOLE_ENABLED = False
#TELNETCONSOLE_HOST = '127.0.0.1'
#TELNETCONSOLE_PORT = [6023,]
#7、Scrapy發(fā)送HTTP請(qǐng)求默認(rèn)使用的請(qǐng)求頭
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
#===>第二部分:并發(fā)與延遲<===
#1、下載器總共最大處理的并發(fā)請(qǐng)求數(shù),默認(rèn)值16
#CONCURRENT_REQUESTS = 32
#2、每個(gè)域名能夠被執(zhí)行的最大并發(fā)請(qǐng)求數(shù)目,默認(rèn)值8
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#3、能夠被單個(gè)IP處理的并發(fā)請(qǐng)求數(shù),默認(rèn)值0,代表無(wú)限制,需要注意兩點(diǎn)
#I、如果不為零,那CONCURRENT_REQUESTS_PER_DOMAIN將被忽略,即并發(fā)數(shù)的限制是按照每個(gè)IP來(lái)計(jì)算,而不是每個(gè)域名
#II、該設(shè)置也影響DOWNLOAD_DELAY,如果該值不為零,那么DOWNLOAD_DELAY下載延遲是限制每個(gè)IP而不是每個(gè)域
#CONCURRENT_REQUESTS_PER_IP = 16
#4、如果沒(méi)有開(kāi)啟智能限速,這個(gè)值就代表一個(gè)規(guī)定死的值,代表對(duì)同一網(wǎng)址延遲請(qǐng)求的秒數(shù)
#DOWNLOAD_DELAY = 3
#===>第三部分:智能限速/自動(dòng)節(jié)流:AutoThrottle extension<===
#一:介紹
from scrapy.contrib.throttle import AutoThrottle #http://scrapy./en/latest/topics/autothrottle.html#topics-autothrottle
設(shè)置目標(biāo):
1、比使用默認(rèn)的下載延遲對(duì)站點(diǎn)更好
2、自動(dòng)調(diào)整scrapy到最佳的爬取速度,所以用戶無(wú)需自己調(diào)整下載延遲到最佳狀態(tài)。用戶只需要定義允許最大并發(fā)的請(qǐng)求,剩下的事情由該擴(kuò)展組件自動(dòng)完成
#二:如何實(shí)現(xiàn)?
在Scrapy中,下載延遲是通過(guò)計(jì)算建立TCP連接到接收到HTTP包頭(header)之間的時(shí)間來(lái)測(cè)量的。
注意,由于Scrapy可能在忙著處理spider的回調(diào)函數(shù)或者無(wú)法下載,因此在合作的多任務(wù)環(huán)境下準(zhǔn)確測(cè)量這些延遲是十分苦難的。 不過(guò),這些延遲仍然是對(duì)Scrapy(甚至是服務(wù)器)繁忙程度的合理測(cè)量,而這擴(kuò)展就是以此為前提進(jìn)行編寫(xiě)的。
#三:限速算法
自動(dòng)限速算法基于以下規(guī)則調(diào)整下載延遲
#1、spiders開(kāi)始時(shí)的下載延遲是基于AUTOTHROTTLE_START_DELAY的值
#2、當(dāng)收到一個(gè)response,對(duì)目標(biāo)站點(diǎn)的下載延遲=收到響應(yīng)的延遲時(shí)間/AUTOTHROTTLE_TARGET_CONCURRENCY
#3、下一次請(qǐng)求的下載延遲就被設(shè)置成:對(duì)目標(biāo)站點(diǎn)下載延遲時(shí)間和過(guò)去的下載延遲時(shí)間的平均值
#4、沒(méi)有達(dá)到200個(gè)response則不允許降低延遲
#5、下載延遲不能變的比DOWNLOAD_DELAY更低或者比AUTOTHROTTLE_MAX_DELAY更高
#四:配置使用
#開(kāi)啟True,默認(rèn)False
AUTOTHROTTLE_ENABLED = True
#起始的延遲
AUTOTHROTTLE_START_DELAY = 5
#最小延遲
DOWNLOAD_DELAY = 3
#最大延遲
AUTOTHROTTLE_MAX_DELAY = 10
#每秒并發(fā)請(qǐng)求數(shù)的平均值,不能高于 CONCURRENT_REQUESTS_PER_DOMAIN或CONCURRENT_REQUESTS_PER_IP,調(diào)高了則吞吐量增大強(qiáng)奸目標(biāo)站點(diǎn),調(diào)低了則對(duì)目標(biāo)站點(diǎn)更加”禮貌“
#每個(gè)特定的時(shí)間點(diǎn),scrapy并發(fā)請(qǐng)求的數(shù)目都可能高于或低于該值,這是爬蟲(chóng)視圖達(dá)到的建議值而不是硬限制
AUTOTHROTTLE_TARGET_CONCURRENCY = 16.0
#調(diào)試
AUTOTHROTTLE_DEBUG = True
CONCURRENT_REQUESTS_PER_DOMAIN = 16
CONCURRENT_REQUESTS_PER_IP = 16
#===>第四部分:爬取深度與爬取方式<===
#1、爬蟲(chóng)允許的最大深度,可以通過(guò)meta查看當(dāng)前深度;0表示無(wú)深度
# DEPTH_LIMIT = 3
#2、爬取時(shí),0表示深度優(yōu)先Lifo(默認(rèn));1表示廣度優(yōu)先FiFo
# 后進(jìn)先出,深度優(yōu)先
# DEPTH_PRIORITY = 0
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleLifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.LifoMemoryQueue'
# 先進(jìn)先出,廣度優(yōu)先
# DEPTH_PRIORITY = 1
# SCHEDULER_DISK_QUEUE = 'scrapy.squeue.PickleFifoDiskQueue'
# SCHEDULER_MEMORY_QUEUE = 'scrapy.squeue.FifoMemoryQueue'
#3、調(diào)度器隊(duì)列
# SCHEDULER = 'scrapy.core.scheduler.Scheduler'
# from scrapy.core.scheduler import Scheduler
#4、訪問(wèn)URL去重
# DUPEFILTER_CLASS = 'step8_king.duplication.RepeatUrl'
#===>第五部分:中間件、Pipelines、擴(kuò)展<===
#1、Enable or disable spider middlewares
# See http://scrapy./en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'Amazon.middlewares.AmazonSpiderMiddleware': 543,
#}
#2、Enable or disable downloader middlewares
# See http://scrapy./en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
# 'Amazon.middlewares.DownMiddleware1': 543, #爬蟲(chóng)代理
}
#3、Enable or disable extensions
# See http://scrapy./en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
#4、Configure item pipelines
# See http://scrapy./en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
# 'Amazon.pipelines.CustomPipeline': 200,
}
#===>第六部分:緩存<===
"""
1. 啟用緩存
目的用于將已經(jīng)發(fā)送的請(qǐng)求或相應(yīng)緩存下來(lái),以便以后使用
from scrapy.downloadermiddlewares.httpcache import HttpCacheMiddleware
from scrapy.extensions.httpcache import DummyPolicy
from scrapy.extensions.httpcache import FilesystemCacheStorage
"""
# 是否啟用緩存策略
# HTTPCACHE_ENABLED = True
# 緩存策略:所有請(qǐng)求均緩存,下次在請(qǐng)求直接訪問(wèn)原來(lái)的緩存即可
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.DummyPolicy"
# 緩存策略:根據(jù)Http響應(yīng)頭:Cache-Control、Last-Modified 等進(jìn)行緩存的策略
# HTTPCACHE_POLICY = "scrapy.extensions.httpcache.RFC2616Policy"
# 緩存超時(shí)時(shí)間
# HTTPCACHE_EXPIRATION_SECS = 0
# 緩存保存路徑
# HTTPCACHE_DIR = 'httpcache'
# 緩存忽略的Http狀態(tài)碼
# HTTPCACHE_IGNORE_HTTP_CODES = []
# 緩存存儲(chǔ)的插件
# HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|