<input type="hidden" name="lsd" value="AVpI6yeG" autocomplete="off" />
<input type="hidden" name="lgnrnd" value="171524_eEMy" />
<input type="hidden" id="lgnjs" name="lgnjs" value="n" /></span>
使用正則語句將value值提取出即可,Cookie值使用cookiejar存儲即可,方便下一步的使用。代碼如下:
def login_first_step(self):
sent_url = 'https://www.'
request = urllib2.Request(url=sent_url,headers=self.facebook_header)
content=self.opener.open(request)
html = content.read()
lsd=lgndim=lgnjs=lgnrnd=''
# 查找lsd
reg = r'<input type="hidden" name="lsd" value="([A-Za-z0-9]*)" autocomplete="off" />'
m = re.compile(reg)
search = re.search(m,html)
if search:
lsd = search.group(1)
# 查找lgndim lgndim可能為空
reg = r'<input type="hidden" autocomplete="off" name="lgndim" value="([A-Za-z0-9]*)"'
m = re.compile(reg)
search = re.search(m,html)
if search:
lgndim = search.group(1)
# 查找lgnrnd
reg = r'<input type="hidden" name="lgnrnd" value="([A-Za-z0-9]*_[A-Za-z0-9]*)" />'
m = re.compile(reg)
search = re.search(m,html)
if search:
lgnrnd = search.group(1)
# 查找lgnjs
reg = r'<input type="hidden" id="lgnjs" name="lgnjs" value="([A-Za-z0-9]*)" />'
m = re.compile(reg)
search = re.search(m,html)
if search:
lgnjs = search.group(1)
#設(shè)置第二步中要post的值
self.login_post_values = 'lsd='+lsd+'&email='+self.email+'&pass='+self.password+ '&persistent=&default_persistent=1&timezone=&lgndim=&lgnrnd=' +lgnrnd+'&lgnjs='+lgnjs+'&locale=zh_CN&next=https%3A%2F%2Fwww.%2F'
print '-------------------------------------------'
print 'lsd:',lsd
print 'lgndim:',lgndim
print 'lgnjs:',lgnjs
print 'lgnrnd:',lgnrnd
print self.cj
for key in self.cj:
print key.name,':',key.value
print '-------------------------------------------'</span>
step 2
進(jìn)入https://www./login.php?login_attempt=1&lwv=110,在header的cookies中提交fr的值,在post中提交下面八個(gè)值,其中:lsd、email、pass、persistent、default_persistent、timezone、lgndim、lgnrnd、lgnjs、locale、next。這些值必須按照指定順序提交,這個(gè)按順序提交糾結(jié)了我一天。。。。其中email是登陸的用戶名,pass是密碼。Local是地區(qū),比如“zh_CN”。Next為https://www./。注意發(fā)送前使用urllib.urlencode(values)函數(shù)對post的值進(jìn)行url編碼。其輸入是一個(gè)字典,輸出是一段字符串。Python的urllib2包中處理header數(shù)據(jù)時(shí),是以字典類型作為輸入的,所以不需要進(jìn)行url編碼,這個(gè)小問題也需要注意,否則會浪費(fèi)很多時(shí)間。
def login_second_step(self):
sent_url = 'https://www./login.php?login_attempt=1&lwv=110'
request = urllib2.Request(url=sent_url,headers=self.facebook_header,data=self.login_post_values)
content=self.opener.open(request)
print '-------------------------------------------'
for key in self.cj:
print key.name,':',key.value
print '-------------------------------------------'</span> 這一步的response是302重定向報(bào)文,python會自動(dòng)向新目標(biāo)https://www./發(fā)送一個(gè)新的請求,但是這個(gè)請求沒有帶上我們獲取的cookie值。為了使得重定向報(bào)文得到新的cookie值,需要自己編寫http_error_302()方法,參考了下面這篇博客http://www./blog/2013/08/13/python-urllib2-%E9%87%8D%E5%AE%9A%E5%90%91%E6%97%B6%E8%8E%B7%E5%8F%96cookie/
首先我們要自己編寫一個(gè)遇到302重定向時(shí)的解決辦法,在這里我的做法比較粗暴,手工處理cookie值,并將新cookie值添加到請求的頭部,代碼如下:
class RedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self,req,fp,code,msg,headers):
print '############ GOT 302 ###############'
cookiemap = {}
setcookie = str(headers["Set-Cookie"])
cookieTokens = ["Domain","Expires", "Path", "Max-Age",'path','domain']
tokens = setcookie.split(";")
for cookie in tokens:
cookie = cookie.strip()
if cookie.startswith("Expires="):
cookies = cookie.split(",", 2)
if len(cookies) > 2:
cookie = cookies[2]
cookie = cookie.strip()
else :
cookies = cookie.split(",", 1)
if len(cookies) > 1:
cookie = cookies[1]
cookie = cookie.strip()
namevalue = cookie.split("=", 1)
if len(namevalue) > 1:
name = namevalue[0]
value = namevalue[1]
if name not in cookieTokens:
cookiemap[name] = value
print cookiemap
str_cookie = ''
for key in cookiemap:
str_cookie = str_cookie + key + '=' + cookiemap[key] + '; '
str_cookie = str_cookie[:-2]
print str_cookie
req.add_header("Cookie", str_cookie)<span style="white-space:pre"> </span>#設(shè)置新的cookie值
return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)</span>
然后,需要在urllib2包中提交請求的opener中添加我們的302處理方法,代碼如下:
opener = urllib2.build_opener(httpHandler, httpsHandler,RedirectHandler,
urllib2.HTTPCookieProcessor(self.cj))</span>
至此,302重定向問題就解決完了,當(dāng)python檢測到收到重定向頁面后,會執(zhí)行我們編寫的出錯(cuò)處理方法。
step 3
有了前面的鋪墊,第三步就簡單了很多,直接訪問facebook的個(gè)人主頁即可。
在這里不得不提一下cookiejar這個(gè)小工具,cookiejar來源于cookielib包。這個(gè)小工具能夠自動(dòng)的存儲已經(jīng)獲取的cookie值(即response頭部的set-cookie值),并在下一次訪問時(shí)添加上這些cookie值。當(dāng)然,我們也能夠自己增刪改查cookiejar中的內(nèi)容,使用起來很方便。否則我們需要自己動(dòng)手寫處理cookie的代碼,非常浪費(fèi)時(shí)間。
def login_third_step(self):
sent_url = 'https://www.'
request = urllib2.Request(url=sent_url,headers=self.facebook_header)
content=self.opener.open(request)
print '-------------------------------------------'
print content.read()
print '-------------------------------------------'</span> 到這里,我們已經(jīng)登錄到了facebook,但是只是到了歡迎頁面,并查看不了“朋友圈”,因?yàn)榇藭r(shí),cookie中少了一個(gè)值,“datr”,這個(gè)值在step 3的response的數(shù)據(jù)中,使用相同的辦法找到并提交即可,所以需要對step 3進(jìn)行修改:
def login_third_step(self):
sent_url = 'https://www.'
request = urllib2.Request(url=sent_url,headers=self.facebook_header)
content=self.opener.open(request)
# print content.read()
tmp_html = content.read()
#查找datr
reg = r'"_js_datr","([A-Za-z0-9]*)"'
m = re.compile(reg)
search = re.search(m,tmp_html)
datr = ''
if search:
datr = search.group(1)
print '-------------------------------------------'
print 'datr: ',datr
self.cj.set_cookie(cookielib.Cookie(
version=0,
name='datr',
value=datr,
port=None,
port_specified=False,
domain=".",
domain_specified=True,
domain_initial_dot=False,
path="/",
path_specified=True,
secure=False,
expires=None,
discard=False,
comment=None,
comment_url=None,
rest=None
)) 然后把數(shù)據(jù)再次提交即可:
def login_fourth_step(self):
sent_url = 'https://www.'
request = urllib2.Request(url=sent_url,headers=self.facebook_header)
content=self.opener.open(request)
print '-------------------------------------------'
print ' getting html '
# print content.read()
self.html = content.read()
print '-------------------------------------------'
至此,模擬登陸就完成了,至于解析數(shù)據(jù),放在下一篇文章再討論。整個(gè)模擬登陸代碼如下:
# -*- coding:gb2312 -*-
__author__ = 'HYDT'
import urllib2
import re
import cookielib
class RedirectHandler(urllib2.HTTPRedirectHandler):
def http_error_302(self,req,fp,code,msg,headers):
print '############ GOT 302 ###############'
cookiemap = {}
setcookie = str(headers["Set-Cookie"])
cookieTokens = ["Domain","Expires", "Path", "Max-Age",'path','domain']
tokens = setcookie.split(";")
for cookie in tokens:
cookie = cookie.strip()
if cookie.startswith("Expires="):
cookies = cookie.split(",", 2)
if len(cookies) > 2:
cookie = cookies[2]
cookie = cookie.strip()
else :
cookies = cookie.split(",", 1)
if len(cookies) > 1:
cookie = cookies[1]
cookie = cookie.strip()
namevalue = cookie.split("=", 1)
if len(namevalue) > 1:
name = namevalue[0]
value = namevalue[1]
if name not in cookieTokens:
cookiemap[name] = value
print cookiemap
str_cookie = ''
for key in cookiemap:
str_cookie = str_cookie + key + '=' + cookiemap[key] + '; '
str_cookie = str_cookie[:-2]
print str_cookie
req.add_header("Cookie", str_cookie)
return urllib2.HTTPRedirectHandler.http_error_302(self, req, fp, code, msg, headers)
class get_html():
email = '%2B<span style="font-family: Arial, Helvetica, sans-serif;">86185xxxxxxxx</span><span style="font-family: Arial, Helvetica, sans-serif;">' #用戶名 +86185xxxxxxxx 注意加號改成url編碼 %2B</span>
password = '' #密碼
cj = cookielib.CookieJar()
login_post_values = ''
html = ''
facebook_header = {
"Connection":"close",
"Cache-Control":"max-age=0",
"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Origin":"https://www.",
"Upgrade-Insecure-Requests":" 1",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/45.0.2454.101 Safari/537.36",
"Content-Type":"application/x-www-form-urlencoded",
"Referer":"https://www./",
"Accept-Language":"zh-CN,zh;q=0.8"
}
opener = urllib2.build_opener()
def get_opener(self):
# self.cj = self.cj.clear()
httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler,RedirectHandler,
urllib2.HTTPCookieProcessor(self.cj))
return opener
def login_first_step(self):
sent_url = 'https://www.'
request = urllib2.Request(url=sent_url,headers=self.facebook_header)
content=self.opener.open(request)
html = content.read()
lsd=lgndim=lgnjs=lgnrnd=''
# 查找lsd
reg = r'<input type="hidden" name="lsd" value="([A-Za-z0-9]*)" autocomplete="off" />'
m = re.compile(reg)
search = re.search(m,html)
if search:
lsd = search.group(1)
# 查找lgndim lgndim可能為空
reg = r'<input type="hidden" autocomplete="off" name="lgndim" value="([A-Za-z0-9]*)"'
m = re.compile(reg)
search = re.search(m,html)
if search:
lgndim = search.group(1)
# 查找lgnrnd
reg = r'<input type="hidden" name="lgnrnd" value="([A-Za-z0-9]*_[A-Za-z0-9]*)" />'
m = re.compile(reg)
search = re.search(m,html)
if search:
lgnrnd = search.group(1)
# 查找lgnjs
reg = r'<input type="hidden" id="lgnjs" name="lgnjs" value="([A-Za-z0-9]*)" />'
m = re.compile(reg)
search = re.search(m,html)
if search:
lgnjs = search.group(1)
#設(shè)置第二步中要post的值
self.login_post_values = 'lsd='+lsd+'&email='+self.email+'&pass='+self.password+ '&persistent=&default_persistent=1&timezone=&lgndim=&lgnrnd=' +lgnrnd+'&lgnjs='+lgnjs+'&locale=zh_CN&next=https%3A%2F%2Fwww.%2F'
print '-------------------------------------------'
print 'lsd:',lsd
print 'lgndim:',lgndim
print 'lgnjs:',lgnjs
print 'lgnrnd:',lgnrnd
print self.cj
for key in self.cj:
print key.name,':',key.value
print '-------------------------------------------'
def login_second_step(self):
sent_url = 'https://www./login.php?login_attempt=1&lwv=110'
request = urllib2.Request(url=sent_url,headers=self.facebook_header,data=self.login_post_values)
content=self.opener.open(request)
print '-------------------------------------------'
for key in self.cj:
print key.name,':',key.value
print '-------------------------------------------'
def login_third_step(self):
sent_url = 'https://www.'
request = urllib2.Request(url=sent_url,headers=self.facebook_header)
content=self.opener.open(request)
# print content.read()
tmp_html = content.read()
#查找datr
reg = r'"_js_datr","([A-Za-z0-9]*)"'
m = re.compile(reg)
search = re.search(m,tmp_html)
datr = ''
if search:
datr = search.group(1)
print '-------------------------------------------'
print 'datr: ',datr
self.cj.set_cookie(cookielib.Cookie(
version=0,
name='datr',
value=datr,
port=None,
port_specified=False,
domain=".",
domain_specified=True,
domain_initial_dot=False,
path="/",
path_specified=True,
secure=False,
expires=None,
discard=False,
comment=None,
comment_url=None,
rest=None
))
def login_fourth_step(self):
sent_url = 'https://www.'
request = urllib2.Request(url=sent_url,headers=self.facebook_header)
content=self.opener.open(request)
print '-------------------------------------------'
print ' getting html '
# print content.read()
self.html = content.read()
print '-------------------------------------------'
def get_proxy(self):
proxy = {'http':'http://127.0.0.1:1080'}
proxy_support = urllib2.ProxyHandler(proxy)
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)
print "#########Open Proxy!##########"
def __init__(self):
self.get_proxy()
self.cj = cookielib.CookieJar()
self.opener = self.get_opener()
self.login_first_step()
self.login_second_step()
self.login_third_step()
self.login_fourth_step()
# get_html()
模擬手工操作的方法
這種方法就十分簡單了,直接上代碼。
def get_cookies(self):
print "=============================="
print " Geting Cookies! "
print "=============================="
#選擇瀏覽器
browser = webdriver.Firefox()
# browser.implicitly_wait(10)
# browser.set_window_size(0,0)
#訪問facebook網(wǎng)頁
browser.get('https://www./')
#輸入賬戶密碼
browser.find_element_by_id('email').clear()
browser.find_element_by_id('email').send_keys(self.email)
browser.find_element_by_id('pass').clear()
browser.find_element_by_id('pass').send_keys(self.password)
#模擬點(diǎn)擊登錄按鈕,兩種不同的點(diǎn)擊方法。。。
try:
browser.find_element_by_xpath('//button[@id="loginbutton"]').send_keys(Keys.ENTER)
except:
browser.find_element_by_xpath('//input[@tabindex="4"]').send_keys(Keys.ENTER)
# time.sleep(10)
browser.find_element_by_xpath('//a[@href="https://www./?ref=logo"]').send_keys(Keys.ENTER)
# browser.file_detector_context('Facebook').send_keys(Keys.ENTER)
#獲取cookie
cookies = browser.get_cookies()
#關(guān)閉瀏覽器
browser.close() </span>
總結(jié)
在這篇博文中,介紹了兩種模擬登陸的方法,一種是模擬數(shù)據(jù)交互過程,該方法的優(yōu)點(diǎn)在于執(zhí)行速度快,相對較為穩(wěn)定,缺點(diǎn)是開發(fā)時(shí)間長。模擬手工操作的方法正好相反,運(yùn)行速度較慢,但開發(fā)速度極快。根據(jù)獲取的cookie值可以看出來,一次模擬登陸的cookie值,有效期是三個(gè)月。所以,該程序三個(gè)月運(yùn)行一次即可,那么運(yùn)行速度慢點(diǎn)完全可以接受。所以,在現(xiàn)實(shí)的工程中,到底使用哪種方法,可以依據(jù)情況而定。
|