一区二区三区日韩精品-日韩经典一区二区三区-五月激情综合丁香婷婷-欧美精品中文字幕专区

分享

用python實(shí)現(xiàn)的抓取騰訊視頻所有電影的爬蟲

 free_light 2014-04-30

1. [代碼]用python實(shí)現(xiàn)的抓取騰訊視頻所有電影的爬蟲     跳至 [1] [全屏預(yù)覽]

001# -*- coding: utf-8 -*-
002# by awakenjoys. my site: www.dianying.at
003import re
004import urllib2
005from bs4 import BeautifulSoup
006import string, time
007import pymongo
008 
009NUM     = 0         #全局變量,電影數(shù)量
010m_type  = u''       #全局變量,電影類型
011m_site  = u'qq' #全局變量,電影網(wǎng)站
012 
013#根據(jù)指定的URL獲取網(wǎng)頁內(nèi)容
014def gethtml(url):
015    req = urllib2.Request(url)
016    response = urllib2.urlopen(req)
017    html = response.read()
018    return html
019 
020#從電影分類列表頁面獲取電影分類
021def gettags(html):
022    global m_type
023    soup = BeautifulSoup(html)      #過濾出分類內(nèi)容
024    #print soup
025    #<ul class="clearfix _group" gname="mi_type" gtype="1">
026    tags_all = soup.find_all('ul', {'class' : 'clearfix _group' , 'gname' : 'mi_type'})
027    #print len(tags_all), tags_all
028    #print str(tags_all[1]).replace('\n', '')
029 
030    #<a _hot="tag.sub" class="_gtag _hotkey" title="動(dòng)作" tvalue="0">動(dòng)作</a>
031    re_tags = r'<a _hot=\"tag\.sub\" class=\"_gtag _hotkey\" href=\"(.+?)\" title=\"(.+?)\" tvalue=\"(.+?)\">.+?</a>'
032    p = re.compile(re_tags, re.DOTALL)
033 
034    tags = p.findall(str(tags_all[0]))
035    if tags:
036        tags_url = {}
037        #print tags
038        for tag in tags:
039            tag_url = tag[0].decode('utf-8')
040            #print tag_url
041            m_type = tag[1].decode('utf-8')
042            tags_url[m_type] = tag_url
043             
044    else:
045            print "Not Find"
046    return tags_url
047 
048#獲取每個(gè)分類的頁數(shù)
049def get_pages(tag_url):
050    tag_html = gethtml(tag_url)
051    #div class="paginator
052    soup = BeautifulSoup(tag_html)      #過濾出標(biāo)記頁面的html
053    #print soup
054    #<div class="mod_pagenav" id="pager">
055    div_page = soup.find_all('div', {'class' : 'mod_pagenav', 'id' : 'pager'})
056    #print div_page #len(div_page), div_page[0]
057 
058    #<a class="c_txt6" title="25"><span>25</span></a>
059    re_pages = r'<a class=.+?><span>(.+?)</span></a>'
060    p = re.compile(re_pages, re.DOTALL)
061    pages = p.findall(str(div_page[0]))
062    #print pages
063    if len(pages) > 1:
064        return pages[-2]
065    else:
066        return 1
067     
068 
069def getmovielist(html):
070    soup = BeautifulSoup(html)
071 
072    #<ul class="mod_list_pic_130">
073    divs = soup.find_all('ul', {'class' : 'mod_list_pic_130'})
074    #print divs
075    for div_html in divs:
076        div_html = str(div_html).replace('\n', '')
077        #print div_html
078        getmovie(div_html)
079 
080 
081def getmovie(html):
082    global NUM
083    global m_type
084    global m_site
085 
086    #<h6 class="caption"> <a target="_blank" title="徒步旅行隊(duì)">徒步旅行隊(duì)</a> </h6> <ul class="info"> <li class="desc">法國(guó)賣座喜劇片</li> <li class="cast"> </li> </ul> </div> <div class="ext ext_last"> <div class="ext_txt"> <h3 class="ext_title">徒步旅行隊(duì)</h3> <div class="ext_info"> <span class="ext_area">地區(qū): 法國(guó)</span> <span class="ext_cast">導(dǎo)演: </span> <span class="ext_date">年代: 2009</span> <span class="ext_type">類型: 喜劇</span> </div> <p class="ext_intro">理查德·達(dá)奇擁有一家小的旅游公司,主要經(jīng)營(yíng)法國(guó)游客到非洲大草原的旅游服務(wù)。六個(gè)法國(guó)游客決定參加理查德·達(dá)奇組織的到非洲的一...</p>
087 
088    re_movie = r'<li><a class=\"mod_poster_130\" href=\"(.+?)\" target=\"_blank\" title=\"(.+?)\"><img.+?</li>'
089    p = re.compile(re_movie, re.DOTALL)
090    movies = p.findall(html)
091    if movies:
092        conn = pymongo.Connection('localhost', 27017)
093        movie_db = conn.dianying
094        playlinks = movie_db.playlinks
095        #print movies
096        for movie in movies:
097            #print movie
098            NUM += 1
099            print "%s : %d" % ("=" * 70, NUM)
100            values = dict(
101                movie_title = movie[1],
102                movie_url   = movie[0],
103                movie_site      = m_site,
104                movie_type      = m_type
105                )
106            print values
107            playlinks.insert(values)
108            print "_" * 70
109            NUM += 1
110            print "%s : %d" % ("=" * 70, NUM)
111 
112    #else:
113    #   print "Not Find"
114 
115def getmovieinfo(url):
116    html = gethtml(url)
117    soup = BeautifulSoup(html)
118 
119    #pack pack_album album_cover
120    divs = soup.find_all('div', {'class' : 'pack pack_album album_cover'})
121    #print divs[0]
122 
123    #<a target="new" title="《血滴子》獨(dú)家紀(jì)錄片" wl="1"> </a>
124    re_info = r'<a href=\"(.+?)\" target=\"new\" title=\"(.+?)\" wl=\".+?\"> </a>'
125    p_info = re.compile(re_info, re.DOTALL)
126    m_info = p_info.findall(str(divs[0]))
127    if m_info:
128        return m_info
129    else:
130        print "Not find movie info"
131 
132    return m_info
133 
134 
135def insertdb(movieinfo):
136    global conn
137    movie_db = conn.dianying_at
138    movies = movie_db.movies
139    movies.insert(movieinfo)
140 
141if __name__ == "__main__":
142    global conn
143 
144    tags_url = "http://v.qq.com/list/1_-1_-1_-1_1_0_0_20_0_-1_0.html"
145    #print tags_url
146    tags_html = gethtml(tags_url)
147    #print tags_html
148    tag_urls = gettags(tags_html)
149    #print tag_urls
150 
151 
152    for url in tag_urls.items():
153        print  str(url[1]).encode('utf-8') #,url[0]
154        maxpage = int(get_pages(str(url[1]).encode('utf-8')))
155        print maxpage
156 
157        for x in range(0, maxpage):
158            #http://v.qq.com/list/1_0_-1_-1_1_0_0_20_0_-1_0.html
159            m_url = str(url[1]).replace('0_20_0_-1_0.html', '')
160            movie_url = "%s%d_20_0_-1_0.html" % (m_url, x)
161            print movie_url
162            movie_html = gethtml(movie_url.encode('utf-8'))
163            #print movie_html
164            getmovielist(movie_html)
165            time.sleep(0.1)

    本站是提供個(gè)人知識(shí)管理的網(wǎng)絡(luò)存儲(chǔ)空間,所有內(nèi)容均由用戶發(fā)布,不代表本站觀點(diǎn)。請(qǐng)注意甄別內(nèi)容中的聯(lián)系方式、誘導(dǎo)購買等信息,謹(jǐn)防詐騙。如發(fā)現(xiàn)有害或侵權(quán)內(nèi)容,請(qǐng)點(diǎn)擊一鍵舉報(bào)。
    轉(zhuǎn)藏 分享 獻(xiàn)花(0

    0條評(píng)論

    發(fā)表

    請(qǐng)遵守用戶 評(píng)論公約

    類似文章 更多

    精品国产91亚洲一区二区三区| 日韩国产亚洲欧美激情| 国产一级内射麻豆91| 国产传媒欧美日韩成人精品| 黄色美女日本的美女日人| 精品少妇人妻av免费看| 欧美一区二区不卡专区| 久久综合日韩精品免费观看| 自拍偷女厕所拍偷区亚洲综合| 九九热精品视频在线观看| 美国黑人一级黄色大片| 蜜桃传媒视频麻豆第一区| 最近中文字幕高清中文字幕无| 午夜视频成人在线免费| 国产乱人伦精品一区二区三区四区| 欧美欧美日韩综合一区| 亚洲国产av国产av| av国产熟妇露脸在线观看| 亚洲中文字幕在线观看黑人| 国产日韩久久精品一区| 国产亚洲视频香蕉一区| 国产av一二三区在线观看| 国产中文另类天堂二区| 99久久精品免费精品国产| 国产麻豆精品福利在线| 久久99青青精品免费观看| 国产精品久久男人的天堂| 亚洲av首页免费在线观看| 国产一区二区三区香蕉av| 亚洲成人精品免费在线观看| 欧美亚洲另类久久久精品| 欧美不卡午夜中文字幕| 91亚洲国产成人久久| 91偷拍与自偷拍精品| 91蜜臀精品一区二区三区| 精品熟女少妇一区二区三区| 一区中文字幕人妻少妇| 麻豆视传媒短视频免费观看| 91亚洲国产日韩在线| 国产又长又粗又爽免费视频| 久久国内午夜福利直播|