之前一直使用flcvd下载优酷视频,这周看到老师讲的爬虫,手痒痒了。于是捣腾了下,从优酷页面爬取视频页面,访问flcvd 抓取下载地址,生成迅雷下载列表。
通过爬虫对正则有更深入的理解,尤其是在优酷分2种:
'http://v\.youku\.com这个为视频播放页面。例如
http://v.youku.com/v_show/id_XNDkyODQwNjgw.html 《楚汉传奇 第40集》。单集下载抓取
'http://www\.youku\.com/show_page 为剧集页面。例如:
http://www.youku.com/show_page/id_z70902150919c11e0a046.html 《楚汉传奇》,可以批量抓取剧集的下载地址哦!
欢迎大家拍砖,有哪些好的建议请指出!
#coding="utf-8"
import re
import urllib2
import threading
import Queue
class youku(threading.Thread):
def __init__(self,baseUrl,youkuQueue):
threading.Thread.__init__(self)
self.baseUrl=baseUrl if self.hasHttp().match(baseUrl) else 'http://'+baseUrl
self.youkuQueue=youkuQueue
self.vshow=self.youku_v_show()
self.showpage=self.youku_show_page()
self.start()
def hasHttp(self):
return re.compile("^http://")
def youku_v_show(self):
return re.compile(r'http://v\.youku\.com/v_show/id_.+\.html')
def youku_show_page(self):
return re.compile(r'http://www\.youku\.com/show_page/id_.+\.html')
def parseUrl(self):
req=urllib2.urlopen(self.baseUrl)
contet=req.read()
if self.vshow.match(self.baseUrl):
return set(self.vshow.findall(contet))
else:
return set(self.showpage.findall(contet))
def run(self):
print "start parse Youku "
matches=self.parseUrl()
print matches
for matche in matches:
print 'matches:%s'%matche
self.youkuQueue.put(matche)
"""
if __name__=='__main__':
Base_Url="http://www.youku.com/show_page/id_z70902150919c11e0a046.html"
#Base_Url="http://tv.youku.com/search/"
#Base_Url="http://v.youku.com/v_show/id_XNDkyODUyMTQ0.html"
#Base_Url="http://www.baidu.com"
youkuQueue=Queue.Queue()
youku=youku(Base_Url,youkuQueue)
"""
复制代码
#coding="utf-8"
import re
import urllib
import urllib2
import threading
import Queue
class flcvd(threading.Thread):
Bae_Url="http://www.flvcd.com/parse.php?"
def __init__(self,worknum,form,downloadList,youkuQueue):
threading.Thread.__init__(self)
#self.url=youkuUrl
self.showpage=self.hidentFileUrl()
self.vshow=self.youku_v_show()
self.herfshow=self.herfFileUrl()
self.form=form
self.downloadList=downloadList
self.youkuQueue=youkuQueue
self.worknum=worknum
self.start()
def hidentFileUrl(self):
return re.compile('(
http://f.youku.com/player/getFlvPath.+\s?)')
def herfFileUrl(self):
return re.compile(r'<a href=\"(
http://f.youku.com/player/getFlvPath[^\"]+)')
def youku_v_show(self):
return re.compile(r'http://v\.youku\.com/v_show/id_.+\.html')
def headers(self,url):
heads={
'Host':'www.flvcd.com',
'Referer':url,
'User-Agent':'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.97 '
}
return heads
def compileUrl(self,youkuUrl):
params={'kw':youkuUrl}
encode=urllib.urlencode(params)
form="&format=%s"%(self.form if self.form else '')
realUrl="%s%s%s"%(self.Bae_Url,encode,form)
return realUrl
def parseUrl(self,youkuUrl):
targetUrl=self.compileUrl(youkuUrl)
req=urllib2.Request(
url =targetUrl,
headers = self.headers(targetUrl)
)
resp=urllib2.urlopen(req)
content=resp.read()
if self.vshow.match(youkuUrl):
return self.herfshow.findall(content)
else:
return self.showpage.findall(content)
def run(self):
while True:
youkuUrl=self.youkuQueue.get()
print '[No:%d] flvcd robot youku url>>%s'%(self.worknum,youkuUrl)
matches=self.parseUrl(youkuUrl)
#print matches
for item in matches:
#print item
self.downloadList.append(item.rstrip())
if self.youkuQueue.empty():
print '[No:%d] flcvd no work ,bye bye...'%self.worknum
break
""" """
if __name__=='__main__':
youkuQueue=Queue.Queue()
youkuQueue.put('http://v.youku.com/v_show/id_XNDkzNTk2NDQw.html')
#youkuQueue.put('http://www.youku.com/show_page/id_z70902150919c11e0a046.html')
downloadList=[]
flcvd=flcvd(1,'super',downloadList,youkuQueue)
复制代码