]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/yahoo.py
4b3aec9d1f5a8ce91f676d585e5f98dd3ba126aa
6 from .common
import InfoExtractor
, SearchInfoExtractor
13 class YahooIE(InfoExtractor
):
14 """Information extractor for screen.yahoo.com."""
15 _VALID_URL
= r
'http://screen\.yahoo\.com/.*?-(?P<id>\d*?)\.html'
17 def _real_extract(self
, url
):
18 mobj
= re
.match(self
._VALID
_URL
, url
)
20 raise ExtractorError(u
'Invalid URL: %s' % url
)
21 video_id
= mobj
.group('id')
22 webpage
= self
._download
_webpage
(url
, video_id
)
23 m_id
= re
.search(r
'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P<new_id>.+?)";', webpage
)
26 # TODO: Check which url parameters are required
27 info_url
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
28 webpage
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info webpage')
29 info_re
= r
'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
30 <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
31 <media:pubStart><!\[CDATA\[(?P<date>.*?)\ .*\]\]></media:pubStart>.*
32 <media:content\ medium="image"\ url="(?P<thumb>.*?)"\ name="LARGETHUMB"
34 self
.report_extraction(video_id
)
35 m_info
= re
.search(info_re
, webpage
, re
.VERBOSE|re
.DOTALL
)
37 raise ExtractorError(u
'Unable to extract video info')
38 video_title
= m_info
.group('title')
39 video_description
= m_info
.group('description')
40 video_thumb
= m_info
.group('thumb')
41 video_date
= m_info
.group('date')
42 video_date
= datetime
.datetime
.strptime(video_date
, '%m/%d/%Y').strftime('%Y%m%d')
44 # TODO: Find a way to get mp4 videos
45 rest_url
= 'http://cosmos.bcst.yahoo.com/rest/v2/pops;element=stream;outputformat=mrss;id=%s;lmsoverride=1;bw=375;dynamicstream=1;cb=83521105;tech=flv,mp4;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id
46 webpage
= self
._download
_webpage
(rest_url
, video_id
, u
'Downloading video url webpage')
47 m_rest
= re
.search(r
'<media:content url="(?P<url>.*?)" path="(?P<path>.*?)"', webpage
)
48 video_url
= m_rest
.group('url')
49 video_path
= m_rest
.group('path')
51 raise ExtractorError(u
'Unable to extract video url')
53 else: # We have to use a different method if another id is defined
54 long_id
= m_id
.group('new_id')
55 info_url
= 'http://video.query.yahoo.com/v1/public/yql?q=SELECT%20*%20FROM%20yahoo.media.video.streams%20WHERE%20id%3D%22' + long_id
+ '%22%20AND%20format%3D%22mp4%2Cflv%22%20AND%20protocol%3D%22rtmp%2Chttp%22%20AND%20plrs%3D%2286Gj0vCaSzV_Iuf6hNylf2%22%20AND%20acctid%3D%22389%22%20AND%20plidl%3D%22%22%20AND%20pspid%3D%22792700001%22%20AND%20offnetwork%3D%22false%22%20AND%20site%3D%22ivy%22%20AND%20lang%3D%22en-US%22%20AND%20region%3D%22US%22%20AND%20override%3D%22none%22%3B&env=prod&format=json&callback=YUI.Env.JSONP.yui_3_8_1_1_1368368376830_335'
56 webpage
= self
._download
_webpage
(info_url
, video_id
, u
'Downloading info json')
57 json_str
= re
.search(r
'YUI.Env.JSONP.yui.*?\((.*?)\);', webpage
).group(1)
58 info
= json
.loads(json_str
)
59 res
= info
[u
'query'][u
'results'][u
'mediaObj'][0]
60 stream
= res
[u
'streams'][0]
61 video_path
= stream
[u
'path']
62 video_url
= stream
[u
'host']
64 video_title
= meta
[u
'title']
65 video_description
= meta
[u
'description']
66 video_thumb
= meta
[u
'thumbnail']
67 video_date
= None # I can't find it
72 'play_path': video_path
,
74 'description': video_description
,
75 'thumbnail': video_thumb
,
76 'upload_date': video_date
,
81 class YahooSearchIE(SearchInfoExtractor
):
82 """Information Extractor for Yahoo! Video search queries."""
85 IE_NAME
= u
'screen.yahoo:search'
86 _SEARCH_KEY
= 'yvsearch'
88 def _get_n_results(self
, query
, n
):
89 """Get a specified number of results for a query"""
96 for pagenum
in itertools
.count(0):
97 result_url
= u
'http://video.search.yahoo.com/search/?p=%s&fr=screen&o=js&gs=0&b=%d' % (compat_urllib_parse
.quote_plus(query
), pagenum
* 30)
98 webpage
= self
._download
_webpage
(result_url
, query
,
99 note
='Downloading results page '+str(pagenum
+1))
100 info
= json
.loads(webpage
)
102 results
= info
[u
'results']
104 for (i
, r
) in enumerate(results
):
105 if (pagenum
* 30) +i
>= n
:
107 mobj
= re
.search(r
'(?P<url>screen\.yahoo\.com/.*?-\d*?\.html)"', r
)
108 e
= self
.url_result('http://' + mobj
.group('url'), 'Yahoo')
109 res
['entries'].append(e
)
110 if (pagenum
* 30 +i
>= n
) or (m
[u
'last'] >= (m
[u
'total'] -1 )):