]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/escapist.py
Imported Upstream version 2015.02.28
[youtubedl] / youtube_dl / extractor / escapist.py
1 from __future__ import unicode_literals
2
3 from .common import InfoExtractor
4 from ..compat import (
5 compat_urllib_parse,
6 compat_urllib_request,
7 )
8 from ..utils import (
9 ExtractorError,
10 js_to_json,
11 parse_duration,
12 )
13
14
15 class EscapistIE(InfoExtractor):
16 _VALID_URL = r'https?://?(www\.)?escapistmagazine\.com/videos/view/[^/?#]+/(?P<id>[0-9]+)-[^/?#]*(?:$|[?#])'
17 _USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko'
18 _TEST = {
19 'url': 'http://www.escapistmagazine.com/videos/view/the-escapist-presents/6618-Breaking-Down-Baldurs-Gate',
20 'md5': 'ab3a706c681efca53f0a35f1415cf0d1',
21 'info_dict': {
22 'id': '6618',
23 'ext': 'mp4',
24 'description': "Baldur's Gate: Original, Modded or Enhanced Edition? I'll break down what you can expect from the new Baldur's Gate: Enhanced Edition.",
25 'uploader_id': 'the-escapist-presents',
26 'uploader': 'The Escapist Presents',
27 'title': "Breaking Down Baldur's Gate",
28 'thumbnail': 're:^https?://.*\.jpg$',
29 'duration': 264,
30 }
31 }
32
33 def _real_extract(self, url):
34 video_id = self._match_id(url)
35 webpage_req = compat_urllib_request.Request(url)
36 webpage_req.add_header('User-Agent', self._USER_AGENT)
37 webpage = self._download_webpage(webpage_req, video_id)
38
39 uploader_id = self._html_search_regex(
40 r"<h1\s+class='headline'>\s*<a\s+href='/videos/view/(.*?)'",
41 webpage, 'uploader ID', fatal=False)
42 uploader = self._html_search_regex(
43 r"<h1\s+class='headline'>(.*?)</a>",
44 webpage, 'uploader', fatal=False)
45 description = self._html_search_meta('description', webpage)
46 duration = parse_duration(self._html_search_meta('duration', webpage))
47
48 raw_title = self._html_search_meta('title', webpage, fatal=True)
49 title = raw_title.partition(' : ')[2]
50
51 config_url = compat_urllib_parse.unquote(self._html_search_regex(
52 r'''(?x)
53 (?:
54 <param\s+name="flashvars".*?\s+value="config=|
55 flashvars=&quot;config=
56 )
57 (https?://[^"&]+)
58 ''',
59 webpage, 'config URL'))
60
61 formats = []
62 ad_formats = []
63
64 def _add_format(name, cfg_url, quality):
65 cfg_req = compat_urllib_request.Request(cfg_url)
66 cfg_req.add_header('User-Agent', self._USER_AGENT)
67 config = self._download_json(
68 cfg_req, video_id,
69 'Downloading ' + name + ' configuration',
70 'Unable to download ' + name + ' configuration',
71 transform_source=js_to_json)
72
73 playlist = config['playlist']
74 for p in playlist:
75 if p.get('eventCategory') == 'Video':
76 ar = formats
77 elif p.get('eventCategory') == 'Video Postroll':
78 ar = ad_formats
79 else:
80 continue
81
82 ar.append({
83 'url': p['url'],
84 'format_id': name,
85 'quality': quality,
86 'http_headers': {
87 'User-Agent': self._USER_AGENT,
88 },
89 })
90
91 _add_format('normal', config_url, quality=0)
92 hq_url = (config_url +
93 ('&hq=1' if '?' in config_url else config_url + '?hq=1'))
94 try:
95 _add_format('hq', hq_url, quality=1)
96 except ExtractorError:
97 pass # That's fine, we'll just use normal quality
98 self._sort_formats(formats)
99
100 if '/escapist/sales-marketing/' in formats[-1]['url']:
101 raise ExtractorError('This IP address has been blocked by The Escapist', expected=True)
102
103 res = {
104 'id': video_id,
105 'formats': formats,
106 'uploader': uploader,
107 'uploader_id': uploader_id,
108 'title': title,
109 'thumbnail': self._og_search_thumbnail(webpage),
110 'description': description,
111 'duration': duration,
112 }
113
114 if self._downloader.params.get('include_ads') and ad_formats:
115 self._sort_formats(ad_formats)
116 ad_res = {
117 'id': '%s-ad' % video_id,
118 'title': '%s (Postroll)' % title,
119 'formats': ad_formats,
120 }
121 return {
122 '_type': 'playlist',
123 'entries': [res, ad_res],
124 'title': title,
125 'id': video_id,
126 }
127
128 return res