]> Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py
6123e12564b7934032ed619b672b6277a75bace0
[youtubedl] / youtube_dl / extractor / youtube.py
1 # coding: utf-8
2
3 import collections
4 import errno
5 import io
6 import itertools
7 import json
8 import os.path
9 import re
10 import struct
11 import traceback
12 import zlib
13
14 from .common import InfoExtractor, SearchInfoExtractor
15 from .subtitles import SubtitlesInfoExtractor
16 from ..jsinterp import JSInterpreter
17 from ..utils import (
18 compat_chr,
19 compat_parse_qs,
20 compat_urllib_parse,
21 compat_urllib_request,
22 compat_urlparse,
23 compat_str,
24
25 clean_html,
26 get_cachedir,
27 get_element_by_id,
28 get_element_by_attribute,
29 ExtractorError,
30 int_or_none,
31 PagedList,
32 unescapeHTML,
33 unified_strdate,
34 orderedSet,
35 write_json_file,
36 uppercase_escape,
37 )
38
39 class YoutubeBaseInfoExtractor(InfoExtractor):
40 """Provide base functions for Youtube extractors"""
41 _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
42 _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
43 _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
44 _NETRC_MACHINE = 'youtube'
45 # If True it will raise an error if no login info is provided
46 _LOGIN_REQUIRED = False
47
48 def _set_language(self):
49 return bool(self._download_webpage(
50 self._LANG_URL, None,
51 note=u'Setting language', errnote='unable to set language',
52 fatal=False))
53
54 def _login(self):
55 (username, password) = self._get_login_info()
56 # No authentication to be performed
57 if username is None:
58 if self._LOGIN_REQUIRED:
59 raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
60 return False
61
62 login_page = self._download_webpage(
63 self._LOGIN_URL, None,
64 note=u'Downloading login page',
65 errnote=u'unable to fetch login page', fatal=False)
66 if login_page is False:
67 return
68
69 galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
70 login_page, u'Login GALX parameter')
71
72 # Log in
73 login_form_strs = {
74 u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
75 u'Email': username,
76 u'GALX': galx,
77 u'Passwd': password,
78 u'PersistentCookie': u'yes',
79 u'_utf8': u'霱',
80 u'bgresponse': u'js_disabled',
81 u'checkConnection': u'',
82 u'checkedDomains': u'youtube',
83 u'dnConn': u'',
84 u'pstMsg': u'0',
85 u'rmShown': u'1',
86 u'secTok': u'',
87 u'signIn': u'Sign in',
88 u'timeStmp': u'',
89 u'service': u'youtube',
90 u'uilel': u'3',
91 u'hl': u'en_US',
92 }
93 # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
94 # chokes on unicode
95 login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
96 login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
97
98 req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
99 login_results = self._download_webpage(
100 req, None,
101 note=u'Logging in', errnote=u'unable to log in', fatal=False)
102 if login_results is False:
103 return False
104 if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
105 self._downloader.report_warning(u'unable to log in: bad username or password')
106 return False
107 return True
108
109 def _confirm_age(self):
110 age_form = {
111 'next_url': '/',
112 'action_confirm': 'Confirm',
113 }
114 req = compat_urllib_request.Request(self._AGE_URL,
115 compat_urllib_parse.urlencode(age_form).encode('ascii'))
116
117 self._download_webpage(
118 req, None,
119 note=u'Confirming age', errnote=u'Unable to confirm age')
120 return True
121
122 def _real_initialize(self):
123 if self._downloader is None:
124 return
125 if not self._set_language():
126 return
127 if not self._login():
128 return
129 self._confirm_age()
130
131
132 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
133 IE_DESC = u'YouTube.com'
134 _VALID_URL = r"""(?x)^
135 (
136 (?:https?://|//)? # http(s):// or protocol-independent URL (optional)
137 (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
138 (?:www\.)?deturl\.com/www\.youtube\.com/|
139 (?:www\.)?pwnyoutube\.com/|
140 (?:www\.)?yourepeat\.com/|
141 tube\.majestyc\.net/|
142 youtube\.googleapis\.com/) # the various hostnames, with wildcard subdomains
143 (?:.*?\#/)? # handle anchor (#/) redirect urls
144 (?: # the various things that can precede the ID:
145 (?:(?:v|embed|e)/) # v/ or embed/ or e/
146 |(?: # or the v= param in all its forms
147 (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
148 (?:\?|\#!?) # the params delimiter ? or # or #!
149 (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx)
150 v=
151 )
152 ))
153 |youtu\.be/ # just youtu.be/xxxx
154 |https?://(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
155 )
156 )? # all until now is optional -> you can pass the naked ID
157 ([0-9A-Za-z_-]{11}) # here is it! the YouTube video ID
158 (?(1).+)? # if we found the ID, everything can follow
159 $"""
160 _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
161 _formats = {
162 '5': {'ext': 'flv', 'width': 400, 'height': 240},
163 '6': {'ext': 'flv', 'width': 450, 'height': 270},
164 '13': {'ext': '3gp'},
165 '17': {'ext': '3gp', 'width': 176, 'height': 144},
166 '18': {'ext': 'mp4', 'width': 640, 'height': 360},
167 '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
168 '34': {'ext': 'flv', 'width': 640, 'height': 360},
169 '35': {'ext': 'flv', 'width': 854, 'height': 480},
170 '36': {'ext': '3gp', 'width': 320, 'height': 240},
171 '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
172 '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
173 '43': {'ext': 'webm', 'width': 640, 'height': 360},
174 '44': {'ext': 'webm', 'width': 854, 'height': 480},
175 '45': {'ext': 'webm', 'width': 1280, 'height': 720},
176 '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
177
178
179 # 3d videos
180 '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
181 '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
182 '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
183 '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
184 '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
185 '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
186 '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
187
188 # Apple HTTP Live Streaming
189 '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
190 '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
191 '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
192 '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
193 '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
194 '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
195 '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
196
197 # DASH mp4 video
198 '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
199 '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
200 '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
201 '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
202 '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
203 '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
204 '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
205 '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
206
207 # Dash mp4 audio
208 '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
209 '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
210 '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
211
212 # Dash webm
213 '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
214 '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
215 '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
216 '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
217 '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
218 '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
219 '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
220 '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
221 '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
222 '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
223 '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
224 '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
225 '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
226 '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
227 '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
228
229 # Dash webm audio
230 '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 48, 'preference': -50},
231 '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
232
233 # RTMP (unnamed)
234 '_rtmp': {'protocol': 'rtmp'},
235 }
236
237 IE_NAME = u'youtube'
238 _TESTS = [
239 {
240 u"url": u"http://www.youtube.com/watch?v=BaW_jenozKc",
241 u"file": u"BaW_jenozKc.mp4",
242 u"info_dict": {
243 u"title": u"youtube-dl test video \"'/\\ä↭𝕐",
244 u"uploader": u"Philipp Hagemeister",
245 u"uploader_id": u"phihag",
246 u"upload_date": u"20121002",
247 u"description": u"test chars: \"'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .",
248 u"categories": [u'Science & Technology'],
249 }
250 },
251 {
252 u"url": u"http://www.youtube.com/watch?v=UxxajLWwzqY",
253 u"file": u"UxxajLWwzqY.mp4",
254 u"note": u"Test generic use_cipher_signature video (#897)",
255 u"info_dict": {
256 u"upload_date": u"20120506",
257 u"title": u"Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]",
258 u"description": u"md5:fea86fda2d5a5784273df5c7cc994d9f",
259 u"uploader": u"Icona Pop",
260 u"uploader_id": u"IconaPop"
261 }
262 },
263 {
264 u"url": u"https://www.youtube.com/watch?v=07FYdnEawAQ",
265 u"file": u"07FYdnEawAQ.mp4",
266 u"note": u"Test VEVO video with age protection (#956)",
267 u"info_dict": {
268 u"upload_date": u"20130703",
269 u"title": u"Justin Timberlake - Tunnel Vision (Explicit)",
270 u"description": u"md5:64249768eec3bc4276236606ea996373",
271 u"uploader": u"justintimberlakeVEVO",
272 u"uploader_id": u"justintimberlakeVEVO"
273 }
274 },
275 {
276 u"url": u"//www.YouTube.com/watch?v=yZIXLfi8CZQ",
277 u"file": u"yZIXLfi8CZQ.mp4",
278 u"note": u"Embed-only video (#1746)",
279 u"info_dict": {
280 u"upload_date": u"20120608",
281 u"title": u"Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012",
282 u"description": u"md5:09b78bd971f1e3e289601dfba15ca4f7",
283 u"uploader": u"SET India",
284 u"uploader_id": u"setindia"
285 }
286 },
287 {
288 u"url": u"http://www.youtube.com/watch?v=a9LDPn-MO4I",
289 u"file": u"a9LDPn-MO4I.m4a",
290 u"note": u"256k DASH audio (format 141) via DASH manifest",
291 u"info_dict": {
292 u"upload_date": "20121002",
293 u"uploader_id": "8KVIDEO",
294 u"description": "No description available.",
295 u"uploader": "8KVIDEO",
296 u"title": "UHDTV TEST 8K VIDEO.mp4"
297 },
298 u"params": {
299 u"youtube_include_dash_manifest": True,
300 u"format": "141",
301 },
302 },
303 # DASH manifest with encrypted signature
304 {
305 u'url': u'https://www.youtube.com/watch?v=IB3lcPjvWLA',
306 u'info_dict': {
307 u'id': u'IB3lcPjvWLA',
308 u'ext': u'm4a',
309 u'title': u'Afrojack - The Spark ft. Spree Wilson',
310 u'description': u'md5:9717375db5a9a3992be4668bbf3bc0a8',
311 u'uploader': u'AfrojackVEVO',
312 u'uploader_id': u'AfrojackVEVO',
313 u'upload_date': u'20131011',
314 },
315 u"params": {
316 u'youtube_include_dash_manifest': True,
317 u'format': '141',
318 },
319 },
320 ]
321
322
323 @classmethod
324 def suitable(cls, url):
325 """Receives a URL and returns True if suitable for this IE."""
326 if YoutubePlaylistIE.suitable(url): return False
327 return re.match(cls._VALID_URL, url) is not None
328
329 def __init__(self, *args, **kwargs):
330 super(YoutubeIE, self).__init__(*args, **kwargs)
331 self._player_cache = {}
332
333 def report_video_info_webpage_download(self, video_id):
334 """Report attempt to download video info webpage."""
335 self.to_screen(u'%s: Downloading video info webpage' % video_id)
336
337 def report_information_extraction(self, video_id):
338 """Report attempt to extract video information."""
339 self.to_screen(u'%s: Extracting video information' % video_id)
340
341 def report_unavailable_format(self, video_id, format):
342 """Report extracted video URL."""
343 self.to_screen(u'%s: Format %s not available' % (video_id, format))
344
345 def report_rtmp_download(self):
346 """Indicate the download will use the RTMP protocol."""
347 self.to_screen(u'RTMP download detected')
348
349 def _extract_signature_function(self, video_id, player_url, slen):
350 id_m = re.match(r'.*-(?P<id>[a-zA-Z0-9_-]+)\.(?P<ext>[a-z]+)$',
351 player_url)
352 player_type = id_m.group('ext')
353 player_id = id_m.group('id')
354
355 # Read from filesystem cache
356 func_id = '%s_%s_%d' % (player_type, player_id, slen)
357 assert os.path.basename(func_id) == func_id
358 cache_dir = get_cachedir(self._downloader.params)
359
360 cache_enabled = cache_dir is not None
361 if cache_enabled:
362 cache_fn = os.path.join(os.path.expanduser(cache_dir),
363 u'youtube-sigfuncs',
364 func_id + '.json')
365 try:
366 with io.open(cache_fn, 'r', encoding='utf-8') as cachef:
367 cache_spec = json.load(cachef)
368 return lambda s: u''.join(s[i] for i in cache_spec)
369 except IOError:
370 pass # No cache available
371
372 if player_type == 'js':
373 code = self._download_webpage(
374 player_url, video_id,
375 note=u'Downloading %s player %s' % (player_type, player_id),
376 errnote=u'Download of %s failed' % player_url)
377 res = self._parse_sig_js(code)
378 elif player_type == 'swf':
379 urlh = self._request_webpage(
380 player_url, video_id,
381 note=u'Downloading %s player %s' % (player_type, player_id),
382 errnote=u'Download of %s failed' % player_url)
383 code = urlh.read()
384 res = self._parse_sig_swf(code)
385 else:
386 assert False, 'Invalid player type %r' % player_type
387
388 if cache_enabled:
389 try:
390 test_string = u''.join(map(compat_chr, range(slen)))
391 cache_res = res(test_string)
392 cache_spec = [ord(c) for c in cache_res]
393 try:
394 os.makedirs(os.path.dirname(cache_fn))
395 except OSError as ose:
396 if ose.errno != errno.EEXIST:
397 raise
398 write_json_file(cache_spec, cache_fn)
399 except Exception:
400 tb = traceback.format_exc()
401 self._downloader.report_warning(
402 u'Writing cache to %r failed: %s' % (cache_fn, tb))
403
404 return res
405
406 def _print_sig_code(self, func, slen):
407 def gen_sig_code(idxs):
408 def _genslice(start, end, step):
409 starts = u'' if start == 0 else str(start)
410 ends = (u':%d' % (end+step)) if end + step >= 0 else u':'
411 steps = u'' if step == 1 else (u':%d' % step)
412 return u's[%s%s%s]' % (starts, ends, steps)
413
414 step = None
415 start = '(Never used)' # Quelch pyflakes warnings - start will be
416 # set as soon as step is set
417 for i, prev in zip(idxs[1:], idxs[:-1]):
418 if step is not None:
419 if i - prev == step:
420 continue
421 yield _genslice(start, prev, step)
422 step = None
423 continue
424 if i - prev in [-1, 1]:
425 step = i - prev
426 start = prev
427 continue
428 else:
429 yield u's[%d]' % prev
430 if step is None:
431 yield u's[%d]' % i
432 else:
433 yield _genslice(start, i, step)
434
435 test_string = u''.join(map(compat_chr, range(slen)))
436 cache_res = func(test_string)
437 cache_spec = [ord(c) for c in cache_res]
438 expr_code = u' + '.join(gen_sig_code(cache_spec))
439 code = u'if len(s) == %d:\n return %s\n' % (slen, expr_code)
440 self.to_screen(u'Extracted signature function:\n' + code)
441
442 def _parse_sig_js(self, jscode):
443 funcname = self._search_regex(
444 r'signature=([$a-zA-Z]+)', jscode,
445 u'Initial JS player signature function name')
446
447 jsi = JSInterpreter(jscode)
448 initial_function = jsi.extract_function(funcname)
449 return lambda s: initial_function([s])
450
451 def _parse_sig_swf(self, file_contents):
452 if file_contents[1:3] != b'WS':
453 raise ExtractorError(
454 u'Not an SWF file; header is %r' % file_contents[:3])
455 if file_contents[:1] == b'C':
456 content = zlib.decompress(file_contents[8:])
457 else:
458 raise NotImplementedError(u'Unsupported compression format %r' %
459 file_contents[:1])
460
461 def extract_tags(content):
462 pos = 0
463 while pos < len(content):
464 header16 = struct.unpack('<H', content[pos:pos+2])[0]
465 pos += 2
466 tag_code = header16 >> 6
467 tag_len = header16 & 0x3f
468 if tag_len == 0x3f:
469 tag_len = struct.unpack('<I', content[pos:pos+4])[0]
470 pos += 4
471 assert pos+tag_len <= len(content)
472 yield (tag_code, content[pos:pos+tag_len])
473 pos += tag_len
474
475 code_tag = next(tag
476 for tag_code, tag in extract_tags(content)
477 if tag_code == 82)
478 p = code_tag.index(b'\0', 4) + 1
479 code_reader = io.BytesIO(code_tag[p:])
480
481 # Parse ABC (AVM2 ByteCode)
482 def read_int(reader=None):
483 if reader is None:
484 reader = code_reader
485 res = 0
486 shift = 0
487 for _ in range(5):
488 buf = reader.read(1)
489 assert len(buf) == 1
490 b = struct.unpack('<B', buf)[0]
491 res = res | ((b & 0x7f) << shift)
492 if b & 0x80 == 0:
493 break
494 shift += 7
495 return res
496
497 def u30(reader=None):
498 res = read_int(reader)
499 assert res & 0xf0000000 == 0
500 return res
501 u32 = read_int
502
503 def s32(reader=None):
504 v = read_int(reader)
505 if v & 0x80000000 != 0:
506 v = - ((v ^ 0xffffffff) + 1)
507 return v
508
509 def read_string(reader=None):
510 if reader is None:
511 reader = code_reader
512 slen = u30(reader)
513 resb = reader.read(slen)
514 assert len(resb) == slen
515 return resb.decode('utf-8')
516
517 def read_bytes(count, reader=None):
518 if reader is None:
519 reader = code_reader
520 resb = reader.read(count)
521 assert len(resb) == count
522 return resb
523
524 def read_byte(reader=None):
525 resb = read_bytes(1, reader=reader)
526 res = struct.unpack('<B', resb)[0]
527 return res
528
529 # minor_version + major_version
530 read_bytes(2 + 2)
531
532 # Constant pool
533 int_count = u30()
534 for _c in range(1, int_count):
535 s32()
536 uint_count = u30()
537 for _c in range(1, uint_count):
538 u32()
539 double_count = u30()
540 read_bytes((double_count-1) * 8)
541 string_count = u30()
542 constant_strings = [u'']
543 for _c in range(1, string_count):
544 s = read_string()
545 constant_strings.append(s)
546 namespace_count = u30()
547 for _c in range(1, namespace_count):
548 read_bytes(1) # kind
549 u30() # name
550 ns_set_count = u30()
551 for _c in range(1, ns_set_count):
552 count = u30()
553 for _c2 in range(count):
554 u30()
555 multiname_count = u30()
556 MULTINAME_SIZES = {
557 0x07: 2, # QName
558 0x0d: 2, # QNameA
559 0x0f: 1, # RTQName
560 0x10: 1, # RTQNameA
561 0x11: 0, # RTQNameL
562 0x12: 0, # RTQNameLA
563 0x09: 2, # Multiname
564 0x0e: 2, # MultinameA
565 0x1b: 1, # MultinameL
566 0x1c: 1, # MultinameLA
567 }
568 multinames = [u'']
569 for _c in range(1, multiname_count):
570 kind = u30()
571 assert kind in MULTINAME_SIZES, u'Invalid multiname kind %r' % kind
572 if kind == 0x07:
573 u30() # namespace_idx
574 name_idx = u30()
575 multinames.append(constant_strings[name_idx])
576 else:
577 multinames.append('[MULTINAME kind: %d]' % kind)
578 for _c2 in range(MULTINAME_SIZES[kind]):
579 u30()
580
581 # Methods
582 method_count = u30()
583 MethodInfo = collections.namedtuple(
584 'MethodInfo',
585 ['NEED_ARGUMENTS', 'NEED_REST'])
586 method_infos = []
587 for method_id in range(method_count):
588 param_count = u30()
589 u30() # return type
590 for _ in range(param_count):
591 u30() # param type
592 u30() # name index (always 0 for youtube)
593 flags = read_byte()
594 if flags & 0x08 != 0:
595 # Options present
596 option_count = u30()
597 for c in range(option_count):
598 u30() # val
599 read_bytes(1) # kind
600 if flags & 0x80 != 0:
601 # Param names present
602 for _ in range(param_count):
603 u30() # param name
604 mi = MethodInfo(flags & 0x01 != 0, flags & 0x04 != 0)
605 method_infos.append(mi)
606
607 # Metadata
608 metadata_count = u30()
609 for _c in range(metadata_count):
610 u30() # name
611 item_count = u30()
612 for _c2 in range(item_count):
613 u30() # key
614 u30() # value
615
616 def parse_traits_info():
617 trait_name_idx = u30()
618 kind_full = read_byte()
619 kind = kind_full & 0x0f
620 attrs = kind_full >> 4
621 methods = {}
622 if kind in [0x00, 0x06]: # Slot or Const
623 u30() # Slot id
624 u30() # type_name_idx
625 vindex = u30()
626 if vindex != 0:
627 read_byte() # vkind
628 elif kind in [0x01, 0x02, 0x03]: # Method / Getter / Setter
629 u30() # disp_id
630 method_idx = u30()
631 methods[multinames[trait_name_idx]] = method_idx
632 elif kind == 0x04: # Class
633 u30() # slot_id
634 u30() # classi
635 elif kind == 0x05: # Function
636 u30() # slot_id
637 function_idx = u30()
638 methods[function_idx] = multinames[trait_name_idx]
639 else:
640 raise ExtractorError(u'Unsupported trait kind %d' % kind)
641
642 if attrs & 0x4 != 0: # Metadata present
643 metadata_count = u30()
644 for _c3 in range(metadata_count):
645 u30() # metadata index
646
647 return methods
648
649 # Classes
650 TARGET_CLASSNAME = u'SignatureDecipher'
651 searched_idx = multinames.index(TARGET_CLASSNAME)
652 searched_class_id = None
653 class_count = u30()
654 for class_id in range(class_count):
655 name_idx = u30()
656 if name_idx == searched_idx:
657 # We found the class we're looking for!
658 searched_class_id = class_id
659 u30() # super_name idx
660 flags = read_byte()
661 if flags & 0x08 != 0: # Protected namespace is present
662 u30() # protected_ns_idx
663 intrf_count = u30()
664 for _c2 in range(intrf_count):
665 u30()
666 u30() # iinit
667 trait_count = u30()
668 for _c2 in range(trait_count):
669 parse_traits_info()
670
671 if searched_class_id is None:
672 raise ExtractorError(u'Target class %r not found' %
673 TARGET_CLASSNAME)
674
675 method_names = {}
676 method_idxs = {}
677 for class_id in range(class_count):
678 u30() # cinit
679 trait_count = u30()
680 for _c2 in range(trait_count):
681 trait_methods = parse_traits_info()
682 if class_id == searched_class_id:
683 method_names.update(trait_methods.items())
684 method_idxs.update(dict(
685 (idx, name)
686 for name, idx in trait_methods.items()))
687
688 # Scripts
689 script_count = u30()
690 for _c in range(script_count):
691 u30() # init
692 trait_count = u30()
693 for _c2 in range(trait_count):
694 parse_traits_info()
695
696 # Method bodies
697 method_body_count = u30()
698 Method = collections.namedtuple('Method', ['code', 'local_count'])
699 methods = {}
700 for _c in range(method_body_count):
701 method_idx = u30()
702 u30() # max_stack
703 local_count = u30()
704 u30() # init_scope_depth
705 u30() # max_scope_depth
706 code_length = u30()
707 code = read_bytes(code_length)
708 if method_idx in method_idxs:
709 m = Method(code, local_count)
710 methods[method_idxs[method_idx]] = m
711 exception_count = u30()
712 for _c2 in range(exception_count):
713 u30() # from
714 u30() # to
715 u30() # target
716 u30() # exc_type
717 u30() # var_name
718 trait_count = u30()
719 for _c2 in range(trait_count):
720 parse_traits_info()
721
722 assert p + code_reader.tell() == len(code_tag)
723 assert len(methods) == len(method_idxs)
724
725 method_pyfunctions = {}
726
727 def extract_function(func_name):
728 if func_name in method_pyfunctions:
729 return method_pyfunctions[func_name]
730 if func_name not in methods:
731 raise ExtractorError(u'Cannot find function %r' % func_name)
732 m = methods[func_name]
733
734 def resfunc(args):
735 registers = ['(this)'] + list(args) + [None] * m.local_count
736 stack = []
737 coder = io.BytesIO(m.code)
738 while True:
739 opcode = struct.unpack('!B', coder.read(1))[0]
740 if opcode == 36: # pushbyte
741 v = struct.unpack('!B', coder.read(1))[0]
742 stack.append(v)
743 elif opcode == 44: # pushstring
744 idx = u30(coder)
745 stack.append(constant_strings[idx])
746 elif opcode == 48: # pushscope
747 # We don't implement the scope register, so we'll just
748 # ignore the popped value
749 stack.pop()
750 elif opcode == 70: # callproperty
751 index = u30(coder)
752 mname = multinames[index]
753 arg_count = u30(coder)
754 args = list(reversed(
755 [stack.pop() for _ in range(arg_count)]))
756 obj = stack.pop()
757 if mname == u'split':
758 assert len(args) == 1
759 assert isinstance(args[0], compat_str)
760 assert isinstance(obj, compat_str)
761 if args[0] == u'':
762 res = list(obj)
763 else:
764 res = obj.split(args[0])
765 stack.append(res)
766 elif mname == u'slice':
767 assert len(args) == 1
768 assert isinstance(args[0], int)
769 assert isinstance(obj, list)
770 res = obj[args[0]:]
771 stack.append(res)
772 elif mname == u'join':
773 assert len(args) == 1
774 assert isinstance(args[0], compat_str)
775 assert isinstance(obj, list)
776 res = args[0].join(obj)
777 stack.append(res)
778 elif mname in method_pyfunctions:
779 stack.append(method_pyfunctions[mname](args))
780 else:
781 raise NotImplementedError(
782 u'Unsupported property %r on %r'
783 % (mname, obj))
784 elif opcode == 72: # returnvalue
785 res = stack.pop()
786 return res
787 elif opcode == 79: # callpropvoid
788 index = u30(coder)
789 mname = multinames[index]
790 arg_count = u30(coder)
791 args = list(reversed(
792 [stack.pop() for _ in range(arg_count)]))
793 obj = stack.pop()
794 if mname == u'reverse':
795 assert isinstance(obj, list)
796 obj.reverse()
797 else:
798 raise NotImplementedError(
799 u'Unsupported (void) property %r on %r'
800 % (mname, obj))
801 elif opcode == 93: # findpropstrict
802 index = u30(coder)
803 mname = multinames[index]
804 res = extract_function(mname)
805 stack.append(res)
806 elif opcode == 97: # setproperty
807 index = u30(coder)
808 value = stack.pop()
809 idx = stack.pop()
810 obj = stack.pop()
811 assert isinstance(obj, list)
812 assert isinstance(idx, int)
813 obj[idx] = value
814 elif opcode == 98: # getlocal
815 index = u30(coder)
816 stack.append(registers[index])
817 elif opcode == 99: # setlocal
818 index = u30(coder)
819 value = stack.pop()
820 registers[index] = value
821 elif opcode == 102: # getproperty
822 index = u30(coder)
823 pname = multinames[index]
824 if pname == u'length':
825 obj = stack.pop()
826 assert isinstance(obj, list)
827 stack.append(len(obj))
828 else: # Assume attribute access
829 idx = stack.pop()
830 assert isinstance(idx, int)
831 obj = stack.pop()
832 assert isinstance(obj, list)
833 stack.append(obj[idx])
834 elif opcode == 128: # coerce
835 u30(coder)
836 elif opcode == 133: # coerce_s
837 assert isinstance(stack[-1], (type(None), compat_str))
838 elif opcode == 164: # modulo
839 value2 = stack.pop()
840 value1 = stack.pop()
841 res = value1 % value2
842 stack.append(res)
843 elif opcode == 208: # getlocal_0
844 stack.append(registers[0])
845 elif opcode == 209: # getlocal_1
846 stack.append(registers[1])
847 elif opcode == 210: # getlocal_2
848 stack.append(registers[2])
849 elif opcode == 211: # getlocal_3
850 stack.append(registers[3])
851 elif opcode == 214: # setlocal_2
852 registers[2] = stack.pop()
853 elif opcode == 215: # setlocal_3
854 registers[3] = stack.pop()
855 else:
856 raise NotImplementedError(
857 u'Unsupported opcode %d' % opcode)
858
859 method_pyfunctions[func_name] = resfunc
860 return resfunc
861
862 initial_function = extract_function(u'decipher')
863 return lambda s: initial_function([s])
864
865 def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
866 """Turn the encrypted s field into a working signature"""
867
868 if player_url is None:
869 raise ExtractorError(u'Cannot decrypt signature without player_url')
870
871 if player_url.startswith(u'//'):
872 player_url = u'https:' + player_url
873 try:
874 player_id = (player_url, len(s))
875 if player_id not in self._player_cache:
876 func = self._extract_signature_function(
877 video_id, player_url, len(s)
878 )
879 self._player_cache[player_id] = func
880 func = self._player_cache[player_id]
881 if self._downloader.params.get('youtube_print_sig_code'):
882 self._print_sig_code(func, len(s))
883 return func(s)
884 except Exception as e:
885 tb = traceback.format_exc()
886 raise ExtractorError(
887 u'Automatic signature extraction failed: ' + tb, cause=e)
888
889 def _get_available_subtitles(self, video_id, webpage):
890 try:
891 sub_list = self._download_webpage(
892 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
893 video_id, note=False)
894 except ExtractorError as err:
895 self._downloader.report_warning(u'unable to download video subtitles: %s' % compat_str(err))
896 return {}
897 lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
898
899 sub_lang_list = {}
900 for l in lang_list:
901 lang = l[1]
902 params = compat_urllib_parse.urlencode({
903 'lang': lang,
904 'v': video_id,
905 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
906 'name': unescapeHTML(l[0]).encode('utf-8'),
907 })
908 url = u'https://www.youtube.com/api/timedtext?' + params
909 sub_lang_list[lang] = url
910 if not sub_lang_list:
911 self._downloader.report_warning(u'video doesn\'t have subtitles')
912 return {}
913 return sub_lang_list
914
915 def _get_available_automatic_caption(self, video_id, webpage):
916 """We need the webpage for getting the captions url, pass it as an
917 argument to speed up the process."""
918 sub_format = self._downloader.params.get('subtitlesformat', 'srt')
919 self.to_screen(u'%s: Looking for automatic captions' % video_id)
920 mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
921 err_msg = u'Couldn\'t find automatic captions for %s' % video_id
922 if mobj is None:
923 self._downloader.report_warning(err_msg)
924 return {}
925 player_config = json.loads(mobj.group(1))
926 try:
927 args = player_config[u'args']
928 caption_url = args[u'ttsurl']
929 timestamp = args[u'timestamp']
930 # We get the available subtitles
931 list_params = compat_urllib_parse.urlencode({
932 'type': 'list',
933 'tlangs': 1,
934 'asrs': 1,
935 })
936 list_url = caption_url + '&' + list_params
937 caption_list = self._download_xml(list_url, video_id)
938 original_lang_node = caption_list.find('track')
939 if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
940 self._downloader.report_warning(u'Video doesn\'t have automatic captions')
941 return {}
942 original_lang = original_lang_node.attrib['lang_code']
943
944 sub_lang_list = {}
945 for lang_node in caption_list.findall('target'):
946 sub_lang = lang_node.attrib['lang_code']
947 params = compat_urllib_parse.urlencode({
948 'lang': original_lang,
949 'tlang': sub_lang,
950 'fmt': sub_format,
951 'ts': timestamp,
952 'kind': 'asr',
953 })
954 sub_lang_list[sub_lang] = caption_url + '&' + params
955 return sub_lang_list
956 # An extractor error can be raise by the download process if there are
957 # no automatic captions but there are subtitles
958 except (KeyError, ExtractorError):
959 self._downloader.report_warning(err_msg)
960 return {}
961
962 @classmethod
963 def extract_id(cls, url):
964 mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
965 if mobj is None:
966 raise ExtractorError(u'Invalid URL: %s' % url)
967 video_id = mobj.group(2)
968 return video_id
969
970 def _extract_from_m3u8(self, manifest_url, video_id):
971 url_map = {}
972 def _get_urls(_manifest):
973 lines = _manifest.split('\n')
974 urls = filter(lambda l: l and not l.startswith('#'),
975 lines)
976 return urls
977 manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
978 formats_urls = _get_urls(manifest)
979 for format_url in formats_urls:
980 itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
981 url_map[itag] = format_url
982 return url_map
983
984 def _extract_annotations(self, video_id):
985 url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
986 return self._download_webpage(url, video_id, note=u'Searching for annotations.', errnote=u'Unable to download video annotations.')
987
988 def _real_extract(self, url):
989 proto = (
990 u'http' if self._downloader.params.get('prefer_insecure', False)
991 else u'https')
992
993 # Extract original video URL from URL with redirection, like age verification, using next_url parameter
994 mobj = re.search(self._NEXT_URL_RE, url)
995 if mobj:
996 url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
997 video_id = self.extract_id(url)
998
999 # Get video webpage
1000 url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
1001 video_webpage = self._download_webpage(url, video_id)
1002
1003 # Attempt to extract SWF player URL
1004 mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
1005 if mobj is not None:
1006 player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
1007 else:
1008 player_url = None
1009
1010 # Get video info
1011 self.report_video_info_webpage_download(video_id)
1012 if re.search(r'player-age-gate-content">', video_webpage) is not None:
1013 self.report_age_confirmation()
1014 age_gate = True
1015 # We simulate the access to the video from www.youtube.com/v/{video_id}
1016 # this can be viewed without login into Youtube
1017 data = compat_urllib_parse.urlencode({'video_id': video_id,
1018 'el': 'player_embedded',
1019 'gl': 'US',
1020 'hl': 'en',
1021 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
1022 'asv': 3,
1023 'sts':'1588',
1024 })
1025 video_info_url = proto + '://www.youtube.com/get_video_info?' + data
1026 video_info_webpage = self._download_webpage(video_info_url, video_id,
1027 note=False,
1028 errnote='unable to download video info webpage')
1029 video_info = compat_parse_qs(video_info_webpage)
1030 else:
1031 age_gate = False
1032 for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
1033 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
1034 % (video_id, el_type))
1035 video_info_webpage = self._download_webpage(video_info_url, video_id,
1036 note=False,
1037 errnote='unable to download video info webpage')
1038 video_info = compat_parse_qs(video_info_webpage)
1039 if 'token' in video_info:
1040 break
1041 if 'token' not in video_info:
1042 if 'reason' in video_info:
1043 raise ExtractorError(
1044 u'YouTube said: %s' % video_info['reason'][0],
1045 expected=True, video_id=video_id)
1046 else:
1047 raise ExtractorError(
1048 u'"token" parameter not in video info for unknown reason',
1049 video_id=video_id)
1050
1051 if 'view_count' in video_info:
1052 view_count = int(video_info['view_count'][0])
1053 else:
1054 view_count = None
1055
1056 # Check for "rental" videos
1057 if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
1058 raise ExtractorError(u'"rental" videos not supported')
1059
1060 # Start extracting information
1061 self.report_information_extraction(video_id)
1062
1063 # uploader
1064 if 'author' not in video_info:
1065 raise ExtractorError(u'Unable to extract uploader name')
1066 video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
1067
1068 # uploader_id
1069 video_uploader_id = None
1070 mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
1071 if mobj is not None:
1072 video_uploader_id = mobj.group(1)
1073 else:
1074 self._downloader.report_warning(u'unable to extract uploader nickname')
1075
1076 # title
1077 if 'title' in video_info:
1078 video_title = video_info['title'][0]
1079 else:
1080 self._downloader.report_warning(u'Unable to extract video title')
1081 video_title = u'_'
1082
1083 # thumbnail image
1084 # We try first to get a high quality image:
1085 m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
1086 video_webpage, re.DOTALL)
1087 if m_thumb is not None:
1088 video_thumbnail = m_thumb.group(1)
1089 elif 'thumbnail_url' not in video_info:
1090 self._downloader.report_warning(u'unable to extract video thumbnail')
1091 video_thumbnail = None
1092 else: # don't panic if we can't find it
1093 video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
1094
1095 # upload date
1096 upload_date = None
1097 mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
1098 if mobj is None:
1099 mobj = re.search(
1100 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
1101 video_webpage)
1102 if mobj is not None:
1103 upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
1104 upload_date = unified_strdate(upload_date)
1105
1106 m_cat_container = get_element_by_id("eow-category", video_webpage)
1107 if m_cat_container:
1108 category = self._html_search_regex(
1109 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
1110 default=None)
1111 video_categories = None if category is None else [category]
1112 else:
1113 video_categories = None
1114
1115 # description
1116 video_description = get_element_by_id("eow-description", video_webpage)
1117 if video_description:
1118 video_description = re.sub(r'''(?x)
1119 <a\s+
1120 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1121 title="([^"]+)"\s+
1122 (?:[a-zA-Z-]+="[^"]+"\s+)*?
1123 class="yt-uix-redirect-link"\s*>
1124 [^<]+
1125 </a>
1126 ''', r'\1', video_description)
1127 video_description = clean_html(video_description)
1128 else:
1129 fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
1130 if fd_mobj:
1131 video_description = unescapeHTML(fd_mobj.group(1))
1132 else:
1133 video_description = u''
1134
1135 def _extract_count(klass):
1136 count = self._search_regex(
1137 r'class="%s">([\d,]+)</span>' % re.escape(klass),
1138 video_webpage, klass, default=None)
1139 if count is not None:
1140 return int(count.replace(',', ''))
1141 return None
1142 like_count = _extract_count(u'likes-count')
1143 dislike_count = _extract_count(u'dislikes-count')
1144
1145 # subtitles
1146 video_subtitles = self.extract_subtitles(video_id, video_webpage)
1147
1148 if self._downloader.params.get('listsubtitles', False):
1149 self._list_available_subtitles(video_id, video_webpage)
1150 return
1151
1152 if 'length_seconds' not in video_info:
1153 self._downloader.report_warning(u'unable to extract video duration')
1154 video_duration = None
1155 else:
1156 video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
1157
1158 # annotations
1159 video_annotations = None
1160 if self._downloader.params.get('writeannotations', False):
1161 video_annotations = self._extract_annotations(video_id)
1162
1163 # Decide which formats to download
1164 try:
1165 mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
1166 if not mobj:
1167 raise ValueError('Could not find vevo ID')
1168 json_code = uppercase_escape(mobj.group(1))
1169 ytplayer_config = json.loads(json_code)
1170 args = ytplayer_config['args']
1171 # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
1172 # this signatures are encrypted
1173 if 'url_encoded_fmt_stream_map' not in args:
1174 raise ValueError(u'No stream_map present') # caught below
1175 re_signature = re.compile(r'[&,]s=')
1176 m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
1177 if m_s is not None:
1178 self.to_screen(u'%s: Encrypted signatures detected.' % video_id)
1179 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
1180 m_s = re_signature.search(args.get('adaptive_fmts', u''))
1181 if m_s is not None:
1182 if 'adaptive_fmts' in video_info:
1183 video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
1184 else:
1185 video_info['adaptive_fmts'] = [args['adaptive_fmts']]
1186 except ValueError:
1187 pass
1188
1189 def _map_to_format_list(urlmap):
1190 formats = []
1191 for itag, video_real_url in urlmap.items():
1192 dct = {
1193 'format_id': itag,
1194 'url': video_real_url,
1195 'player_url': player_url,
1196 }
1197 if itag in self._formats:
1198 dct.update(self._formats[itag])
1199 formats.append(dct)
1200 return formats
1201
1202 if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
1203 self.report_rtmp_download()
1204 formats = [{
1205 'format_id': '_rtmp',
1206 'protocol': 'rtmp',
1207 'url': video_info['conn'][0],
1208 'player_url': player_url,
1209 }]
1210 elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
1211 encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
1212 if 'rtmpe%3Dyes' in encoded_url_map:
1213 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
1214 url_map = {}
1215 for url_data_str in encoded_url_map.split(','):
1216 url_data = compat_parse_qs(url_data_str)
1217 if 'itag' in url_data and 'url' in url_data:
1218 url = url_data['url'][0]
1219 if 'sig' in url_data:
1220 url += '&signature=' + url_data['sig'][0]
1221 elif 's' in url_data:
1222 encrypted_sig = url_data['s'][0]
1223 if self._downloader.params.get('verbose'):
1224 if age_gate:
1225 if player_url is None:
1226 player_version = 'unknown'
1227 else:
1228 player_version = self._search_regex(
1229 r'-(.+)\.swf$', player_url,
1230 u'flash player', fatal=False)
1231 player_desc = 'flash player %s' % player_version
1232 else:
1233 player_version = self._search_regex(
1234 r'html5player-(.+?)\.js', video_webpage,
1235 'html5 player', fatal=False)
1236 player_desc = u'html5 player %s' % player_version
1237
1238 parts_sizes = u'.'.join(compat_str(len(part)) for part in encrypted_sig.split('.'))
1239 self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
1240 (len(encrypted_sig), parts_sizes, url_data['itag'][0], player_desc))
1241
1242 if not age_gate:
1243 jsplayer_url_json = self._search_regex(
1244 r'"assets":.+?"js":\s*("[^"]+")',
1245 video_webpage, u'JS player URL')
1246 player_url = json.loads(jsplayer_url_json)
1247
1248 signature = self._decrypt_signature(
1249 encrypted_sig, video_id, player_url, age_gate)
1250 url += '&signature=' + signature
1251 if 'ratebypass' not in url:
1252 url += '&ratebypass=yes'
1253 url_map[url_data['itag'][0]] = url
1254 formats = _map_to_format_list(url_map)
1255 elif video_info.get('hlsvp'):
1256 manifest_url = video_info['hlsvp'][0]
1257 url_map = self._extract_from_m3u8(manifest_url, video_id)
1258 formats = _map_to_format_list(url_map)
1259 else:
1260 raise ExtractorError(u'no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
1261
1262 # Look for the DASH manifest
1263 if (self._downloader.params.get('youtube_include_dash_manifest', False)):
1264 try:
1265 # The DASH manifest used needs to be the one from the original video_webpage.
1266 # The one found in get_video_info seems to be using different signatures.
1267 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
1268 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
1269 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
1270 if age_gate:
1271 dash_manifest_url = video_info.get('dashmpd')[0]
1272 else:
1273 dash_manifest_url = ytplayer_config['args']['dashmpd']
1274 def decrypt_sig(mobj):
1275 s = mobj.group(1)
1276 dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
1277 return '/signature/%s' % dec_s
1278 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
1279 dash_doc = self._download_xml(
1280 dash_manifest_url, video_id,
1281 note=u'Downloading DASH manifest',
1282 errnote=u'Could not download DASH manifest')
1283 for r in dash_doc.findall(u'.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
1284 url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
1285 if url_el is None:
1286 continue
1287 format_id = r.attrib['id']
1288 video_url = url_el.text
1289 filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
1290 f = {
1291 'format_id': format_id,
1292 'url': video_url,
1293 'width': int_or_none(r.attrib.get('width')),
1294 'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
1295 'asr': int_or_none(r.attrib.get('audioSamplingRate')),
1296 'filesize': filesize,
1297 }
1298 try:
1299 existing_format = next(
1300 fo for fo in formats
1301 if fo['format_id'] == format_id)
1302 except StopIteration:
1303 f.update(self._formats.get(format_id, {}))
1304 formats.append(f)
1305 else:
1306 existing_format.update(f)
1307
1308 except (ExtractorError, KeyError) as e:
1309 self.report_warning(u'Skipping DASH manifest: %s' % e, video_id)
1310
1311 self._sort_formats(formats)
1312
1313 return {
1314 'id': video_id,
1315 'uploader': video_uploader,
1316 'uploader_id': video_uploader_id,
1317 'upload_date': upload_date,
1318 'title': video_title,
1319 'thumbnail': video_thumbnail,
1320 'description': video_description,
1321 'categories': video_categories,
1322 'subtitles': video_subtitles,
1323 'duration': video_duration,
1324 'age_limit': 18 if age_gate else 0,
1325 'annotations': video_annotations,
1326 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1327 'view_count': view_count,
1328 'like_count': like_count,
1329 'dislike_count': dislike_count,
1330 'formats': formats,
1331 }
1332
1333 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1334 IE_DESC = u'YouTube.com playlists'
1335 _VALID_URL = r"""(?x)(?:
1336 (?:https?://)?
1337 (?:\w+\.)?
1338 youtube\.com/
1339 (?:
1340 (?:course|view_play_list|my_playlists|artist|playlist|watch)
1341 \? (?:.*?&)*? (?:p|a|list)=
1342 | p/
1343 )
1344 (
1345 (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1346 # Top tracks, they can also include dots
1347 |(?:MC)[\w\.]*
1348 )
1349 .*
1350 |
1351 ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1352 )"""
1353 _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1354 _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1355 _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1356 IE_NAME = u'youtube:playlist'
1357
1358 def _real_initialize(self):
1359 self._login()
1360
1361 def _ids_to_results(self, ids):
1362 return [self.url_result(vid_id, 'Youtube', video_id=vid_id)
1363 for vid_id in ids]
1364
1365 def _extract_mix(self, playlist_id):
1366 # The mixes are generated from a a single video
1367 # the id of the playlist is just 'RD' + video_id
1368 url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1369 webpage = self._download_webpage(url, playlist_id, u'Downloading Youtube mix')
1370 search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1371 title_span = (search_title('playlist-title') or
1372 search_title('title long-title') or search_title('title'))
1373 title = clean_html(title_span)
1374 video_re = r'''(?x)data-video-username=".*?".*?
1375 href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id)
1376 ids = orderedSet(re.findall(video_re, webpage, flags=re.DOTALL))
1377 url_results = self._ids_to_results(ids)
1378
1379 return self.playlist_result(url_results, playlist_id, title)
1380
1381 def _real_extract(self, url):
1382 # Extract playlist id
1383 mobj = re.match(self._VALID_URL, url)
1384 if mobj is None:
1385 raise ExtractorError(u'Invalid URL: %s' % url)
1386 playlist_id = mobj.group(1) or mobj.group(2)
1387
1388 # Check if it's a video-specific URL
1389 query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1390 if 'v' in query_dict:
1391 video_id = query_dict['v'][0]
1392 if self._downloader.params.get('noplaylist'):
1393 self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id)
1394 return self.url_result(video_id, 'Youtube', video_id=video_id)
1395 else:
1396 self.to_screen(u'Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1397
1398 if playlist_id.startswith('RD'):
1399 # Mixes require a custom extraction process
1400 return self._extract_mix(playlist_id)
1401 if playlist_id.startswith('TL'):
1402 raise ExtractorError(u'For downloading YouTube.com top lists, use '
1403 u'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1404
1405 url = self._TEMPLATE_URL % playlist_id
1406 page = self._download_webpage(url, playlist_id)
1407 more_widget_html = content_html = page
1408
1409 # Check if the playlist exists or is private
1410 if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1411 raise ExtractorError(
1412 u'The playlist doesn\'t exist or is private, use --username or '
1413 '--netrc to access it.',
1414 expected=True)
1415
1416 # Extract the video ids from the playlist pages
1417 ids = []
1418
1419 for page_num in itertools.count(1):
1420 matches = re.finditer(self._VIDEO_RE, content_html)
1421 # We remove the duplicates and the link with index 0
1422 # (it's not the first video of the playlist)
1423 new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1424 ids.extend(new_ids)
1425
1426 mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1427 if not mobj:
1428 break
1429
1430 more = self._download_json(
1431 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1432 'Downloading page #%s' % page_num,
1433 transform_source=uppercase_escape)
1434 content_html = more['content_html']
1435 more_widget_html = more['load_more_widget_html']
1436
1437 playlist_title = self._html_search_regex(
1438 r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1439 page, u'title')
1440
1441 url_results = self._ids_to_results(ids)
1442 return self.playlist_result(url_results, playlist_id, playlist_title)
1443
1444
1445 class YoutubeTopListIE(YoutubePlaylistIE):
1446 IE_NAME = u'youtube:toplist'
1447 IE_DESC = (u'YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1448 u' (Example: "yttoplist:music:Top Tracks")')
1449 _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1450
1451 def _real_extract(self, url):
1452 mobj = re.match(self._VALID_URL, url)
1453 channel = mobj.group('chann')
1454 title = mobj.group('title')
1455 query = compat_urllib_parse.urlencode({'title': title})
1456 playlist_re = 'href="([^"]+?%s.*?)"' % re.escape(query)
1457 channel_page = self._download_webpage('https://www.youtube.com/%s' % channel, title)
1458 link = self._html_search_regex(playlist_re, channel_page, u'list')
1459 url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1460
1461 video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1462 ids = []
1463 # sometimes the webpage doesn't contain the videos
1464 # retry until we get them
1465 for i in itertools.count(0):
1466 msg = u'Downloading Youtube mix'
1467 if i > 0:
1468 msg += ', retry #%d' % i
1469 webpage = self._download_webpage(url, title, msg)
1470 ids = orderedSet(re.findall(video_re, webpage))
1471 if ids:
1472 break
1473 url_results = self._ids_to_results(ids)
1474 return self.playlist_result(url_results, playlist_title=title)
1475
1476
1477 class YoutubeChannelIE(InfoExtractor):
1478 IE_DESC = u'YouTube.com channels'
1479 _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1480 _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1481 _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1482 IE_NAME = u'youtube:channel'
1483
1484 def extract_videos_from_page(self, page):
1485 ids_in_page = []
1486 for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1487 if mobj.group(1) not in ids_in_page:
1488 ids_in_page.append(mobj.group(1))
1489 return ids_in_page
1490
1491 def _real_extract(self, url):
1492 # Extract channel id
1493 mobj = re.match(self._VALID_URL, url)
1494 if mobj is None:
1495 raise ExtractorError(u'Invalid URL: %s' % url)
1496
1497 # Download channel page
1498 channel_id = mobj.group(1)
1499 video_ids = []
1500 url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1501 channel_page = self._download_webpage(url, channel_id)
1502 autogenerated = re.search(r'''(?x)
1503 class="[^"]*?(?:
1504 channel-header-autogenerated-label|
1505 yt-channel-title-autogenerated
1506 )[^"]*"''', channel_page) is not None
1507
1508 if autogenerated:
1509 # The videos are contained in a single page
1510 # the ajax pages can't be used, they are empty
1511 video_ids = self.extract_videos_from_page(channel_page)
1512 else:
1513 # Download all channel pages using the json-based channel_ajax query
1514 for pagenum in itertools.count(1):
1515 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1516 page = self._download_json(
1517 url, channel_id, note=u'Downloading page #%s' % pagenum,
1518 transform_source=uppercase_escape)
1519
1520 ids_in_page = self.extract_videos_from_page(page['content_html'])
1521 video_ids.extend(ids_in_page)
1522
1523 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1524 break
1525
1526 self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1527
1528 url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1529 for video_id in video_ids]
1530 return self.playlist_result(url_entries, channel_id)
1531
1532
1533 class YoutubeUserIE(InfoExtractor):
1534 IE_DESC = u'YouTube.com user videos (URL or "ytuser" keyword)'
1535 _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1536 _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1537 _GDATA_PAGE_SIZE = 50
1538 _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1539 IE_NAME = u'youtube:user'
1540
1541 @classmethod
1542 def suitable(cls, url):
1543 # Don't return True if the url can be extracted with other youtube
1544 # extractor, the regex would is too permissive and it would match.
1545 other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1546 if any(ie.suitable(url) for ie in other_ies): return False
1547 else: return super(YoutubeUserIE, cls).suitable(url)
1548
1549 def _real_extract(self, url):
1550 # Extract username
1551 mobj = re.match(self._VALID_URL, url)
1552 if mobj is None:
1553 raise ExtractorError(u'Invalid URL: %s' % url)
1554
1555 username = mobj.group(1)
1556
1557 # Download video ids using YouTube Data API. Result size per
1558 # query is limited (currently to 50 videos) so we need to query
1559 # page by page until there are no video ids - it means we got
1560 # all of them.
1561
1562 def download_page(pagenum):
1563 start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1564
1565 gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1566 page = self._download_webpage(
1567 gdata_url, username,
1568 u'Downloading video ids from %d to %d' % (
1569 start_index, start_index + self._GDATA_PAGE_SIZE))
1570
1571 try:
1572 response = json.loads(page)
1573 except ValueError as err:
1574 raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err))
1575 if 'entry' not in response['feed']:
1576 return
1577
1578 # Extract video identifiers
1579 entries = response['feed']['entry']
1580 for entry in entries:
1581 title = entry['title']['$t']
1582 video_id = entry['id']['$t'].split('/')[-1]
1583 yield {
1584 '_type': 'url',
1585 'url': video_id,
1586 'ie_key': 'Youtube',
1587 'id': video_id,
1588 'title': title,
1589 }
1590 url_results = PagedList(download_page, self._GDATA_PAGE_SIZE)
1591
1592 return self.playlist_result(url_results, playlist_title=username)
1593
1594
1595 class YoutubeSearchIE(SearchInfoExtractor):
1596 IE_DESC = u'YouTube.com searches'
1597 _API_URL = u'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1598 _MAX_RESULTS = 1000
1599 IE_NAME = u'youtube:search'
1600 _SEARCH_KEY = 'ytsearch'
1601
1602 def _get_n_results(self, query, n):
1603 """Get a specified number of results for a query"""
1604
1605 video_ids = []
1606 pagenum = 0
1607 limit = n
1608 PAGE_SIZE = 50
1609
1610 while (PAGE_SIZE * pagenum) < limit:
1611 result_url = self._API_URL % (
1612 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1613 (PAGE_SIZE * pagenum) + 1)
1614 data_json = self._download_webpage(
1615 result_url, video_id=u'query "%s"' % query,
1616 note=u'Downloading page %s' % (pagenum + 1),
1617 errnote=u'Unable to download API page')
1618 data = json.loads(data_json)
1619 api_response = data['data']
1620
1621 if 'items' not in api_response:
1622 raise ExtractorError(
1623 u'[youtube] No video results', expected=True)
1624
1625 new_ids = list(video['id'] for video in api_response['items'])
1626 video_ids += new_ids
1627
1628 limit = min(n, api_response['totalItems'])
1629 pagenum += 1
1630
1631 if len(video_ids) > n:
1632 video_ids = video_ids[:n]
1633 videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1634 for video_id in video_ids]
1635 return self.playlist_result(videos, query)
1636
1637
1638 class YoutubeSearchDateIE(YoutubeSearchIE):
1639 IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1640 _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1641 _SEARCH_KEY = 'ytsearchdate'
1642 IE_DESC = u'YouTube.com searches, newest videos first'
1643
1644
1645 class YoutubeSearchURLIE(InfoExtractor):
1646 IE_DESC = u'YouTube.com search URLs'
1647 IE_NAME = u'youtube:search_url'
1648 _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1649
1650 def _real_extract(self, url):
1651 mobj = re.match(self._VALID_URL, url)
1652 query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1653
1654 webpage = self._download_webpage(url, query)
1655 result_code = self._search_regex(
1656 r'(?s)<ol class="item-section"(.*?)</ol>', webpage, u'result HTML')
1657
1658 part_codes = re.findall(
1659 r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1660 entries = []
1661 for part_code in part_codes:
1662 part_title = self._html_search_regex(
1663 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1664 part_url_snippet = self._html_search_regex(
1665 r'(?s)href="([^"]+)"', part_code, 'item URL')
1666 part_url = compat_urlparse.urljoin(
1667 'https://www.youtube.com/', part_url_snippet)
1668 entries.append({
1669 '_type': 'url',
1670 'url': part_url,
1671 'title': part_title,
1672 })
1673
1674 return {
1675 '_type': 'playlist',
1676 'entries': entries,
1677 'title': query,
1678 }
1679
1680
1681 class YoutubeShowIE(InfoExtractor):
1682 IE_DESC = u'YouTube.com (multi-season) shows'
1683 _VALID_URL = r'https?://www\.youtube\.com/show/(.*)'
1684 IE_NAME = u'youtube:show'
1685
1686 def _real_extract(self, url):
1687 mobj = re.match(self._VALID_URL, url)
1688 show_name = mobj.group(1)
1689 webpage = self._download_webpage(url, show_name, u'Downloading show webpage')
1690 # There's one playlist for each season of the show
1691 m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1692 self.to_screen(u'%s: Found %s seasons' % (show_name, len(m_seasons)))
1693 return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
1694
1695
1696 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1697 """
1698 Base class for extractors that fetch info from
1699 http://www.youtube.com/feed_ajax
1700 Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1701 """
1702 _LOGIN_REQUIRED = True
1703 # use action_load_personal_feed instead of action_load_system_feed
1704 _PERSONAL_FEED = False
1705
1706 @property
1707 def _FEED_TEMPLATE(self):
1708 action = 'action_load_system_feed'
1709 if self._PERSONAL_FEED:
1710 action = 'action_load_personal_feed'
1711 return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1712
1713 @property
1714 def IE_NAME(self):
1715 return u'youtube:%s' % self._FEED_NAME
1716
1717 def _real_initialize(self):
1718 self._login()
1719
1720 def _real_extract(self, url):
1721 feed_entries = []
1722 paging = 0
1723 for i in itertools.count(1):
1724 info = self._download_json(self._FEED_TEMPLATE % paging,
1725 u'%s feed' % self._FEED_NAME,
1726 u'Downloading page %s' % i)
1727 feed_html = info.get('feed_html') or info.get('content_html')
1728 m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1729 ids = orderedSet(m.group(1) for m in m_ids)
1730 feed_entries.extend(
1731 self.url_result(video_id, 'Youtube', video_id=video_id)
1732 for video_id in ids)
1733 mobj = re.search(
1734 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1735 feed_html)
1736 if mobj is None:
1737 break
1738 paging = mobj.group('paging')
1739 return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1740
1741 class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
1742 IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1743 _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1744 _FEED_NAME = 'subscriptions'
1745 _PLAYLIST_TITLE = u'Youtube Subscriptions'
1746
1747 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1748 IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1749 _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1750 _FEED_NAME = 'recommended'
1751 _PLAYLIST_TITLE = u'Youtube Recommended videos'
1752
1753 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1754 IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1755 _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1756 _FEED_NAME = 'watch_later'
1757 _PLAYLIST_TITLE = u'Youtube Watch Later'
1758 _PERSONAL_FEED = True
1759
1760 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1761 IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)'
1762 _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory'
1763 _FEED_NAME = 'history'
1764 _PERSONAL_FEED = True
1765 _PLAYLIST_TITLE = u'Youtube Watch History'
1766
1767 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1768 IE_NAME = u'youtube:favorites'
1769 IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1770 _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1771 _LOGIN_REQUIRED = True
1772
1773 def _real_extract(self, url):
1774 webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1775 playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
1776 return self.url_result(playlist_id, 'YoutubePlaylist')
1777
1778
1779 class YoutubeTruncatedURLIE(InfoExtractor):
1780 IE_NAME = 'youtube:truncated_url'
1781 IE_DESC = False # Do not list
1782 _VALID_URL = r'''(?x)
1783 (?:https?://)?[^/]+/watch\?(?:
1784 feature=[a-z_]+|
1785 annotation_id=annotation_[^&]+
1786 )?$|
1787 (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1788 '''
1789
1790 _TESTS = [{
1791 'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1792 'only_matching': True,
1793 }, {
1794 'url': 'http://www.youtube.com/watch?',
1795 'only_matching': True,
1796 }]
1797
1798 def _real_extract(self, url):
1799 raise ExtractorError(
1800 u'Did you forget to quote the URL? Remember that & is a meta '
1801 u'character in most shells, so you want to put the URL in quotes, '
1802 u'like youtube-dl '
1803 u'"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1804 u' or simply youtube-dl BaW_jenozKc .',
1805 expected=True)