Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import traceback
  11
  12 from .common import InfoExtractor, SearchInfoExtractor
  13 from .subtitles import SubtitlesInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..utils import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_request,
  21     compat_urlparse,
  22     compat_str,
  23
  24     clean_html,
  25     get_element_by_id,
  26     get_element_by_attribute,
  27     ExtractorError,
  28     int_or_none,
  29     OnDemandPagedList,
  30     unescapeHTML,
  31     unified_strdate,
  32     orderedSet,
  33     uppercase_escape,
  34 )
  35
  36 class YoutubeBaseInfoExtractor(InfoExtractor):
  37     """Provide base functions for Youtube extractors"""
  38     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  39     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  40     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  41     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  42     _NETRC_MACHINE = 'youtube'
  43     # If True it will raise an error if no login info is provided
  44     _LOGIN_REQUIRED = False
  45
  46     def _set_language(self):
  47         return bool(self._download_webpage(
  48             self._LANG_URL, None,
  49             note='Setting language', errnote='unable to set language',
  50             fatal=False))
  51
  52     def _login(self):
  53         """
  54         Attempt to log in to YouTube.
  55         True is returned if successful or skipped.
  56         False is returned if login failed.
  57
  58         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  59         """
  60         (username, password) = self._get_login_info()
  61         # No authentication to be performed
  62         if username is None:
  63             if self._LOGIN_REQUIRED:
  64                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  65             return True
  66
  67         login_page = self._download_webpage(
  68             self._LOGIN_URL, None,
  69             note='Downloading login page',
  70             errnote='unable to fetch login page', fatal=False)
  71         if login_page is False:
  72             return
  73
  74         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  75                                   login_page, 'Login GALX parameter')
  76
  77         # Log in
  78         login_form_strs = {
  79                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  80                 'Email': username,
  81                 'GALX': galx,
  82                 'Passwd': password,
  83
  84                 'PersistentCookie': 'yes',
  85                 '_utf8': '霱',
  86                 'bgresponse': 'js_disabled',
  87                 'checkConnection': '',
  88                 'checkedDomains': 'youtube',
  89                 'dnConn': '',
  90                 'pstMsg': '0',
  91                 'rmShown': '1',
  92                 'secTok': '',
  93                 'signIn': 'Sign in',
  94                 'timeStmp': '',
  95                 'service': 'youtube',
  96                 'uilel': '3',
  97                 'hl': 'en_US',
  98         }
  99
 100         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 101         # chokes on unicode
 102         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 103         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 104
 105         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 106         login_results = self._download_webpage(
 107             req, None,
 108             note='Logging in', errnote='unable to log in', fatal=False)
 109         if login_results is False:
 110             return False
 111
 112         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 113             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 114
 115         # Two-Factor
 116         # TODO add SMS and phone call support - these require making a request and then prompting the user
 117
 118         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 119             tfa_code = self._get_tfa_info()
 120
 121             if tfa_code is None:
 122                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 123                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 124                 return False
 125
 126             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 127
 128             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 129             if match is None:
 130                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 131             secTok = match.group(1)
 132             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 133             if match is None:
 134                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 135             timeStmp = match.group(1)
 136
 137             tfa_form_strs = {
 138                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 139                 'smsToken': '',
 140                 'smsUserPin': tfa_code,
 141                 'smsVerifyPin': 'Verify',
 142
 143                 'PersistentCookie': 'yes',
 144                 'checkConnection': '',
 145                 'checkedDomains': 'youtube',
 146                 'pstMsg': '1',
 147                 'secTok': secTok,
 148                 'timeStmp': timeStmp,
 149                 'service': 'youtube',
 150                 'hl': 'en_US',
 151             }
 152             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
 153             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 154
 155             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 156             tfa_results = self._download_webpage(
 157                 tfa_req, None,
 158                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 159
 160             if tfa_results is False:
 161                 return False
 162
 163             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 164                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 165                 return False
 166             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 167                 self._downloader.report_warning('unable to log in - did the page structure change?')
 168                 return False
 169             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 170                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 171                 return False
 172
 173         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 174             self._downloader.report_warning('unable to log in: bad username or password')
 175             return False
 176         return True
 177
 178     def _confirm_age(self):
 179         age_form = {
 180             'next_url': '/',
 181             'action_confirm': 'Confirm',
 182         }
 183         req = compat_urllib_request.Request(self._AGE_URL,
 184             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 185
 186         self._download_webpage(
 187             req, None,
 188             note='Confirming age', errnote='Unable to confirm age',
 189             fatal=False)
 190
 191     def _real_initialize(self):
 192         if self._downloader is None:
 193             return
 194         if self._get_login_info()[0] is not None:
 195             if not self._set_language():
 196                 return
 197         if not self._login():
 198             return
 199         self._confirm_age()
 200
 201
 202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 203     IE_DESC = 'YouTube.com'
 204     _VALID_URL = r"""(?x)^
 205                      (
 206                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 207                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 208                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 209                             (?:www\.)?pwnyoutube\.com/|
 210                             (?:www\.)?yourepeat\.com/|
 211                             tube\.majestyc\.net/|
 212                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 213                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 214                          (?:                                                  # the various things that can precede the ID:
 215                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 216                              |(?:                                             # or the v= param in all its forms
 217                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 218                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 219                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 220                                  v=
 221                              )
 222                          ))
 223                          |youtu\.be/                                          # just youtu.be/xxxx
 224                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 225                          )
 226                      )?                                                       # all until now is optional -> you can pass the naked ID
 227                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 228                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 229                      (?(1).+)?                                                # if we found the ID, everything can follow
 230                      $"""
 231     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 232     _formats = {
 233         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 234         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 235         '13': {'ext': '3gp'},
 236         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 237         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 238         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 239         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 240         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 241         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 242         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 243         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 244         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 245         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 246         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 247         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 248
 249
 250         # 3d videos
 251         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 252         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 253         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 254         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 255         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 256         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 257         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 258
 259         # Apple HTTP Live Streaming
 260         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 261         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 262         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 263         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 264         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 265         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 266         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 267
 268         # DASH mp4 video
 269         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 270         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 271         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 272         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 273         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 274         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 275         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 276         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 277         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 278         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 279         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 280
 281         # Dash mp4 audio
 282         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 283         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 284         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 285
 286         # Dash webm
 287         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 288         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 289         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 290         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 291         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 292         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 293         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 294         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 295         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 296         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 297         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 298         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 299         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 300         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 301         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 302         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 303         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 304         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 305
 306         # Dash webm audio
 307         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 308         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 309
 310         # Dash webm audio with opus inside
 311         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 312         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 313         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 314
 315         # RTMP (unnamed)
 316         '_rtmp': {'protocol': 'rtmp'},
 317     }
 318
 319     IE_NAME = 'youtube'
 320     _TESTS = [
 321         {
 322             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 323             'info_dict': {
 324                 'id': 'BaW_jenozKc',
 325                 'ext': 'mp4',
 326                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 327                 'uploader': 'Philipp Hagemeister',
 328                 'uploader_id': 'phihag',
 329                 'upload_date': '20121002',
 330                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 331                 'categories': ['Science & Technology'],
 332                 'like_count': int,
 333                 'dislike_count': int,
 334             }
 335         },
 336         {
 337             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 338             'note': 'Test generic use_cipher_signature video (#897)',
 339             'info_dict': {
 340                 'id': 'UxxajLWwzqY',
 341                 'ext': 'mp4',
 342                 'upload_date': '20120506',
 343                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 344                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 345                 'uploader': 'Icona Pop',
 346                 'uploader_id': 'IconaPop',
 347             }
 348         },
 349         {
 350             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 351             'note': 'Test VEVO video with age protection (#956)',
 352             'info_dict': {
 353                 'id': '07FYdnEawAQ',
 354                 'ext': 'mp4',
 355                 'upload_date': '20130703',
 356                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 357                 'description': 'md5:64249768eec3bc4276236606ea996373',
 358                 'uploader': 'justintimberlakeVEVO',
 359                 'uploader_id': 'justintimberlakeVEVO',
 360             }
 361         },
 362         {
 363             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 364             'note': 'Embed-only video (#1746)',
 365             'info_dict': {
 366                 'id': 'yZIXLfi8CZQ',
 367                 'ext': 'mp4',
 368                 'upload_date': '20120608',
 369                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 370                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 371                 'uploader': 'SET India',
 372                 'uploader_id': 'setindia'
 373             }
 374         },
 375         {
 376             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 377             'note': '256k DASH audio (format 141) via DASH manifest',
 378             'info_dict': {
 379                 'id': 'a9LDPn-MO4I',
 380                 'ext': 'm4a',
 381                 'upload_date': '20121002',
 382                 'uploader_id': '8KVIDEO',
 383                 'description': '',
 384                 'uploader': '8KVIDEO',
 385                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 386             },
 387             'params': {
 388                 'youtube_include_dash_manifest': True,
 389                 'format': '141',
 390             },
 391         },
 392         # DASH manifest with encrypted signature
 393         {
 394             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 395             'info_dict': {
 396                 'id': 'IB3lcPjvWLA',
 397                 'ext': 'm4a',
 398                 'title': 'Afrojack - The Spark ft. Spree Wilson',
 399                 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
 400                 'uploader': 'AfrojackVEVO',
 401                 'uploader_id': 'AfrojackVEVO',
 402                 'upload_date': '20131011',
 403             },
 404             'params': {
 405                 'youtube_include_dash_manifest': True,
 406                 'format': '141',
 407             },
 408         },
 409     ]
 410
 411     def __init__(self, *args, **kwargs):
 412         super(YoutubeIE, self).__init__(*args, **kwargs)
 413         self._player_cache = {}
 414
 415     def report_video_info_webpage_download(self, video_id):
 416         """Report attempt to download video info webpage."""
 417         self.to_screen('%s: Downloading video info webpage' % video_id)
 418
 419     def report_information_extraction(self, video_id):
 420         """Report attempt to extract video information."""
 421         self.to_screen('%s: Extracting video information' % video_id)
 422
 423     def report_unavailable_format(self, video_id, format):
 424         """Report extracted video URL."""
 425         self.to_screen('%s: Format %s not available' % (video_id, format))
 426
 427     def report_rtmp_download(self):
 428         """Indicate the download will use the RTMP protocol."""
 429         self.to_screen('RTMP download detected')
 430
 431     def _signature_cache_id(self, example_sig):
 432         """ Return a string representation of a signature """
 433         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 434
 435     def _extract_signature_function(self, video_id, player_url, example_sig):
 436         id_m = re.match(
 437             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 438             player_url)
 439         if not id_m:
 440             raise ExtractorError('Cannot identify player %r' % player_url)
 441         player_type = id_m.group('ext')
 442         player_id = id_m.group('id')
 443
 444         # Read from filesystem cache
 445         func_id = '%s_%s_%s' % (
 446             player_type, player_id, self._signature_cache_id(example_sig))
 447         assert os.path.basename(func_id) == func_id
 448
 449         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 450         if cache_spec is not None:
 451             return lambda s: ''.join(s[i] for i in cache_spec)
 452
 453         if player_type == 'js':
 454             code = self._download_webpage(
 455                 player_url, video_id,
 456                 note='Downloading %s player %s' % (player_type, player_id),
 457                 errnote='Download of %s failed' % player_url)
 458             res = self._parse_sig_js(code)
 459         elif player_type == 'swf':
 460             urlh = self._request_webpage(
 461                 player_url, video_id,
 462                 note='Downloading %s player %s' % (player_type, player_id),
 463                 errnote='Download of %s failed' % player_url)
 464             code = urlh.read()
 465             res = self._parse_sig_swf(code)
 466         else:
 467             assert False, 'Invalid player type %r' % player_type
 468
 469         if cache_spec is None:
 470             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 471             cache_res = res(test_string)
 472             cache_spec = [ord(c) for c in cache_res]
 473
 474         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 475         return res
 476
 477     def _print_sig_code(self, func, example_sig):
 478         def gen_sig_code(idxs):
 479             def _genslice(start, end, step):
 480                 starts = '' if start == 0 else str(start)
 481                 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
 482                 steps = '' if step == 1 else (':%d' % step)
 483                 return 's[%s%s%s]' % (starts, ends, steps)
 484
 485             step = None
 486             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 487                                     # set as soon as step is set
 488             for i, prev in zip(idxs[1:], idxs[:-1]):
 489                 if step is not None:
 490                     if i - prev == step:
 491                         continue
 492                     yield _genslice(start, prev, step)
 493                     step = None
 494                     continue
 495                 if i - prev in [-1, 1]:
 496                     step = i - prev
 497                     start = prev
 498                     continue
 499                 else:
 500                     yield 's[%d]' % prev
 501             if step is None:
 502                 yield 's[%d]' % i
 503             else:
 504                 yield _genslice(start, i, step)
 505
 506         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 507         cache_res = func(test_string)
 508         cache_spec = [ord(c) for c in cache_res]
 509         expr_code = ' + '.join(gen_sig_code(cache_spec))
 510         signature_id_tuple = '(%s)' % (
 511             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 512         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 513                 '    return %s\n') % (signature_id_tuple, expr_code)
 514         self.to_screen('Extracted signature function:\n' + code)
 515
 516     def _parse_sig_js(self, jscode):
 517         funcname = self._search_regex(
 518             r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
 519              'Initial JS player signature function name')
 520
 521         jsi = JSInterpreter(jscode)
 522         initial_function = jsi.extract_function(funcname)
 523         return lambda s: initial_function([s])
 524
 525     def _parse_sig_swf(self, file_contents):
 526         swfi = SWFInterpreter(file_contents)
 527         TARGET_CLASSNAME = 'SignatureDecipher'
 528         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 529         initial_function = swfi.extract_function(searched_class, 'decipher')
 530         return lambda s: initial_function([s])
 531
 532     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 533         """Turn the encrypted s field into a working signature"""
 534
 535         if player_url is None:
 536             raise ExtractorError('Cannot decrypt signature without player_url')
 537
 538         if player_url.startswith('//'):
 539             player_url = 'https:' + player_url
 540         try:
 541             player_id = (player_url, self._signature_cache_id(s))
 542             if player_id not in self._player_cache:
 543                 func = self._extract_signature_function(
 544                     video_id, player_url, s
 545                 )
 546                 self._player_cache[player_id] = func
 547             func = self._player_cache[player_id]
 548             if self._downloader.params.get('youtube_print_sig_code'):
 549                 self._print_sig_code(func, s)
 550             return func(s)
 551         except Exception as e:
 552             tb = traceback.format_exc()
 553             raise ExtractorError(
 554                 'Signature extraction failed: ' + tb, cause=e)
 555
 556     def _get_available_subtitles(self, video_id, webpage):
 557         try:
 558             sub_list = self._download_webpage(
 559                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 560                 video_id, note=False)
 561         except ExtractorError as err:
 562             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 563             return {}
 564         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 565
 566         sub_lang_list = {}
 567         for l in lang_list:
 568             lang = l[1]
 569             if lang in sub_lang_list:
 570                 continue
 571             params = compat_urllib_parse.urlencode({
 572                 'lang': lang,
 573                 'v': video_id,
 574                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 575                 'name': unescapeHTML(l[0]).encode('utf-8'),
 576             })
 577             url = 'https://www.youtube.com/api/timedtext?' + params
 578             sub_lang_list[lang] = url
 579         if not sub_lang_list:
 580             self._downloader.report_warning('video doesn\'t have subtitles')
 581             return {}
 582         return sub_lang_list
 583
 584     def _get_available_automatic_caption(self, video_id, webpage):
 585         """We need the webpage for getting the captions url, pass it as an
 586            argument to speed up the process."""
 587         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 588         self.to_screen('%s: Looking for automatic captions' % video_id)
 589         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 590         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 591         if mobj is None:
 592             self._downloader.report_warning(err_msg)
 593             return {}
 594         player_config = json.loads(mobj.group(1))
 595         try:
 596             args = player_config[u'args']
 597             caption_url = args[u'ttsurl']
 598             timestamp = args[u'timestamp']
 599             # We get the available subtitles
 600             list_params = compat_urllib_parse.urlencode({
 601                 'type': 'list',
 602                 'tlangs': 1,
 603                 'asrs': 1,
 604             })
 605             list_url = caption_url + '&' + list_params
 606             caption_list = self._download_xml(list_url, video_id)
 607             original_lang_node = caption_list.find('track')
 608             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 609                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 610                 return {}
 611             original_lang = original_lang_node.attrib['lang_code']
 612
 613             sub_lang_list = {}
 614             for lang_node in caption_list.findall('target'):
 615                 sub_lang = lang_node.attrib['lang_code']
 616                 params = compat_urllib_parse.urlencode({
 617                     'lang': original_lang,
 618                     'tlang': sub_lang,
 619                     'fmt': sub_format,
 620                     'ts': timestamp,
 621                     'kind': 'asr',
 622                 })
 623                 sub_lang_list[sub_lang] = caption_url + '&' + params
 624             return sub_lang_list
 625         # An extractor error can be raise by the download process if there are
 626         # no automatic captions but there are subtitles
 627         except (KeyError, ExtractorError):
 628             self._downloader.report_warning(err_msg)
 629             return {}
 630
 631     @classmethod
 632     def extract_id(cls, url):
 633         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 634         if mobj is None:
 635             raise ExtractorError('Invalid URL: %s' % url)
 636         video_id = mobj.group(2)
 637         return video_id
 638
 639     def _extract_from_m3u8(self, manifest_url, video_id):
 640         url_map = {}
 641         def _get_urls(_manifest):
 642             lines = _manifest.split('\n')
 643             urls = filter(lambda l: l and not l.startswith('#'),
 644                             lines)
 645             return urls
 646         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 647         formats_urls = _get_urls(manifest)
 648         for format_url in formats_urls:
 649             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 650             url_map[itag] = format_url
 651         return url_map
 652
 653     def _extract_annotations(self, video_id):
 654         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 655         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 656
 657     def _real_extract(self, url):
 658         proto = (
 659             'http' if self._downloader.params.get('prefer_insecure', False)
 660             else 'https')
 661
 662         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 663         mobj = re.search(self._NEXT_URL_RE, url)
 664         if mobj:
 665             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 666         video_id = self.extract_id(url)
 667
 668         # Get video webpage
 669         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id
 670         pref_cookies = [
 671             c for c in self._downloader.cookiejar
 672             if c.domain == '.youtube.com' and c.name == 'PREF']
 673         for pc in pref_cookies:
 674             if 'hl=' in pc.value:
 675                 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
 676             else:
 677                 if pc.value:
 678                     pc.value += '&'
 679                 pc.value += 'hl=en'
 680         video_webpage = self._download_webpage(url, video_id)
 681
 682         # Attempt to extract SWF player URL
 683         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 684         if mobj is not None:
 685             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 686         else:
 687             player_url = None
 688
 689         # Get video info
 690         self.report_video_info_webpage_download(video_id)
 691         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 692             age_gate = True
 693             # We simulate the access to the video from www.youtube.com/v/{video_id}
 694             # this can be viewed without login into Youtube
 695             data = compat_urllib_parse.urlencode({
 696                 'video_id': video_id,
 697                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 698                 'sts': self._search_regex(
 699                     r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
 700             })
 701             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 702             video_info_webpage = self._download_webpage(
 703                 video_info_url, video_id,
 704                 note='Refetching age-gated info webpage',
 705                 errnote='unable to download video info webpage')
 706             video_info = compat_parse_qs(video_info_webpage)
 707         else:
 708             age_gate = False
 709             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 710                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 711                         % (video_id, el_type))
 712                 video_info_webpage = self._download_webpage(video_info_url, video_id,
 713                                         note=False,
 714                                         errnote='unable to download video info webpage')
 715                 video_info = compat_parse_qs(video_info_webpage)
 716                 if 'token' in video_info:
 717                     break
 718         if 'token' not in video_info:
 719             if 'reason' in video_info:
 720                 raise ExtractorError(
 721                     'YouTube said: %s' % video_info['reason'][0],
 722                     expected=True, video_id=video_id)
 723             else:
 724                 raise ExtractorError(
 725                     '"token" parameter not in video info for unknown reason',
 726                     video_id=video_id)
 727
 728         if 'view_count' in video_info:
 729             view_count = int(video_info['view_count'][0])
 730         else:
 731             view_count = None
 732
 733         # Check for "rental" videos
 734         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 735             raise ExtractorError('"rental" videos not supported')
 736
 737         # Start extracting information
 738         self.report_information_extraction(video_id)
 739
 740         # uploader
 741         if 'author' not in video_info:
 742             raise ExtractorError('Unable to extract uploader name')
 743         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 744
 745         # uploader_id
 746         video_uploader_id = None
 747         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 748         if mobj is not None:
 749             video_uploader_id = mobj.group(1)
 750         else:
 751             self._downloader.report_warning('unable to extract uploader nickname')
 752
 753         # title
 754         if 'title' in video_info:
 755             video_title = video_info['title'][0]
 756         else:
 757             self._downloader.report_warning('Unable to extract video title')
 758             video_title = '_'
 759
 760         # thumbnail image
 761         # We try first to get a high quality image:
 762         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 763                             video_webpage, re.DOTALL)
 764         if m_thumb is not None:
 765             video_thumbnail = m_thumb.group(1)
 766         elif 'thumbnail_url' not in video_info:
 767             self._downloader.report_warning('unable to extract video thumbnail')
 768             video_thumbnail = None
 769         else:   # don't panic if we can't find it
 770             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 771
 772         # upload date
 773         upload_date = None
 774         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 775         if mobj is None:
 776             mobj = re.search(
 777                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 778                 video_webpage)
 779         if mobj is not None:
 780             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 781             upload_date = unified_strdate(upload_date)
 782
 783         m_cat_container = self._search_regex(
 784             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 785             video_webpage, 'categories', fatal=False)
 786         if m_cat_container:
 787             category = self._html_search_regex(
 788                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 789                 default=None)
 790             video_categories = None if category is None else [category]
 791         else:
 792             video_categories = None
 793
 794         # description
 795         video_description = get_element_by_id("eow-description", video_webpage)
 796         if video_description:
 797             video_description = re.sub(r'''(?x)
 798                 <a\s+
 799                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 800                     title="([^"]+)"\s+
 801                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 802                     class="yt-uix-redirect-link"\s*>
 803                 [^<]+
 804                 </a>
 805             ''', r'\1', video_description)
 806             video_description = clean_html(video_description)
 807         else:
 808             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 809             if fd_mobj:
 810                 video_description = unescapeHTML(fd_mobj.group(1))
 811             else:
 812                 video_description = ''
 813
 814         def _extract_count(count_name):
 815             count = self._search_regex(
 816                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 817                 video_webpage, count_name, default=None)
 818             if count is not None:
 819                 return int(count.replace(',', ''))
 820             return None
 821         like_count = _extract_count('like')
 822         dislike_count = _extract_count('dislike')
 823
 824         # subtitles
 825         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 826
 827         if self._downloader.params.get('listsubtitles', False):
 828             self._list_available_subtitles(video_id, video_webpage)
 829             return
 830
 831         if 'length_seconds' not in video_info:
 832             self._downloader.report_warning('unable to extract video duration')
 833             video_duration = None
 834         else:
 835             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 836
 837         # annotations
 838         video_annotations = None
 839         if self._downloader.params.get('writeannotations', False):
 840                 video_annotations = self._extract_annotations(video_id)
 841
 842         # Decide which formats to download
 843         try:
 844             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 845             if not mobj:
 846                 raise ValueError('Could not find vevo ID')
 847             json_code = uppercase_escape(mobj.group(1))
 848             ytplayer_config = json.loads(json_code)
 849             args = ytplayer_config['args']
 850             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
 851             # this signatures are encrypted
 852             if 'url_encoded_fmt_stream_map' not in args:
 853                 raise ValueError('No stream_map present')  # caught below
 854             re_signature = re.compile(r'[&,]s=')
 855             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
 856             if m_s is not None:
 857                 self.to_screen('%s: Encrypted signatures detected.' % video_id)
 858                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 859             m_s = re_signature.search(args.get('adaptive_fmts', ''))
 860             if m_s is not None:
 861                 if 'adaptive_fmts' in video_info:
 862                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
 863                 else:
 864                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
 865         except ValueError:
 866             pass
 867
 868         def _map_to_format_list(urlmap):
 869             formats = []
 870             for itag, video_real_url in urlmap.items():
 871                 dct = {
 872                     'format_id': itag,
 873                     'url': video_real_url,
 874                     'player_url': player_url,
 875                 }
 876                 if itag in self._formats:
 877                     dct.update(self._formats[itag])
 878                 formats.append(dct)
 879             return formats
 880
 881         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 882             self.report_rtmp_download()
 883             formats = [{
 884                 'format_id': '_rtmp',
 885                 'protocol': 'rtmp',
 886                 'url': video_info['conn'][0],
 887                 'player_url': player_url,
 888             }]
 889         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
 890             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
 891             if 'rtmpe%3Dyes' in encoded_url_map:
 892                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 893             url_map = {}
 894             for url_data_str in encoded_url_map.split(','):
 895                 url_data = compat_parse_qs(url_data_str)
 896                 if 'itag' not in url_data or 'url' not in url_data:
 897                     continue
 898                 format_id = url_data['itag'][0]
 899                 url = url_data['url'][0]
 900
 901                 if 'sig' in url_data:
 902                     url += '&signature=' + url_data['sig'][0]
 903                 elif 's' in url_data:
 904                     encrypted_sig = url_data['s'][0]
 905
 906                     if not age_gate:
 907                         jsplayer_url_json = self._search_regex(
 908                             r'"assets":.+?"js":\s*("[^"]+")',
 909                             video_webpage, 'JS player URL')
 910                         player_url = json.loads(jsplayer_url_json)
 911                     if player_url is None:
 912                         player_url_json = self._search_regex(
 913                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 914                             video_webpage, 'age gate player URL')
 915                         player_url = json.loads(player_url_json)
 916
 917                     if self._downloader.params.get('verbose'):
 918                         if player_url is None:
 919                             player_version = 'unknown'
 920                             player_desc = 'unknown'
 921                         else:
 922                             if player_url.endswith('swf'):
 923                                 player_version = self._search_regex(
 924                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 925                                     'flash player', fatal=False)
 926                                 player_desc = 'flash player %s' % player_version
 927                             else:
 928                                 player_version = self._search_regex(
 929                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
 930                                     player_url,
 931                                     'html5 player', fatal=False)
 932                                 player_desc = 'html5 player %s' % player_version
 933
 934                         parts_sizes = self._signature_cache_id(encrypted_sig)
 935                         self.to_screen('{%s} signature length %s, %s' %
 936                             (format_id, parts_sizes, player_desc))
 937
 938                     signature = self._decrypt_signature(
 939                         encrypted_sig, video_id, player_url, age_gate)
 940                     url += '&signature=' + signature
 941                 if 'ratebypass' not in url:
 942                     url += '&ratebypass=yes'
 943                 url_map[format_id] = url
 944             formats = _map_to_format_list(url_map)
 945         elif video_info.get('hlsvp'):
 946             manifest_url = video_info['hlsvp'][0]
 947             url_map = self._extract_from_m3u8(manifest_url, video_id)
 948             formats = _map_to_format_list(url_map)
 949         else:
 950             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 951
 952         # Look for the DASH manifest
 953         if self._downloader.params.get('youtube_include_dash_manifest', True):
 954             try:
 955                 # The DASH manifest used needs to be the one from the original video_webpage.
 956                 # The one found in get_video_info seems to be using different signatures.
 957                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
 958                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
 959                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
 960                 if age_gate:
 961                     dash_manifest_url = video_info.get('dashmpd')[0]
 962                 else:
 963                     dash_manifest_url = ytplayer_config['args']['dashmpd']
 964                 def decrypt_sig(mobj):
 965                     s = mobj.group(1)
 966                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 967                     return '/signature/%s' % dec_s
 968                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 969                 dash_doc = self._download_xml(
 970                     dash_manifest_url, video_id,
 971                     note='Downloading DASH manifest',
 972                     errnote='Could not download DASH manifest')
 973                 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 974                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 975                     if url_el is None:
 976                         continue
 977                     format_id = r.attrib['id']
 978                     video_url = url_el.text
 979                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 980                     f = {
 981                         'format_id': format_id,
 982                         'url': video_url,
 983                         'width': int_or_none(r.attrib.get('width')),
 984                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 985                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 986                         'filesize': filesize,
 987                     }
 988                     try:
 989                         existing_format = next(
 990                             fo for fo in formats
 991                             if fo['format_id'] == format_id)
 992                     except StopIteration:
 993                         f.update(self._formats.get(format_id, {}))
 994                         formats.append(f)
 995                     else:
 996                         existing_format.update(f)
 997
 998             except (ExtractorError, KeyError) as e:
 999                 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
1000
1001         self._sort_formats(formats)
1002
1003         return {
1004             'id':           video_id,
1005             'uploader':     video_uploader,
1006             'uploader_id':  video_uploader_id,
1007             'upload_date':  upload_date,
1008             'title':        video_title,
1009             'thumbnail':    video_thumbnail,
1010             'description':  video_description,
1011             'categories':   video_categories,
1012             'subtitles':    video_subtitles,
1013             'duration':     video_duration,
1014             'age_limit':    18 if age_gate else 0,
1015             'annotations':  video_annotations,
1016             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1017             'view_count':   view_count,
1018             'like_count': like_count,
1019             'dislike_count': dislike_count,
1020             'formats':      formats,
1021         }
1022
1023 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1024     IE_DESC = 'YouTube.com playlists'
1025     _VALID_URL = r"""(?x)(?:
1026                         (?:https?://)?
1027                         (?:\w+\.)?
1028                         youtube\.com/
1029                         (?:
1030                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1031                            \? (?:.*?&)*? (?:p|a|list)=
1032                         |  p/
1033                         )
1034                         (
1035                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1036                             # Top tracks, they can also include dots
1037                             |(?:MC)[\w\.]*
1038                         )
1039                         .*
1040                      |
1041                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1042                      )"""
1043     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1044     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1045     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1046     IE_NAME = 'youtube:playlist'
1047     _TESTS = [{
1048         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1049         'info_dict': {
1050             'title': 'ytdl test PL',
1051             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1052         },
1053         'playlist_count': 3,
1054     }, {
1055         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1056         'info_dict': {
1057             'title': 'YDL_Empty_List',
1058         },
1059         'playlist_count': 0,
1060     }, {
1061         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1062         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1063         'info_dict': {
1064             'title': '29C3: Not my department',
1065         },
1066         'playlist_count': 95,
1067     }, {
1068         'note': 'issue #673',
1069         'url': 'PLBB231211A4F62143',
1070         'info_dict': {
1071             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1072         },
1073         'playlist_mincount': 26,
1074     }, {
1075         'note': 'Large playlist',
1076         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1077         'info_dict': {
1078             'title': 'Uploads from Cauchemar',
1079         },
1080         'playlist_mincount': 799,
1081     }, {
1082         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1083         'info_dict': {
1084             'title': 'YDL_safe_search',
1085         },
1086         'playlist_count': 2,
1087     }, {
1088         'note': 'embedded',
1089         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1090         'playlist_count': 4,
1091         'info_dict': {
1092             'title': 'JODA15',
1093         }
1094     }, {
1095         'note': 'Embedded SWF player',
1096         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1097         'playlist_count': 4,
1098         'info_dict': {
1099             'title': 'JODA7',
1100         }
1101     }]
1102
1103     def _real_initialize(self):
1104         self._login()
1105
1106     def _ids_to_results(self, ids):
1107         return [
1108             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1109             for vid_id in ids]
1110
1111     def _extract_mix(self, playlist_id):
1112         # The mixes are generated from a a single video
1113         # the id of the playlist is just 'RD' + video_id
1114         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1115         webpage = self._download_webpage(
1116             url, playlist_id, 'Downloading Youtube mix')
1117         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1118         title_span = (
1119             search_title('playlist-title') or
1120             search_title('title long-title') or
1121             search_title('title'))
1122         title = clean_html(title_span)
1123         ids = orderedSet(re.findall(
1124             r'''(?xs)data-video-username=".*?".*?
1125                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1126             webpage))
1127         url_results = self._ids_to_results(ids)
1128
1129         return self.playlist_result(url_results, playlist_id, title)
1130
1131     def _real_extract(self, url):
1132         # Extract playlist id
1133         mobj = re.match(self._VALID_URL, url)
1134         if mobj is None:
1135             raise ExtractorError('Invalid URL: %s' % url)
1136         playlist_id = mobj.group(1) or mobj.group(2)
1137
1138         # Check if it's a video-specific URL
1139         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1140         if 'v' in query_dict:
1141             video_id = query_dict['v'][0]
1142             if self._downloader.params.get('noplaylist'):
1143                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1144                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1145             else:
1146                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1147
1148         if playlist_id.startswith('RD'):
1149             # Mixes require a custom extraction process
1150             return self._extract_mix(playlist_id)
1151         if playlist_id.startswith('TL'):
1152             raise ExtractorError('For downloading YouTube.com top lists, use '
1153                 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1154
1155         url = self._TEMPLATE_URL % playlist_id
1156         page = self._download_webpage(url, playlist_id)
1157         more_widget_html = content_html = page
1158
1159         # Check if the playlist exists or is private
1160         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1161             raise ExtractorError(
1162                 'The playlist doesn\'t exist or is private, use --username or '
1163                 '--netrc to access it.',
1164                 expected=True)
1165
1166         # Extract the video ids from the playlist pages
1167         ids = []
1168
1169         for page_num in itertools.count(1):
1170             matches = re.finditer(self._VIDEO_RE, content_html)
1171             # We remove the duplicates and the link with index 0
1172             # (it's not the first video of the playlist)
1173             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1174             ids.extend(new_ids)
1175
1176             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1177             if not mobj:
1178                 break
1179
1180             more = self._download_json(
1181                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1182                 'Downloading page #%s' % page_num,
1183                 transform_source=uppercase_escape)
1184             content_html = more['content_html']
1185             more_widget_html = more['load_more_widget_html']
1186
1187         playlist_title = self._html_search_regex(
1188             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1189             page, 'title')
1190
1191         url_results = self._ids_to_results(ids)
1192         return self.playlist_result(url_results, playlist_id, playlist_title)
1193
1194
1195 class YoutubeTopListIE(YoutubePlaylistIE):
1196     IE_NAME = 'youtube:toplist'
1197     IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1198         ' (Example: "yttoplist:music:Top Tracks")')
1199     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1200     _TESTS = [{
1201         'url': 'yttoplist:music:Trending',
1202         'playlist_mincount': 5,
1203         'skip': 'Only works for logged-in users',
1204     }]
1205
1206     def _real_extract(self, url):
1207         mobj = re.match(self._VALID_URL, url)
1208         channel = mobj.group('chann')
1209         title = mobj.group('title')
1210         query = compat_urllib_parse.urlencode({'title': title})
1211         channel_page = self._download_webpage(
1212             'https://www.youtube.com/%s' % channel, title)
1213         link = self._html_search_regex(
1214             r'''(?x)
1215                 <a\s+href="([^"]+)".*?>\s*
1216                 <span\s+class="branded-page-module-title-text">\s*
1217                 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1218             channel_page, 'list')
1219         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1220
1221         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1222         ids = []
1223         # sometimes the webpage doesn't contain the videos
1224         # retry until we get them
1225         for i in itertools.count(0):
1226             msg = 'Downloading Youtube mix'
1227             if i > 0:
1228                 msg += ', retry #%d' % i
1229
1230             webpage = self._download_webpage(url, title, msg)
1231             ids = orderedSet(re.findall(video_re, webpage))
1232             if ids:
1233                 break
1234         url_results = self._ids_to_results(ids)
1235         return self.playlist_result(url_results, playlist_title=title)
1236
1237
1238 class YoutubeChannelIE(InfoExtractor):
1239     IE_DESC = 'YouTube.com channels'
1240     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1241     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1242     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1243     IE_NAME = 'youtube:channel'
1244     _TESTS = [{
1245         'note': 'paginated channel',
1246         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1247         'playlist_mincount': 91,
1248     }]
1249
1250     def extract_videos_from_page(self, page):
1251         ids_in_page = []
1252         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1253             if mobj.group(1) not in ids_in_page:
1254                 ids_in_page.append(mobj.group(1))
1255         return ids_in_page
1256
1257     def _real_extract(self, url):
1258         # Extract channel id
1259         mobj = re.match(self._VALID_URL, url)
1260         if mobj is None:
1261             raise ExtractorError('Invalid URL: %s' % url)
1262
1263         # Download channel page
1264         channel_id = mobj.group(1)
1265         video_ids = []
1266         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1267         channel_page = self._download_webpage(url, channel_id)
1268         autogenerated = re.search(r'''(?x)
1269                 class="[^"]*?(?:
1270                     channel-header-autogenerated-label|
1271                     yt-channel-title-autogenerated
1272                 )[^"]*"''', channel_page) is not None
1273
1274         if autogenerated:
1275             # The videos are contained in a single page
1276             # the ajax pages can't be used, they are empty
1277             video_ids = self.extract_videos_from_page(channel_page)
1278         else:
1279             # Download all channel pages using the json-based channel_ajax query
1280             for pagenum in itertools.count(1):
1281                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1282                 page = self._download_json(
1283                     url, channel_id, note='Downloading page #%s' % pagenum,
1284                     transform_source=uppercase_escape)
1285
1286                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1287                 video_ids.extend(ids_in_page)
1288
1289                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1290                     break
1291
1292         self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1293
1294         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1295                        for video_id in video_ids]
1296         return self.playlist_result(url_entries, channel_id)
1297
1298
1299 class YoutubeUserIE(InfoExtractor):
1300     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1301     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1302     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1303     _GDATA_PAGE_SIZE = 50
1304     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1305     IE_NAME = 'youtube:user'
1306
1307     _TESTS = [{
1308         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1309         'playlist_mincount': 320,
1310         'info_dict': {
1311             'title': 'TheLinuxFoundation',
1312         }
1313     }, {
1314         'url': 'ytuser:phihag',
1315         'only_matching': True,
1316     }]
1317
1318     @classmethod
1319     def suitable(cls, url):
1320         # Don't return True if the url can be extracted with other youtube
1321         # extractor, the regex would is too permissive and it would match.
1322         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1323         if any(ie.suitable(url) for ie in other_ies): return False
1324         else: return super(YoutubeUserIE, cls).suitable(url)
1325
1326     def _real_extract(self, url):
1327         # Extract username
1328         mobj = re.match(self._VALID_URL, url)
1329         if mobj is None:
1330             raise ExtractorError('Invalid URL: %s' % url)
1331
1332         username = mobj.group(1)
1333
1334         # Download video ids using YouTube Data API. Result size per
1335         # query is limited (currently to 50 videos) so we need to query
1336         # page by page until there are no video ids - it means we got
1337         # all of them.
1338
1339         def download_page(pagenum):
1340             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1341
1342             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1343             page = self._download_webpage(
1344                 gdata_url, username,
1345                 'Downloading video ids from %d to %d' % (
1346                     start_index, start_index + self._GDATA_PAGE_SIZE))
1347
1348             try:
1349                 response = json.loads(page)
1350             except ValueError as err:
1351                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1352             if 'entry' not in response['feed']:
1353                 return
1354
1355             # Extract video identifiers
1356             entries = response['feed']['entry']
1357             for entry in entries:
1358                 title = entry['title']['$t']
1359                 video_id = entry['id']['$t'].split('/')[-1]
1360                 yield {
1361                     '_type': 'url',
1362                     'url': video_id,
1363                     'ie_key': 'Youtube',
1364                     'id': video_id,
1365                     'title': title,
1366                 }
1367         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1368
1369         return self.playlist_result(url_results, playlist_title=username)
1370
1371
1372 class YoutubeSearchIE(SearchInfoExtractor):
1373     IE_DESC = 'YouTube.com searches'
1374     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1375     _MAX_RESULTS = 1000
1376     IE_NAME = 'youtube:search'
1377     _SEARCH_KEY = 'ytsearch'
1378
1379     def _get_n_results(self, query, n):
1380         """Get a specified number of results for a query"""
1381
1382         video_ids = []
1383         pagenum = 0
1384         limit = n
1385         PAGE_SIZE = 50
1386
1387         while (PAGE_SIZE * pagenum) < limit:
1388             result_url = self._API_URL % (
1389                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1390                 (PAGE_SIZE * pagenum) + 1)
1391             data_json = self._download_webpage(
1392                 result_url, video_id='query "%s"' % query,
1393                 note='Downloading page %s' % (pagenum + 1),
1394                 errnote='Unable to download API page')
1395             data = json.loads(data_json)
1396             api_response = data['data']
1397
1398             if 'items' not in api_response:
1399                 raise ExtractorError(
1400                     '[youtube] No video results', expected=True)
1401
1402             new_ids = list(video['id'] for video in api_response['items'])
1403             video_ids += new_ids
1404
1405             limit = min(n, api_response['totalItems'])
1406             pagenum += 1
1407
1408         if len(video_ids) > n:
1409             video_ids = video_ids[:n]
1410         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1411                   for video_id in video_ids]
1412         return self.playlist_result(videos, query)
1413
1414
1415 class YoutubeSearchDateIE(YoutubeSearchIE):
1416     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1417     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1418     _SEARCH_KEY = 'ytsearchdate'
1419     IE_DESC = 'YouTube.com searches, newest videos first'
1420
1421
1422 class YoutubeSearchURLIE(InfoExtractor):
1423     IE_DESC = 'YouTube.com search URLs'
1424     IE_NAME = 'youtube:search_url'
1425     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1426     _TESTS = [{
1427         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1428         'playlist_mincount': 5,
1429         'info_dict': {
1430             'title': 'youtube-dl test video',
1431         }
1432     }]
1433
1434     def _real_extract(self, url):
1435         mobj = re.match(self._VALID_URL, url)
1436         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1437
1438         webpage = self._download_webpage(url, query)
1439         result_code = self._search_regex(
1440             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1441
1442         part_codes = re.findall(
1443             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1444         entries = []
1445         for part_code in part_codes:
1446             part_title = self._html_search_regex(
1447                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1448             part_url_snippet = self._html_search_regex(
1449                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1450             part_url = compat_urlparse.urljoin(
1451                 'https://www.youtube.com/', part_url_snippet)
1452             entries.append({
1453                 '_type': 'url',
1454                 'url': part_url,
1455                 'title': part_title,
1456             })
1457
1458         return {
1459             '_type': 'playlist',
1460             'entries': entries,
1461             'title': query,
1462         }
1463
1464
1465 class YoutubeShowIE(InfoExtractor):
1466     IE_DESC = 'YouTube.com (multi-season) shows'
1467     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1468     IE_NAME = 'youtube:show'
1469     _TESTS = [{
1470         'url': 'http://www.youtube.com/show/airdisasters',
1471         'playlist_mincount': 3,
1472         'info_dict': {
1473             'id': 'airdisasters',
1474             'title': 'Air Disasters',
1475         }
1476     }]
1477
1478     def _real_extract(self, url):
1479         mobj = re.match(self._VALID_URL, url)
1480         playlist_id = mobj.group('id')
1481         webpage = self._download_webpage(
1482             url, playlist_id, 'Downloading show webpage')
1483         # There's one playlist for each season of the show
1484         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1485         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1486         entries = [
1487             self.url_result(
1488                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1489             for season in m_seasons
1490         ]
1491         title = self._og_search_title(webpage, fatal=False)
1492
1493         return {
1494             '_type': 'playlist',
1495             'id': playlist_id,
1496             'title': title,
1497             'entries': entries,
1498         }
1499
1500
1501 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1502     """
1503     Base class for extractors that fetch info from
1504     http://www.youtube.com/feed_ajax
1505     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1506     """
1507     _LOGIN_REQUIRED = True
1508     # use action_load_personal_feed instead of action_load_system_feed
1509     _PERSONAL_FEED = False
1510
1511     @property
1512     def _FEED_TEMPLATE(self):
1513         action = 'action_load_system_feed'
1514         if self._PERSONAL_FEED:
1515             action = 'action_load_personal_feed'
1516         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1517
1518     @property
1519     def IE_NAME(self):
1520         return 'youtube:%s' % self._FEED_NAME
1521
1522     def _real_initialize(self):
1523         self._login()
1524
1525     def _real_extract(self, url):
1526         feed_entries = []
1527         paging = 0
1528         for i in itertools.count(1):
1529             info = self._download_json(self._FEED_TEMPLATE % paging,
1530                                           '%s feed' % self._FEED_NAME,
1531                                           'Downloading page %s' % i)
1532             feed_html = info.get('feed_html') or info.get('content_html')
1533             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1534             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1535             ids = orderedSet(m.group(1) for m in m_ids)
1536             feed_entries.extend(
1537                 self.url_result(video_id, 'Youtube', video_id=video_id)
1538                 for video_id in ids)
1539             mobj = re.search(
1540                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1541                 load_more_widget_html)
1542             if mobj is None:
1543                 break
1544             paging = mobj.group('paging')
1545         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1546
1547 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1548     IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1549     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1550     _FEED_NAME = 'recommended'
1551     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1552
1553 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1554     IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1555     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1556     _FEED_NAME = 'watch_later'
1557     _PLAYLIST_TITLE = 'Youtube Watch Later'
1558     _PERSONAL_FEED = True
1559
1560 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1561     IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1562     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1563     _FEED_NAME = 'history'
1564     _PERSONAL_FEED = True
1565     _PLAYLIST_TITLE = 'Youtube Watch History'
1566
1567 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1568     IE_NAME = 'youtube:favorites'
1569     IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1570     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1571     _LOGIN_REQUIRED = True
1572
1573     def _real_extract(self, url):
1574         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1575         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1576         return self.url_result(playlist_id, 'YoutubePlaylist')
1577
1578
1579 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1580     IE_NAME = 'youtube:subscriptions'
1581     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1582     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1583     _TESTS = []
1584
1585     def _real_extract(self, url):
1586         title = 'Youtube Subscriptions'
1587         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1588
1589         # The extraction process is the same as for playlists, but the regex
1590         # for the video ids doesn't contain an index
1591         ids = []
1592         more_widget_html = content_html = page
1593
1594         for page_num in itertools.count(1):
1595             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1596             new_ids = orderedSet(matches)
1597             ids.extend(new_ids)
1598
1599             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1600             if not mobj:
1601                 break
1602
1603             more = self._download_json(
1604                 'https://youtube.com/%s' % mobj.group('more'), title,
1605                 'Downloading page #%s' % page_num,
1606                 transform_source=uppercase_escape)
1607             content_html = more['content_html']
1608             more_widget_html = more['load_more_widget_html']
1609
1610         return {
1611             '_type': 'playlist',
1612             'title': title,
1613             'entries': self._ids_to_results(ids),
1614         }
1615
1616
1617 class YoutubeTruncatedURLIE(InfoExtractor):
1618     IE_NAME = 'youtube:truncated_url'
1619     IE_DESC = False  # Do not list
1620     _VALID_URL = r'''(?x)
1621         (?:https?://)?[^/]+/watch\?(?:
1622             feature=[a-z_]+|
1623             annotation_id=annotation_[^&]+
1624         )?$|
1625         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1626     '''
1627
1628     _TESTS = [{
1629         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1630         'only_matching': True,
1631     }, {
1632         'url': 'http://www.youtube.com/watch?',
1633         'only_matching': True,
1634     }]
1635
1636     def _real_extract(self, url):
1637         raise ExtractorError(
1638             'Did you forget to quote the URL? Remember that & is a meta '
1639             'character in most shells, so you want to put the URL in quotes, '
1640             'like  youtube-dl '
1641             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1642             ' or simply  youtube-dl BaW_jenozKc  .',
1643             expected=True)