Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/youtube.py

   1 # coding: utf-8
   2
   3 from __future__ import unicode_literals
   4
   5
   6 import itertools
   7 import json
   8 import os.path
   9 import re
  10 import traceback
  11
  12 from .common import InfoExtractor, SearchInfoExtractor
  13 from .subtitles import SubtitlesInfoExtractor
  14 from ..jsinterp import JSInterpreter
  15 from ..swfinterp import SWFInterpreter
  16 from ..utils import (
  17     compat_chr,
  18     compat_parse_qs,
  19     compat_urllib_parse,
  20     compat_urllib_request,
  21     compat_urlparse,
  22     compat_str,
  23
  24     clean_html,
  25     get_element_by_id,
  26     get_element_by_attribute,
  27     ExtractorError,
  28     int_or_none,
  29     OnDemandPagedList,
  30     unescapeHTML,
  31     unified_strdate,
  32     orderedSet,
  33     uppercase_escape,
  34 )
  35
  36 class YoutubeBaseInfoExtractor(InfoExtractor):
  37     """Provide base functions for Youtube extractors"""
  38     _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
  39     _TWOFACTOR_URL = 'https://accounts.google.com/SecondFactor'
  40     _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
  41     _AGE_URL = 'https://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
  42     _NETRC_MACHINE = 'youtube'
  43     # If True it will raise an error if no login info is provided
  44     _LOGIN_REQUIRED = False
  45
  46     def _set_language(self):
  47         return bool(self._download_webpage(
  48             self._LANG_URL, None,
  49             note='Setting language', errnote='unable to set language',
  50             fatal=False))
  51
  52     def _login(self):
  53         """
  54         Attempt to log in to YouTube.
  55         True is returned if successful or skipped.
  56         False is returned if login failed.
  57
  58         If _LOGIN_REQUIRED is set and no authentication was provided, an error is raised.
  59         """
  60         (username, password) = self._get_login_info()
  61         # No authentication to be performed
  62         if username is None:
  63             if self._LOGIN_REQUIRED:
  64                 raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True)
  65             return True
  66
  67         login_page = self._download_webpage(
  68             self._LOGIN_URL, None,
  69             note='Downloading login page',
  70             errnote='unable to fetch login page', fatal=False)
  71         if login_page is False:
  72             return
  73
  74         galx = self._search_regex(r'(?s)<input.+?name="GALX".+?value="(.+?)"',
  75                                   login_page, 'Login GALX parameter')
  76
  77         # Log in
  78         login_form_strs = {
  79                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
  80                 'Email': username,
  81                 'GALX': galx,
  82                 'Passwd': password,
  83
  84                 'PersistentCookie': 'yes',
  85                 '_utf8': '霱',
  86                 'bgresponse': 'js_disabled',
  87                 'checkConnection': '',
  88                 'checkedDomains': 'youtube',
  89                 'dnConn': '',
  90                 'pstMsg': '0',
  91                 'rmShown': '1',
  92                 'secTok': '',
  93                 'signIn': 'Sign in',
  94                 'timeStmp': '',
  95                 'service': 'youtube',
  96                 'uilel': '3',
  97                 'hl': 'en_US',
  98         }
  99
 100         # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
 101         # chokes on unicode
 102         login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
 103         login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
 104
 105         req = compat_urllib_request.Request(self._LOGIN_URL, login_data)
 106         login_results = self._download_webpage(
 107             req, None,
 108             note='Logging in', errnote='unable to log in', fatal=False)
 109         if login_results is False:
 110             return False
 111
 112         if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
 113             raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
 114
 115         # Two-Factor
 116         # TODO add SMS and phone call support - these require making a request and then prompting the user
 117
 118         if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', login_results) is not None:
 119             tfa_code = self._get_tfa_info()
 120
 121             if tfa_code is None:
 122                 self._downloader.report_warning('Two-factor authentication required. Provide it with --twofactor <code>')
 123                 self._downloader.report_warning('(Note that only TOTP (Google Authenticator App) codes work at this time.)')
 124                 return False
 125
 126             # Unlike the first login form, secTok and timeStmp are both required for the TFA form
 127
 128             match = re.search(r'id="secTok"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 129             if match is None:
 130                 self._downloader.report_warning('Failed to get secTok - did the page structure change?')
 131             secTok = match.group(1)
 132             match = re.search(r'id="timeStmp"\n\s+value=\'(.+)\'/>', login_results, re.M | re.U)
 133             if match is None:
 134                 self._downloader.report_warning('Failed to get timeStmp - did the page structure change?')
 135             timeStmp = match.group(1)
 136
 137             tfa_form_strs = {
 138                 'continue': 'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
 139                 'smsToken': '',
 140                 'smsUserPin': tfa_code,
 141                 'smsVerifyPin': 'Verify',
 142
 143                 'PersistentCookie': 'yes',
 144                 'checkConnection': '',
 145                 'checkedDomains': 'youtube',
 146                 'pstMsg': '1',
 147                 'secTok': secTok,
 148                 'timeStmp': timeStmp,
 149                 'service': 'youtube',
 150                 'hl': 'en_US',
 151             }
 152             tfa_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in tfa_form_strs.items())
 153             tfa_data = compat_urllib_parse.urlencode(tfa_form).encode('ascii')
 154
 155             tfa_req = compat_urllib_request.Request(self._TWOFACTOR_URL, tfa_data)
 156             tfa_results = self._download_webpage(
 157                 tfa_req, None,
 158                 note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
 159
 160             if tfa_results is False:
 161                 return False
 162
 163             if re.search(r'(?i)<form[^>]* id="gaia_secondfactorform"', tfa_results) is not None:
 164                 self._downloader.report_warning('Two-factor code expired. Please try again, or use a one-use backup code instead.')
 165                 return False
 166             if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None:
 167                 self._downloader.report_warning('unable to log in - did the page structure change?')
 168                 return False
 169             if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
 170                 self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
 171                 return False
 172
 173         if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
 174             self._downloader.report_warning('unable to log in: bad username or password')
 175             return False
 176         return True
 177
 178     def _confirm_age(self):
 179         age_form = {
 180             'next_url': '/',
 181             'action_confirm': 'Confirm',
 182         }
 183         req = compat_urllib_request.Request(self._AGE_URL,
 184             compat_urllib_parse.urlencode(age_form).encode('ascii'))
 185
 186         self._download_webpage(
 187             req, None,
 188             note='Confirming age', errnote='Unable to confirm age',
 189             fatal=False)
 190
 191     def _real_initialize(self):
 192         if self._downloader is None:
 193             return
 194         if self._get_login_info()[0] is not None:
 195             if not self._set_language():
 196                 return
 197         if not self._login():
 198             return
 199         self._confirm_age()
 200
 201
 202 class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor):
 203     IE_DESC = 'YouTube.com'
 204     _VALID_URL = r"""(?x)^
 205                      (
 206                          (?:https?://|//)                                    # http(s):// or protocol-independent URL
 207                          (?:(?:(?:(?:\w+\.)?[yY][oO][uU][tT][uU][bB][eE](?:-nocookie)?\.com/|
 208                             (?:www\.)?deturl\.com/www\.youtube\.com/|
 209                             (?:www\.)?pwnyoutube\.com/|
 210                             (?:www\.)?yourepeat\.com/|
 211                             tube\.majestyc\.net/|
 212                             youtube\.googleapis\.com/)                        # the various hostnames, with wildcard subdomains
 213                          (?:.*?\#/)?                                          # handle anchor (#/) redirect urls
 214                          (?:                                                  # the various things that can precede the ID:
 215                              (?:(?:v|embed|e)/(?!videoseries))                # v/ or embed/ or e/
 216                              |(?:                                             # or the v= param in all its forms
 217                                  (?:(?:watch|movie)(?:_popup)?(?:\.php)?/?)?  # preceding watch(_popup|.php) or nothing (like /?v=xxxx)
 218                                  (?:\?|\#!?)                                  # the params delimiter ? or # or #!
 219                                  (?:.*?&)?                                    # any other preceding param (like /?s=tuff&v=xxxx)
 220                                  v=
 221                              )
 222                          ))
 223                          |youtu\.be/                                          # just youtu.be/xxxx
 224                          |(?:www\.)?cleanvideosearch\.com/media/action/yt/watch\?videoId=
 225                          )
 226                      )?                                                       # all until now is optional -> you can pass the naked ID
 227                      ([0-9A-Za-z_-]{11})                                      # here is it! the YouTube video ID
 228                      (?!.*?&list=)                                            # combined list/video URLs are handled by the playlist IE
 229                      (?(1).+)?                                                # if we found the ID, everything can follow
 230                      $"""
 231     _NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
 232     _formats = {
 233         '5': {'ext': 'flv', 'width': 400, 'height': 240},
 234         '6': {'ext': 'flv', 'width': 450, 'height': 270},
 235         '13': {'ext': '3gp'},
 236         '17': {'ext': '3gp', 'width': 176, 'height': 144},
 237         '18': {'ext': 'mp4', 'width': 640, 'height': 360},
 238         '22': {'ext': 'mp4', 'width': 1280, 'height': 720},
 239         '34': {'ext': 'flv', 'width': 640, 'height': 360},
 240         '35': {'ext': 'flv', 'width': 854, 'height': 480},
 241         '36': {'ext': '3gp', 'width': 320, 'height': 240},
 242         '37': {'ext': 'mp4', 'width': 1920, 'height': 1080},
 243         '38': {'ext': 'mp4', 'width': 4096, 'height': 3072},
 244         '43': {'ext': 'webm', 'width': 640, 'height': 360},
 245         '44': {'ext': 'webm', 'width': 854, 'height': 480},
 246         '45': {'ext': 'webm', 'width': 1280, 'height': 720},
 247         '46': {'ext': 'webm', 'width': 1920, 'height': 1080},
 248
 249
 250         # 3d videos
 251         '82': {'ext': 'mp4', 'height': 360, 'format_note': '3D', 'preference': -20},
 252         '83': {'ext': 'mp4', 'height': 480, 'format_note': '3D', 'preference': -20},
 253         '84': {'ext': 'mp4', 'height': 720, 'format_note': '3D', 'preference': -20},
 254         '85': {'ext': 'mp4', 'height': 1080, 'format_note': '3D', 'preference': -20},
 255         '100': {'ext': 'webm', 'height': 360, 'format_note': '3D', 'preference': -20},
 256         '101': {'ext': 'webm', 'height': 480, 'format_note': '3D', 'preference': -20},
 257         '102': {'ext': 'webm', 'height': 720, 'format_note': '3D', 'preference': -20},
 258
 259         # Apple HTTP Live Streaming
 260         '92': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 261         '93': {'ext': 'mp4', 'height': 360, 'format_note': 'HLS', 'preference': -10},
 262         '94': {'ext': 'mp4', 'height': 480, 'format_note': 'HLS', 'preference': -10},
 263         '95': {'ext': 'mp4', 'height': 720, 'format_note': 'HLS', 'preference': -10},
 264         '96': {'ext': 'mp4', 'height': 1080, 'format_note': 'HLS', 'preference': -10},
 265         '132': {'ext': 'mp4', 'height': 240, 'format_note': 'HLS', 'preference': -10},
 266         '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'preference': -10},
 267
 268         # DASH mp4 video
 269         '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 270         '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 271         '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 272         '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 273         '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 274         '138': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 275         '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 276         '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 277         '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 278         '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'h264'},
 279         '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'vcodec': 'h264'},
 280
 281         # Dash mp4 audio
 282         '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 48, 'preference': -50},
 283         '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 128, 'preference': -50},
 284         '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'vcodec': 'none', 'abr': 256, 'preference': -50},
 285
 286         # Dash webm
 287         '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 288         '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 289         '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 290         '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 291         '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 292         '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'acodec': 'none', 'container': 'webm', 'vcodec': 'VP8', 'preference': -40},
 293         '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'container': 'webm', 'vcodec': 'VP9'},
 294         '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 295         '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 296         '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 297         '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 298         '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 299         '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 300         '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 301         '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 302         '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40},
 303         '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 304         '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'acodec': 'none', 'preference': -40, 'fps': 60, 'vcodec': 'VP9'},
 305
 306         # Dash webm audio
 307         '171': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
 308         '172': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
 309
 310         # Dash webm audio with opus inside
 311         '249': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
 312         '250': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
 313         '251': {'ext': 'webm', 'vcodec': 'none', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
 314
 315         # RTMP (unnamed)
 316         '_rtmp': {'protocol': 'rtmp'},
 317     }
 318
 319     IE_NAME = 'youtube'
 320     _TESTS = [
 321         {
 322             'url': 'http://www.youtube.com/watch?v=BaW_jenozKc',
 323             'info_dict': {
 324                 'id': 'BaW_jenozKc',
 325                 'ext': 'mp4',
 326                 'title': 'youtube-dl test video "\'/\\ä↭𝕐',
 327                 'uploader': 'Philipp Hagemeister',
 328                 'uploader_id': 'phihag',
 329                 'upload_date': '20121002',
 330                 'description': 'test chars:  "\'/\\ä↭𝕐\ntest URL: https://github.com/rg3/youtube-dl/issues/1892\n\nThis is a test video for youtube-dl.\n\nFor more information, contact phihag@phihag.de .',
 331                 'categories': ['Science & Technology'],
 332                 'like_count': int,
 333                 'dislike_count': int,
 334             }
 335         },
 336         {
 337             'url': 'http://www.youtube.com/watch?v=UxxajLWwzqY',
 338             'note': 'Test generic use_cipher_signature video (#897)',
 339             'info_dict': {
 340                 'id': 'UxxajLWwzqY',
 341                 'ext': 'mp4',
 342                 'upload_date': '20120506',
 343                 'title': 'Icona Pop - I Love It (feat. Charli XCX) [OFFICIAL VIDEO]',
 344                 'description': 'md5:fea86fda2d5a5784273df5c7cc994d9f',
 345                 'uploader': 'Icona Pop',
 346                 'uploader_id': 'IconaPop',
 347             }
 348         },
 349         {
 350             'url': 'https://www.youtube.com/watch?v=07FYdnEawAQ',
 351             'note': 'Test VEVO video with age protection (#956)',
 352             'info_dict': {
 353                 'id': '07FYdnEawAQ',
 354                 'ext': 'mp4',
 355                 'upload_date': '20130703',
 356                 'title': 'Justin Timberlake - Tunnel Vision (Explicit)',
 357                 'description': 'md5:64249768eec3bc4276236606ea996373',
 358                 'uploader': 'justintimberlakeVEVO',
 359                 'uploader_id': 'justintimberlakeVEVO',
 360             }
 361         },
 362         {
 363             'url': '//www.YouTube.com/watch?v=yZIXLfi8CZQ',
 364             'note': 'Embed-only video (#1746)',
 365             'info_dict': {
 366                 'id': 'yZIXLfi8CZQ',
 367                 'ext': 'mp4',
 368                 'upload_date': '20120608',
 369                 'title': 'Principal Sexually Assaults A Teacher - Episode 117 - 8th June 2012',
 370                 'description': 'md5:09b78bd971f1e3e289601dfba15ca4f7',
 371                 'uploader': 'SET India',
 372                 'uploader_id': 'setindia'
 373             }
 374         },
 375         {
 376             'url': 'http://www.youtube.com/watch?v=a9LDPn-MO4I',
 377             'note': '256k DASH audio (format 141) via DASH manifest',
 378             'info_dict': {
 379                 'id': 'a9LDPn-MO4I',
 380                 'ext': 'm4a',
 381                 'upload_date': '20121002',
 382                 'uploader_id': '8KVIDEO',
 383                 'description': '',
 384                 'uploader': '8KVIDEO',
 385                 'title': 'UHDTV TEST 8K VIDEO.mp4'
 386             },
 387             'params': {
 388                 'youtube_include_dash_manifest': True,
 389                 'format': '141',
 390             },
 391         },
 392         # DASH manifest with encrypted signature
 393         {
 394             'url': 'https://www.youtube.com/watch?v=IB3lcPjvWLA',
 395             'info_dict': {
 396                 'id': 'IB3lcPjvWLA',
 397                 'ext': 'm4a',
 398                 'title': 'Afrojack - The Spark ft. Spree Wilson',
 399                 'description': 'md5:9717375db5a9a3992be4668bbf3bc0a8',
 400                 'uploader': 'AfrojackVEVO',
 401                 'uploader_id': 'AfrojackVEVO',
 402                 'upload_date': '20131011',
 403             },
 404             'params': {
 405                 'youtube_include_dash_manifest': True,
 406                 'format': '141',
 407             },
 408         },
 409         # Controversy video
 410         {
 411             'url': 'https://www.youtube.com/watch?v=T4XJQO3qol8',
 412             'info_dict': {
 413                 'id': 'T4XJQO3qol8',
 414                 'ext': 'mp4',
 415                 'upload_date': '20100909',
 416                 'uploader': 'The Amazing Atheist',
 417                 'uploader_id': 'TheAmazingAtheist',
 418                 'title': 'Burning Everyone\'s Koran',
 419                 'description': 'SUBSCRIBE: http://www.youtube.com/saturninefilms\n\nEven Obama has taken a stand against freedom on this issue: http://www.huffingtonpost.com/2010/09/09/obama-gma-interview-quran_n_710282.html',
 420             }
 421         }
 422     ]
 423
 424     def __init__(self, *args, **kwargs):
 425         super(YoutubeIE, self).__init__(*args, **kwargs)
 426         self._player_cache = {}
 427
 428     def report_video_info_webpage_download(self, video_id):
 429         """Report attempt to download video info webpage."""
 430         self.to_screen('%s: Downloading video info webpage' % video_id)
 431
 432     def report_information_extraction(self, video_id):
 433         """Report attempt to extract video information."""
 434         self.to_screen('%s: Extracting video information' % video_id)
 435
 436     def report_unavailable_format(self, video_id, format):
 437         """Report extracted video URL."""
 438         self.to_screen('%s: Format %s not available' % (video_id, format))
 439
 440     def report_rtmp_download(self):
 441         """Indicate the download will use the RTMP protocol."""
 442         self.to_screen('RTMP download detected')
 443
 444     def _signature_cache_id(self, example_sig):
 445         """ Return a string representation of a signature """
 446         return '.'.join(compat_str(len(part)) for part in example_sig.split('.'))
 447
 448     def _extract_signature_function(self, video_id, player_url, example_sig):
 449         id_m = re.match(
 450             r'.*-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player)?\.(?P<ext>[a-z]+)$',
 451             player_url)
 452         if not id_m:
 453             raise ExtractorError('Cannot identify player %r' % player_url)
 454         player_type = id_m.group('ext')
 455         player_id = id_m.group('id')
 456
 457         # Read from filesystem cache
 458         func_id = '%s_%s_%s' % (
 459             player_type, player_id, self._signature_cache_id(example_sig))
 460         assert os.path.basename(func_id) == func_id
 461
 462         cache_spec = self._downloader.cache.load('youtube-sigfuncs', func_id)
 463         if cache_spec is not None:
 464             return lambda s: ''.join(s[i] for i in cache_spec)
 465
 466         if player_type == 'js':
 467             code = self._download_webpage(
 468                 player_url, video_id,
 469                 note='Downloading %s player %s' % (player_type, player_id),
 470                 errnote='Download of %s failed' % player_url)
 471             res = self._parse_sig_js(code)
 472         elif player_type == 'swf':
 473             urlh = self._request_webpage(
 474                 player_url, video_id,
 475                 note='Downloading %s player %s' % (player_type, player_id),
 476                 errnote='Download of %s failed' % player_url)
 477             code = urlh.read()
 478             res = self._parse_sig_swf(code)
 479         else:
 480             assert False, 'Invalid player type %r' % player_type
 481
 482         if cache_spec is None:
 483             test_string = ''.join(map(compat_chr, range(len(example_sig))))
 484             cache_res = res(test_string)
 485             cache_spec = [ord(c) for c in cache_res]
 486
 487         self._downloader.cache.store('youtube-sigfuncs', func_id, cache_spec)
 488         return res
 489
 490     def _print_sig_code(self, func, example_sig):
 491         def gen_sig_code(idxs):
 492             def _genslice(start, end, step):
 493                 starts = '' if start == 0 else str(start)
 494                 ends = (':%d' % (end+step)) if end + step >= 0 else ':'
 495                 steps = '' if step == 1 else (':%d' % step)
 496                 return 's[%s%s%s]' % (starts, ends, steps)
 497
 498             step = None
 499             start = '(Never used)'  # Quelch pyflakes warnings - start will be
 500                                     # set as soon as step is set
 501             for i, prev in zip(idxs[1:], idxs[:-1]):
 502                 if step is not None:
 503                     if i - prev == step:
 504                         continue
 505                     yield _genslice(start, prev, step)
 506                     step = None
 507                     continue
 508                 if i - prev in [-1, 1]:
 509                     step = i - prev
 510                     start = prev
 511                     continue
 512                 else:
 513                     yield 's[%d]' % prev
 514             if step is None:
 515                 yield 's[%d]' % i
 516             else:
 517                 yield _genslice(start, i, step)
 518
 519         test_string = ''.join(map(compat_chr, range(len(example_sig))))
 520         cache_res = func(test_string)
 521         cache_spec = [ord(c) for c in cache_res]
 522         expr_code = ' + '.join(gen_sig_code(cache_spec))
 523         signature_id_tuple = '(%s)' % (
 524             ', '.join(compat_str(len(p)) for p in example_sig.split('.')))
 525         code = ('if tuple(len(p) for p in s.split(\'.\')) == %s:\n'
 526                 '    return %s\n') % (signature_id_tuple, expr_code)
 527         self.to_screen('Extracted signature function:\n' + code)
 528
 529     def _parse_sig_js(self, jscode):
 530         funcname = self._search_regex(
 531             r'\.sig\|\|([a-zA-Z0-9]+)\(', jscode,
 532              'Initial JS player signature function name')
 533
 534         jsi = JSInterpreter(jscode)
 535         initial_function = jsi.extract_function(funcname)
 536         return lambda s: initial_function([s])
 537
 538     def _parse_sig_swf(self, file_contents):
 539         swfi = SWFInterpreter(file_contents)
 540         TARGET_CLASSNAME = 'SignatureDecipher'
 541         searched_class = swfi.extract_class(TARGET_CLASSNAME)
 542         initial_function = swfi.extract_function(searched_class, 'decipher')
 543         return lambda s: initial_function([s])
 544
 545     def _decrypt_signature(self, s, video_id, player_url, age_gate=False):
 546         """Turn the encrypted s field into a working signature"""
 547
 548         if player_url is None:
 549             raise ExtractorError('Cannot decrypt signature without player_url')
 550
 551         if player_url.startswith('//'):
 552             player_url = 'https:' + player_url
 553         try:
 554             player_id = (player_url, self._signature_cache_id(s))
 555             if player_id not in self._player_cache:
 556                 func = self._extract_signature_function(
 557                     video_id, player_url, s
 558                 )
 559                 self._player_cache[player_id] = func
 560             func = self._player_cache[player_id]
 561             if self._downloader.params.get('youtube_print_sig_code'):
 562                 self._print_sig_code(func, s)
 563             return func(s)
 564         except Exception as e:
 565             tb = traceback.format_exc()
 566             raise ExtractorError(
 567                 'Signature extraction failed: ' + tb, cause=e)
 568
 569     def _get_available_subtitles(self, video_id, webpage):
 570         try:
 571             sub_list = self._download_webpage(
 572                 'https://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id,
 573                 video_id, note=False)
 574         except ExtractorError as err:
 575             self._downloader.report_warning('unable to download video subtitles: %s' % compat_str(err))
 576             return {}
 577         lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list)
 578
 579         sub_lang_list = {}
 580         for l in lang_list:
 581             lang = l[1]
 582             if lang in sub_lang_list:
 583                 continue
 584             params = compat_urllib_parse.urlencode({
 585                 'lang': lang,
 586                 'v': video_id,
 587                 'fmt': self._downloader.params.get('subtitlesformat', 'srt'),
 588                 'name': unescapeHTML(l[0]).encode('utf-8'),
 589             })
 590             url = 'https://www.youtube.com/api/timedtext?' + params
 591             sub_lang_list[lang] = url
 592         if not sub_lang_list:
 593             self._downloader.report_warning('video doesn\'t have subtitles')
 594             return {}
 595         return sub_lang_list
 596
 597     def _get_available_automatic_caption(self, video_id, webpage):
 598         """We need the webpage for getting the captions url, pass it as an
 599            argument to speed up the process."""
 600         sub_format = self._downloader.params.get('subtitlesformat', 'srt')
 601         self.to_screen('%s: Looking for automatic captions' % video_id)
 602         mobj = re.search(r';ytplayer.config = ({.*?});', webpage)
 603         err_msg = 'Couldn\'t find automatic captions for %s' % video_id
 604         if mobj is None:
 605             self._downloader.report_warning(err_msg)
 606             return {}
 607         player_config = json.loads(mobj.group(1))
 608         try:
 609             args = player_config[u'args']
 610             caption_url = args[u'ttsurl']
 611             timestamp = args[u'timestamp']
 612             # We get the available subtitles
 613             list_params = compat_urllib_parse.urlencode({
 614                 'type': 'list',
 615                 'tlangs': 1,
 616                 'asrs': 1,
 617             })
 618             list_url = caption_url + '&' + list_params
 619             caption_list = self._download_xml(list_url, video_id)
 620             original_lang_node = caption_list.find('track')
 621             if original_lang_node is None or original_lang_node.attrib.get('kind') != 'asr' :
 622                 self._downloader.report_warning('Video doesn\'t have automatic captions')
 623                 return {}
 624             original_lang = original_lang_node.attrib['lang_code']
 625
 626             sub_lang_list = {}
 627             for lang_node in caption_list.findall('target'):
 628                 sub_lang = lang_node.attrib['lang_code']
 629                 params = compat_urllib_parse.urlencode({
 630                     'lang': original_lang,
 631                     'tlang': sub_lang,
 632                     'fmt': sub_format,
 633                     'ts': timestamp,
 634                     'kind': 'asr',
 635                 })
 636                 sub_lang_list[sub_lang] = caption_url + '&' + params
 637             return sub_lang_list
 638         # An extractor error can be raise by the download process if there are
 639         # no automatic captions but there are subtitles
 640         except (KeyError, ExtractorError):
 641             self._downloader.report_warning(err_msg)
 642             return {}
 643
 644     @classmethod
 645     def extract_id(cls, url):
 646         mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
 647         if mobj is None:
 648             raise ExtractorError('Invalid URL: %s' % url)
 649         video_id = mobj.group(2)
 650         return video_id
 651
 652     def _extract_from_m3u8(self, manifest_url, video_id):
 653         url_map = {}
 654         def _get_urls(_manifest):
 655             lines = _manifest.split('\n')
 656             urls = filter(lambda l: l and not l.startswith('#'),
 657                             lines)
 658             return urls
 659         manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
 660         formats_urls = _get_urls(manifest)
 661         for format_url in formats_urls:
 662             itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
 663             url_map[itag] = format_url
 664         return url_map
 665
 666     def _extract_annotations(self, video_id):
 667         url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
 668         return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
 669
 670     def _real_extract(self, url):
 671         proto = (
 672             'http' if self._downloader.params.get('prefer_insecure', False)
 673             else 'https')
 674
 675         # Extract original video URL from URL with redirection, like age verification, using next_url parameter
 676         mobj = re.search(self._NEXT_URL_RE, url)
 677         if mobj:
 678             url = proto + '://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/')
 679         video_id = self.extract_id(url)
 680
 681         # Get video webpage
 682         url = proto + '://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1&bpctr=9999999999' % video_id
 683         pref_cookies = [
 684             c for c in self._downloader.cookiejar
 685             if c.domain == '.youtube.com' and c.name == 'PREF']
 686         for pc in pref_cookies:
 687             if 'hl=' in pc.value:
 688                 pc.value = re.sub(r'hl=[^&]+', 'hl=en', pc.value)
 689             else:
 690                 if pc.value:
 691                     pc.value += '&'
 692                 pc.value += 'hl=en'
 693         video_webpage = self._download_webpage(url, video_id)
 694
 695         # Attempt to extract SWF player URL
 696         mobj = re.search(r'swfConfig.*?"(https?:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage)
 697         if mobj is not None:
 698             player_url = re.sub(r'\\(.)', r'\1', mobj.group(1))
 699         else:
 700             player_url = None
 701
 702         # Get video info
 703         self.report_video_info_webpage_download(video_id)
 704         if re.search(r'player-age-gate-content">', video_webpage) is not None:
 705             age_gate = True
 706             # We simulate the access to the video from www.youtube.com/v/{video_id}
 707             # this can be viewed without login into Youtube
 708             data = compat_urllib_parse.urlencode({
 709                 'video_id': video_id,
 710                 'eurl': 'https://youtube.googleapis.com/v/' + video_id,
 711                 'sts': self._search_regex(
 712                     r'"sts"\s*:\s*(\d+)', video_webpage, 'sts', default=''),
 713             })
 714             video_info_url = proto + '://www.youtube.com/get_video_info?' + data
 715             video_info_webpage = self._download_webpage(
 716                 video_info_url, video_id,
 717                 note='Refetching age-gated info webpage',
 718                 errnote='unable to download video info webpage')
 719             video_info = compat_parse_qs(video_info_webpage)
 720         else:
 721             age_gate = False
 722             for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']:
 723                 video_info_url = (proto + '://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en'
 724                         % (video_id, el_type))
 725                 video_info_webpage = self._download_webpage(video_info_url, video_id,
 726                                         note=False,
 727                                         errnote='unable to download video info webpage')
 728                 video_info = compat_parse_qs(video_info_webpage)
 729                 if 'token' in video_info:
 730                     break
 731         if 'token' not in video_info:
 732             if 'reason' in video_info:
 733                 raise ExtractorError(
 734                     'YouTube said: %s' % video_info['reason'][0],
 735                     expected=True, video_id=video_id)
 736             else:
 737                 raise ExtractorError(
 738                     '"token" parameter not in video info for unknown reason',
 739                     video_id=video_id)
 740
 741         if 'view_count' in video_info:
 742             view_count = int(video_info['view_count'][0])
 743         else:
 744             view_count = None
 745
 746         # Check for "rental" videos
 747         if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
 748             raise ExtractorError('"rental" videos not supported')
 749
 750         # Start extracting information
 751         self.report_information_extraction(video_id)
 752
 753         # uploader
 754         if 'author' not in video_info:
 755             raise ExtractorError('Unable to extract uploader name')
 756         video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0])
 757
 758         # uploader_id
 759         video_uploader_id = None
 760         mobj = re.search(r'<link itemprop="url" href="http://www.youtube.com/(?:user|channel)/([^"]+)">', video_webpage)
 761         if mobj is not None:
 762             video_uploader_id = mobj.group(1)
 763         else:
 764             self._downloader.report_warning('unable to extract uploader nickname')
 765
 766         # title
 767         if 'title' in video_info:
 768             video_title = video_info['title'][0]
 769         else:
 770             self._downloader.report_warning('Unable to extract video title')
 771             video_title = '_'
 772
 773         # thumbnail image
 774         # We try first to get a high quality image:
 775         m_thumb = re.search(r'<span itemprop="thumbnail".*?href="(.*?)">',
 776                             video_webpage, re.DOTALL)
 777         if m_thumb is not None:
 778             video_thumbnail = m_thumb.group(1)
 779         elif 'thumbnail_url' not in video_info:
 780             self._downloader.report_warning('unable to extract video thumbnail')
 781             video_thumbnail = None
 782         else:   # don't panic if we can't find it
 783             video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0])
 784
 785         # upload date
 786         upload_date = None
 787         mobj = re.search(r'(?s)id="eow-date.*?>(.*?)</span>', video_webpage)
 788         if mobj is None:
 789             mobj = re.search(
 790                 r'(?s)id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live) on (.*?)</strong>',
 791                 video_webpage)
 792         if mobj is not None:
 793             upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
 794             upload_date = unified_strdate(upload_date)
 795
 796         m_cat_container = self._search_regex(
 797             r'(?s)<h4[^>]*>\s*Category\s*</h4>\s*<ul[^>]*>(.*?)</ul>',
 798             video_webpage, 'categories', fatal=False)
 799         if m_cat_container:
 800             category = self._html_search_regex(
 801                 r'(?s)<a[^<]+>(.*?)</a>', m_cat_container, 'category',
 802                 default=None)
 803             video_categories = None if category is None else [category]
 804         else:
 805             video_categories = None
 806
 807         # description
 808         video_description = get_element_by_id("eow-description", video_webpage)
 809         if video_description:
 810             video_description = re.sub(r'''(?x)
 811                 <a\s+
 812                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 813                     title="([^"]+)"\s+
 814                     (?:[a-zA-Z-]+="[^"]+"\s+)*?
 815                     class="yt-uix-redirect-link"\s*>
 816                 [^<]+
 817                 </a>
 818             ''', r'\1', video_description)
 819             video_description = clean_html(video_description)
 820         else:
 821             fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage)
 822             if fd_mobj:
 823                 video_description = unescapeHTML(fd_mobj.group(1))
 824             else:
 825                 video_description = ''
 826
 827         def _extract_count(count_name):
 828             count = self._search_regex(
 829                 r'id="watch-%s"[^>]*>.*?([\d,]+)\s*</span>' % re.escape(count_name),
 830                 video_webpage, count_name, default=None)
 831             if count is not None:
 832                 return int(count.replace(',', ''))
 833             return None
 834         like_count = _extract_count('like')
 835         dislike_count = _extract_count('dislike')
 836
 837         # subtitles
 838         video_subtitles = self.extract_subtitles(video_id, video_webpage)
 839
 840         if self._downloader.params.get('listsubtitles', False):
 841             self._list_available_subtitles(video_id, video_webpage)
 842             return
 843
 844         if 'length_seconds' not in video_info:
 845             self._downloader.report_warning('unable to extract video duration')
 846             video_duration = None
 847         else:
 848             video_duration = int(compat_urllib_parse.unquote_plus(video_info['length_seconds'][0]))
 849
 850         # annotations
 851         video_annotations = None
 852         if self._downloader.params.get('writeannotations', False):
 853                 video_annotations = self._extract_annotations(video_id)
 854
 855         # Decide which formats to download
 856         try:
 857             mobj = re.search(r';ytplayer\.config\s*=\s*({.*?});', video_webpage)
 858             if not mobj:
 859                 raise ValueError('Could not find vevo ID')
 860             json_code = uppercase_escape(mobj.group(1))
 861             ytplayer_config = json.loads(json_code)
 862             args = ytplayer_config['args']
 863             # Easy way to know if the 's' value is in url_encoded_fmt_stream_map
 864             # this signatures are encrypted
 865             if 'url_encoded_fmt_stream_map' not in args:
 866                 raise ValueError('No stream_map present')  # caught below
 867             re_signature = re.compile(r'[&,]s=')
 868             m_s = re_signature.search(args['url_encoded_fmt_stream_map'])
 869             if m_s is not None:
 870                 self.to_screen('%s: Encrypted signatures detected.' % video_id)
 871                 video_info['url_encoded_fmt_stream_map'] = [args['url_encoded_fmt_stream_map']]
 872             m_s = re_signature.search(args.get('adaptive_fmts', ''))
 873             if m_s is not None:
 874                 if 'adaptive_fmts' in video_info:
 875                     video_info['adaptive_fmts'][0] += ',' + args['adaptive_fmts']
 876                 else:
 877                     video_info['adaptive_fmts'] = [args['adaptive_fmts']]
 878         except ValueError:
 879             pass
 880
 881         def _map_to_format_list(urlmap):
 882             formats = []
 883             for itag, video_real_url in urlmap.items():
 884                 dct = {
 885                     'format_id': itag,
 886                     'url': video_real_url,
 887                     'player_url': player_url,
 888                 }
 889                 if itag in self._formats:
 890                     dct.update(self._formats[itag])
 891                 formats.append(dct)
 892             return formats
 893
 894         if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
 895             self.report_rtmp_download()
 896             formats = [{
 897                 'format_id': '_rtmp',
 898                 'protocol': 'rtmp',
 899                 'url': video_info['conn'][0],
 900                 'player_url': player_url,
 901             }]
 902         elif len(video_info.get('url_encoded_fmt_stream_map', [])) >= 1 or len(video_info.get('adaptive_fmts', [])) >= 1:
 903             encoded_url_map = video_info.get('url_encoded_fmt_stream_map', [''])[0] + ',' + video_info.get('adaptive_fmts',[''])[0]
 904             if 'rtmpe%3Dyes' in encoded_url_map:
 905                 raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
 906             url_map = {}
 907             for url_data_str in encoded_url_map.split(','):
 908                 url_data = compat_parse_qs(url_data_str)
 909                 if 'itag' not in url_data or 'url' not in url_data:
 910                     continue
 911                 format_id = url_data['itag'][0]
 912                 url = url_data['url'][0]
 913
 914                 if 'sig' in url_data:
 915                     url += '&signature=' + url_data['sig'][0]
 916                 elif 's' in url_data:
 917                     encrypted_sig = url_data['s'][0]
 918
 919                     if not age_gate:
 920                         jsplayer_url_json = self._search_regex(
 921                             r'"assets":.+?"js":\s*("[^"]+")',
 922                             video_webpage, 'JS player URL')
 923                         player_url = json.loads(jsplayer_url_json)
 924                     if player_url is None:
 925                         player_url_json = self._search_regex(
 926                             r'ytplayer\.config.*?"url"\s*:\s*("[^"]+")',
 927                             video_webpage, 'age gate player URL')
 928                         player_url = json.loads(player_url_json)
 929
 930                     if self._downloader.params.get('verbose'):
 931                         if player_url is None:
 932                             player_version = 'unknown'
 933                             player_desc = 'unknown'
 934                         else:
 935                             if player_url.endswith('swf'):
 936                                 player_version = self._search_regex(
 937                                     r'-(.+?)(?:/watch_as3)?\.swf$', player_url,
 938                                     'flash player', fatal=False)
 939                                 player_desc = 'flash player %s' % player_version
 940                             else:
 941                                 player_version = self._search_regex(
 942                                     r'html5player-([^/]+?)(?:/html5player)?\.js',
 943                                     player_url,
 944                                     'html5 player', fatal=False)
 945                                 player_desc = 'html5 player %s' % player_version
 946
 947                         parts_sizes = self._signature_cache_id(encrypted_sig)
 948                         self.to_screen('{%s} signature length %s, %s' %
 949                             (format_id, parts_sizes, player_desc))
 950
 951                     signature = self._decrypt_signature(
 952                         encrypted_sig, video_id, player_url, age_gate)
 953                     url += '&signature=' + signature
 954                 if 'ratebypass' not in url:
 955                     url += '&ratebypass=yes'
 956                 url_map[format_id] = url
 957             formats = _map_to_format_list(url_map)
 958         elif video_info.get('hlsvp'):
 959             manifest_url = video_info['hlsvp'][0]
 960             url_map = self._extract_from_m3u8(manifest_url, video_id)
 961             formats = _map_to_format_list(url_map)
 962         else:
 963             raise ExtractorError('no conn, hlsvp or url_encoded_fmt_stream_map information found in video info')
 964
 965         # Look for the DASH manifest
 966         if self._downloader.params.get('youtube_include_dash_manifest', True):
 967             try:
 968                 # The DASH manifest used needs to be the one from the original video_webpage.
 969                 # The one found in get_video_info seems to be using different signatures.
 970                 # However, in the case of an age restriction there won't be any embedded dashmpd in the video_webpage.
 971                 # Luckily, it seems, this case uses some kind of default signature (len == 86), so the
 972                 # combination of get_video_info and the _static_decrypt_signature() decryption fallback will work here.
 973                 if age_gate:
 974                     dash_manifest_url = video_info.get('dashmpd')[0]
 975                 else:
 976                     dash_manifest_url = ytplayer_config['args']['dashmpd']
 977                 def decrypt_sig(mobj):
 978                     s = mobj.group(1)
 979                     dec_s = self._decrypt_signature(s, video_id, player_url, age_gate)
 980                     return '/signature/%s' % dec_s
 981                 dash_manifest_url = re.sub(r'/s/([\w\.]+)', decrypt_sig, dash_manifest_url)
 982                 dash_doc = self._download_xml(
 983                     dash_manifest_url, video_id,
 984                     note='Downloading DASH manifest',
 985                     errnote='Could not download DASH manifest')
 986                 for r in dash_doc.findall('.//{urn:mpeg:DASH:schema:MPD:2011}Representation'):
 987                     url_el = r.find('{urn:mpeg:DASH:schema:MPD:2011}BaseURL')
 988                     if url_el is None:
 989                         continue
 990                     format_id = r.attrib['id']
 991                     video_url = url_el.text
 992                     filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength'))
 993                     f = {
 994                         'format_id': format_id,
 995                         'url': video_url,
 996                         'width': int_or_none(r.attrib.get('width')),
 997                         'tbr': int_or_none(r.attrib.get('bandwidth'), 1000),
 998                         'asr': int_or_none(r.attrib.get('audioSamplingRate')),
 999                         'filesize': filesize,
1000                     }
1001                     try:
1002                         existing_format = next(
1003                             fo for fo in formats
1004                             if fo['format_id'] == format_id)
1005                     except StopIteration:
1006                         f.update(self._formats.get(format_id, {}))
1007                         formats.append(f)
1008                     else:
1009                         existing_format.update(f)
1010
1011             except (ExtractorError, KeyError) as e:
1012                 self.report_warning('Skipping DASH manifest: %r' % e, video_id)
1013
1014         self._sort_formats(formats)
1015
1016         return {
1017             'id':           video_id,
1018             'uploader':     video_uploader,
1019             'uploader_id':  video_uploader_id,
1020             'upload_date':  upload_date,
1021             'title':        video_title,
1022             'thumbnail':    video_thumbnail,
1023             'description':  video_description,
1024             'categories':   video_categories,
1025             'subtitles':    video_subtitles,
1026             'duration':     video_duration,
1027             'age_limit':    18 if age_gate else 0,
1028             'annotations':  video_annotations,
1029             'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
1030             'view_count':   view_count,
1031             'like_count': like_count,
1032             'dislike_count': dislike_count,
1033             'formats':      formats,
1034         }
1035
1036 class YoutubePlaylistIE(YoutubeBaseInfoExtractor):
1037     IE_DESC = 'YouTube.com playlists'
1038     _VALID_URL = r"""(?x)(?:
1039                         (?:https?://)?
1040                         (?:\w+\.)?
1041                         youtube\.com/
1042                         (?:
1043                            (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries)
1044                            \? (?:.*?&)*? (?:p|a|list)=
1045                         |  p/
1046                         )
1047                         (
1048                             (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,}
1049                             # Top tracks, they can also include dots
1050                             |(?:MC)[\w\.]*
1051                         )
1052                         .*
1053                      |
1054                         ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,})
1055                      )"""
1056     _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
1057     _MORE_PAGES_INDICATOR = r'data-link-type="next"'
1058     _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)'
1059     IE_NAME = 'youtube:playlist'
1060     _TESTS = [{
1061         'url': 'https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1062         'info_dict': {
1063             'title': 'ytdl test PL',
1064             'id': 'PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re',
1065         },
1066         'playlist_count': 3,
1067     }, {
1068         'url': 'https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx',
1069         'info_dict': {
1070             'title': 'YDL_Empty_List',
1071         },
1072         'playlist_count': 0,
1073     }, {
1074         'note': 'Playlist with deleted videos (#651). As a bonus, the video #51 is also twice in this list.',
1075         'url': 'https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC',
1076         'info_dict': {
1077             'title': '29C3: Not my department',
1078         },
1079         'playlist_count': 95,
1080     }, {
1081         'note': 'issue #673',
1082         'url': 'PLBB231211A4F62143',
1083         'info_dict': {
1084             'title': '[OLD]Team Fortress 2 (Class-based LP)',
1085         },
1086         'playlist_mincount': 26,
1087     }, {
1088         'note': 'Large playlist',
1089         'url': 'https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q',
1090         'info_dict': {
1091             'title': 'Uploads from Cauchemar',
1092         },
1093         'playlist_mincount': 799,
1094     }, {
1095         'url': 'PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl',
1096         'info_dict': {
1097             'title': 'YDL_safe_search',
1098         },
1099         'playlist_count': 2,
1100     }, {
1101         'note': 'embedded',
1102         'url': 'http://www.youtube.com/embed/videoseries?list=PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu',
1103         'playlist_count': 4,
1104         'info_dict': {
1105             'title': 'JODA15',
1106         }
1107     }, {
1108         'note': 'Embedded SWF player',
1109         'url': 'http://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0',
1110         'playlist_count': 4,
1111         'info_dict': {
1112             'title': 'JODA7',
1113         }
1114     }]
1115
1116     def _real_initialize(self):
1117         self._login()
1118
1119     def _ids_to_results(self, ids):
1120         return [
1121             self.url_result(vid_id, 'Youtube', video_id=vid_id)
1122             for vid_id in ids]
1123
1124     def _extract_mix(self, playlist_id):
1125         # The mixes are generated from a a single video
1126         # the id of the playlist is just 'RD' + video_id
1127         url = 'https://youtube.com/watch?v=%s&list=%s' % (playlist_id[-11:], playlist_id)
1128         webpage = self._download_webpage(
1129             url, playlist_id, 'Downloading Youtube mix')
1130         search_title = lambda class_name: get_element_by_attribute('class', class_name, webpage)
1131         title_span = (
1132             search_title('playlist-title') or
1133             search_title('title long-title') or
1134             search_title('title'))
1135         title = clean_html(title_span)
1136         ids = orderedSet(re.findall(
1137             r'''(?xs)data-video-username=".*?".*?
1138                        href="/watch\?v=([0-9A-Za-z_-]{11})&amp;[^"]*?list=%s''' % re.escape(playlist_id),
1139             webpage))
1140         url_results = self._ids_to_results(ids)
1141
1142         return self.playlist_result(url_results, playlist_id, title)
1143
1144     def _real_extract(self, url):
1145         # Extract playlist id
1146         mobj = re.match(self._VALID_URL, url)
1147         if mobj is None:
1148             raise ExtractorError('Invalid URL: %s' % url)
1149         playlist_id = mobj.group(1) or mobj.group(2)
1150
1151         # Check if it's a video-specific URL
1152         query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query)
1153         if 'v' in query_dict:
1154             video_id = query_dict['v'][0]
1155             if self._downloader.params.get('noplaylist'):
1156                 self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
1157                 return self.url_result(video_id, 'Youtube', video_id=video_id)
1158             else:
1159                 self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
1160
1161         if playlist_id.startswith('RD'):
1162             # Mixes require a custom extraction process
1163             return self._extract_mix(playlist_id)
1164         if playlist_id.startswith('TL'):
1165             raise ExtractorError('For downloading YouTube.com top lists, use '
1166                 'the "yttoplist" keyword, for example "youtube-dl \'yttoplist:music:Top Tracks\'"', expected=True)
1167
1168         url = self._TEMPLATE_URL % playlist_id
1169         page = self._download_webpage(url, playlist_id)
1170         more_widget_html = content_html = page
1171
1172         # Check if the playlist exists or is private
1173         if re.search(r'<div class="yt-alert-message">[^<]*?(The|This) playlist (does not exist|is private)[^<]*?</div>', page) is not None:
1174             raise ExtractorError(
1175                 'The playlist doesn\'t exist or is private, use --username or '
1176                 '--netrc to access it.',
1177                 expected=True)
1178
1179         # Extract the video ids from the playlist pages
1180         ids = []
1181
1182         for page_num in itertools.count(1):
1183             matches = re.finditer(self._VIDEO_RE, content_html)
1184             # We remove the duplicates and the link with index 0
1185             # (it's not the first video of the playlist)
1186             new_ids = orderedSet(m.group('id') for m in matches if m.group('index') != '0')
1187             ids.extend(new_ids)
1188
1189             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1190             if not mobj:
1191                 break
1192
1193             more = self._download_json(
1194                 'https://youtube.com/%s' % mobj.group('more'), playlist_id,
1195                 'Downloading page #%s' % page_num,
1196                 transform_source=uppercase_escape)
1197             content_html = more['content_html']
1198             more_widget_html = more['load_more_widget_html']
1199
1200         playlist_title = self._html_search_regex(
1201             r'(?s)<h1 class="pl-header-title[^"]*">\s*(.*?)\s*</h1>',
1202             page, 'title')
1203
1204         url_results = self._ids_to_results(ids)
1205         return self.playlist_result(url_results, playlist_id, playlist_title)
1206
1207
1208 class YoutubeTopListIE(YoutubePlaylistIE):
1209     IE_NAME = 'youtube:toplist'
1210     IE_DESC = ('YouTube.com top lists, "yttoplist:{channel}:{list title}"'
1211         ' (Example: "yttoplist:music:Top Tracks")')
1212     _VALID_URL = r'yttoplist:(?P<chann>.*?):(?P<title>.*?)$'
1213     _TESTS = [{
1214         'url': 'yttoplist:music:Trending',
1215         'playlist_mincount': 5,
1216         'skip': 'Only works for logged-in users',
1217     }]
1218
1219     def _real_extract(self, url):
1220         mobj = re.match(self._VALID_URL, url)
1221         channel = mobj.group('chann')
1222         title = mobj.group('title')
1223         query = compat_urllib_parse.urlencode({'title': title})
1224         channel_page = self._download_webpage(
1225             'https://www.youtube.com/%s' % channel, title)
1226         link = self._html_search_regex(
1227             r'''(?x)
1228                 <a\s+href="([^"]+)".*?>\s*
1229                 <span\s+class="branded-page-module-title-text">\s*
1230                 <span[^>]*>.*?%s.*?</span>''' % re.escape(query),
1231             channel_page, 'list')
1232         url = compat_urlparse.urljoin('https://www.youtube.com/', link)
1233
1234         video_re = r'data-index="\d+".*?data-video-id="([0-9A-Za-z_-]{11})"'
1235         ids = []
1236         # sometimes the webpage doesn't contain the videos
1237         # retry until we get them
1238         for i in itertools.count(0):
1239             msg = 'Downloading Youtube mix'
1240             if i > 0:
1241                 msg += ', retry #%d' % i
1242
1243             webpage = self._download_webpage(url, title, msg)
1244             ids = orderedSet(re.findall(video_re, webpage))
1245             if ids:
1246                 break
1247         url_results = self._ids_to_results(ids)
1248         return self.playlist_result(url_results, playlist_title=title)
1249
1250
1251 class YoutubeChannelIE(InfoExtractor):
1252     IE_DESC = 'YouTube.com channels'
1253     _VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
1254     _MORE_PAGES_INDICATOR = 'yt-uix-load-more'
1255     _MORE_PAGES_URL = 'https://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
1256     IE_NAME = 'youtube:channel'
1257     _TESTS = [{
1258         'note': 'paginated channel',
1259         'url': 'https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w',
1260         'playlist_mincount': 91,
1261     }]
1262
1263     def extract_videos_from_page(self, page):
1264         ids_in_page = []
1265         for mobj in re.finditer(r'href="/watch\?v=([0-9A-Za-z_-]+)&?', page):
1266             if mobj.group(1) not in ids_in_page:
1267                 ids_in_page.append(mobj.group(1))
1268         return ids_in_page
1269
1270     def _real_extract(self, url):
1271         # Extract channel id
1272         mobj = re.match(self._VALID_URL, url)
1273         if mobj is None:
1274             raise ExtractorError('Invalid URL: %s' % url)
1275
1276         # Download channel page
1277         channel_id = mobj.group(1)
1278         video_ids = []
1279         url = 'https://www.youtube.com/channel/%s/videos' % channel_id
1280         channel_page = self._download_webpage(url, channel_id)
1281         autogenerated = re.search(r'''(?x)
1282                 class="[^"]*?(?:
1283                     channel-header-autogenerated-label|
1284                     yt-channel-title-autogenerated
1285                 )[^"]*"''', channel_page) is not None
1286
1287         if autogenerated:
1288             # The videos are contained in a single page
1289             # the ajax pages can't be used, they are empty
1290             video_ids = self.extract_videos_from_page(channel_page)
1291         else:
1292             # Download all channel pages using the json-based channel_ajax query
1293             for pagenum in itertools.count(1):
1294                 url = self._MORE_PAGES_URL % (pagenum, channel_id)
1295                 page = self._download_json(
1296                     url, channel_id, note='Downloading page #%s' % pagenum,
1297                     transform_source=uppercase_escape)
1298
1299                 ids_in_page = self.extract_videos_from_page(page['content_html'])
1300                 video_ids.extend(ids_in_page)
1301
1302                 if self._MORE_PAGES_INDICATOR not in page['load_more_widget_html']:
1303                     break
1304
1305         self._downloader.to_screen('[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids)))
1306
1307         url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id)
1308                        for video_id in video_ids]
1309         return self.playlist_result(url_entries, channel_id)
1310
1311
1312 class YoutubeUserIE(InfoExtractor):
1313     IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)'
1314     _VALID_URL = r'(?:(?:(?:https?://)?(?:\w+\.)?youtube\.com/(?:user/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)([A-Za-z0-9_-]+)'
1315     _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/users/%s'
1316     _GDATA_PAGE_SIZE = 50
1317     _GDATA_URL = 'https://gdata.youtube.com/feeds/api/users/%s/uploads?max-results=%d&start-index=%d&alt=json'
1318     IE_NAME = 'youtube:user'
1319
1320     _TESTS = [{
1321         'url': 'https://www.youtube.com/user/TheLinuxFoundation',
1322         'playlist_mincount': 320,
1323         'info_dict': {
1324             'title': 'TheLinuxFoundation',
1325         }
1326     }, {
1327         'url': 'ytuser:phihag',
1328         'only_matching': True,
1329     }]
1330
1331     @classmethod
1332     def suitable(cls, url):
1333         # Don't return True if the url can be extracted with other youtube
1334         # extractor, the regex would is too permissive and it would match.
1335         other_ies = iter(klass for (name, klass) in globals().items() if name.endswith('IE') and klass is not cls)
1336         if any(ie.suitable(url) for ie in other_ies): return False
1337         else: return super(YoutubeUserIE, cls).suitable(url)
1338
1339     def _real_extract(self, url):
1340         # Extract username
1341         mobj = re.match(self._VALID_URL, url)
1342         if mobj is None:
1343             raise ExtractorError('Invalid URL: %s' % url)
1344
1345         username = mobj.group(1)
1346
1347         # Download video ids using YouTube Data API. Result size per
1348         # query is limited (currently to 50 videos) so we need to query
1349         # page by page until there are no video ids - it means we got
1350         # all of them.
1351
1352         def download_page(pagenum):
1353             start_index = pagenum * self._GDATA_PAGE_SIZE + 1
1354
1355             gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
1356             page = self._download_webpage(
1357                 gdata_url, username,
1358                 'Downloading video ids from %d to %d' % (
1359                     start_index, start_index + self._GDATA_PAGE_SIZE))
1360
1361             try:
1362                 response = json.loads(page)
1363             except ValueError as err:
1364                 raise ExtractorError('Invalid JSON in API response: ' + compat_str(err))
1365             if 'entry' not in response['feed']:
1366                 return
1367
1368             # Extract video identifiers
1369             entries = response['feed']['entry']
1370             for entry in entries:
1371                 title = entry['title']['$t']
1372                 video_id = entry['id']['$t'].split('/')[-1]
1373                 yield {
1374                     '_type': 'url',
1375                     'url': video_id,
1376                     'ie_key': 'Youtube',
1377                     'id': video_id,
1378                     'title': title,
1379                 }
1380         url_results = OnDemandPagedList(download_page, self._GDATA_PAGE_SIZE)
1381
1382         return self.playlist_result(url_results, playlist_title=username)
1383
1384
1385 class YoutubeSearchIE(SearchInfoExtractor):
1386     IE_DESC = 'YouTube.com searches'
1387     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc'
1388     _MAX_RESULTS = 1000
1389     IE_NAME = 'youtube:search'
1390     _SEARCH_KEY = 'ytsearch'
1391
1392     def _get_n_results(self, query, n):
1393         """Get a specified number of results for a query"""
1394
1395         video_ids = []
1396         pagenum = 0
1397         limit = n
1398         PAGE_SIZE = 50
1399
1400         while (PAGE_SIZE * pagenum) < limit:
1401             result_url = self._API_URL % (
1402                 compat_urllib_parse.quote_plus(query.encode('utf-8')),
1403                 (PAGE_SIZE * pagenum) + 1)
1404             data_json = self._download_webpage(
1405                 result_url, video_id='query "%s"' % query,
1406                 note='Downloading page %s' % (pagenum + 1),
1407                 errnote='Unable to download API page')
1408             data = json.loads(data_json)
1409             api_response = data['data']
1410
1411             if 'items' not in api_response:
1412                 raise ExtractorError(
1413                     '[youtube] No video results', expected=True)
1414
1415             new_ids = list(video['id'] for video in api_response['items'])
1416             video_ids += new_ids
1417
1418             limit = min(n, api_response['totalItems'])
1419             pagenum += 1
1420
1421         if len(video_ids) > n:
1422             video_ids = video_ids[:n]
1423         videos = [self.url_result(video_id, 'Youtube', video_id=video_id)
1424                   for video_id in video_ids]
1425         return self.playlist_result(videos, query)
1426
1427
1428 class YoutubeSearchDateIE(YoutubeSearchIE):
1429     IE_NAME = YoutubeSearchIE.IE_NAME + ':date'
1430     _API_URL = 'https://gdata.youtube.com/feeds/api/videos?q=%s&start-index=%i&max-results=50&v=2&alt=jsonc&orderby=published'
1431     _SEARCH_KEY = 'ytsearchdate'
1432     IE_DESC = 'YouTube.com searches, newest videos first'
1433
1434
1435 class YoutubeSearchURLIE(InfoExtractor):
1436     IE_DESC = 'YouTube.com search URLs'
1437     IE_NAME = 'youtube:search_url'
1438     _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?search_query=(?P<query>[^&]+)(?:[&]|$)'
1439     _TESTS = [{
1440         'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video',
1441         'playlist_mincount': 5,
1442         'info_dict': {
1443             'title': 'youtube-dl test video',
1444         }
1445     }]
1446
1447     def _real_extract(self, url):
1448         mobj = re.match(self._VALID_URL, url)
1449         query = compat_urllib_parse.unquote_plus(mobj.group('query'))
1450
1451         webpage = self._download_webpage(url, query)
1452         result_code = self._search_regex(
1453             r'(?s)<ol class="item-section"(.*?)</ol>', webpage, 'result HTML')
1454
1455         part_codes = re.findall(
1456             r'(?s)<h3 class="yt-lockup-title">(.*?)</h3>', result_code)
1457         entries = []
1458         for part_code in part_codes:
1459             part_title = self._html_search_regex(
1460                 [r'(?s)title="([^"]+)"', r'>([^<]+)</a>'], part_code, 'item title', fatal=False)
1461             part_url_snippet = self._html_search_regex(
1462                 r'(?s)href="([^"]+)"', part_code, 'item URL')
1463             part_url = compat_urlparse.urljoin(
1464                 'https://www.youtube.com/', part_url_snippet)
1465             entries.append({
1466                 '_type': 'url',
1467                 'url': part_url,
1468                 'title': part_title,
1469             })
1470
1471         return {
1472             '_type': 'playlist',
1473             'entries': entries,
1474             'title': query,
1475         }
1476
1477
1478 class YoutubeShowIE(InfoExtractor):
1479     IE_DESC = 'YouTube.com (multi-season) shows'
1480     _VALID_URL = r'https?://www\.youtube\.com/show/(?P<id>[^?#]*)'
1481     IE_NAME = 'youtube:show'
1482     _TESTS = [{
1483         'url': 'http://www.youtube.com/show/airdisasters',
1484         'playlist_mincount': 3,
1485         'info_dict': {
1486             'id': 'airdisasters',
1487             'title': 'Air Disasters',
1488         }
1489     }]
1490
1491     def _real_extract(self, url):
1492         mobj = re.match(self._VALID_URL, url)
1493         playlist_id = mobj.group('id')
1494         webpage = self._download_webpage(
1495             url, playlist_id, 'Downloading show webpage')
1496         # There's one playlist for each season of the show
1497         m_seasons = list(re.finditer(r'href="(/playlist\?list=.*?)"', webpage))
1498         self.to_screen('%s: Found %s seasons' % (playlist_id, len(m_seasons)))
1499         entries = [
1500             self.url_result(
1501                 'https://www.youtube.com' + season.group(1), 'YoutubePlaylist')
1502             for season in m_seasons
1503         ]
1504         title = self._og_search_title(webpage, fatal=False)
1505
1506         return {
1507             '_type': 'playlist',
1508             'id': playlist_id,
1509             'title': title,
1510             'entries': entries,
1511         }
1512
1513
1514 class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
1515     """
1516     Base class for extractors that fetch info from
1517     http://www.youtube.com/feed_ajax
1518     Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
1519     """
1520     _LOGIN_REQUIRED = True
1521     # use action_load_personal_feed instead of action_load_system_feed
1522     _PERSONAL_FEED = False
1523
1524     @property
1525     def _FEED_TEMPLATE(self):
1526         action = 'action_load_system_feed'
1527         if self._PERSONAL_FEED:
1528             action = 'action_load_personal_feed'
1529         return 'https://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
1530
1531     @property
1532     def IE_NAME(self):
1533         return 'youtube:%s' % self._FEED_NAME
1534
1535     def _real_initialize(self):
1536         self._login()
1537
1538     def _real_extract(self, url):
1539         feed_entries = []
1540         paging = 0
1541         for i in itertools.count(1):
1542             info = self._download_json(self._FEED_TEMPLATE % paging,
1543                                           '%s feed' % self._FEED_NAME,
1544                                           'Downloading page %s' % i)
1545             feed_html = info.get('feed_html') or info.get('content_html')
1546             load_more_widget_html = info.get('load_more_widget_html') or feed_html
1547             m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
1548             ids = orderedSet(m.group(1) for m in m_ids)
1549             feed_entries.extend(
1550                 self.url_result(video_id, 'Youtube', video_id=video_id)
1551                 for video_id in ids)
1552             mobj = re.search(
1553                 r'data-uix-load-more-href="/?[^"]+paging=(?P<paging>\d+)',
1554                 load_more_widget_html)
1555             if mobj is None:
1556                 break
1557             paging = mobj.group('paging')
1558         return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
1559
1560 class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
1561     IE_DESC = 'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
1562     _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
1563     _FEED_NAME = 'recommended'
1564     _PLAYLIST_TITLE = 'Youtube Recommended videos'
1565
1566 class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
1567     IE_DESC = 'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
1568     _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
1569     _FEED_NAME = 'watch_later'
1570     _PLAYLIST_TITLE = 'Youtube Watch Later'
1571     _PERSONAL_FEED = True
1572
1573 class YoutubeHistoryIE(YoutubeFeedsInfoExtractor):
1574     IE_DESC = 'Youtube watch history, "ythistory" keyword (requires authentication)'
1575     _VALID_URL = 'https?://www\.youtube\.com/feed/history|:ythistory'
1576     _FEED_NAME = 'history'
1577     _PERSONAL_FEED = True
1578     _PLAYLIST_TITLE = 'Youtube Watch History'
1579
1580 class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
1581     IE_NAME = 'youtube:favorites'
1582     IE_DESC = 'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
1583     _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:ou?rites)?'
1584     _LOGIN_REQUIRED = True
1585
1586     def _real_extract(self, url):
1587         webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
1588         playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, 'favourites playlist id')
1589         return self.url_result(playlist_id, 'YoutubePlaylist')
1590
1591
1592 class YoutubeSubscriptionsIE(YoutubePlaylistIE):
1593     IE_NAME = 'youtube:subscriptions'
1594     IE_DESC = 'YouTube.com subscriptions feed, "ytsubs" keyword (requires authentication)'
1595     _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
1596     _TESTS = []
1597
1598     def _real_extract(self, url):
1599         title = 'Youtube Subscriptions'
1600         page = self._download_webpage('https://www.youtube.com/feed/subscriptions', title)
1601
1602         # The extraction process is the same as for playlists, but the regex
1603         # for the video ids doesn't contain an index
1604         ids = []
1605         more_widget_html = content_html = page
1606
1607         for page_num in itertools.count(1):
1608             matches = re.findall(r'href="\s*/watch\?v=([0-9A-Za-z_-]{11})', content_html)
1609             new_ids = orderedSet(matches)
1610             ids.extend(new_ids)
1611
1612             mobj = re.search(r'data-uix-load-more-href="/?(?P<more>[^"]+)"', more_widget_html)
1613             if not mobj:
1614                 break
1615
1616             more = self._download_json(
1617                 'https://youtube.com/%s' % mobj.group('more'), title,
1618                 'Downloading page #%s' % page_num,
1619                 transform_source=uppercase_escape)
1620             content_html = more['content_html']
1621             more_widget_html = more['load_more_widget_html']
1622
1623         return {
1624             '_type': 'playlist',
1625             'title': title,
1626             'entries': self._ids_to_results(ids),
1627         }
1628
1629
1630 class YoutubeTruncatedURLIE(InfoExtractor):
1631     IE_NAME = 'youtube:truncated_url'
1632     IE_DESC = False  # Do not list
1633     _VALID_URL = r'''(?x)
1634         (?:https?://)?[^/]+/watch\?(?:
1635             feature=[a-z_]+|
1636             annotation_id=annotation_[^&]+
1637         )?$|
1638         (?:https?://)?(?:www\.)?youtube\.com/attribution_link\?a=[^&]+$
1639     '''
1640
1641     _TESTS = [{
1642         'url': 'http://www.youtube.com/watch?annotation_id=annotation_3951667041',
1643         'only_matching': True,
1644     }, {
1645         'url': 'http://www.youtube.com/watch?',
1646         'only_matching': True,
1647     }]
1648
1649     def _real_extract(self, url):
1650         raise ExtractorError(
1651             'Did you forget to quote the URL? Remember that & is a meta '
1652             'character in most shells, so you want to put the URL in quotes, '
1653             'like  youtube-dl '
1654             '"http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc" '
1655             ' or simply  youtube-dl BaW_jenozKc  .',
1656             expected=True)