Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py

   1 from __future__ import unicode_literals
   2
   3 import base64
   4 import datetime
   5 import hashlib
   6 import json
   7 import netrc
   8 import os
   9 import random
  10 import re
  11 import socket
  12 import sys
  13 import time
  14 import math
  15
  16 from ..compat import (
  17     compat_cookiejar,
  18     compat_cookies,
  19     compat_etree_fromstring,
  20     compat_getpass,
  21     compat_http_client,
  22     compat_os_name,
  23     compat_str,
  24     compat_urllib_error,
  25     compat_urllib_parse_unquote,
  26     compat_urllib_parse_urlencode,
  27     compat_urllib_request,
  28     compat_urlparse,
  29 )
  30 from ..downloader.f4m import remove_encrypted_media
  31 from ..utils import (
  32     NO_DEFAULT,
  33     age_restricted,
  34     base_url,
  35     bug_reports_message,
  36     clean_html,
  37     compiled_regex_type,
  38     determine_ext,
  39     error_to_compat_str,
  40     ExtractorError,
  41     fix_xml_ampersands,
  42     float_or_none,
  43     GeoRestrictedError,
  44     GeoUtils,
  45     int_or_none,
  46     js_to_json,
  47     parse_iso8601,
  48     RegexNotFoundError,
  49     sanitize_filename,
  50     sanitized_Request,
  51     unescapeHTML,
  52     unified_strdate,
  53     unified_timestamp,
  54     url_basename,
  55     xpath_element,
  56     xpath_text,
  57     xpath_with_ns,
  58     determine_protocol,
  59     parse_duration,
  60     mimetype2ext,
  61     update_Request,
  62     update_url_query,
  63     parse_m3u8_attributes,
  64     extract_attributes,
  65     parse_codecs,
  66     urljoin,
  67 )
  68
  69
  70 class InfoExtractor(object):
  71     """Information Extractor class.
  72
  73     Information extractors are the classes that, given a URL, extract
  74     information about the video (or videos) the URL refers to. This
  75     information includes the real video URL, the video title, author and
  76     others. The information is stored in a dictionary which is then
  77     passed to the YoutubeDL. The YoutubeDL processes this
  78     information possibly downloading the video to the file system, among
  79     other possible outcomes.
  80
  81     The type field determines the type of the result.
  82     By far the most common value (and the default if _type is missing) is
  83     "video", which indicates a single video.
  84
  85     For a video, the dictionaries must include the following fields:
  86
  87     id:             Video identifier.
  88     title:          Video title, unescaped.
  89
  90     Additionally, it must contain either a formats entry or a url one:
  91
  92     formats:        A list of dictionaries for each format available, ordered
  93                     from worst to best quality.
  94
  95                     Potential fields:
  96                     * url        Mandatory. The URL of the video file
  97                     * manifest_url
  98                                  The URL of the manifest file in case of
  99                                  fragmented media (DASH, hls, hds)
 100                     * ext        Will be calculated from URL if missing
 101                     * format     A human-readable description of the format
 102                                  ("mp4 container with h264/opus").
 103                                  Calculated from the format_id, width, height.
 104                                  and format_note fields if missing.
 105                     * format_id  A short description of the format
 106                                  ("mp4_h264_opus" or "19").
 107                                 Technically optional, but strongly recommended.
 108                     * format_note Additional info about the format
 109                                  ("3D" or "DASH video")
 110                     * width      Width of the video, if known
 111                     * height     Height of the video, if known
 112                     * resolution Textual description of width and height
 113                     * tbr        Average bitrate of audio and video in KBit/s
 114                     * abr        Average audio bitrate in KBit/s
 115                     * acodec     Name of the audio codec in use
 116                     * asr        Audio sampling rate in Hertz
 117                     * vbr        Average video bitrate in KBit/s
 118                     * fps        Frame rate
 119                     * vcodec     Name of the video codec in use
 120                     * container  Name of the container format
 121                     * filesize   The number of bytes, if known in advance
 122                     * filesize_approx  An estimate for the number of bytes
 123                     * player_url SWF Player URL (used for rtmpdump).
 124                     * protocol   The protocol that will be used for the actual
 125                                  download, lower-case.
 126                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 127                                  "m3u8", "m3u8_native" or "http_dash_segments".
 128                     * fragment_base_url
 129                                  Base URL for fragments. Each fragment's path
 130                                  value (if present) will be relative to
 131                                  this URL.
 132                     * fragments  A list of fragments of a fragmented media.
 133                                  Each fragment entry must contain either an url
 134                                  or a path. If an url is present it should be
 135                                  considered by a client. Otherwise both path and
 136                                  fragment_base_url must be present. Here is
 137                                  the list of all potential fields:
 138                                  * "url" - fragment's URL
 139                                  * "path" - fragment's path relative to
 140                                             fragment_base_url
 141                                  * "duration" (optional, int or float)
 142                                  * "filesize" (optional, int)
 143                     * preference Order number of this format. If this field is
 144                                  present and not None, the formats get sorted
 145                                  by this field, regardless of all other values.
 146                                  -1 for default (order by other properties),
 147                                  -2 or smaller for less than default.
 148                                  < -1000 to hide the format (if there is
 149                                     another one which is strictly better)
 150                     * language   Language code, e.g. "de" or "en-US".
 151                     * language_preference  Is this in the language mentioned in
 152                                  the URL?
 153                                  10 if it's what the URL is about,
 154                                  -1 for default (don't know),
 155                                  -10 otherwise, other values reserved for now.
 156                     * quality    Order number of the video quality of this
 157                                  format, irrespective of the file format.
 158                                  -1 for default (order by other properties),
 159                                  -2 or smaller for less than default.
 160                     * source_preference  Order number for this video source
 161                                   (quality takes higher priority)
 162                                  -1 for default (order by other properties),
 163                                  -2 or smaller for less than default.
 164                     * http_headers  A dictionary of additional HTTP headers
 165                                  to add to the request.
 166                     * stretched_ratio  If given and not 1, indicates that the
 167                                  video's pixels are not square.
 168                                  width : height ratio as float.
 169                     * no_resume  The server does not support resuming the
 170                                  (HTTP or RTMP) download. Boolean.
 171
 172     url:            Final video URL.
 173     ext:            Video filename extension.
 174     format:         The video format, defaults to ext (used for --get-format)
 175     player_url:     SWF Player URL (used for rtmpdump).
 176
 177     The following fields are optional:
 178
 179     alt_title:      A secondary title of the video.
 180     display_id      An alternative identifier for the video, not necessarily
 181                     unique, but available before title. Typically, id is
 182                     something like "4234987", title "Dancing naked mole rats",
 183                     and display_id "dancing-naked-mole-rats"
 184     thumbnails:     A list of dictionaries, with the following entries:
 185                         * "id" (optional, string) - Thumbnail format ID
 186                         * "url"
 187                         * "preference" (optional, int) - quality of the image
 188                         * "width" (optional, int)
 189                         * "height" (optional, int)
 190                         * "resolution" (optional, string "{width}x{height"},
 191                                         deprecated)
 192                         * "filesize" (optional, int)
 193     thumbnail:      Full URL to a video thumbnail image.
 194     description:    Full video description.
 195     uploader:       Full name of the video uploader.
 196     license:        License name the video is licensed under.
 197     creator:        The creator of the video.
 198     release_date:   The date (YYYYMMDD) when the video was released.
 199     timestamp:      UNIX timestamp of the moment the video became available.
 200     upload_date:    Video upload date (YYYYMMDD).
 201                     If not explicitly set, calculated from timestamp.
 202     uploader_id:    Nickname or id of the video uploader.
 203     uploader_url:   Full URL to a personal webpage of the video uploader.
 204     location:       Physical location where the video was filmed.
 205     subtitles:      The available subtitles as a dictionary in the format
 206                     {tag: subformats}. "tag" is usually a language code, and
 207                     "subformats" is a list sorted from lower to higher
 208                     preference, each element is a dictionary with the "ext"
 209                     entry and one of:
 210                         * "data": The subtitles file contents
 211                         * "url": A URL pointing to the subtitles file
 212                     "ext" will be calculated from URL if missing
 213     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 214                     automatically generated captions
 215     duration:       Length of the video in seconds, as an integer or float.
 216     view_count:     How many users have watched the video on the platform.
 217     like_count:     Number of positive ratings of the video
 218     dislike_count:  Number of negative ratings of the video
 219     repost_count:   Number of reposts of the video
 220     average_rating: Average rating give by users, the scale used depends on the webpage
 221     comment_count:  Number of comments on the video
 222     comments:       A list of comments, each with one or more of the following
 223                     properties (all but one of text or html optional):
 224                         * "author" - human-readable name of the comment author
 225                         * "author_id" - user ID of the comment author
 226                         * "id" - Comment ID
 227                         * "html" - Comment as HTML
 228                         * "text" - Plain text of the comment
 229                         * "timestamp" - UNIX timestamp of comment
 230                         * "parent" - ID of the comment this one is replying to.
 231                                      Set to "root" to indicate that this is a
 232                                      comment to the original video.
 233     age_limit:      Age restriction for the video, as an integer (years)
 234     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 235                     should allow to get the same result again. (It will be set
 236                     by YoutubeDL if it's missing)
 237     categories:     A list of categories that the video falls in, for example
 238                     ["Sports", "Berlin"]
 239     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 240     is_live:        True, False, or None (=unknown). Whether this video is a
 241                     live stream that goes on instead of a fixed-length video.
 242     start_time:     Time in seconds where the reproduction should start, as
 243                     specified in the URL.
 244     end_time:       Time in seconds where the reproduction should end, as
 245                     specified in the URL.
 246
 247     The following fields should only be used when the video belongs to some logical
 248     chapter or section:
 249
 250     chapter:        Name or title of the chapter the video belongs to.
 251     chapter_number: Number of the chapter the video belongs to, as an integer.
 252     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 253
 254     The following fields should only be used when the video is an episode of some
 255     series, programme or podcast:
 256
 257     series:         Title of the series or programme the video episode belongs to.
 258     season:         Title of the season the video episode belongs to.
 259     season_number:  Number of the season the video episode belongs to, as an integer.
 260     season_id:      Id of the season the video episode belongs to, as a unicode string.
 261     episode:        Title of the video episode. Unlike mandatory video title field,
 262                     this field should denote the exact title of the video episode
 263                     without any kind of decoration.
 264     episode_number: Number of the video episode within a season, as an integer.
 265     episode_id:     Id of the video episode, as a unicode string.
 266
 267     The following fields should only be used when the media is a track or a part of
 268     a music album:
 269
 270     track:          Title of the track.
 271     track_number:   Number of the track within an album or a disc, as an integer.
 272     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 273                     as a unicode string.
 274     artist:         Artist(s) of the track.
 275     genre:          Genre(s) of the track.
 276     album:          Title of the album the track belongs to.
 277     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 278     album_artist:   List of all artists appeared on the album (e.g.
 279                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 280                     and compilations).
 281     disc_number:    Number of the disc or other physical medium the track belongs to,
 282                     as an integer.
 283     release_year:   Year (YYYY) when the album was released.
 284
 285     Unless mentioned otherwise, the fields should be Unicode strings.
 286
 287     Unless mentioned otherwise, None is equivalent to absence of information.
 288
 289
 290     _type "playlist" indicates multiple videos.
 291     There must be a key "entries", which is a list, an iterable, or a PagedList
 292     object, each element of which is a valid dictionary by this specification.
 293
 294     Additionally, playlists can have "title", "description" and "id" attributes
 295     with the same semantics as videos (see above).
 296
 297
 298     _type "multi_video" indicates that there are multiple videos that
 299     form a single show, for examples multiple acts of an opera or TV episode.
 300     It must have an entries key like a playlist and contain all the keys
 301     required for a video at the same time.
 302
 303
 304     _type "url" indicates that the video must be extracted from another
 305     location, possibly by a different extractor. Its only required key is:
 306     "url" - the next URL to extract.
 307     The key "ie_key" can be set to the class name (minus the trailing "IE",
 308     e.g. "Youtube") if the extractor class is known in advance.
 309     Additionally, the dictionary may have any properties of the resolved entity
 310     known in advance, for example "title" if the title of the referred video is
 311     known ahead of time.
 312
 313
 314     _type "url_transparent" entities have the same specification as "url", but
 315     indicate that the given additional information is more precise than the one
 316     associated with the resolved URL.
 317     This is useful when a site employs a video service that hosts the video and
 318     its technical metadata, but that video service does not embed a useful
 319     title, description etc.
 320
 321
 322     Subclasses of this one should re-define the _real_initialize() and
 323     _real_extract() methods and define a _VALID_URL regexp.
 324     Probably, they should also be added to the list of extractors.
 325
 326     _GEO_BYPASS attribute may be set to False in order to disable
 327     geo restriction bypass mechanisms for a particular extractor.
 328     Though it won't disable explicit geo restriction bypass based on
 329     country code provided with geo_bypass_country. (experimental)
 330
 331     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 332     countries for this extractor. One of these countries will be used by
 333     geo restriction bypass mechanism right away in order to bypass
 334     geo restriction, of course, if the mechanism is not disabled. (experimental)
 335
 336     NB: both these geo attributes are experimental and may change in future
 337     or be completely removed.
 338
 339     Finally, the _WORKING attribute should be set to False for broken IEs
 340     in order to warn the users and skip the tests.
 341     """
 342
 343     _ready = False
 344     _downloader = None
 345     _x_forwarded_for_ip = None
 346     _GEO_BYPASS = True
 347     _GEO_COUNTRIES = None
 348     _WORKING = True
 349
 350     def __init__(self, downloader=None):
 351         """Constructor. Receives an optional downloader."""
 352         self._ready = False
 353         self._x_forwarded_for_ip = None
 354         self.set_downloader(downloader)
 355
 356     @classmethod
 357     def suitable(cls, url):
 358         """Receives a URL and returns True if suitable for this IE."""
 359
 360         # This does not use has/getattr intentionally - we want to know whether
 361         # we have cached the regexp for *this* class, whereas getattr would also
 362         # match the superclass
 363         if '_VALID_URL_RE' not in cls.__dict__:
 364             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 365         return cls._VALID_URL_RE.match(url) is not None
 366
 367     @classmethod
 368     def _match_id(cls, url):
 369         if '_VALID_URL_RE' not in cls.__dict__:
 370             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 371         m = cls._VALID_URL_RE.match(url)
 372         assert m
 373         return m.group('id')
 374
 375     @classmethod
 376     def working(cls):
 377         """Getter method for _WORKING."""
 378         return cls._WORKING
 379
 380     def initialize(self):
 381         """Initializes an instance (authentication, etc)."""
 382         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 383         if not self._ready:
 384             self._real_initialize()
 385             self._ready = True
 386
 387     def _initialize_geo_bypass(self, countries):
 388         """
 389         Initialize geo restriction bypass mechanism.
 390
 391         This method is used to initialize geo bypass mechanism based on faking
 392         X-Forwarded-For HTTP header. A random country from provided country list
 393         is selected and a random IP belonging to this country is generated. This
 394         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 395         HTTP requests.
 396
 397         This method will be used for initial geo bypass mechanism initialization
 398         during the instance initialization with _GEO_COUNTRIES.
 399
 400         You may also manually call it from extractor's code if geo countries
 401         information is not available beforehand (e.g. obtained during
 402         extraction) or due to some another reason.
 403         """
 404         if not self._x_forwarded_for_ip:
 405             country_code = self._downloader.params.get('geo_bypass_country', None)
 406             # If there is no explicit country for geo bypass specified and
 407             # the extractor is known to be geo restricted let's fake IP
 408             # as X-Forwarded-For right away.
 409             if (not country_code and
 410                     self._GEO_BYPASS and
 411                     self._downloader.params.get('geo_bypass', True) and
 412                     countries):
 413                 country_code = random.choice(countries)
 414             if country_code:
 415                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 416                 if self._downloader.params.get('verbose', False):
 417                     self._downloader.to_stdout(
 418                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 419                         % (self._x_forwarded_for_ip, country_code.upper()))
 420
 421     def extract(self, url):
 422         """Extracts URL information and returns it in list of dicts."""
 423         try:
 424             for _ in range(2):
 425                 try:
 426                     self.initialize()
 427                     ie_result = self._real_extract(url)
 428                     if self._x_forwarded_for_ip:
 429                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 430                     return ie_result
 431                 except GeoRestrictedError as e:
 432                     if self.__maybe_fake_ip_and_retry(e.countries):
 433                         continue
 434                     raise
 435         except ExtractorError:
 436             raise
 437         except compat_http_client.IncompleteRead as e:
 438             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 439         except (KeyError, StopIteration) as e:
 440             raise ExtractorError('An extractor error has occurred.', cause=e)
 441
 442     def __maybe_fake_ip_and_retry(self, countries):
 443         if (not self._downloader.params.get('geo_bypass_country', None) and
 444                 self._GEO_BYPASS and
 445                 self._downloader.params.get('geo_bypass', True) and
 446                 not self._x_forwarded_for_ip and
 447                 countries):
 448             country_code = random.choice(countries)
 449             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 450             if self._x_forwarded_for_ip:
 451                 self.report_warning(
 452                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 453                     % (self._x_forwarded_for_ip, country_code.upper()))
 454                 return True
 455         return False
 456
 457     def set_downloader(self, downloader):
 458         """Sets the downloader for this IE."""
 459         self._downloader = downloader
 460
 461     def _real_initialize(self):
 462         """Real initialization process. Redefine in subclasses."""
 463         pass
 464
 465     def _real_extract(self, url):
 466         """Real extraction process. Redefine in subclasses."""
 467         pass
 468
 469     @classmethod
 470     def ie_key(cls):
 471         """A string for getting the InfoExtractor with get_info_extractor"""
 472         return compat_str(cls.__name__[:-2])
 473
 474     @property
 475     def IE_NAME(self):
 476         return compat_str(type(self).__name__[:-2])
 477
 478     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 479         """ Returns the response handle """
 480         if note is None:
 481             self.report_download_webpage(video_id)
 482         elif note is not False:
 483             if video_id is None:
 484                 self.to_screen('%s' % (note,))
 485             else:
 486                 self.to_screen('%s: %s' % (video_id, note))
 487         if isinstance(url_or_request, compat_urllib_request.Request):
 488             url_or_request = update_Request(
 489                 url_or_request, data=data, headers=headers, query=query)
 490         else:
 491             if query:
 492                 url_or_request = update_url_query(url_or_request, query)
 493             if data is not None or headers:
 494                 url_or_request = sanitized_Request(url_or_request, data, headers)
 495         try:
 496             return self._downloader.urlopen(url_or_request)
 497         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 498             if errnote is False:
 499                 return False
 500             if errnote is None:
 501                 errnote = 'Unable to download webpage'
 502
 503             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 504             if fatal:
 505                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 506             else:
 507                 self._downloader.report_warning(errmsg)
 508                 return False
 509
 510     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 511         """ Returns a tuple (page content as string, URL handle) """
 512         # Strip hashes from the URL (#1038)
 513         if isinstance(url_or_request, (compat_str, str)):
 514             url_or_request = url_or_request.partition('#')[0]
 515
 516         # Some sites check X-Forwarded-For HTTP header in order to figure out
 517         # the origin of the client behind proxy. This allows bypassing geo
 518         # restriction by faking this header's value to IP that belongs to some
 519         # geo unrestricted country. We will do so once we encounter any
 520         # geo restriction error.
 521         if self._x_forwarded_for_ip:
 522             if 'X-Forwarded-For' not in headers:
 523                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 524
 525         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 526         if urlh is False:
 527             assert not fatal
 528             return False
 529         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 530         return (content, urlh)
 531
 532     @staticmethod
 533     def _guess_encoding_from_content(content_type, webpage_bytes):
 534         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 535         if m:
 536             encoding = m.group(1)
 537         else:
 538             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 539                           webpage_bytes[:1024])
 540             if m:
 541                 encoding = m.group(1).decode('ascii')
 542             elif webpage_bytes.startswith(b'\xff\xfe'):
 543                 encoding = 'utf-16'
 544             else:
 545                 encoding = 'utf-8'
 546
 547         return encoding
 548
 549     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 550         content_type = urlh.headers.get('Content-Type', '')
 551         webpage_bytes = urlh.read()
 552         if prefix is not None:
 553             webpage_bytes = prefix + webpage_bytes
 554         if not encoding:
 555             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 556         if self._downloader.params.get('dump_intermediate_pages', False):
 557             try:
 558                 url = url_or_request.get_full_url()
 559             except AttributeError:
 560                 url = url_or_request
 561             self.to_screen('Dumping request to ' + url)
 562             dump = base64.b64encode(webpage_bytes).decode('ascii')
 563             self._downloader.to_screen(dump)
 564         if self._downloader.params.get('write_pages', False):
 565             try:
 566                 url = url_or_request.get_full_url()
 567             except AttributeError:
 568                 url = url_or_request
 569             basen = '%s_%s' % (video_id, url)
 570             if len(basen) > 240:
 571                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 572                 basen = basen[:240 - len(h)] + h
 573             raw_filename = basen + '.dump'
 574             filename = sanitize_filename(raw_filename, restricted=True)
 575             self.to_screen('Saving request to ' + filename)
 576             # Working around MAX_PATH limitation on Windows (see
 577             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 578             if compat_os_name == 'nt':
 579                 absfilepath = os.path.abspath(filename)
 580                 if len(absfilepath) > 259:
 581                     filename = '\\\\?\\' + absfilepath
 582             with open(filename, 'wb') as outf:
 583                 outf.write(webpage_bytes)
 584
 585         try:
 586             content = webpage_bytes.decode(encoding, 'replace')
 587         except LookupError:
 588             content = webpage_bytes.decode('utf-8', 'replace')
 589
 590         if ('<title>Access to this site is blocked</title>' in content and
 591                 'Websense' in content[:512]):
 592             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 593             blocked_iframe = self._html_search_regex(
 594                 r'<iframe src="([^"]+)"', content,
 595                 'Websense information URL', default=None)
 596             if blocked_iframe:
 597                 msg += ' Visit %s for more details' % blocked_iframe
 598             raise ExtractorError(msg, expected=True)
 599         if '<title>The URL you requested has been blocked</title>' in content[:512]:
 600             msg = (
 601                 'Access to this webpage has been blocked by Indian censorship. '
 602                 'Use a VPN or proxy server (with --proxy) to route around it.')
 603             block_msg = self._html_search_regex(
 604                 r'</h1><p>(.*?)</p>',
 605                 content, 'block message', default=None)
 606             if block_msg:
 607                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 608             raise ExtractorError(msg, expected=True)
 609
 610         return content
 611
 612     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 613         """ Returns the data of the page as a string """
 614         success = False
 615         try_count = 0
 616         while success is False:
 617             try:
 618                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 619                 success = True
 620             except compat_http_client.IncompleteRead as e:
 621                 try_count += 1
 622                 if try_count >= tries:
 623                     raise e
 624                 self._sleep(timeout, video_id)
 625         if res is False:
 626             return res
 627         else:
 628             content, _ = res
 629             return content
 630
 631     def _download_xml(self, url_or_request, video_id,
 632                       note='Downloading XML', errnote='Unable to download XML',
 633                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 634         """Return the xml as an xml.etree.ElementTree.Element"""
 635         xml_string = self._download_webpage(
 636             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 637         if xml_string is False:
 638             return xml_string
 639         if transform_source:
 640             xml_string = transform_source(xml_string)
 641         return compat_etree_fromstring(xml_string.encode('utf-8'))
 642
 643     def _download_json(self, url_or_request, video_id,
 644                        note='Downloading JSON metadata',
 645                        errnote='Unable to download JSON metadata',
 646                        transform_source=None,
 647                        fatal=True, encoding=None, data=None, headers={}, query={}):
 648         json_string = self._download_webpage(
 649             url_or_request, video_id, note, errnote, fatal=fatal,
 650             encoding=encoding, data=data, headers=headers, query=query)
 651         if (not fatal) and json_string is False:
 652             return None
 653         return self._parse_json(
 654             json_string, video_id, transform_source=transform_source, fatal=fatal)
 655
 656     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 657         if transform_source:
 658             json_string = transform_source(json_string)
 659         try:
 660             return json.loads(json_string)
 661         except ValueError as ve:
 662             errmsg = '%s: Failed to parse JSON ' % video_id
 663             if fatal:
 664                 raise ExtractorError(errmsg, cause=ve)
 665             else:
 666                 self.report_warning(errmsg + str(ve))
 667
 668     def report_warning(self, msg, video_id=None):
 669         idstr = '' if video_id is None else '%s: ' % video_id
 670         self._downloader.report_warning(
 671             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 672
 673     def to_screen(self, msg):
 674         """Print msg to screen, prefixing it with '[ie_name]'"""
 675         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 676
 677     def report_extraction(self, id_or_name):
 678         """Report information extraction."""
 679         self.to_screen('%s: Extracting information' % id_or_name)
 680
 681     def report_download_webpage(self, video_id):
 682         """Report webpage download."""
 683         self.to_screen('%s: Downloading webpage' % video_id)
 684
 685     def report_age_confirmation(self):
 686         """Report attempt to confirm age."""
 687         self.to_screen('Confirming age')
 688
 689     def report_login(self):
 690         """Report attempt to log in."""
 691         self.to_screen('Logging in')
 692
 693     @staticmethod
 694     def raise_login_required(msg='This video is only available for registered users'):
 695         raise ExtractorError(
 696             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 697             expected=True)
 698
 699     @staticmethod
 700     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 701         raise GeoRestrictedError(msg, countries=countries)
 702
 703     # Methods for following #608
 704     @staticmethod
 705     def url_result(url, ie=None, video_id=None, video_title=None):
 706         """Returns a URL that points to a page that should be processed"""
 707         # TODO: ie should be the class used for getting the info
 708         video_info = {'_type': 'url',
 709                       'url': url,
 710                       'ie_key': ie}
 711         if video_id is not None:
 712             video_info['id'] = video_id
 713         if video_title is not None:
 714             video_info['title'] = video_title
 715         return video_info
 716
 717     @staticmethod
 718     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 719         """Returns a playlist"""
 720         video_info = {'_type': 'playlist',
 721                       'entries': entries}
 722         if playlist_id:
 723             video_info['id'] = playlist_id
 724         if playlist_title:
 725             video_info['title'] = playlist_title
 726         if playlist_description:
 727             video_info['description'] = playlist_description
 728         return video_info
 729
 730     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 731         """
 732         Perform a regex search on the given string, using a single or a list of
 733         patterns returning the first matching group.
 734         In case of failure return a default value or raise a WARNING or a
 735         RegexNotFoundError, depending on fatal, specifying the field name.
 736         """
 737         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 738             mobj = re.search(pattern, string, flags)
 739         else:
 740             for p in pattern:
 741                 mobj = re.search(p, string, flags)
 742                 if mobj:
 743                     break
 744
 745         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 746             _name = '\033[0;34m%s\033[0m' % name
 747         else:
 748             _name = name
 749
 750         if mobj:
 751             if group is None:
 752                 # return the first matching group
 753                 return next(g for g in mobj.groups() if g is not None)
 754             else:
 755                 return mobj.group(group)
 756         elif default is not NO_DEFAULT:
 757             return default
 758         elif fatal:
 759             raise RegexNotFoundError('Unable to extract %s' % _name)
 760         else:
 761             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 762             return None
 763
 764     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 765         """
 766         Like _search_regex, but strips HTML tags and unescapes entities.
 767         """
 768         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 769         if res:
 770             return clean_html(res).strip()
 771         else:
 772             return res
 773
 774     def _get_netrc_login_info(self, netrc_machine=None):
 775         username = None
 776         password = None
 777         netrc_machine = netrc_machine or self._NETRC_MACHINE
 778
 779         if self._downloader.params.get('usenetrc', False):
 780             try:
 781                 info = netrc.netrc().authenticators(netrc_machine)
 782                 if info is not None:
 783                     username = info[0]
 784                     password = info[2]
 785                 else:
 786                     raise netrc.NetrcParseError(
 787                         'No authenticators for %s' % netrc_machine)
 788             except (IOError, netrc.NetrcParseError) as err:
 789                 self._downloader.report_warning(
 790                     'parsing .netrc: %s' % error_to_compat_str(err))
 791
 792         return username, password
 793
 794     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 795         """
 796         Get the login info as (username, password)
 797         First look for the manually specified credentials using username_option
 798         and password_option as keys in params dictionary. If no such credentials
 799         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 800         value.
 801         If there's no info available, return (None, None)
 802         """
 803         if self._downloader is None:
 804             return (None, None)
 805
 806         downloader_params = self._downloader.params
 807
 808         # Attempt to use provided username and password or .netrc data
 809         if downloader_params.get(username_option) is not None:
 810             username = downloader_params[username_option]
 811             password = downloader_params[password_option]
 812         else:
 813             username, password = self._get_netrc_login_info(netrc_machine)
 814
 815         return username, password
 816
 817     def _get_tfa_info(self, note='two-factor verification code'):
 818         """
 819         Get the two-factor authentication info
 820         TODO - asking the user will be required for sms/phone verify
 821         currently just uses the command line option
 822         If there's no info available, return None
 823         """
 824         if self._downloader is None:
 825             return None
 826         downloader_params = self._downloader.params
 827
 828         if downloader_params.get('twofactor') is not None:
 829             return downloader_params['twofactor']
 830
 831         return compat_getpass('Type %s and press [Return]: ' % note)
 832
 833     # Helper functions for extracting OpenGraph info
 834     @staticmethod
 835     def _og_regexes(prop):
 836         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 837         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 838                        % {'prop': re.escape(prop)})
 839         template = r'<meta[^>]+?%s[^>]+?%s'
 840         return [
 841             template % (property_re, content_re),
 842             template % (content_re, property_re),
 843         ]
 844
 845     @staticmethod
 846     def _meta_regex(prop):
 847         return r'''(?isx)<meta
 848                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 849                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 850
 851     def _og_search_property(self, prop, html, name=None, **kargs):
 852         if not isinstance(prop, (list, tuple)):
 853             prop = [prop]
 854         if name is None:
 855             name = 'OpenGraph %s' % prop[0]
 856         og_regexes = []
 857         for p in prop:
 858             og_regexes.extend(self._og_regexes(p))
 859         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 860         if escaped is None:
 861             return None
 862         return unescapeHTML(escaped)
 863
 864     def _og_search_thumbnail(self, html, **kargs):
 865         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 866
 867     def _og_search_description(self, html, **kargs):
 868         return self._og_search_property('description', html, fatal=False, **kargs)
 869
 870     def _og_search_title(self, html, **kargs):
 871         return self._og_search_property('title', html, **kargs)
 872
 873     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 874         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 875         if secure:
 876             regexes = self._og_regexes('video:secure_url') + regexes
 877         return self._html_search_regex(regexes, html, name, **kargs)
 878
 879     def _og_search_url(self, html, **kargs):
 880         return self._og_search_property('url', html, **kargs)
 881
 882     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 883         if not isinstance(name, (list, tuple)):
 884             name = [name]
 885         if display_name is None:
 886             display_name = name[0]
 887         return self._html_search_regex(
 888             [self._meta_regex(n) for n in name],
 889             html, display_name, fatal=fatal, group='content', **kwargs)
 890
 891     def _dc_search_uploader(self, html):
 892         return self._html_search_meta('dc.creator', html, 'uploader')
 893
 894     def _rta_search(self, html):
 895         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 896         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 897                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 898                      html):
 899             return 18
 900         return 0
 901
 902     def _media_rating_search(self, html):
 903         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 904         rating = self._html_search_meta('rating', html)
 905
 906         if not rating:
 907             return None
 908
 909         RATING_TABLE = {
 910             'safe for kids': 0,
 911             'general': 8,
 912             '14 years': 14,
 913             'mature': 17,
 914             'restricted': 19,
 915         }
 916         return RATING_TABLE.get(rating.lower())
 917
 918     def _family_friendly_search(self, html):
 919         # See http://schema.org/VideoObject
 920         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 921
 922         if not family_friendly:
 923             return None
 924
 925         RATING_TABLE = {
 926             '1': 0,
 927             'true': 0,
 928             '0': 18,
 929             'false': 18,
 930         }
 931         return RATING_TABLE.get(family_friendly.lower())
 932
 933     def _twitter_search_player(self, html):
 934         return self._html_search_meta('twitter:player', html,
 935                                       'twitter card player')
 936
 937     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 938         json_ld = self._search_regex(
 939             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 940             html, 'JSON-LD', group='json_ld', **kwargs)
 941         default = kwargs.get('default', NO_DEFAULT)
 942         if not json_ld:
 943             return default if default is not NO_DEFAULT else {}
 944         # JSON-LD may be malformed and thus `fatal` should be respected.
 945         # At the same time `default` may be passed that assumes `fatal=False`
 946         # for _search_regex. Let's simulate the same behavior here as well.
 947         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 948         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 949
 950     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 951         if isinstance(json_ld, compat_str):
 952             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 953         if not json_ld:
 954             return {}
 955         info = {}
 956         if not isinstance(json_ld, (list, tuple, dict)):
 957             return info
 958         if isinstance(json_ld, dict):
 959             json_ld = [json_ld]
 960         for e in json_ld:
 961             if e.get('@context') == 'http://schema.org':
 962                 item_type = e.get('@type')
 963                 if expected_type is not None and expected_type != item_type:
 964                     return info
 965                 if item_type == 'TVEpisode':
 966                     info.update({
 967                         'episode': unescapeHTML(e.get('name')),
 968                         'episode_number': int_or_none(e.get('episodeNumber')),
 969                         'description': unescapeHTML(e.get('description')),
 970                     })
 971                     part_of_season = e.get('partOfSeason')
 972                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
 973                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
 974                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
 975                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
 976                         info['series'] = unescapeHTML(part_of_series.get('name'))
 977                 elif item_type == 'Article':
 978                     info.update({
 979                         'timestamp': parse_iso8601(e.get('datePublished')),
 980                         'title': unescapeHTML(e.get('headline')),
 981                         'description': unescapeHTML(e.get('articleBody')),
 982                     })
 983                 elif item_type == 'VideoObject':
 984                     info.update({
 985                         'url': e.get('contentUrl'),
 986                         'title': unescapeHTML(e.get('name')),
 987                         'description': unescapeHTML(e.get('description')),
 988                         'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
 989                         'duration': parse_duration(e.get('duration')),
 990                         'timestamp': unified_timestamp(e.get('uploadDate')),
 991                         'filesize': float_or_none(e.get('contentSize')),
 992                         'tbr': int_or_none(e.get('bitrate')),
 993                         'width': int_or_none(e.get('width')),
 994                         'height': int_or_none(e.get('height')),
 995                     })
 996                 break
 997         return dict((k, v) for k, v in info.items() if v is not None)
 998
 999     @staticmethod
1000     def _hidden_inputs(html):
1001         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1002         hidden_inputs = {}
1003         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1004             attrs = extract_attributes(input)
1005             if not input:
1006                 continue
1007             if attrs.get('type') not in ('hidden', 'submit'):
1008                 continue
1009             name = attrs.get('name') or attrs.get('id')
1010             value = attrs.get('value')
1011             if name and value is not None:
1012                 hidden_inputs[name] = value
1013         return hidden_inputs
1014
1015     def _form_hidden_inputs(self, form_id, html):
1016         form = self._search_regex(
1017             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1018             html, '%s form' % form_id, group='form')
1019         return self._hidden_inputs(form)
1020
1021     def _sort_formats(self, formats, field_preference=None):
1022         if not formats:
1023             raise ExtractorError('No video formats found')
1024
1025         for f in formats:
1026             # Automatically determine tbr when missing based on abr and vbr (improves
1027             # formats sorting in some cases)
1028             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1029                 f['tbr'] = f['abr'] + f['vbr']
1030
1031         def _formats_key(f):
1032             # TODO remove the following workaround
1033             from ..utils import determine_ext
1034             if not f.get('ext') and 'url' in f:
1035                 f['ext'] = determine_ext(f['url'])
1036
1037             if isinstance(field_preference, (list, tuple)):
1038                 return tuple(
1039                     f.get(field)
1040                     if f.get(field) is not None
1041                     else ('' if field == 'format_id' else -1)
1042                     for field in field_preference)
1043
1044             preference = f.get('preference')
1045             if preference is None:
1046                 preference = 0
1047                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1048                     preference -= 0.5
1049
1050             protocol = f.get('protocol') or determine_protocol(f)
1051             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1052
1053             if f.get('vcodec') == 'none':  # audio only
1054                 preference -= 50
1055                 if self._downloader.params.get('prefer_free_formats'):
1056                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1057                 else:
1058                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1059                 ext_preference = 0
1060                 try:
1061                     audio_ext_preference = ORDER.index(f['ext'])
1062                 except ValueError:
1063                     audio_ext_preference = -1
1064             else:
1065                 if f.get('acodec') == 'none':  # video only
1066                     preference -= 40
1067                 if self._downloader.params.get('prefer_free_formats'):
1068                     ORDER = ['flv', 'mp4', 'webm']
1069                 else:
1070                     ORDER = ['webm', 'flv', 'mp4']
1071                 try:
1072                     ext_preference = ORDER.index(f['ext'])
1073                 except ValueError:
1074                     ext_preference = -1
1075                 audio_ext_preference = 0
1076
1077             return (
1078                 preference,
1079                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1080                 f.get('quality') if f.get('quality') is not None else -1,
1081                 f.get('tbr') if f.get('tbr') is not None else -1,
1082                 f.get('filesize') if f.get('filesize') is not None else -1,
1083                 f.get('vbr') if f.get('vbr') is not None else -1,
1084                 f.get('height') if f.get('height') is not None else -1,
1085                 f.get('width') if f.get('width') is not None else -1,
1086                 proto_preference,
1087                 ext_preference,
1088                 f.get('abr') if f.get('abr') is not None else -1,
1089                 audio_ext_preference,
1090                 f.get('fps') if f.get('fps') is not None else -1,
1091                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1092                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1093                 f.get('format_id') if f.get('format_id') is not None else '',
1094             )
1095         formats.sort(key=_formats_key)
1096
1097     def _check_formats(self, formats, video_id):
1098         if formats:
1099             formats[:] = filter(
1100                 lambda f: self._is_valid_url(
1101                     f['url'], video_id,
1102                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1103                 formats)
1104
1105     @staticmethod
1106     def _remove_duplicate_formats(formats):
1107         format_urls = set()
1108         unique_formats = []
1109         for f in formats:
1110             if f['url'] not in format_urls:
1111                 format_urls.add(f['url'])
1112                 unique_formats.append(f)
1113         formats[:] = unique_formats
1114
1115     def _is_valid_url(self, url, video_id, item='video', headers={}):
1116         url = self._proto_relative_url(url, scheme='http:')
1117         # For now assume non HTTP(S) URLs always valid
1118         if not (url.startswith('http://') or url.startswith('https://')):
1119             return True
1120         try:
1121             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1122             return True
1123         except ExtractorError as e:
1124             if isinstance(e.cause, compat_urllib_error.URLError):
1125                 self.to_screen(
1126                     '%s: %s URL is invalid, skipping' % (video_id, item))
1127                 return False
1128             raise
1129
1130     def http_scheme(self):
1131         """ Either "http:" or "https:", depending on the user's preferences """
1132         return (
1133             'http:'
1134             if self._downloader.params.get('prefer_insecure', False)
1135             else 'https:')
1136
1137     def _proto_relative_url(self, url, scheme=None):
1138         if url is None:
1139             return url
1140         if url.startswith('//'):
1141             if scheme is None:
1142                 scheme = self.http_scheme()
1143             return scheme + url
1144         else:
1145             return url
1146
1147     def _sleep(self, timeout, video_id, msg_template=None):
1148         if msg_template is None:
1149             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1150         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1151         self.to_screen(msg)
1152         time.sleep(timeout)
1153
1154     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1155                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1156                              fatal=True, m3u8_id=None):
1157         manifest = self._download_xml(
1158             manifest_url, video_id, 'Downloading f4m manifest',
1159             'Unable to download f4m manifest',
1160             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1161             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1162             transform_source=transform_source,
1163             fatal=fatal)
1164
1165         if manifest is False:
1166             return []
1167
1168         return self._parse_f4m_formats(
1169             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1170             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1171
1172     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1173                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1174                            fatal=True, m3u8_id=None):
1175         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1176         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1177         if akamai_pv is not None and ';' in akamai_pv.text:
1178             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1179             if playerVerificationChallenge.strip() != '':
1180                 return []
1181
1182         formats = []
1183         manifest_version = '1.0'
1184         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1185         if not media_nodes:
1186             manifest_version = '2.0'
1187             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1188         # Remove unsupported DRM protected media from final formats
1189         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1190         media_nodes = remove_encrypted_media(media_nodes)
1191         if not media_nodes:
1192             return formats
1193         base_url = xpath_text(
1194             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1195             'base URL', default=None)
1196         if base_url:
1197             base_url = base_url.strip()
1198
1199         bootstrap_info = xpath_element(
1200             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1201             'bootstrap info', default=None)
1202
1203         vcodec = None
1204         mime_type = xpath_text(
1205             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1206             'base URL', default=None)
1207         if mime_type and mime_type.startswith('audio/'):
1208             vcodec = 'none'
1209
1210         for i, media_el in enumerate(media_nodes):
1211             tbr = int_or_none(media_el.attrib.get('bitrate'))
1212             width = int_or_none(media_el.attrib.get('width'))
1213             height = int_or_none(media_el.attrib.get('height'))
1214             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1215             # If <bootstrapInfo> is present, the specified f4m is a
1216             # stream-level manifest, and only set-level manifests may refer to
1217             # external resources.  See section 11.4 and section 4 of F4M spec
1218             if bootstrap_info is None:
1219                 media_url = None
1220                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1221                 if manifest_version == '2.0':
1222                     media_url = media_el.attrib.get('href')
1223                 if media_url is None:
1224                     media_url = media_el.attrib.get('url')
1225                 if not media_url:
1226                     continue
1227                 manifest_url = (
1228                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1229                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1230                 # If media_url is itself a f4m manifest do the recursive extraction
1231                 # since bitrates in parent manifest (this one) and media_url manifest
1232                 # may differ leading to inability to resolve the format by requested
1233                 # bitrate in f4m downloader
1234                 ext = determine_ext(manifest_url)
1235                 if ext == 'f4m':
1236                     f4m_formats = self._extract_f4m_formats(
1237                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1238                         transform_source=transform_source, fatal=fatal)
1239                     # Sometimes stream-level manifest contains single media entry that
1240                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1241                     # At the same time parent's media entry in set-level manifest may
1242                     # contain it. We will copy it from parent in such cases.
1243                     if len(f4m_formats) == 1:
1244                         f = f4m_formats[0]
1245                         f.update({
1246                             'tbr': f.get('tbr') or tbr,
1247                             'width': f.get('width') or width,
1248                             'height': f.get('height') or height,
1249                             'format_id': f.get('format_id') if not tbr else format_id,
1250                             'vcodec': vcodec,
1251                         })
1252                     formats.extend(f4m_formats)
1253                     continue
1254                 elif ext == 'm3u8':
1255                     formats.extend(self._extract_m3u8_formats(
1256                         manifest_url, video_id, 'mp4', preference=preference,
1257                         m3u8_id=m3u8_id, fatal=fatal))
1258                     continue
1259             formats.append({
1260                 'format_id': format_id,
1261                 'url': manifest_url,
1262                 'manifest_url': manifest_url,
1263                 'ext': 'flv' if bootstrap_info is not None else None,
1264                 'tbr': tbr,
1265                 'width': width,
1266                 'height': height,
1267                 'vcodec': vcodec,
1268                 'preference': preference,
1269             })
1270         return formats
1271
1272     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1273         return {
1274             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1275             'url': m3u8_url,
1276             'ext': ext,
1277             'protocol': 'm3u8',
1278             'preference': preference - 100 if preference else -100,
1279             'resolution': 'multiple',
1280             'format_note': 'Quality selection URL',
1281         }
1282
1283     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1284                               entry_protocol='m3u8', preference=None,
1285                               m3u8_id=None, note=None, errnote=None,
1286                               fatal=True, live=False):
1287
1288         res = self._download_webpage_handle(
1289             m3u8_url, video_id,
1290             note=note or 'Downloading m3u8 information',
1291             errnote=errnote or 'Failed to download m3u8 information',
1292             fatal=fatal)
1293         if res is False:
1294             return []
1295         m3u8_doc, urlh = res
1296         m3u8_url = urlh.geturl()
1297
1298         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1299             return []
1300
1301         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
1302
1303         format_url = lambda u: (
1304             u
1305             if re.match(r'^https?://', u)
1306             else compat_urlparse.urljoin(m3u8_url, u))
1307
1308         # We should try extracting formats only from master playlists [1], i.e.
1309         # playlists that describe available qualities. On the other hand media
1310         # playlists [2] should be returned as is since they contain just the media
1311         # without qualities renditions.
1312         # Fortunately, master playlist can be easily distinguished from media
1313         # playlist based on particular tags availability. As of [1, 2] master
1314         # playlist tags MUST NOT appear in a media playist and vice versa.
1315         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
1316         # and MUST NOT appear in master playlist thus we can clearly detect media
1317         # playlist with this criterion.
1318         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
1319         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
1320         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
1321         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1322             return [{
1323                 'url': m3u8_url,
1324                 'format_id': m3u8_id,
1325                 'ext': ext,
1326                 'protocol': entry_protocol,
1327                 'preference': preference,
1328             }]
1329         audio_in_video_stream = {}
1330         last_info = {}
1331         last_media = {}
1332         for line in m3u8_doc.splitlines():
1333             if line.startswith('#EXT-X-STREAM-INF:'):
1334                 last_info = parse_m3u8_attributes(line)
1335             elif line.startswith('#EXT-X-MEDIA:'):
1336                 media = parse_m3u8_attributes(line)
1337                 media_type = media.get('TYPE')
1338                 if media_type in ('VIDEO', 'AUDIO'):
1339                     group_id = media.get('GROUP-ID')
1340                     media_url = media.get('URI')
1341                     if media_url:
1342                         format_id = []
1343                         for v in (group_id, media.get('NAME')):
1344                             if v:
1345                                 format_id.append(v)
1346                         f = {
1347                             'format_id': '-'.join(format_id),
1348                             'url': format_url(media_url),
1349                             'language': media.get('LANGUAGE'),
1350                             'ext': ext,
1351                             'protocol': entry_protocol,
1352                             'preference': preference,
1353                         }
1354                         if media_type == 'AUDIO':
1355                             f['vcodec'] = 'none'
1356                             if group_id and not audio_in_video_stream.get(group_id):
1357                                 audio_in_video_stream[group_id] = False
1358                         formats.append(f)
1359                     else:
1360                         # When there is no URI in EXT-X-MEDIA let this tag's
1361                         # data be used by regular URI lines below
1362                         last_media = media
1363                         if media_type == 'AUDIO' and group_id:
1364                             audio_in_video_stream[group_id] = True
1365             elif line.startswith('#') or not line.strip():
1366                 continue
1367             else:
1368                 tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
1369                 format_id = []
1370                 if m3u8_id:
1371                     format_id.append(m3u8_id)
1372                 # Despite specification does not mention NAME attribute for
1373                 # EXT-X-STREAM-INF it still sometimes may be present
1374                 stream_name = last_info.get('NAME') or last_media.get('NAME')
1375                 # Bandwidth of live streams may differ over time thus making
1376                 # format_id unpredictable. So it's better to keep provided
1377                 # format_id intact.
1378                 if not live:
1379                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1380                 manifest_url = format_url(line.strip())
1381                 f = {
1382                     'format_id': '-'.join(format_id),
1383                     'url': manifest_url,
1384                     'manifest_url': manifest_url,
1385                     'tbr': tbr,
1386                     'ext': ext,
1387                     'fps': float_or_none(last_info.get('FRAME-RATE')),
1388                     'protocol': entry_protocol,
1389                     'preference': preference,
1390                 }
1391                 resolution = last_info.get('RESOLUTION')
1392                 if resolution:
1393                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1394                     if mobj:
1395                         f['width'] = int(mobj.group('width'))
1396                         f['height'] = int(mobj.group('height'))
1397                 # Unified Streaming Platform
1398                 mobj = re.search(
1399                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1400                 if mobj:
1401                     abr, vbr = mobj.groups()
1402                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1403                     f.update({
1404                         'vbr': vbr,
1405                         'abr': abr,
1406                     })
1407                 f.update(parse_codecs(last_info.get('CODECS')))
1408                 if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
1409                     # TODO: update acodec for audio only formats with the same GROUP-ID
1410                     f['acodec'] = 'none'
1411                 formats.append(f)
1412                 last_info = {}
1413                 last_media = {}
1414         return formats
1415
1416     @staticmethod
1417     def _xpath_ns(path, namespace=None):
1418         if not namespace:
1419             return path
1420         out = []
1421         for c in path.split('/'):
1422             if not c or c == '.':
1423                 out.append(c)
1424             else:
1425                 out.append('{%s}%s' % (namespace, c))
1426         return '/'.join(out)
1427
1428     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1429         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1430
1431         if smil is False:
1432             assert not fatal
1433             return []
1434
1435         namespace = self._parse_smil_namespace(smil)
1436
1437         return self._parse_smil_formats(
1438             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1439
1440     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1441         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1442         if smil is False:
1443             return {}
1444         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1445
1446     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1447         return self._download_xml(
1448             smil_url, video_id, 'Downloading SMIL file',
1449             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1450
1451     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1452         namespace = self._parse_smil_namespace(smil)
1453
1454         formats = self._parse_smil_formats(
1455             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1456         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1457
1458         video_id = os.path.splitext(url_basename(smil_url))[0]
1459         title = None
1460         description = None
1461         upload_date = None
1462         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1463             name = meta.attrib.get('name')
1464             content = meta.attrib.get('content')
1465             if not name or not content:
1466                 continue
1467             if not title and name == 'title':
1468                 title = content
1469             elif not description and name in ('description', 'abstract'):
1470                 description = content
1471             elif not upload_date and name == 'date':
1472                 upload_date = unified_strdate(content)
1473
1474         thumbnails = [{
1475             'id': image.get('type'),
1476             'url': image.get('src'),
1477             'width': int_or_none(image.get('width')),
1478             'height': int_or_none(image.get('height')),
1479         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1480
1481         return {
1482             'id': video_id,
1483             'title': title or video_id,
1484             'description': description,
1485             'upload_date': upload_date,
1486             'thumbnails': thumbnails,
1487             'formats': formats,
1488             'subtitles': subtitles,
1489         }
1490
1491     def _parse_smil_namespace(self, smil):
1492         return self._search_regex(
1493             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1494
1495     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1496         base = smil_url
1497         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1498             b = meta.get('base') or meta.get('httpBase')
1499             if b:
1500                 base = b
1501                 break
1502
1503         formats = []
1504         rtmp_count = 0
1505         http_count = 0
1506         m3u8_count = 0
1507
1508         srcs = []
1509         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1510         for medium in media:
1511             src = medium.get('src')
1512             if not src or src in srcs:
1513                 continue
1514             srcs.append(src)
1515
1516             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1517             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1518             width = int_or_none(medium.get('width'))
1519             height = int_or_none(medium.get('height'))
1520             proto = medium.get('proto')
1521             ext = medium.get('ext')
1522             src_ext = determine_ext(src)
1523             streamer = medium.get('streamer') or base
1524
1525             if proto == 'rtmp' or streamer.startswith('rtmp'):
1526                 rtmp_count += 1
1527                 formats.append({
1528                     'url': streamer,
1529                     'play_path': src,
1530                     'ext': 'flv',
1531                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1532                     'tbr': bitrate,
1533                     'filesize': filesize,
1534                     'width': width,
1535                     'height': height,
1536                 })
1537                 if transform_rtmp_url:
1538                     streamer, src = transform_rtmp_url(streamer, src)
1539                     formats[-1].update({
1540                         'url': streamer,
1541                         'play_path': src,
1542                     })
1543                 continue
1544
1545             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1546             src_url = src_url.strip()
1547
1548             if proto == 'm3u8' or src_ext == 'm3u8':
1549                 m3u8_formats = self._extract_m3u8_formats(
1550                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1551                 if len(m3u8_formats) == 1:
1552                     m3u8_count += 1
1553                     m3u8_formats[0].update({
1554                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1555                         'tbr': bitrate,
1556                         'width': width,
1557                         'height': height,
1558                     })
1559                 formats.extend(m3u8_formats)
1560                 continue
1561
1562             if src_ext == 'f4m':
1563                 f4m_url = src_url
1564                 if not f4m_params:
1565                     f4m_params = {
1566                         'hdcore': '3.2.0',
1567                         'plugin': 'flowplayer-3.2.0.1',
1568                     }
1569                 f4m_url += '&' if '?' in f4m_url else '?'
1570                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1571                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1572                 continue
1573
1574             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1575                 http_count += 1
1576                 formats.append({
1577                     'url': src_url,
1578                     'ext': ext or src_ext or 'flv',
1579                     'format_id': 'http-%d' % (bitrate or http_count),
1580                     'tbr': bitrate,
1581                     'filesize': filesize,
1582                     'width': width,
1583                     'height': height,
1584                 })
1585                 continue
1586
1587         return formats
1588
1589     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1590         urls = []
1591         subtitles = {}
1592         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1593             src = textstream.get('src')
1594             if not src or src in urls:
1595                 continue
1596             urls.append(src)
1597             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1598             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1599             subtitles.setdefault(lang, []).append({
1600                 'url': src,
1601                 'ext': ext,
1602             })
1603         return subtitles
1604
1605     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1606         xspf = self._download_xml(
1607             playlist_url, playlist_id, 'Downloading xpsf playlist',
1608             'Unable to download xspf manifest', fatal=fatal)
1609         if xspf is False:
1610             return []
1611         return self._parse_xspf(xspf, playlist_id)
1612
1613     def _parse_xspf(self, playlist, playlist_id):
1614         NS_MAP = {
1615             'xspf': 'http://xspf.org/ns/0/',
1616             's1': 'http://static.streamone.nl/player/ns/0',
1617         }
1618
1619         entries = []
1620         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1621             title = xpath_text(
1622                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1623             description = xpath_text(
1624                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1625             thumbnail = xpath_text(
1626                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1627             duration = float_or_none(
1628                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1629
1630             formats = [{
1631                 'url': location.text,
1632                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1633                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1634                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1635             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1636             self._sort_formats(formats)
1637
1638             entries.append({
1639                 'id': playlist_id,
1640                 'title': title,
1641                 'description': description,
1642                 'thumbnail': thumbnail,
1643                 'duration': duration,
1644                 'formats': formats,
1645             })
1646         return entries
1647
1648     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1649         res = self._download_webpage_handle(
1650             mpd_url, video_id,
1651             note=note or 'Downloading MPD manifest',
1652             errnote=errnote or 'Failed to download MPD manifest',
1653             fatal=fatal)
1654         if res is False:
1655             return []
1656         mpd, urlh = res
1657         mpd_base_url = base_url(urlh.geturl())
1658
1659         return self._parse_mpd_formats(
1660             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1661             formats_dict=formats_dict, mpd_url=mpd_url)
1662
1663     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1664         """
1665         Parse formats from MPD manifest.
1666         References:
1667          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1668             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1669          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1670         """
1671         if mpd_doc.get('type') == 'dynamic':
1672             return []
1673
1674         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1675
1676         def _add_ns(path):
1677             return self._xpath_ns(path, namespace)
1678
1679         def is_drm_protected(element):
1680             return element.find(_add_ns('ContentProtection')) is not None
1681
1682         def extract_multisegment_info(element, ms_parent_info):
1683             ms_info = ms_parent_info.copy()
1684
1685             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1686             # common attributes and elements.  We will only extract relevant
1687             # for us.
1688             def extract_common(source):
1689                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1690                 if segment_timeline is not None:
1691                     s_e = segment_timeline.findall(_add_ns('S'))
1692                     if s_e:
1693                         ms_info['total_number'] = 0
1694                         ms_info['s'] = []
1695                         for s in s_e:
1696                             r = int(s.get('r', 0))
1697                             ms_info['total_number'] += 1 + r
1698                             ms_info['s'].append({
1699                                 't': int(s.get('t', 0)),
1700                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1701                                 'd': int(s.attrib['d']),
1702                                 'r': r,
1703                             })
1704                 start_number = source.get('startNumber')
1705                 if start_number:
1706                     ms_info['start_number'] = int(start_number)
1707                 timescale = source.get('timescale')
1708                 if timescale:
1709                     ms_info['timescale'] = int(timescale)
1710                 segment_duration = source.get('duration')
1711                 if segment_duration:
1712                     ms_info['segment_duration'] = int(segment_duration)
1713
1714             def extract_Initialization(source):
1715                 initialization = source.find(_add_ns('Initialization'))
1716                 if initialization is not None:
1717                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1718
1719             segment_list = element.find(_add_ns('SegmentList'))
1720             if segment_list is not None:
1721                 extract_common(segment_list)
1722                 extract_Initialization(segment_list)
1723                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1724                 if segment_urls_e:
1725                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1726             else:
1727                 segment_template = element.find(_add_ns('SegmentTemplate'))
1728                 if segment_template is not None:
1729                     extract_common(segment_template)
1730                     media = segment_template.get('media')
1731                     if media:
1732                         ms_info['media'] = media
1733                     initialization = segment_template.get('initialization')
1734                     if initialization:
1735                         ms_info['initialization'] = initialization
1736                     else:
1737                         extract_Initialization(segment_template)
1738             return ms_info
1739
1740         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1741         formats = []
1742         for period in mpd_doc.findall(_add_ns('Period')):
1743             period_duration = parse_duration(period.get('duration')) or mpd_duration
1744             period_ms_info = extract_multisegment_info(period, {
1745                 'start_number': 1,
1746                 'timescale': 1,
1747             })
1748             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1749                 if is_drm_protected(adaptation_set):
1750                     continue
1751                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1752                 for representation in adaptation_set.findall(_add_ns('Representation')):
1753                     if is_drm_protected(representation):
1754                         continue
1755                     representation_attrib = adaptation_set.attrib.copy()
1756                     representation_attrib.update(representation.attrib)
1757                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1758                     mime_type = representation_attrib['mimeType']
1759                     content_type = mime_type.split('/')[0]
1760                     if content_type == 'text':
1761                         # TODO implement WebVTT downloading
1762                         pass
1763                     elif content_type == 'video' or content_type == 'audio':
1764                         base_url = ''
1765                         for element in (representation, adaptation_set, period, mpd_doc):
1766                             base_url_e = element.find(_add_ns('BaseURL'))
1767                             if base_url_e is not None:
1768                                 base_url = base_url_e.text + base_url
1769                                 if re.match(r'^https?://', base_url):
1770                                     break
1771                         if mpd_base_url and not re.match(r'^https?://', base_url):
1772                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1773                                 mpd_base_url += '/'
1774                             base_url = mpd_base_url + base_url
1775                         representation_id = representation_attrib.get('id')
1776                         lang = representation_attrib.get('lang')
1777                         url_el = representation.find(_add_ns('BaseURL'))
1778                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1779                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1780                         f = {
1781                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1782                             'url': base_url,
1783                             'manifest_url': mpd_url,
1784                             'ext': mimetype2ext(mime_type),
1785                             'width': int_or_none(representation_attrib.get('width')),
1786                             'height': int_or_none(representation_attrib.get('height')),
1787                             'tbr': int_or_none(bandwidth, 1000),
1788                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1789                             'fps': int_or_none(representation_attrib.get('frameRate')),
1790                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1791                             'format_note': 'DASH %s' % content_type,
1792                             'filesize': filesize,
1793                         }
1794                         f.update(parse_codecs(representation_attrib.get('codecs')))
1795                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1796
1797                         def prepare_template(template_name, identifiers):
1798                             t = representation_ms_info[template_name]
1799                             t = t.replace('$RepresentationID$', representation_id)
1800                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1801                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1802                             t.replace('$$', '$')
1803                             return t
1804
1805                         # @initialization is a regular template like @media one
1806                         # so it should be handled just the same way (see
1807                         # https://github.com/rg3/youtube-dl/issues/11605)
1808                         if 'initialization' in representation_ms_info:
1809                             initialization_template = prepare_template(
1810                                 'initialization',
1811                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1812                                 # $Time$ shall not be included for @initialization thus
1813                                 # only $Bandwidth$ remains
1814                                 ('Bandwidth', ))
1815                             representation_ms_info['initialization_url'] = initialization_template % {
1816                                 'Bandwidth': bandwidth,
1817                             }
1818
1819                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1820
1821                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1822
1823                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1824                             # can't be used at the same time
1825                             if '%(Number' in media_template and 's' not in representation_ms_info:
1826                                 segment_duration = None
1827                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1828                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1829                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1830                                 representation_ms_info['fragments'] = [{
1831                                     'url': media_template % {
1832                                         'Number': segment_number,
1833                                         'Bandwidth': bandwidth,
1834                                     },
1835                                     'duration': segment_duration,
1836                                 } for segment_number in range(
1837                                     representation_ms_info['start_number'],
1838                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1839                             else:
1840                                 # $Number*$ or $Time$ in media template with S list available
1841                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1842                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1843                                 representation_ms_info['fragments'] = []
1844                                 segment_time = 0
1845                                 segment_d = None
1846                                 segment_number = representation_ms_info['start_number']
1847
1848                                 def add_segment_url():
1849                                     segment_url = media_template % {
1850                                         'Time': segment_time,
1851                                         'Bandwidth': bandwidth,
1852                                         'Number': segment_number,
1853                                     }
1854                                     representation_ms_info['fragments'].append({
1855                                         'url': segment_url,
1856                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1857                                     })
1858
1859                                 for num, s in enumerate(representation_ms_info['s']):
1860                                     segment_time = s.get('t') or segment_time
1861                                     segment_d = s['d']
1862                                     add_segment_url()
1863                                     segment_number += 1
1864                                     for r in range(s.get('r', 0)):
1865                                         segment_time += segment_d
1866                                         add_segment_url()
1867                                         segment_number += 1
1868                                     segment_time += segment_d
1869                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1870                             # No media template
1871                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1872                             # or any YouTube dashsegments video
1873                             fragments = []
1874                             segment_index = 0
1875                             timescale = representation_ms_info['timescale']
1876                             for s in representation_ms_info['s']:
1877                                 duration = float_or_none(s['d'], timescale)
1878                                 for r in range(s.get('r', 0) + 1):
1879                                     fragments.append({
1880                                         'url': representation_ms_info['segment_urls'][segment_index],
1881                                         'duration': duration,
1882                                     })
1883                                     segment_index += 1
1884                             representation_ms_info['fragments'] = fragments
1885                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1886                         # No fragments key is present in this case.
1887                         if 'fragments' in representation_ms_info:
1888                             f.update({
1889                                 'fragments': [],
1890                                 'protocol': 'http_dash_segments',
1891                             })
1892                             if 'initialization_url' in representation_ms_info:
1893                                 initialization_url = representation_ms_info['initialization_url']
1894                                 if not f.get('url'):
1895                                     f['url'] = initialization_url
1896                                 f['fragments'].append({'url': initialization_url})
1897                             f['fragments'].extend(representation_ms_info['fragments'])
1898                             for fragment in f['fragments']:
1899                                 fragment['url'] = urljoin(base_url, fragment['url'])
1900                         try:
1901                             existing_format = next(
1902                                 fo for fo in formats
1903                                 if fo['format_id'] == representation_id)
1904                         except StopIteration:
1905                             full_info = formats_dict.get(representation_id, {}).copy()
1906                             full_info.update(f)
1907                             formats.append(full_info)
1908                         else:
1909                             existing_format.update(f)
1910                     else:
1911                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1912         return formats
1913
1914     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1915         res = self._download_webpage_handle(
1916             ism_url, video_id,
1917             note=note or 'Downloading ISM manifest',
1918             errnote=errnote or 'Failed to download ISM manifest',
1919             fatal=fatal)
1920         if res is False:
1921             return []
1922         ism, urlh = res
1923
1924         return self._parse_ism_formats(
1925             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
1926
1927     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
1928         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
1929             return []
1930
1931         duration = int(ism_doc.attrib['Duration'])
1932         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
1933
1934         formats = []
1935         for stream in ism_doc.findall('StreamIndex'):
1936             stream_type = stream.get('Type')
1937             if stream_type not in ('video', 'audio'):
1938                 continue
1939             url_pattern = stream.attrib['Url']
1940             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
1941             stream_name = stream.get('Name')
1942             for track in stream.findall('QualityLevel'):
1943                 fourcc = track.get('FourCC')
1944                 # TODO: add support for WVC1 and WMAP
1945                 if fourcc not in ('H264', 'AVC1', 'AACL'):
1946                     self.report_warning('%s is not a supported codec' % fourcc)
1947                     continue
1948                 tbr = int(track.attrib['Bitrate']) // 1000
1949                 width = int_or_none(track.get('MaxWidth'))
1950                 height = int_or_none(track.get('MaxHeight'))
1951                 sampling_rate = int_or_none(track.get('SamplingRate'))
1952
1953                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
1954                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
1955
1956                 fragments = []
1957                 fragment_ctx = {
1958                     'time': 0,
1959                 }
1960                 stream_fragments = stream.findall('c')
1961                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
1962                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
1963                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
1964                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
1965                     if not fragment_ctx['duration']:
1966                         try:
1967                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
1968                         except IndexError:
1969                             next_fragment_time = duration
1970                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
1971                     for _ in range(fragment_repeat):
1972                         fragments.append({
1973                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
1974                             'duration': fragment_ctx['duration'] / stream_timescale,
1975                         })
1976                         fragment_ctx['time'] += fragment_ctx['duration']
1977
1978                 format_id = []
1979                 if ism_id:
1980                     format_id.append(ism_id)
1981                 if stream_name:
1982                     format_id.append(stream_name)
1983                 format_id.append(compat_str(tbr))
1984
1985                 formats.append({
1986                     'format_id': '-'.join(format_id),
1987                     'url': ism_url,
1988                     'manifest_url': ism_url,
1989                     'ext': 'ismv' if stream_type == 'video' else 'isma',
1990                     'width': width,
1991                     'height': height,
1992                     'tbr': tbr,
1993                     'asr': sampling_rate,
1994                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
1995                     'acodec': 'none' if stream_type == 'video' else fourcc,
1996                     'protocol': 'ism',
1997                     'fragments': fragments,
1998                     '_download_params': {
1999                         'duration': duration,
2000                         'timescale': stream_timescale,
2001                         'width': width or 0,
2002                         'height': height or 0,
2003                         'fourcc': fourcc,
2004                         'codec_private_data': track.get('CodecPrivateData'),
2005                         'sampling_rate': sampling_rate,
2006                         'channels': int_or_none(track.get('Channels', 2)),
2007                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2008                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2009                     },
2010                 })
2011         return formats
2012
2013     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2014         def absolute_url(video_url):
2015             return compat_urlparse.urljoin(base_url, video_url)
2016
2017         def parse_content_type(content_type):
2018             if not content_type:
2019                 return {}
2020             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2021             if ctr:
2022                 mimetype, codecs = ctr.groups()
2023                 f = parse_codecs(codecs)
2024                 f['ext'] = mimetype2ext(mimetype)
2025                 return f
2026             return {}
2027
2028         def _media_formats(src, cur_media_type):
2029             full_url = absolute_url(src)
2030             ext = determine_ext(full_url)
2031             if ext == 'm3u8':
2032                 is_plain_url = False
2033                 formats = self._extract_m3u8_formats(
2034                     full_url, video_id, ext='mp4',
2035                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2036                     preference=preference)
2037             elif ext == 'mpd':
2038                 is_plain_url = False
2039                 formats = self._extract_mpd_formats(
2040                     full_url, video_id, mpd_id=mpd_id)
2041             else:
2042                 is_plain_url = True
2043                 formats = [{
2044                     'url': full_url,
2045                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2046                 }]
2047             return is_plain_url, formats
2048
2049         entries = []
2050         media_tags = [(media_tag, media_type, '')
2051                       for media_tag, media_type
2052                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2053         media_tags.extend(re.findall(
2054             # We only allow video|audio followed by a whitespace or '>'.
2055             # Allowing more characters may end up in significant slow down (see
2056             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2057             # http://www.porntrex.com/maps/videositemap.xml).
2058             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2059         for media_tag, media_type, media_content in media_tags:
2060             media_info = {
2061                 'formats': [],
2062                 'subtitles': {},
2063             }
2064             media_attributes = extract_attributes(media_tag)
2065             src = media_attributes.get('src')
2066             if src:
2067                 _, formats = _media_formats(src, media_type)
2068                 media_info['formats'].extend(formats)
2069             media_info['thumbnail'] = media_attributes.get('poster')
2070             if media_content:
2071                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2072                     source_attributes = extract_attributes(source_tag)
2073                     src = source_attributes.get('src')
2074                     if not src:
2075                         continue
2076                     is_plain_url, formats = _media_formats(src, media_type)
2077                     if is_plain_url:
2078                         f = parse_content_type(source_attributes.get('type'))
2079                         f.update(formats[0])
2080                         media_info['formats'].append(f)
2081                     else:
2082                         media_info['formats'].extend(formats)
2083                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2084                     track_attributes = extract_attributes(track_tag)
2085                     kind = track_attributes.get('kind')
2086                     if not kind or kind in ('subtitles', 'captions'):
2087                         src = track_attributes.get('src')
2088                         if not src:
2089                             continue
2090                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2091                         media_info['subtitles'].setdefault(lang, []).append({
2092                             'url': absolute_url(src),
2093                         })
2094             if media_info['formats'] or media_info['subtitles']:
2095                 entries.append(media_info)
2096         return entries
2097
2098     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2099         formats = []
2100         hdcore_sign = 'hdcore=3.7.0'
2101         f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2102         hds_host = hosts.get('hds')
2103         if hds_host:
2104             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2105         if 'hdcore=' not in f4m_url:
2106             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2107         f4m_formats = self._extract_f4m_formats(
2108             f4m_url, video_id, f4m_id='hds', fatal=False)
2109         for entry in f4m_formats:
2110             entry.update({'extra_param_to_segment_url': hdcore_sign})
2111         formats.extend(f4m_formats)
2112         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2113         hls_host = hosts.get('hls')
2114         if hls_host:
2115             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2116         formats.extend(self._extract_m3u8_formats(
2117             m3u8_url, video_id, 'mp4', 'm3u8_native',
2118             m3u8_id='hls', fatal=False))
2119         return formats
2120
2121     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2122         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2123         url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
2124         http_base_url = 'http' + url_base
2125         formats = []
2126         if 'm3u8' not in skip_protocols:
2127             formats.extend(self._extract_m3u8_formats(
2128                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2129                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2130         if 'f4m' not in skip_protocols:
2131             formats.extend(self._extract_f4m_formats(
2132                 http_base_url + '/manifest.f4m',
2133                 video_id, f4m_id='hds', fatal=False))
2134         if 'dash' not in skip_protocols:
2135             formats.extend(self._extract_mpd_formats(
2136                 http_base_url + '/manifest.mpd',
2137                 video_id, mpd_id='dash', fatal=False))
2138         if re.search(r'(?:/smil:|\.smil)', url_base):
2139             if 'smil' not in skip_protocols:
2140                 rtmp_formats = self._extract_smil_formats(
2141                     http_base_url + '/jwplayer.smil',
2142                     video_id, fatal=False)
2143                 for rtmp_format in rtmp_formats:
2144                     rtsp_format = rtmp_format.copy()
2145                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2146                     del rtsp_format['play_path']
2147                     del rtsp_format['ext']
2148                     rtsp_format.update({
2149                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2150                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2151                         'protocol': 'rtsp',
2152                     })
2153                     formats.extend([rtmp_format, rtsp_format])
2154         else:
2155             for protocol in ('rtmp', 'rtsp'):
2156                 if protocol not in skip_protocols:
2157                     formats.append({
2158                         'url': protocol + url_base,
2159                         'format_id': protocol,
2160                         'protocol': protocol,
2161                     })
2162         return formats
2163
2164     @staticmethod
2165     def _find_jwplayer_data(webpage):
2166         mobj = re.search(
2167             r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
2168             webpage)
2169         if mobj:
2170             return mobj.group('options')
2171
2172     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2173         jwplayer_data = self._parse_json(
2174             self._find_jwplayer_data(webpage), video_id,
2175             transform_source=js_to_json)
2176         return self._parse_jwplayer_data(
2177             jwplayer_data, video_id, *args, **kwargs)
2178
2179     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2180                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2181         # JWPlayer backward compatibility: flattened playlists
2182         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2183         if 'playlist' not in jwplayer_data:
2184             jwplayer_data = {'playlist': [jwplayer_data]}
2185
2186         entries = []
2187
2188         # JWPlayer backward compatibility: single playlist item
2189         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2190         if not isinstance(jwplayer_data['playlist'], list):
2191             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2192
2193         for video_data in jwplayer_data['playlist']:
2194             # JWPlayer backward compatibility: flattened sources
2195             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2196             if 'sources' not in video_data:
2197                 video_data['sources'] = [video_data]
2198
2199             this_video_id = video_id or video_data['mediaid']
2200
2201             formats = self._parse_jwplayer_formats(
2202                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2203                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2204             self._sort_formats(formats)
2205
2206             subtitles = {}
2207             tracks = video_data.get('tracks')
2208             if tracks and isinstance(tracks, list):
2209                 for track in tracks:
2210                     if track.get('kind') != 'captions':
2211                         continue
2212                     track_url = urljoin(base_url, track.get('file'))
2213                     if not track_url:
2214                         continue
2215                     subtitles.setdefault(track.get('label') or 'en', []).append({
2216                         'url': self._proto_relative_url(track_url)
2217                     })
2218
2219             entries.append({
2220                 'id': this_video_id,
2221                 'title': video_data['title'] if require_title else video_data.get('title'),
2222                 'description': video_data.get('description'),
2223                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2224                 'timestamp': int_or_none(video_data.get('pubdate')),
2225                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2226                 'subtitles': subtitles,
2227                 'formats': formats,
2228             })
2229         if len(entries) == 1:
2230             return entries[0]
2231         else:
2232             return self.playlist_result(entries)
2233
2234     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2235                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2236         formats = []
2237         for source in jwplayer_sources_data:
2238             source_url = self._proto_relative_url(source['file'])
2239             if base_url:
2240                 source_url = compat_urlparse.urljoin(base_url, source_url)
2241             source_type = source.get('type') or ''
2242             ext = mimetype2ext(source_type) or determine_ext(source_url)
2243             if source_type == 'hls' or ext == 'm3u8':
2244                 formats.extend(self._extract_m3u8_formats(
2245                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2246                     m3u8_id=m3u8_id, fatal=False))
2247             elif ext == 'mpd':
2248                 formats.extend(self._extract_mpd_formats(
2249                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2250             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2251             elif source_type.startswith('audio') or ext in (
2252                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2253                 formats.append({
2254                     'url': source_url,
2255                     'vcodec': 'none',
2256                     'ext': ext,
2257                 })
2258             else:
2259                 height = int_or_none(source.get('height'))
2260                 if height is None:
2261                     # Often no height is provided but there is a label in
2262                     # format like "1080p", "720p SD", or 1080.
2263                     height = int_or_none(self._search_regex(
2264                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2265                         'height', default=None))
2266                 a_format = {
2267                     'url': source_url,
2268                     'width': int_or_none(source.get('width')),
2269                     'height': height,
2270                     'tbr': int_or_none(source.get('bitrate')),
2271                     'ext': ext,
2272                 }
2273                 if source_url.startswith('rtmp'):
2274                     a_format['ext'] = 'flv'
2275                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2276                     # of jwplayer.flash.swf
2277                     rtmp_url_parts = re.split(
2278                         r'((?:mp4|mp3|flv):)', source_url, 1)
2279                     if len(rtmp_url_parts) == 3:
2280                         rtmp_url, prefix, play_path = rtmp_url_parts
2281                         a_format.update({
2282                             'url': rtmp_url,
2283                             'play_path': prefix + play_path,
2284                         })
2285                     if rtmp_params:
2286                         a_format.update(rtmp_params)
2287                 formats.append(a_format)
2288         return formats
2289
2290     def _live_title(self, name):
2291         """ Generate the title for a live video """
2292         now = datetime.datetime.now()
2293         now_str = now.strftime('%Y-%m-%d %H:%M')
2294         return name + ' ' + now_str
2295
2296     def _int(self, v, name, fatal=False, **kwargs):
2297         res = int_or_none(v, **kwargs)
2298         if 'get_attr' in kwargs:
2299             print(getattr(v, kwargs['get_attr']))
2300         if res is None:
2301             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2302             if fatal:
2303                 raise ExtractorError(msg)
2304             else:
2305                 self._downloader.report_warning(msg)
2306         return res
2307
2308     def _float(self, v, name, fatal=False, **kwargs):
2309         res = float_or_none(v, **kwargs)
2310         if res is None:
2311             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2312             if fatal:
2313                 raise ExtractorError(msg)
2314             else:
2315                 self._downloader.report_warning(msg)
2316         return res
2317
2318     def _set_cookie(self, domain, name, value, expire_time=None):
2319         cookie = compat_cookiejar.Cookie(
2320             0, name, value, None, None, domain, None,
2321             None, '/', True, False, expire_time, '', None, None, None)
2322         self._downloader.cookiejar.set_cookie(cookie)
2323
2324     def _get_cookies(self, url):
2325         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2326         req = sanitized_Request(url)
2327         self._downloader.cookiejar.add_cookie_header(req)
2328         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2329
2330     def get_testcases(self, include_onlymatching=False):
2331         t = getattr(self, '_TEST', None)
2332         if t:
2333             assert not hasattr(self, '_TESTS'), \
2334                 '%s has _TEST and _TESTS' % type(self).__name__
2335             tests = [t]
2336         else:
2337             tests = getattr(self, '_TESTS', [])
2338         for t in tests:
2339             if not include_onlymatching and t.get('only_matching', False):
2340                 continue
2341             t['name'] = type(self).__name__[:-len('IE')]
2342             yield t
2343
2344     def is_suitable(self, age_limit):
2345         """ Test whether the extractor is generally suitable for the given
2346         age limit (i.e. pornographic sites are not, all others usually are) """
2347
2348         any_restricted = False
2349         for tc in self.get_testcases(include_onlymatching=False):
2350             if tc.get('playlist', []):
2351                 tc = tc['playlist'][0]
2352             is_restricted = age_restricted(
2353                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2354             if not is_restricted:
2355                 return True
2356             any_restricted = any_restricted or is_restricted
2357         return not any_restricted
2358
2359     def extract_subtitles(self, *args, **kwargs):
2360         if (self._downloader.params.get('writesubtitles', False) or
2361                 self._downloader.params.get('listsubtitles')):
2362             return self._get_subtitles(*args, **kwargs)
2363         return {}
2364
2365     def _get_subtitles(self, *args, **kwargs):
2366         raise NotImplementedError('This method must be implemented by subclasses')
2367
2368     @staticmethod
2369     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2370         """ Merge subtitle items for one language. Items with duplicated URLs
2371         will be dropped. """
2372         list1_urls = set([item['url'] for item in subtitle_list1])
2373         ret = list(subtitle_list1)
2374         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2375         return ret
2376
2377     @classmethod
2378     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2379         """ Merge two subtitle dictionaries, language by language. """
2380         ret = dict(subtitle_dict1)
2381         for lang in subtitle_dict2:
2382             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2383         return ret
2384
2385     def extract_automatic_captions(self, *args, **kwargs):
2386         if (self._downloader.params.get('writeautomaticsub', False) or
2387                 self._downloader.params.get('listsubtitles')):
2388             return self._get_automatic_captions(*args, **kwargs)
2389         return {}
2390
2391     def _get_automatic_captions(self, *args, **kwargs):
2392         raise NotImplementedError('This method must be implemented by subclasses')
2393
2394     def mark_watched(self, *args, **kwargs):
2395         if (self._downloader.params.get('mark_watched', False) and
2396                 (self._get_login_info()[0] is not None or
2397                     self._downloader.params.get('cookiefile') is not None)):
2398             self._mark_watched(*args, **kwargs)
2399
2400     def _mark_watched(self, *args, **kwargs):
2401         raise NotImplementedError('This method must be implemented by subclasses')
2402
2403     def geo_verification_headers(self):
2404         headers = {}
2405         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2406         if geo_verification_proxy:
2407             headers['Ytdl-request-proxy'] = geo_verification_proxy
2408         return headers
2409
2410     def _generic_id(self, url):
2411         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2412
2413     def _generic_title(self, url):
2414         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2415
2416
2417 class SearchInfoExtractor(InfoExtractor):
2418     """
2419     Base class for paged search queries extractors.
2420     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2421     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2422     """
2423
2424     @classmethod
2425     def _make_valid_url(cls):
2426         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2427
2428     @classmethod
2429     def suitable(cls, url):
2430         return re.match(cls._make_valid_url(), url) is not None
2431
2432     def _real_extract(self, query):
2433         mobj = re.match(self._make_valid_url(), query)
2434         if mobj is None:
2435             raise ExtractorError('Invalid search query "%s"' % query)
2436
2437         prefix = mobj.group('prefix')
2438         query = mobj.group('query')
2439         if prefix == '':
2440             return self._get_n_results(query, 1)
2441         elif prefix == 'all':
2442             return self._get_n_results(query, self._MAX_RESULTS)
2443         else:
2444             n = int(prefix)
2445             if n <= 0:
2446                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2447             elif n > self._MAX_RESULTS:
2448                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2449                 n = self._MAX_RESULTS
2450             return self._get_n_results(query, n)
2451
2452     def _get_n_results(self, query, n):
2453         """Get a specified number of results for a query"""
2454         raise NotImplementedError('This method must be implemented by subclasses')
2455
2456     @property
2457     def SEARCH_KEY(self):
2458         return self._SEARCH_KEY