Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177                     * downloader_options  A dictionary of downloader options as
 178                                  described in FileDownloader
 179
 180     url:            Final video URL.
 181     ext:            Video filename extension.
 182     format:         The video format, defaults to ext (used for --get-format)
 183     player_url:     SWF Player URL (used for rtmpdump).
 184
 185     The following fields are optional:
 186
 187     alt_title:      A secondary title of the video.
 188     display_id      An alternative identifier for the video, not necessarily
 189                     unique, but available before title. Typically, id is
 190                     something like "4234987", title "Dancing naked mole rats",
 191                     and display_id "dancing-naked-mole-rats"
 192     thumbnails:     A list of dictionaries, with the following entries:
 193                         * "id" (optional, string) - Thumbnail format ID
 194                         * "url"
 195                         * "preference" (optional, int) - quality of the image
 196                         * "width" (optional, int)
 197                         * "height" (optional, int)
 198                         * "resolution" (optional, string "{width}x{height"},
 199                                         deprecated)
 200                         * "filesize" (optional, int)
 201     thumbnail:      Full URL to a video thumbnail image.
 202     description:    Full video description.
 203     uploader:       Full name of the video uploader.
 204     license:        License name the video is licensed under.
 205     creator:        The creator of the video.
 206     release_date:   The date (YYYYMMDD) when the video was released.
 207     timestamp:      UNIX timestamp of the moment the video became available.
 208     upload_date:    Video upload date (YYYYMMDD).
 209                     If not explicitly set, calculated from timestamp.
 210     uploader_id:    Nickname or id of the video uploader.
 211     uploader_url:   Full URL to a personal webpage of the video uploader.
 212     location:       Physical location where the video was filmed.
 213     subtitles:      The available subtitles as a dictionary in the format
 214                     {tag: subformats}. "tag" is usually a language code, and
 215                     "subformats" is a list sorted from lower to higher
 216                     preference, each element is a dictionary with the "ext"
 217                     entry and one of:
 218                         * "data": The subtitles file contents
 219                         * "url": A URL pointing to the subtitles file
 220                     "ext" will be calculated from URL if missing
 221     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 222                     automatically generated captions
 223     duration:       Length of the video in seconds, as an integer or float.
 224     view_count:     How many users have watched the video on the platform.
 225     like_count:     Number of positive ratings of the video
 226     dislike_count:  Number of negative ratings of the video
 227     repost_count:   Number of reposts of the video
 228     average_rating: Average rating give by users, the scale used depends on the webpage
 229     comment_count:  Number of comments on the video
 230     comments:       A list of comments, each with one or more of the following
 231                     properties (all but one of text or html optional):
 232                         * "author" - human-readable name of the comment author
 233                         * "author_id" - user ID of the comment author
 234                         * "id" - Comment ID
 235                         * "html" - Comment as HTML
 236                         * "text" - Plain text of the comment
 237                         * "timestamp" - UNIX timestamp of comment
 238                         * "parent" - ID of the comment this one is replying to.
 239                                      Set to "root" to indicate that this is a
 240                                      comment to the original video.
 241     age_limit:      Age restriction for the video, as an integer (years)
 242     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 243                     should allow to get the same result again. (It will be set
 244                     by YoutubeDL if it's missing)
 245     categories:     A list of categories that the video falls in, for example
 246                     ["Sports", "Berlin"]
 247     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 248     is_live:        True, False, or None (=unknown). Whether this video is a
 249                     live stream that goes on instead of a fixed-length video.
 250     start_time:     Time in seconds where the reproduction should start, as
 251                     specified in the URL.
 252     end_time:       Time in seconds where the reproduction should end, as
 253                     specified in the URL.
 254     chapters:       A list of dictionaries, with the following entries:
 255                         * "start_time" - The start time of the chapter in seconds
 256                         * "end_time" - The end time of the chapter in seconds
 257                         * "title" (optional, string)
 258
 259     The following fields should only be used when the video belongs to some logical
 260     chapter or section:
 261
 262     chapter:        Name or title of the chapter the video belongs to.
 263     chapter_number: Number of the chapter the video belongs to, as an integer.
 264     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 265
 266     The following fields should only be used when the video is an episode of some
 267     series, programme or podcast:
 268
 269     series:         Title of the series or programme the video episode belongs to.
 270     season:         Title of the season the video episode belongs to.
 271     season_number:  Number of the season the video episode belongs to, as an integer.
 272     season_id:      Id of the season the video episode belongs to, as a unicode string.
 273     episode:        Title of the video episode. Unlike mandatory video title field,
 274                     this field should denote the exact title of the video episode
 275                     without any kind of decoration.
 276     episode_number: Number of the video episode within a season, as an integer.
 277     episode_id:     Id of the video episode, as a unicode string.
 278
 279     The following fields should only be used when the media is a track or a part of
 280     a music album:
 281
 282     track:          Title of the track.
 283     track_number:   Number of the track within an album or a disc, as an integer.
 284     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 285                     as a unicode string.
 286     artist:         Artist(s) of the track.
 287     genre:          Genre(s) of the track.
 288     album:          Title of the album the track belongs to.
 289     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 290     album_artist:   List of all artists appeared on the album (e.g.
 291                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 292                     and compilations).
 293     disc_number:    Number of the disc or other physical medium the track belongs to,
 294                     as an integer.
 295     release_year:   Year (YYYY) when the album was released.
 296
 297     Unless mentioned otherwise, the fields should be Unicode strings.
 298
 299     Unless mentioned otherwise, None is equivalent to absence of information.
 300
 301
 302     _type "playlist" indicates multiple videos.
 303     There must be a key "entries", which is a list, an iterable, or a PagedList
 304     object, each element of which is a valid dictionary by this specification.
 305
 306     Additionally, playlists can have "id", "title", "description", "uploader",
 307     "uploader_id", "uploader_url" attributes with the same semantics as videos
 308     (see above).
 309
 310
 311     _type "multi_video" indicates that there are multiple videos that
 312     form a single show, for examples multiple acts of an opera or TV episode.
 313     It must have an entries key like a playlist and contain all the keys
 314     required for a video at the same time.
 315
 316
 317     _type "url" indicates that the video must be extracted from another
 318     location, possibly by a different extractor. Its only required key is:
 319     "url" - the next URL to extract.
 320     The key "ie_key" can be set to the class name (minus the trailing "IE",
 321     e.g. "Youtube") if the extractor class is known in advance.
 322     Additionally, the dictionary may have any properties of the resolved entity
 323     known in advance, for example "title" if the title of the referred video is
 324     known ahead of time.
 325
 326
 327     _type "url_transparent" entities have the same specification as "url", but
 328     indicate that the given additional information is more precise than the one
 329     associated with the resolved URL.
 330     This is useful when a site employs a video service that hosts the video and
 331     its technical metadata, but that video service does not embed a useful
 332     title, description etc.
 333
 334
 335     Subclasses of this one should re-define the _real_initialize() and
 336     _real_extract() methods and define a _VALID_URL regexp.
 337     Probably, they should also be added to the list of extractors.
 338
 339     _GEO_BYPASS attribute may be set to False in order to disable
 340     geo restriction bypass mechanisms for a particular extractor.
 341     Though it won't disable explicit geo restriction bypass based on
 342     country code provided with geo_bypass_country.
 343
 344     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 345     countries for this extractor. One of these countries will be used by
 346     geo restriction bypass mechanism right away in order to bypass
 347     geo restriction, of course, if the mechanism is not disabled.
 348
 349     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted
 350     IP blocks in CIDR notation for this extractor. One of these IP blocks
 351     will be used by geo restriction bypass mechanism similarly
 352     to _GEO_COUNTRIES.
 353
 354     Finally, the _WORKING attribute should be set to False for broken IEs
 355     in order to warn the users and skip the tests.
 356     """
 357
 358     _ready = False
 359     _downloader = None
 360     _x_forwarded_for_ip = None
 361     _GEO_BYPASS = True
 362     _GEO_COUNTRIES = None
 363     _GEO_IP_BLOCKS = None
 364     _WORKING = True
 365
 366     def __init__(self, downloader=None):
 367         """Constructor. Receives an optional downloader."""
 368         self._ready = False
 369         self._x_forwarded_for_ip = None
 370         self.set_downloader(downloader)
 371
 372     @classmethod
 373     def suitable(cls, url):
 374         """Receives a URL and returns True if suitable for this IE."""
 375
 376         # This does not use has/getattr intentionally - we want to know whether
 377         # we have cached the regexp for *this* class, whereas getattr would also
 378         # match the superclass
 379         if '_VALID_URL_RE' not in cls.__dict__:
 380             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 381         return cls._VALID_URL_RE.match(url) is not None
 382
 383     @classmethod
 384     def _match_id(cls, url):
 385         if '_VALID_URL_RE' not in cls.__dict__:
 386             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 387         m = cls._VALID_URL_RE.match(url)
 388         assert m
 389         return compat_str(m.group('id'))
 390
 391     @classmethod
 392     def working(cls):
 393         """Getter method for _WORKING."""
 394         return cls._WORKING
 395
 396     def initialize(self):
 397         """Initializes an instance (authentication, etc)."""
 398         self._initialize_geo_bypass({
 399             'countries': self._GEO_COUNTRIES,
 400             'ip_blocks': self._GEO_IP_BLOCKS,
 401         })
 402         if not self._ready:
 403             self._real_initialize()
 404             self._ready = True
 405
 406     def _initialize_geo_bypass(self, geo_bypass_context):
 407         """
 408         Initialize geo restriction bypass mechanism.
 409
 410         This method is used to initialize geo bypass mechanism based on faking
 411         X-Forwarded-For HTTP header. A random country from provided country list
 412         is selected and a random IP belonging to this country is generated. This
 413         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 414         HTTP requests.
 415
 416         This method will be used for initial geo bypass mechanism initialization
 417         during the instance initialization with _GEO_COUNTRIES and
 418         _GEO_IP_BLOCKS.
 419
 420         You may also manually call it from extractor's code if geo bypass
 421         information is not available beforehand (e.g. obtained during
 422         extraction) or due to some other reason. In this case you should pass
 423         this information in geo bypass context passed as first argument. It may
 424         contain following fields:
 425
 426         countries:  List of geo unrestricted countries (similar
 427                     to _GEO_COUNTRIES)
 428         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation
 429                     (similar to _GEO_IP_BLOCKS)
 430
 431         """
 432         if not self._x_forwarded_for_ip:
 433
 434             # Geo bypass mechanism is explicitly disabled by user
 435             if not self._downloader.params.get('geo_bypass', True):
 436                 return
 437
 438             if not geo_bypass_context:
 439                 geo_bypass_context = {}
 440
 441             # Backward compatibility: previously _initialize_geo_bypass
 442             # expected a list of countries, some 3rd party code may still use
 443             # it this way
 444             if isinstance(geo_bypass_context, (list, tuple)):
 445                 geo_bypass_context = {
 446                     'countries': geo_bypass_context,
 447                 }
 448
 449             # The whole point of geo bypass mechanism is to fake IP
 450             # as X-Forwarded-For HTTP header based on some IP block or
 451             # country code.
 452
 453             # Path 1: bypassing based on IP block in CIDR notation
 454
 455             # Explicit IP block specified by user, use it right away
 456             # regardless of whether extractor is geo bypassable or not
 457             ip_block = self._downloader.params.get('geo_bypass_ip_block', None)
 458
 459             # Otherwise use random IP block from geo bypass context but only
 460             # if extractor is known as geo bypassable
 461             if not ip_block:
 462                 ip_blocks = geo_bypass_context.get('ip_blocks')
 463                 if self._GEO_BYPASS and ip_blocks:
 464                     ip_block = random.choice(ip_blocks)
 465
 466             if ip_block:
 467                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(ip_block)
 468                 if self._downloader.params.get('verbose', False):
 469                     self._downloader.to_screen(
 470                         '[debug] Using fake IP %s as X-Forwarded-For.'
 471                         % self._x_forwarded_for_ip)
 472                 return
 473
 474             # Path 2: bypassing based on country code
 475
 476             # Explicit country code specified by user, use it right away
 477             # regardless of whether extractor is geo bypassable or not
 478             country = self._downloader.params.get('geo_bypass_country', None)
 479
 480             # Otherwise use random country code from geo bypass context but
 481             # only if extractor is known as geo bypassable
 482             if not country:
 483                 countries = geo_bypass_context.get('countries')
 484                 if self._GEO_BYPASS and countries:
 485                     country = random.choice(countries)
 486
 487             if country:
 488                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country)
 489                 if self._downloader.params.get('verbose', False):
 490                     self._downloader.to_screen(
 491                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 492                         % (self._x_forwarded_for_ip, country.upper()))
 493
 494     def extract(self, url):
 495         """Extracts URL information and returns it in list of dicts."""
 496         try:
 497             for _ in range(2):
 498                 try:
 499                     self.initialize()
 500                     ie_result = self._real_extract(url)
 501                     if self._x_forwarded_for_ip:
 502                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 503                     return ie_result
 504                 except GeoRestrictedError as e:
 505                     if self.__maybe_fake_ip_and_retry(e.countries):
 506                         continue
 507                     raise
 508         except ExtractorError:
 509             raise
 510         except compat_http_client.IncompleteRead as e:
 511             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 512         except (KeyError, StopIteration) as e:
 513             raise ExtractorError('An extractor error has occurred.', cause=e)
 514
 515     def __maybe_fake_ip_and_retry(self, countries):
 516         if (not self._downloader.params.get('geo_bypass_country', None) and
 517                 self._GEO_BYPASS and
 518                 self._downloader.params.get('geo_bypass', True) and
 519                 not self._x_forwarded_for_ip and
 520                 countries):
 521             country_code = random.choice(countries)
 522             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 523             if self._x_forwarded_for_ip:
 524                 self.report_warning(
 525                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 526                     % (self._x_forwarded_for_ip, country_code.upper()))
 527                 return True
 528         return False
 529
 530     def set_downloader(self, downloader):
 531         """Sets the downloader for this IE."""
 532         self._downloader = downloader
 533
 534     def _real_initialize(self):
 535         """Real initialization process. Redefine in subclasses."""
 536         pass
 537
 538     def _real_extract(self, url):
 539         """Real extraction process. Redefine in subclasses."""
 540         pass
 541
 542     @classmethod
 543     def ie_key(cls):
 544         """A string for getting the InfoExtractor with get_info_extractor"""
 545         return compat_str(cls.__name__[:-2])
 546
 547     @property
 548     def IE_NAME(self):
 549         return compat_str(type(self).__name__[:-2])
 550
 551     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 552         """ Returns the response handle """
 553         if note is None:
 554             self.report_download_webpage(video_id)
 555         elif note is not False:
 556             if video_id is None:
 557                 self.to_screen('%s' % (note,))
 558             else:
 559                 self.to_screen('%s: %s' % (video_id, note))
 560
 561         # Some sites check X-Forwarded-For HTTP header in order to figure out
 562         # the origin of the client behind proxy. This allows bypassing geo
 563         # restriction by faking this header's value to IP that belongs to some
 564         # geo unrestricted country. We will do so once we encounter any
 565         # geo restriction error.
 566         if self._x_forwarded_for_ip:
 567             if 'X-Forwarded-For' not in headers:
 568                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 569
 570         if isinstance(url_or_request, compat_urllib_request.Request):
 571             url_or_request = update_Request(
 572                 url_or_request, data=data, headers=headers, query=query)
 573         else:
 574             if query:
 575                 url_or_request = update_url_query(url_or_request, query)
 576             if data is not None or headers:
 577                 url_or_request = sanitized_Request(url_or_request, data, headers)
 578         try:
 579             return self._downloader.urlopen(url_or_request)
 580         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 581             if errnote is False:
 582                 return False
 583             if errnote is None:
 584                 errnote = 'Unable to download webpage'
 585
 586             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 587             if fatal:
 588                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 589             else:
 590                 self._downloader.report_warning(errmsg)
 591                 return False
 592
 593     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 594         """ Returns a tuple (page content as string, URL handle) """
 595         # Strip hashes from the URL (#1038)
 596         if isinstance(url_or_request, (compat_str, str)):
 597             url_or_request = url_or_request.partition('#')[0]
 598
 599         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 600         if urlh is False:
 601             assert not fatal
 602             return False
 603         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 604         return (content, urlh)
 605
 606     @staticmethod
 607     def _guess_encoding_from_content(content_type, webpage_bytes):
 608         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 609         if m:
 610             encoding = m.group(1)
 611         else:
 612             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 613                           webpage_bytes[:1024])
 614             if m:
 615                 encoding = m.group(1).decode('ascii')
 616             elif webpage_bytes.startswith(b'\xff\xfe'):
 617                 encoding = 'utf-16'
 618             else:
 619                 encoding = 'utf-8'
 620
 621         return encoding
 622
 623     def __check_blocked(self, content):
 624         first_block = content[:512]
 625         if ('<title>Access to this site is blocked</title>' in content and
 626                 'Websense' in first_block):
 627             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 628             blocked_iframe = self._html_search_regex(
 629                 r'<iframe src="([^"]+)"', content,
 630                 'Websense information URL', default=None)
 631             if blocked_iframe:
 632                 msg += ' Visit %s for more details' % blocked_iframe
 633             raise ExtractorError(msg, expected=True)
 634         if '<title>The URL you requested has been blocked</title>' in first_block:
 635             msg = (
 636                 'Access to this webpage has been blocked by Indian censorship. '
 637                 'Use a VPN or proxy server (with --proxy) to route around it.')
 638             block_msg = self._html_search_regex(
 639                 r'</h1><p>(.*?)</p>',
 640                 content, 'block message', default=None)
 641             if block_msg:
 642                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 643             raise ExtractorError(msg, expected=True)
 644         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 645                 'blocklist.rkn.gov.ru' in content):
 646             raise ExtractorError(
 647                 'Access to this webpage has been blocked by decision of the Russian government. '
 648                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 649                 expected=True)
 650
 651     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 652         content_type = urlh.headers.get('Content-Type', '')
 653         webpage_bytes = urlh.read()
 654         if prefix is not None:
 655             webpage_bytes = prefix + webpage_bytes
 656         if not encoding:
 657             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 658         if self._downloader.params.get('dump_intermediate_pages', False):
 659             self.to_screen('Dumping request to ' + urlh.geturl())
 660             dump = base64.b64encode(webpage_bytes).decode('ascii')
 661             self._downloader.to_screen(dump)
 662         if self._downloader.params.get('write_pages', False):
 663             basen = '%s_%s' % (video_id, urlh.geturl())
 664             if len(basen) > 240:
 665                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 666                 basen = basen[:240 - len(h)] + h
 667             raw_filename = basen + '.dump'
 668             filename = sanitize_filename(raw_filename, restricted=True)
 669             self.to_screen('Saving request to ' + filename)
 670             # Working around MAX_PATH limitation on Windows (see
 671             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 672             if compat_os_name == 'nt':
 673                 absfilepath = os.path.abspath(filename)
 674                 if len(absfilepath) > 259:
 675                     filename = '\\\\?\\' + absfilepath
 676             with open(filename, 'wb') as outf:
 677                 outf.write(webpage_bytes)
 678
 679         try:
 680             content = webpage_bytes.decode(encoding, 'replace')
 681         except LookupError:
 682             content = webpage_bytes.decode('utf-8', 'replace')
 683
 684         self.__check_blocked(content)
 685
 686         return content
 687
 688     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 689         """ Returns the data of the page as a string """
 690         success = False
 691         try_count = 0
 692         while success is False:
 693             try:
 694                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 695                 success = True
 696             except compat_http_client.IncompleteRead as e:
 697                 try_count += 1
 698                 if try_count >= tries:
 699                     raise e
 700                 self._sleep(timeout, video_id)
 701         if res is False:
 702             return res
 703         else:
 704             content, _ = res
 705             return content
 706
 707     def _download_xml_handle(
 708             self, url_or_request, video_id, note='Downloading XML',
 709             errnote='Unable to download XML', transform_source=None,
 710             fatal=True, encoding=None, data=None, headers={}, query={}):
 711         """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
 712         res = self._download_webpage_handle(
 713             url_or_request, video_id, note, errnote, fatal=fatal,
 714             encoding=encoding, data=data, headers=headers, query=query)
 715         if res is False:
 716             return res
 717         xml_string, urlh = res
 718         return self._parse_xml(
 719             xml_string, video_id, transform_source=transform_source,
 720             fatal=fatal), urlh
 721
 722     def _download_xml(self, url_or_request, video_id,
 723                       note='Downloading XML', errnote='Unable to download XML',
 724                       transform_source=None, fatal=True, encoding=None,
 725                       data=None, headers={}, query={}):
 726         """Return the xml as an xml.etree.ElementTree.Element"""
 727         res = self._download_xml_handle(
 728             url_or_request, video_id, note=note, errnote=errnote,
 729             transform_source=transform_source, fatal=fatal, encoding=encoding,
 730             data=data, headers=headers, query=query)
 731         return res if res is False else res[0]
 732
 733     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 734         if transform_source:
 735             xml_string = transform_source(xml_string)
 736         try:
 737             return compat_etree_fromstring(xml_string.encode('utf-8'))
 738         except compat_xml_parse_error as ve:
 739             errmsg = '%s: Failed to parse XML ' % video_id
 740             if fatal:
 741                 raise ExtractorError(errmsg, cause=ve)
 742             else:
 743                 self.report_warning(errmsg + str(ve))
 744
 745     def _download_json_handle(
 746             self, url_or_request, video_id, note='Downloading JSON metadata',
 747             errnote='Unable to download JSON metadata', transform_source=None,
 748             fatal=True, encoding=None, data=None, headers={}, query={}):
 749         """Return a tuple (JSON object, URL handle)"""
 750         res = self._download_webpage_handle(
 751             url_or_request, video_id, note, errnote, fatal=fatal,
 752             encoding=encoding, data=data, headers=headers, query=query)
 753         if res is False:
 754             return res
 755         json_string, urlh = res
 756         return self._parse_json(
 757             json_string, video_id, transform_source=transform_source,
 758             fatal=fatal), urlh
 759
 760     def _download_json(
 761             self, url_or_request, video_id, note='Downloading JSON metadata',
 762             errnote='Unable to download JSON metadata', transform_source=None,
 763             fatal=True, encoding=None, data=None, headers={}, query={}):
 764         res = self._download_json_handle(
 765             url_or_request, video_id, note=note, errnote=errnote,
 766             transform_source=transform_source, fatal=fatal, encoding=encoding,
 767             data=data, headers=headers, query=query)
 768         return res if res is False else res[0]
 769
 770     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 771         if transform_source:
 772             json_string = transform_source(json_string)
 773         try:
 774             return json.loads(json_string)
 775         except ValueError as ve:
 776             errmsg = '%s: Failed to parse JSON ' % video_id
 777             if fatal:
 778                 raise ExtractorError(errmsg, cause=ve)
 779             else:
 780                 self.report_warning(errmsg + str(ve))
 781
 782     def report_warning(self, msg, video_id=None):
 783         idstr = '' if video_id is None else '%s: ' % video_id
 784         self._downloader.report_warning(
 785             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 786
 787     def to_screen(self, msg):
 788         """Print msg to screen, prefixing it with '[ie_name]'"""
 789         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 790
 791     def report_extraction(self, id_or_name):
 792         """Report information extraction."""
 793         self.to_screen('%s: Extracting information' % id_or_name)
 794
 795     def report_download_webpage(self, video_id):
 796         """Report webpage download."""
 797         self.to_screen('%s: Downloading webpage' % video_id)
 798
 799     def report_age_confirmation(self):
 800         """Report attempt to confirm age."""
 801         self.to_screen('Confirming age')
 802
 803     def report_login(self):
 804         """Report attempt to log in."""
 805         self.to_screen('Logging in')
 806
 807     @staticmethod
 808     def raise_login_required(msg='This video is only available for registered users'):
 809         raise ExtractorError(
 810             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 811             expected=True)
 812
 813     @staticmethod
 814     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 815         raise GeoRestrictedError(msg, countries=countries)
 816
 817     # Methods for following #608
 818     @staticmethod
 819     def url_result(url, ie=None, video_id=None, video_title=None):
 820         """Returns a URL that points to a page that should be processed"""
 821         # TODO: ie should be the class used for getting the info
 822         video_info = {'_type': 'url',
 823                       'url': url,
 824                       'ie_key': ie}
 825         if video_id is not None:
 826             video_info['id'] = video_id
 827         if video_title is not None:
 828             video_info['title'] = video_title
 829         return video_info
 830
 831     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 832         urls = orderedSet(
 833             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 834             for m in matches)
 835         return self.playlist_result(
 836             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 837
 838     @staticmethod
 839     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 840         """Returns a playlist"""
 841         video_info = {'_type': 'playlist',
 842                       'entries': entries}
 843         if playlist_id:
 844             video_info['id'] = playlist_id
 845         if playlist_title:
 846             video_info['title'] = playlist_title
 847         if playlist_description:
 848             video_info['description'] = playlist_description
 849         return video_info
 850
 851     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 852         """
 853         Perform a regex search on the given string, using a single or a list of
 854         patterns returning the first matching group.
 855         In case of failure return a default value or raise a WARNING or a
 856         RegexNotFoundError, depending on fatal, specifying the field name.
 857         """
 858         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 859             mobj = re.search(pattern, string, flags)
 860         else:
 861             for p in pattern:
 862                 mobj = re.search(p, string, flags)
 863                 if mobj:
 864                     break
 865
 866         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 867             _name = '\033[0;34m%s\033[0m' % name
 868         else:
 869             _name = name
 870
 871         if mobj:
 872             if group is None:
 873                 # return the first matching group
 874                 return next(g for g in mobj.groups() if g is not None)
 875             else:
 876                 return mobj.group(group)
 877         elif default is not NO_DEFAULT:
 878             return default
 879         elif fatal:
 880             raise RegexNotFoundError('Unable to extract %s' % _name)
 881         else:
 882             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 883             return None
 884
 885     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 886         """
 887         Like _search_regex, but strips HTML tags and unescapes entities.
 888         """
 889         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 890         if res:
 891             return clean_html(res).strip()
 892         else:
 893             return res
 894
 895     def _get_netrc_login_info(self, netrc_machine=None):
 896         username = None
 897         password = None
 898         netrc_machine = netrc_machine or self._NETRC_MACHINE
 899
 900         if self._downloader.params.get('usenetrc', False):
 901             try:
 902                 info = netrc.netrc().authenticators(netrc_machine)
 903                 if info is not None:
 904                     username = info[0]
 905                     password = info[2]
 906                 else:
 907                     raise netrc.NetrcParseError(
 908                         'No authenticators for %s' % netrc_machine)
 909             except (IOError, netrc.NetrcParseError) as err:
 910                 self._downloader.report_warning(
 911                     'parsing .netrc: %s' % error_to_compat_str(err))
 912
 913         return username, password
 914
 915     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 916         """
 917         Get the login info as (username, password)
 918         First look for the manually specified credentials using username_option
 919         and password_option as keys in params dictionary. If no such credentials
 920         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 921         value.
 922         If there's no info available, return (None, None)
 923         """
 924         if self._downloader is None:
 925             return (None, None)
 926
 927         downloader_params = self._downloader.params
 928
 929         # Attempt to use provided username and password or .netrc data
 930         if downloader_params.get(username_option) is not None:
 931             username = downloader_params[username_option]
 932             password = downloader_params[password_option]
 933         else:
 934             username, password = self._get_netrc_login_info(netrc_machine)
 935
 936         return username, password
 937
 938     def _get_tfa_info(self, note='two-factor verification code'):
 939         """
 940         Get the two-factor authentication info
 941         TODO - asking the user will be required for sms/phone verify
 942         currently just uses the command line option
 943         If there's no info available, return None
 944         """
 945         if self._downloader is None:
 946             return None
 947         downloader_params = self._downloader.params
 948
 949         if downloader_params.get('twofactor') is not None:
 950             return downloader_params['twofactor']
 951
 952         return compat_getpass('Type %s and press [Return]: ' % note)
 953
 954     # Helper functions for extracting OpenGraph info
 955     @staticmethod
 956     def _og_regexes(prop):
 957         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 958         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 959                        % {'prop': re.escape(prop)})
 960         template = r'<meta[^>]+?%s[^>]+?%s'
 961         return [
 962             template % (property_re, content_re),
 963             template % (content_re, property_re),
 964         ]
 965
 966     @staticmethod
 967     def _meta_regex(prop):
 968         return r'''(?isx)<meta
 969                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 970                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 971
 972     def _og_search_property(self, prop, html, name=None, **kargs):
 973         if not isinstance(prop, (list, tuple)):
 974             prop = [prop]
 975         if name is None:
 976             name = 'OpenGraph %s' % prop[0]
 977         og_regexes = []
 978         for p in prop:
 979             og_regexes.extend(self._og_regexes(p))
 980         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 981         if escaped is None:
 982             return None
 983         return unescapeHTML(escaped)
 984
 985     def _og_search_thumbnail(self, html, **kargs):
 986         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 987
 988     def _og_search_description(self, html, **kargs):
 989         return self._og_search_property('description', html, fatal=False, **kargs)
 990
 991     def _og_search_title(self, html, **kargs):
 992         return self._og_search_property('title', html, **kargs)
 993
 994     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 995         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 996         if secure:
 997             regexes = self._og_regexes('video:secure_url') + regexes
 998         return self._html_search_regex(regexes, html, name, **kargs)
 999
1000     def _og_search_url(self, html, **kargs):
1001         return self._og_search_property('url', html, **kargs)
1002
1003     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
1004         if not isinstance(name, (list, tuple)):
1005             name = [name]
1006         if display_name is None:
1007             display_name = name[0]
1008         return self._html_search_regex(
1009             [self._meta_regex(n) for n in name],
1010             html, display_name, fatal=fatal, group='content', **kwargs)
1011
1012     def _dc_search_uploader(self, html):
1013         return self._html_search_meta('dc.creator', html, 'uploader')
1014
1015     def _rta_search(self, html):
1016         # See http://www.rtalabel.org/index.php?content=howtofaq#single
1017         if re.search(r'(?ix)<meta\s+name="rating"\s+'
1018                      r'     content="RTA-5042-1996-1400-1577-RTA"',
1019                      html):
1020             return 18
1021         return 0
1022
1023     def _media_rating_search(self, html):
1024         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
1025         rating = self._html_search_meta('rating', html)
1026
1027         if not rating:
1028             return None
1029
1030         RATING_TABLE = {
1031             'safe for kids': 0,
1032             'general': 8,
1033             '14 years': 14,
1034             'mature': 17,
1035             'restricted': 19,
1036         }
1037         return RATING_TABLE.get(rating.lower())
1038
1039     def _family_friendly_search(self, html):
1040         # See http://schema.org/VideoObject
1041         family_friendly = self._html_search_meta(
1042             'isFamilyFriendly', html, default=None)
1043
1044         if not family_friendly:
1045             return None
1046
1047         RATING_TABLE = {
1048             '1': 0,
1049             'true': 0,
1050             '0': 18,
1051             'false': 18,
1052         }
1053         return RATING_TABLE.get(family_friendly.lower())
1054
1055     def _twitter_search_player(self, html):
1056         return self._html_search_meta('twitter:player', html,
1057                                       'twitter card player')
1058
1059     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
1060         json_ld = self._search_regex(
1061             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
1062             html, 'JSON-LD', group='json_ld', **kwargs)
1063         default = kwargs.get('default', NO_DEFAULT)
1064         if not json_ld:
1065             return default if default is not NO_DEFAULT else {}
1066         # JSON-LD may be malformed and thus `fatal` should be respected.
1067         # At the same time `default` may be passed that assumes `fatal=False`
1068         # for _search_regex. Let's simulate the same behavior here as well.
1069         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
1070         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
1071
1072     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1073         if isinstance(json_ld, compat_str):
1074             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1075         if not json_ld:
1076             return {}
1077         info = {}
1078         if not isinstance(json_ld, (list, tuple, dict)):
1079             return info
1080         if isinstance(json_ld, dict):
1081             json_ld = [json_ld]
1082
1083         INTERACTION_TYPE_MAP = {
1084             'CommentAction': 'comment',
1085             'AgreeAction': 'like',
1086             'DisagreeAction': 'dislike',
1087             'LikeAction': 'like',
1088             'DislikeAction': 'dislike',
1089             'ListenAction': 'view',
1090             'WatchAction': 'view',
1091             'ViewAction': 'view',
1092         }
1093
1094         def extract_interaction_statistic(e):
1095             interaction_statistic = e.get('interactionStatistic')
1096             if not isinstance(interaction_statistic, list):
1097                 return
1098             for is_e in interaction_statistic:
1099                 if not isinstance(is_e, dict):
1100                     continue
1101                 if is_e.get('@type') != 'InteractionCounter':
1102                     continue
1103                 interaction_type = is_e.get('interactionType')
1104                 if not isinstance(interaction_type, compat_str):
1105                     continue
1106                 interaction_count = int_or_none(is_e.get('userInteractionCount'))
1107                 if interaction_count is None:
1108                     continue
1109                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1])
1110                 if not count_kind:
1111                     continue
1112                 count_key = '%s_count' % count_kind
1113                 if info.get(count_key) is not None:
1114                     continue
1115                 info[count_key] = interaction_count
1116
1117         def extract_video_object(e):
1118             assert e['@type'] == 'VideoObject'
1119             info.update({
1120                 'url': e.get('contentUrl'),
1121                 'title': unescapeHTML(e.get('name')),
1122                 'description': unescapeHTML(e.get('description')),
1123                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1124                 'duration': parse_duration(e.get('duration')),
1125                 'timestamp': unified_timestamp(e.get('uploadDate')),
1126                 'filesize': float_or_none(e.get('contentSize')),
1127                 'tbr': int_or_none(e.get('bitrate')),
1128                 'width': int_or_none(e.get('width')),
1129                 'height': int_or_none(e.get('height')),
1130                 'view_count': int_or_none(e.get('interactionCount')),
1131             })
1132             extract_interaction_statistic(e)
1133
1134         for e in json_ld:
1135             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1136                 item_type = e.get('@type')
1137                 if expected_type is not None and expected_type != item_type:
1138                     return info
1139                 if item_type in ('TVEpisode', 'Episode'):
1140                     info.update({
1141                         'episode': unescapeHTML(e.get('name')),
1142                         'episode_number': int_or_none(e.get('episodeNumber')),
1143                         'description': unescapeHTML(e.get('description')),
1144                     })
1145                     part_of_season = e.get('partOfSeason')
1146                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1147                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1148                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1149                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1150                         info['series'] = unescapeHTML(part_of_series.get('name'))
1151                 elif item_type in ('Article', 'NewsArticle'):
1152                     info.update({
1153                         'timestamp': parse_iso8601(e.get('datePublished')),
1154                         'title': unescapeHTML(e.get('headline')),
1155                         'description': unescapeHTML(e.get('articleBody')),
1156                     })
1157                 elif item_type == 'VideoObject':
1158                     extract_video_object(e)
1159                     continue
1160                 video = e.get('video')
1161                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1162                     extract_video_object(video)
1163                 break
1164         return dict((k, v) for k, v in info.items() if v is not None)
1165
1166     @staticmethod
1167     def _hidden_inputs(html):
1168         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1169         hidden_inputs = {}
1170         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1171             attrs = extract_attributes(input)
1172             if not input:
1173                 continue
1174             if attrs.get('type') not in ('hidden', 'submit'):
1175                 continue
1176             name = attrs.get('name') or attrs.get('id')
1177             value = attrs.get('value')
1178             if name and value is not None:
1179                 hidden_inputs[name] = value
1180         return hidden_inputs
1181
1182     def _form_hidden_inputs(self, form_id, html):
1183         form = self._search_regex(
1184             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1185             html, '%s form' % form_id, group='form')
1186         return self._hidden_inputs(form)
1187
1188     def _sort_formats(self, formats, field_preference=None):
1189         if not formats:
1190             raise ExtractorError('No video formats found')
1191
1192         for f in formats:
1193             # Automatically determine tbr when missing based on abr and vbr (improves
1194             # formats sorting in some cases)
1195             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1196                 f['tbr'] = f['abr'] + f['vbr']
1197
1198         def _formats_key(f):
1199             # TODO remove the following workaround
1200             from ..utils import determine_ext
1201             if not f.get('ext') and 'url' in f:
1202                 f['ext'] = determine_ext(f['url'])
1203
1204             if isinstance(field_preference, (list, tuple)):
1205                 return tuple(
1206                     f.get(field)
1207                     if f.get(field) is not None
1208                     else ('' if field == 'format_id' else -1)
1209                     for field in field_preference)
1210
1211             preference = f.get('preference')
1212             if preference is None:
1213                 preference = 0
1214                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1215                     preference -= 0.5
1216
1217             protocol = f.get('protocol') or determine_protocol(f)
1218             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1219
1220             if f.get('vcodec') == 'none':  # audio only
1221                 preference -= 50
1222                 if self._downloader.params.get('prefer_free_formats'):
1223                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1224                 else:
1225                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1226                 ext_preference = 0
1227                 try:
1228                     audio_ext_preference = ORDER.index(f['ext'])
1229                 except ValueError:
1230                     audio_ext_preference = -1
1231             else:
1232                 if f.get('acodec') == 'none':  # video only
1233                     preference -= 40
1234                 if self._downloader.params.get('prefer_free_formats'):
1235                     ORDER = ['flv', 'mp4', 'webm']
1236                 else:
1237                     ORDER = ['webm', 'flv', 'mp4']
1238                 try:
1239                     ext_preference = ORDER.index(f['ext'])
1240                 except ValueError:
1241                     ext_preference = -1
1242                 audio_ext_preference = 0
1243
1244             return (
1245                 preference,
1246                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1247                 f.get('quality') if f.get('quality') is not None else -1,
1248                 f.get('tbr') if f.get('tbr') is not None else -1,
1249                 f.get('filesize') if f.get('filesize') is not None else -1,
1250                 f.get('vbr') if f.get('vbr') is not None else -1,
1251                 f.get('height') if f.get('height') is not None else -1,
1252                 f.get('width') if f.get('width') is not None else -1,
1253                 proto_preference,
1254                 ext_preference,
1255                 f.get('abr') if f.get('abr') is not None else -1,
1256                 audio_ext_preference,
1257                 f.get('fps') if f.get('fps') is not None else -1,
1258                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1259                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1260                 f.get('format_id') if f.get('format_id') is not None else '',
1261             )
1262         formats.sort(key=_formats_key)
1263
1264     def _check_formats(self, formats, video_id):
1265         if formats:
1266             formats[:] = filter(
1267                 lambda f: self._is_valid_url(
1268                     f['url'], video_id,
1269                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1270                 formats)
1271
1272     @staticmethod
1273     def _remove_duplicate_formats(formats):
1274         format_urls = set()
1275         unique_formats = []
1276         for f in formats:
1277             if f['url'] not in format_urls:
1278                 format_urls.add(f['url'])
1279                 unique_formats.append(f)
1280         formats[:] = unique_formats
1281
1282     def _is_valid_url(self, url, video_id, item='video', headers={}):
1283         url = self._proto_relative_url(url, scheme='http:')
1284         # For now assume non HTTP(S) URLs always valid
1285         if not (url.startswith('http://') or url.startswith('https://')):
1286             return True
1287         try:
1288             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1289             return True
1290         except ExtractorError as e:
1291             if isinstance(e.cause, compat_urllib_error.URLError):
1292                 self.to_screen(
1293                     '%s: %s URL is invalid, skipping' % (video_id, item))
1294                 return False
1295             raise
1296
1297     def http_scheme(self):
1298         """ Either "http:" or "https:", depending on the user's preferences """
1299         return (
1300             'http:'
1301             if self._downloader.params.get('prefer_insecure', False)
1302             else 'https:')
1303
1304     def _proto_relative_url(self, url, scheme=None):
1305         if url is None:
1306             return url
1307         if url.startswith('//'):
1308             if scheme is None:
1309                 scheme = self.http_scheme()
1310             return scheme + url
1311         else:
1312             return url
1313
1314     def _sleep(self, timeout, video_id, msg_template=None):
1315         if msg_template is None:
1316             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1317         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1318         self.to_screen(msg)
1319         time.sleep(timeout)
1320
1321     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1322                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1323                              fatal=True, m3u8_id=None):
1324         manifest = self._download_xml(
1325             manifest_url, video_id, 'Downloading f4m manifest',
1326             'Unable to download f4m manifest',
1327             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1328             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1329             transform_source=transform_source,
1330             fatal=fatal)
1331
1332         if manifest is False:
1333             return []
1334
1335         return self._parse_f4m_formats(
1336             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1337             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1338
1339     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1340                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1341                            fatal=True, m3u8_id=None):
1342         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1343         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1344         if akamai_pv is not None and ';' in akamai_pv.text:
1345             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1346             if playerVerificationChallenge.strip() != '':
1347                 return []
1348
1349         formats = []
1350         manifest_version = '1.0'
1351         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1352         if not media_nodes:
1353             manifest_version = '2.0'
1354             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1355         # Remove unsupported DRM protected media from final formats
1356         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1357         media_nodes = remove_encrypted_media(media_nodes)
1358         if not media_nodes:
1359             return formats
1360
1361         manifest_base_url = get_base_url(manifest)
1362
1363         bootstrap_info = xpath_element(
1364             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1365             'bootstrap info', default=None)
1366
1367         vcodec = None
1368         mime_type = xpath_text(
1369             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1370             'base URL', default=None)
1371         if mime_type and mime_type.startswith('audio/'):
1372             vcodec = 'none'
1373
1374         for i, media_el in enumerate(media_nodes):
1375             tbr = int_or_none(media_el.attrib.get('bitrate'))
1376             width = int_or_none(media_el.attrib.get('width'))
1377             height = int_or_none(media_el.attrib.get('height'))
1378             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1379             # If <bootstrapInfo> is present, the specified f4m is a
1380             # stream-level manifest, and only set-level manifests may refer to
1381             # external resources.  See section 11.4 and section 4 of F4M spec
1382             if bootstrap_info is None:
1383                 media_url = None
1384                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1385                 if manifest_version == '2.0':
1386                     media_url = media_el.attrib.get('href')
1387                 if media_url is None:
1388                     media_url = media_el.attrib.get('url')
1389                 if not media_url:
1390                     continue
1391                 manifest_url = (
1392                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1393                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1394                 # If media_url is itself a f4m manifest do the recursive extraction
1395                 # since bitrates in parent manifest (this one) and media_url manifest
1396                 # may differ leading to inability to resolve the format by requested
1397                 # bitrate in f4m downloader
1398                 ext = determine_ext(manifest_url)
1399                 if ext == 'f4m':
1400                     f4m_formats = self._extract_f4m_formats(
1401                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1402                         transform_source=transform_source, fatal=fatal)
1403                     # Sometimes stream-level manifest contains single media entry that
1404                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1405                     # At the same time parent's media entry in set-level manifest may
1406                     # contain it. We will copy it from parent in such cases.
1407                     if len(f4m_formats) == 1:
1408                         f = f4m_formats[0]
1409                         f.update({
1410                             'tbr': f.get('tbr') or tbr,
1411                             'width': f.get('width') or width,
1412                             'height': f.get('height') or height,
1413                             'format_id': f.get('format_id') if not tbr else format_id,
1414                             'vcodec': vcodec,
1415                         })
1416                     formats.extend(f4m_formats)
1417                     continue
1418                 elif ext == 'm3u8':
1419                     formats.extend(self._extract_m3u8_formats(
1420                         manifest_url, video_id, 'mp4', preference=preference,
1421                         m3u8_id=m3u8_id, fatal=fatal))
1422                     continue
1423             formats.append({
1424                 'format_id': format_id,
1425                 'url': manifest_url,
1426                 'manifest_url': manifest_url,
1427                 'ext': 'flv' if bootstrap_info is not None else None,
1428                 'protocol': 'f4m',
1429                 'tbr': tbr,
1430                 'width': width,
1431                 'height': height,
1432                 'vcodec': vcodec,
1433                 'preference': preference,
1434             })
1435         return formats
1436
1437     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1438         return {
1439             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1440             'url': m3u8_url,
1441             'ext': ext,
1442             'protocol': 'm3u8',
1443             'preference': preference - 100 if preference else -100,
1444             'resolution': 'multiple',
1445             'format_note': 'Quality selection URL',
1446         }
1447
1448     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1449                               entry_protocol='m3u8', preference=None,
1450                               m3u8_id=None, note=None, errnote=None,
1451                               fatal=True, live=False):
1452         res = self._download_webpage_handle(
1453             m3u8_url, video_id,
1454             note=note or 'Downloading m3u8 information',
1455             errnote=errnote or 'Failed to download m3u8 information',
1456             fatal=fatal)
1457
1458         if res is False:
1459             return []
1460
1461         m3u8_doc, urlh = res
1462         m3u8_url = urlh.geturl()
1463
1464         return self._parse_m3u8_formats(
1465             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1466             preference=preference, m3u8_id=m3u8_id, live=live)
1467
1468     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1469                             entry_protocol='m3u8', preference=None,
1470                             m3u8_id=None, live=False):
1471         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1472             return []
1473
1474         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1475             return []
1476
1477         formats = []
1478
1479         format_url = lambda u: (
1480             u
1481             if re.match(r'^https?://', u)
1482             else compat_urlparse.urljoin(m3u8_url, u))
1483
1484         # References:
1485         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1486         # 2. https://github.com/rg3/youtube-dl/issues/12211
1487
1488         # We should try extracting formats only from master playlists [1, 4.3.4],
1489         # i.e. playlists that describe available qualities. On the other hand
1490         # media playlists [1, 4.3.3] should be returned as is since they contain
1491         # just the media without qualities renditions.
1492         # Fortunately, master playlist can be easily distinguished from media
1493         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1494         # master playlist tags MUST NOT appear in a media playist and vice versa.
1495         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1496         # media playlist and MUST NOT appear in master playlist thus we can
1497         # clearly detect media playlist with this criterion.
1498
1499         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1500             return [{
1501                 'url': m3u8_url,
1502                 'format_id': m3u8_id,
1503                 'ext': ext,
1504                 'protocol': entry_protocol,
1505                 'preference': preference,
1506             }]
1507
1508         groups = {}
1509         last_stream_inf = {}
1510
1511         def extract_media(x_media_line):
1512             media = parse_m3u8_attributes(x_media_line)
1513             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1514             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1515             if not (media_type and group_id and name):
1516                 return
1517             groups.setdefault(group_id, []).append(media)
1518             if media_type not in ('VIDEO', 'AUDIO'):
1519                 return
1520             media_url = media.get('URI')
1521             if media_url:
1522                 format_id = []
1523                 for v in (m3u8_id, group_id, name):
1524                     if v:
1525                         format_id.append(v)
1526                 f = {
1527                     'format_id': '-'.join(format_id),
1528                     'url': format_url(media_url),
1529                     'manifest_url': m3u8_url,
1530                     'language': media.get('LANGUAGE'),
1531                     'ext': ext,
1532                     'protocol': entry_protocol,
1533                     'preference': preference,
1534                 }
1535                 if media_type == 'AUDIO':
1536                     f['vcodec'] = 'none'
1537                 formats.append(f)
1538
1539         def build_stream_name():
1540             # Despite specification does not mention NAME attribute for
1541             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1542             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1543             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1544             stream_name = last_stream_inf.get('NAME')
1545             if stream_name:
1546                 return stream_name
1547             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1548             # from corresponding rendition group
1549             stream_group_id = last_stream_inf.get('VIDEO')
1550             if not stream_group_id:
1551                 return
1552             stream_group = groups.get(stream_group_id)
1553             if not stream_group:
1554                 return stream_group_id
1555             rendition = stream_group[0]
1556             return rendition.get('NAME') or stream_group_id
1557
1558         for line in m3u8_doc.splitlines():
1559             if line.startswith('#EXT-X-STREAM-INF:'):
1560                 last_stream_inf = parse_m3u8_attributes(line)
1561             elif line.startswith('#EXT-X-MEDIA:'):
1562                 extract_media(line)
1563             elif line.startswith('#') or not line.strip():
1564                 continue
1565             else:
1566                 tbr = float_or_none(
1567                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1568                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1569                 format_id = []
1570                 if m3u8_id:
1571                     format_id.append(m3u8_id)
1572                 stream_name = build_stream_name()
1573                 # Bandwidth of live streams may differ over time thus making
1574                 # format_id unpredictable. So it's better to keep provided
1575                 # format_id intact.
1576                 if not live:
1577                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1578                 manifest_url = format_url(line.strip())
1579                 f = {
1580                     'format_id': '-'.join(format_id),
1581                     'url': manifest_url,
1582                     'manifest_url': m3u8_url,
1583                     'tbr': tbr,
1584                     'ext': ext,
1585                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1586                     'protocol': entry_protocol,
1587                     'preference': preference,
1588                 }
1589                 resolution = last_stream_inf.get('RESOLUTION')
1590                 if resolution:
1591                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1592                     if mobj:
1593                         f['width'] = int(mobj.group('width'))
1594                         f['height'] = int(mobj.group('height'))
1595                 # Unified Streaming Platform
1596                 mobj = re.search(
1597                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1598                 if mobj:
1599                     abr, vbr = mobj.groups()
1600                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1601                     f.update({
1602                         'vbr': vbr,
1603                         'abr': abr,
1604                     })
1605                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1606                 f.update(codecs)
1607                 audio_group_id = last_stream_inf.get('AUDIO')
1608                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1609                 # references a rendition group MUST have a CODECS attribute.
1610                 # However, this is not always respected, for example, [2]
1611                 # contains EXT-X-STREAM-INF tag which references AUDIO
1612                 # rendition group but does not have CODECS and despite
1613                 # referencing audio group an audio group, it represents
1614                 # a complete (with audio and video) format. So, for such cases
1615                 # we will ignore references to rendition groups and treat them
1616                 # as complete formats.
1617                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1618                     audio_group = groups.get(audio_group_id)
1619                     if audio_group and audio_group[0].get('URI'):
1620                         # TODO: update acodec for audio only formats with
1621                         # the same GROUP-ID
1622                         f['acodec'] = 'none'
1623                 formats.append(f)
1624                 last_stream_inf = {}
1625         return formats
1626
1627     @staticmethod
1628     def _xpath_ns(path, namespace=None):
1629         if not namespace:
1630             return path
1631         out = []
1632         for c in path.split('/'):
1633             if not c or c == '.':
1634                 out.append(c)
1635             else:
1636                 out.append('{%s}%s' % (namespace, c))
1637         return '/'.join(out)
1638
1639     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1640         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1641
1642         if smil is False:
1643             assert not fatal
1644             return []
1645
1646         namespace = self._parse_smil_namespace(smil)
1647
1648         return self._parse_smil_formats(
1649             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1650
1651     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1652         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1653         if smil is False:
1654             return {}
1655         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1656
1657     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1658         return self._download_xml(
1659             smil_url, video_id, 'Downloading SMIL file',
1660             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1661
1662     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1663         namespace = self._parse_smil_namespace(smil)
1664
1665         formats = self._parse_smil_formats(
1666             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1667         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1668
1669         video_id = os.path.splitext(url_basename(smil_url))[0]
1670         title = None
1671         description = None
1672         upload_date = None
1673         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1674             name = meta.attrib.get('name')
1675             content = meta.attrib.get('content')
1676             if not name or not content:
1677                 continue
1678             if not title and name == 'title':
1679                 title = content
1680             elif not description and name in ('description', 'abstract'):
1681                 description = content
1682             elif not upload_date and name == 'date':
1683                 upload_date = unified_strdate(content)
1684
1685         thumbnails = [{
1686             'id': image.get('type'),
1687             'url': image.get('src'),
1688             'width': int_or_none(image.get('width')),
1689             'height': int_or_none(image.get('height')),
1690         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1691
1692         return {
1693             'id': video_id,
1694             'title': title or video_id,
1695             'description': description,
1696             'upload_date': upload_date,
1697             'thumbnails': thumbnails,
1698             'formats': formats,
1699             'subtitles': subtitles,
1700         }
1701
1702     def _parse_smil_namespace(self, smil):
1703         return self._search_regex(
1704             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1705
1706     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1707         base = smil_url
1708         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1709             b = meta.get('base') or meta.get('httpBase')
1710             if b:
1711                 base = b
1712                 break
1713
1714         formats = []
1715         rtmp_count = 0
1716         http_count = 0
1717         m3u8_count = 0
1718
1719         srcs = []
1720         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1721         for medium in media:
1722             src = medium.get('src')
1723             if not src or src in srcs:
1724                 continue
1725             srcs.append(src)
1726
1727             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1728             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1729             width = int_or_none(medium.get('width'))
1730             height = int_or_none(medium.get('height'))
1731             proto = medium.get('proto')
1732             ext = medium.get('ext')
1733             src_ext = determine_ext(src)
1734             streamer = medium.get('streamer') or base
1735
1736             if proto == 'rtmp' or streamer.startswith('rtmp'):
1737                 rtmp_count += 1
1738                 formats.append({
1739                     'url': streamer,
1740                     'play_path': src,
1741                     'ext': 'flv',
1742                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1743                     'tbr': bitrate,
1744                     'filesize': filesize,
1745                     'width': width,
1746                     'height': height,
1747                 })
1748                 if transform_rtmp_url:
1749                     streamer, src = transform_rtmp_url(streamer, src)
1750                     formats[-1].update({
1751                         'url': streamer,
1752                         'play_path': src,
1753                     })
1754                 continue
1755
1756             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1757             src_url = src_url.strip()
1758
1759             if proto == 'm3u8' or src_ext == 'm3u8':
1760                 m3u8_formats = self._extract_m3u8_formats(
1761                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1762                 if len(m3u8_formats) == 1:
1763                     m3u8_count += 1
1764                     m3u8_formats[0].update({
1765                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1766                         'tbr': bitrate,
1767                         'width': width,
1768                         'height': height,
1769                     })
1770                 formats.extend(m3u8_formats)
1771                 continue
1772
1773             if src_ext == 'f4m':
1774                 f4m_url = src_url
1775                 if not f4m_params:
1776                     f4m_params = {
1777                         'hdcore': '3.2.0',
1778                         'plugin': 'flowplayer-3.2.0.1',
1779                     }
1780                 f4m_url += '&' if '?' in f4m_url else '?'
1781                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1782                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1783                 continue
1784
1785             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1786                 http_count += 1
1787                 formats.append({
1788                     'url': src_url,
1789                     'ext': ext or src_ext or 'flv',
1790                     'format_id': 'http-%d' % (bitrate or http_count),
1791                     'tbr': bitrate,
1792                     'filesize': filesize,
1793                     'width': width,
1794                     'height': height,
1795                 })
1796                 continue
1797
1798         return formats
1799
1800     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1801         urls = []
1802         subtitles = {}
1803         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1804             src = textstream.get('src')
1805             if not src or src in urls:
1806                 continue
1807             urls.append(src)
1808             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1809             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1810             subtitles.setdefault(lang, []).append({
1811                 'url': src,
1812                 'ext': ext,
1813             })
1814         return subtitles
1815
1816     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1817         xspf = self._download_xml(
1818             xspf_url, playlist_id, 'Downloading xpsf playlist',
1819             'Unable to download xspf manifest', fatal=fatal)
1820         if xspf is False:
1821             return []
1822         return self._parse_xspf(
1823             xspf, playlist_id, xspf_url=xspf_url,
1824             xspf_base_url=base_url(xspf_url))
1825
1826     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1827         NS_MAP = {
1828             'xspf': 'http://xspf.org/ns/0/',
1829             's1': 'http://static.streamone.nl/player/ns/0',
1830         }
1831
1832         entries = []
1833         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1834             title = xpath_text(
1835                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1836             description = xpath_text(
1837                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1838             thumbnail = xpath_text(
1839                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1840             duration = float_or_none(
1841                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1842
1843             formats = []
1844             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1845                 format_url = urljoin(xspf_base_url, location.text)
1846                 if not format_url:
1847                     continue
1848                 formats.append({
1849                     'url': format_url,
1850                     'manifest_url': xspf_url,
1851                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1852                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1853                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1854                 })
1855             self._sort_formats(formats)
1856
1857             entries.append({
1858                 'id': playlist_id,
1859                 'title': title,
1860                 'description': description,
1861                 'thumbnail': thumbnail,
1862                 'duration': duration,
1863                 'formats': formats,
1864             })
1865         return entries
1866
1867     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1868         res = self._download_xml_handle(
1869             mpd_url, video_id,
1870             note=note or 'Downloading MPD manifest',
1871             errnote=errnote or 'Failed to download MPD manifest',
1872             fatal=fatal)
1873         if res is False:
1874             return []
1875         mpd_doc, urlh = res
1876         mpd_base_url = base_url(urlh.geturl())
1877
1878         return self._parse_mpd_formats(
1879             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1880             formats_dict=formats_dict, mpd_url=mpd_url)
1881
1882     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1883         """
1884         Parse formats from MPD manifest.
1885         References:
1886          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1887             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1888          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1889         """
1890         if mpd_doc.get('type') == 'dynamic':
1891             return []
1892
1893         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1894
1895         def _add_ns(path):
1896             return self._xpath_ns(path, namespace)
1897
1898         def is_drm_protected(element):
1899             return element.find(_add_ns('ContentProtection')) is not None
1900
1901         def extract_multisegment_info(element, ms_parent_info):
1902             ms_info = ms_parent_info.copy()
1903
1904             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1905             # common attributes and elements.  We will only extract relevant
1906             # for us.
1907             def extract_common(source):
1908                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1909                 if segment_timeline is not None:
1910                     s_e = segment_timeline.findall(_add_ns('S'))
1911                     if s_e:
1912                         ms_info['total_number'] = 0
1913                         ms_info['s'] = []
1914                         for s in s_e:
1915                             r = int(s.get('r', 0))
1916                             ms_info['total_number'] += 1 + r
1917                             ms_info['s'].append({
1918                                 't': int(s.get('t', 0)),
1919                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1920                                 'd': int(s.attrib['d']),
1921                                 'r': r,
1922                             })
1923                 start_number = source.get('startNumber')
1924                 if start_number:
1925                     ms_info['start_number'] = int(start_number)
1926                 timescale = source.get('timescale')
1927                 if timescale:
1928                     ms_info['timescale'] = int(timescale)
1929                 segment_duration = source.get('duration')
1930                 if segment_duration:
1931                     ms_info['segment_duration'] = float(segment_duration)
1932
1933             def extract_Initialization(source):
1934                 initialization = source.find(_add_ns('Initialization'))
1935                 if initialization is not None:
1936                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1937
1938             segment_list = element.find(_add_ns('SegmentList'))
1939             if segment_list is not None:
1940                 extract_common(segment_list)
1941                 extract_Initialization(segment_list)
1942                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1943                 if segment_urls_e:
1944                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1945             else:
1946                 segment_template = element.find(_add_ns('SegmentTemplate'))
1947                 if segment_template is not None:
1948                     extract_common(segment_template)
1949                     media = segment_template.get('media')
1950                     if media:
1951                         ms_info['media'] = media
1952                     initialization = segment_template.get('initialization')
1953                     if initialization:
1954                         ms_info['initialization'] = initialization
1955                     else:
1956                         extract_Initialization(segment_template)
1957             return ms_info
1958
1959         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1960         formats = []
1961         for period in mpd_doc.findall(_add_ns('Period')):
1962             period_duration = parse_duration(period.get('duration')) or mpd_duration
1963             period_ms_info = extract_multisegment_info(period, {
1964                 'start_number': 1,
1965                 'timescale': 1,
1966             })
1967             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1968                 if is_drm_protected(adaptation_set):
1969                     continue
1970                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1971                 for representation in adaptation_set.findall(_add_ns('Representation')):
1972                     if is_drm_protected(representation):
1973                         continue
1974                     representation_attrib = adaptation_set.attrib.copy()
1975                     representation_attrib.update(representation.attrib)
1976                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1977                     mime_type = representation_attrib['mimeType']
1978                     content_type = mime_type.split('/')[0]
1979                     if content_type == 'text':
1980                         # TODO implement WebVTT downloading
1981                         pass
1982                     elif content_type in ('video', 'audio'):
1983                         base_url = ''
1984                         for element in (representation, adaptation_set, period, mpd_doc):
1985                             base_url_e = element.find(_add_ns('BaseURL'))
1986                             if base_url_e is not None:
1987                                 base_url = base_url_e.text + base_url
1988                                 if re.match(r'^https?://', base_url):
1989                                     break
1990                         if mpd_base_url and not re.match(r'^https?://', base_url):
1991                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1992                                 mpd_base_url += '/'
1993                             base_url = mpd_base_url + base_url
1994                         representation_id = representation_attrib.get('id')
1995                         lang = representation_attrib.get('lang')
1996                         url_el = representation.find(_add_ns('BaseURL'))
1997                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1998                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1999                         f = {
2000                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
2001                             'url': base_url,
2002                             'manifest_url': mpd_url,
2003                             'ext': mimetype2ext(mime_type),
2004                             'width': int_or_none(representation_attrib.get('width')),
2005                             'height': int_or_none(representation_attrib.get('height')),
2006                             'tbr': float_or_none(bandwidth, 1000),
2007                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
2008                             'fps': int_or_none(representation_attrib.get('frameRate')),
2009                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
2010                             'format_note': 'DASH %s' % content_type,
2011                             'filesize': filesize,
2012                             'container': mimetype2ext(mime_type) + '_dash',
2013                         }
2014                         f.update(parse_codecs(representation_attrib.get('codecs')))
2015                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
2016
2017                         def prepare_template(template_name, identifiers):
2018                             t = representation_ms_info[template_name]
2019                             t = t.replace('$RepresentationID$', representation_id)
2020                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
2021                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
2022                             t.replace('$$', '$')
2023                             return t
2024
2025                         # @initialization is a regular template like @media one
2026                         # so it should be handled just the same way (see
2027                         # https://github.com/rg3/youtube-dl/issues/11605)
2028                         if 'initialization' in representation_ms_info:
2029                             initialization_template = prepare_template(
2030                                 'initialization',
2031                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
2032                                 # $Time$ shall not be included for @initialization thus
2033                                 # only $Bandwidth$ remains
2034                                 ('Bandwidth', ))
2035                             representation_ms_info['initialization_url'] = initialization_template % {
2036                                 'Bandwidth': bandwidth,
2037                             }
2038
2039                         def location_key(location):
2040                             return 'url' if re.match(r'^https?://', location) else 'path'
2041
2042                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
2043
2044                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
2045                             media_location_key = location_key(media_template)
2046
2047                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
2048                             # can't be used at the same time
2049                             if '%(Number' in media_template and 's' not in representation_ms_info:
2050                                 segment_duration = None
2051                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
2052                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
2053                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
2054                                 representation_ms_info['fragments'] = [{
2055                                     media_location_key: media_template % {
2056                                         'Number': segment_number,
2057                                         'Bandwidth': bandwidth,
2058                                     },
2059                                     'duration': segment_duration,
2060                                 } for segment_number in range(
2061                                     representation_ms_info['start_number'],
2062                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
2063                             else:
2064                                 # $Number*$ or $Time$ in media template with S list available
2065                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
2066                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
2067                                 representation_ms_info['fragments'] = []
2068                                 segment_time = 0
2069                                 segment_d = None
2070                                 segment_number = representation_ms_info['start_number']
2071
2072                                 def add_segment_url():
2073                                     segment_url = media_template % {
2074                                         'Time': segment_time,
2075                                         'Bandwidth': bandwidth,
2076                                         'Number': segment_number,
2077                                     }
2078                                     representation_ms_info['fragments'].append({
2079                                         media_location_key: segment_url,
2080                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
2081                                     })
2082
2083                                 for num, s in enumerate(representation_ms_info['s']):
2084                                     segment_time = s.get('t') or segment_time
2085                                     segment_d = s['d']
2086                                     add_segment_url()
2087                                     segment_number += 1
2088                                     for r in range(s.get('r', 0)):
2089                                         segment_time += segment_d
2090                                         add_segment_url()
2091                                         segment_number += 1
2092                                     segment_time += segment_d
2093                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
2094                             # No media template
2095                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
2096                             # or any YouTube dashsegments video
2097                             fragments = []
2098                             segment_index = 0
2099                             timescale = representation_ms_info['timescale']
2100                             for s in representation_ms_info['s']:
2101                                 duration = float_or_none(s['d'], timescale)
2102                                 for r in range(s.get('r', 0) + 1):
2103                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
2104                                     fragments.append({
2105                                         location_key(segment_uri): segment_uri,
2106                                         'duration': duration,
2107                                     })
2108                                     segment_index += 1
2109                             representation_ms_info['fragments'] = fragments
2110                         elif 'segment_urls' in representation_ms_info:
2111                             # Segment URLs with no SegmentTimeline
2112                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2113                             # https://github.com/rg3/youtube-dl/pull/14844
2114                             fragments = []
2115                             segment_duration = float_or_none(
2116                                 representation_ms_info['segment_duration'],
2117                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2118                             for segment_url in representation_ms_info['segment_urls']:
2119                                 fragment = {
2120                                     location_key(segment_url): segment_url,
2121                                 }
2122                                 if segment_duration:
2123                                     fragment['duration'] = segment_duration
2124                                 fragments.append(fragment)
2125                             representation_ms_info['fragments'] = fragments
2126                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2127                         # No fragments key is present in this case.
2128                         if 'fragments' in representation_ms_info:
2129                             f.update({
2130                                 'fragment_base_url': base_url,
2131                                 'fragments': [],
2132                                 'protocol': 'http_dash_segments',
2133                             })
2134                             if 'initialization_url' in representation_ms_info:
2135                                 initialization_url = representation_ms_info['initialization_url']
2136                                 if not f.get('url'):
2137                                     f['url'] = initialization_url
2138                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2139                             f['fragments'].extend(representation_ms_info['fragments'])
2140                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2141                         # is not necessarily unique within a Period thus formats with
2142                         # the same `format_id` are quite possible. There are numerous examples
2143                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2144                         # https://github.com/rg3/youtube-dl/issues/13919)
2145                         full_info = formats_dict.get(representation_id, {}).copy()
2146                         full_info.update(f)
2147                         formats.append(full_info)
2148                     else:
2149                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2150         return formats
2151
2152     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2153         res = self._download_xml_handle(
2154             ism_url, video_id,
2155             note=note or 'Downloading ISM manifest',
2156             errnote=errnote or 'Failed to download ISM manifest',
2157             fatal=fatal)
2158         if res is False:
2159             return []
2160         ism_doc, urlh = res
2161
2162         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2163
2164     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2165         """
2166         Parse formats from ISM manifest.
2167         References:
2168          1. [MS-SSTR]: Smooth Streaming Protocol,
2169             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2170         """
2171         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2172             return []
2173
2174         duration = int(ism_doc.attrib['Duration'])
2175         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2176
2177         formats = []
2178         for stream in ism_doc.findall('StreamIndex'):
2179             stream_type = stream.get('Type')
2180             if stream_type not in ('video', 'audio'):
2181                 continue
2182             url_pattern = stream.attrib['Url']
2183             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2184             stream_name = stream.get('Name')
2185             for track in stream.findall('QualityLevel'):
2186                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2187                 # TODO: add support for WVC1 and WMAP
2188                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2189                     self.report_warning('%s is not a supported codec' % fourcc)
2190                     continue
2191                 tbr = int(track.attrib['Bitrate']) // 1000
2192                 # [1] does not mention Width and Height attributes. However,
2193                 # they're often present while MaxWidth and MaxHeight are
2194                 # missing, so should be used as fallbacks
2195                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2196                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2197                 sampling_rate = int_or_none(track.get('SamplingRate'))
2198
2199                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2200                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2201
2202                 fragments = []
2203                 fragment_ctx = {
2204                     'time': 0,
2205                 }
2206                 stream_fragments = stream.findall('c')
2207                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2208                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2209                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2210                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2211                     if not fragment_ctx['duration']:
2212                         try:
2213                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2214                         except IndexError:
2215                             next_fragment_time = duration
2216                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2217                     for _ in range(fragment_repeat):
2218                         fragments.append({
2219                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2220                             'duration': fragment_ctx['duration'] / stream_timescale,
2221                         })
2222                         fragment_ctx['time'] += fragment_ctx['duration']
2223
2224                 format_id = []
2225                 if ism_id:
2226                     format_id.append(ism_id)
2227                 if stream_name:
2228                     format_id.append(stream_name)
2229                 format_id.append(compat_str(tbr))
2230
2231                 formats.append({
2232                     'format_id': '-'.join(format_id),
2233                     'url': ism_url,
2234                     'manifest_url': ism_url,
2235                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2236                     'width': width,
2237                     'height': height,
2238                     'tbr': tbr,
2239                     'asr': sampling_rate,
2240                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2241                     'acodec': 'none' if stream_type == 'video' else fourcc,
2242                     'protocol': 'ism',
2243                     'fragments': fragments,
2244                     '_download_params': {
2245                         'duration': duration,
2246                         'timescale': stream_timescale,
2247                         'width': width or 0,
2248                         'height': height or 0,
2249                         'fourcc': fourcc,
2250                         'codec_private_data': track.get('CodecPrivateData'),
2251                         'sampling_rate': sampling_rate,
2252                         'channels': int_or_none(track.get('Channels', 2)),
2253                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2254                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2255                     },
2256                 })
2257         return formats
2258
2259     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2260         def absolute_url(item_url):
2261             return urljoin(base_url, item_url)
2262
2263         def parse_content_type(content_type):
2264             if not content_type:
2265                 return {}
2266             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2267             if ctr:
2268                 mimetype, codecs = ctr.groups()
2269                 f = parse_codecs(codecs)
2270                 f['ext'] = mimetype2ext(mimetype)
2271                 return f
2272             return {}
2273
2274         def _media_formats(src, cur_media_type, type_info={}):
2275             full_url = absolute_url(src)
2276             ext = type_info.get('ext') or determine_ext(full_url)
2277             if ext == 'm3u8':
2278                 is_plain_url = False
2279                 formats = self._extract_m3u8_formats(
2280                     full_url, video_id, ext='mp4',
2281                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2282                     preference=preference, fatal=False)
2283             elif ext == 'mpd':
2284                 is_plain_url = False
2285                 formats = self._extract_mpd_formats(
2286                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2287             else:
2288                 is_plain_url = True
2289                 formats = [{
2290                     'url': full_url,
2291                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2292                 }]
2293             return is_plain_url, formats
2294
2295         entries = []
2296         # amp-video and amp-audio are very similar to their HTML5 counterparts
2297         # so we wll include them right here (see
2298         # https://www.ampproject.org/docs/reference/components/amp-video)
2299         media_tags = [(media_tag, media_type, '')
2300                       for media_tag, media_type
2301                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2302         media_tags.extend(re.findall(
2303             # We only allow video|audio followed by a whitespace or '>'.
2304             # Allowing more characters may end up in significant slow down (see
2305             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2306             # http://www.porntrex.com/maps/videositemap.xml).
2307             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2308         for media_tag, media_type, media_content in media_tags:
2309             media_info = {
2310                 'formats': [],
2311                 'subtitles': {},
2312             }
2313             media_attributes = extract_attributes(media_tag)
2314             src = media_attributes.get('src')
2315             if src:
2316                 _, formats = _media_formats(src, media_type)
2317                 media_info['formats'].extend(formats)
2318             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2319             if media_content:
2320                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2321                     source_attributes = extract_attributes(source_tag)
2322                     src = source_attributes.get('src')
2323                     if not src:
2324                         continue
2325                     f = parse_content_type(source_attributes.get('type'))
2326                     is_plain_url, formats = _media_formats(src, media_type, f)
2327                     if is_plain_url:
2328                         # res attribute is not standard but seen several times
2329                         # in the wild
2330                         f.update({
2331                             'height': int_or_none(source_attributes.get('res')),
2332                             'format_id': source_attributes.get('label'),
2333                         })
2334                         f.update(formats[0])
2335                         media_info['formats'].append(f)
2336                     else:
2337                         media_info['formats'].extend(formats)
2338                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2339                     track_attributes = extract_attributes(track_tag)
2340                     kind = track_attributes.get('kind')
2341                     if not kind or kind in ('subtitles', 'captions'):
2342                         src = track_attributes.get('src')
2343                         if not src:
2344                             continue
2345                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2346                         media_info['subtitles'].setdefault(lang, []).append({
2347                             'url': absolute_url(src),
2348                         })
2349             if media_info['formats'] or media_info['subtitles']:
2350                 entries.append(media_info)
2351         return entries
2352
2353     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2354         formats = []
2355         hdcore_sign = 'hdcore=3.7.0'
2356         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2357         hds_host = hosts.get('hds')
2358         if hds_host:
2359             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2360         if 'hdcore=' not in f4m_url:
2361             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2362         f4m_formats = self._extract_f4m_formats(
2363             f4m_url, video_id, f4m_id='hds', fatal=False)
2364         for entry in f4m_formats:
2365             entry.update({'extra_param_to_segment_url': hdcore_sign})
2366         formats.extend(f4m_formats)
2367         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2368         hls_host = hosts.get('hls')
2369         if hls_host:
2370             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2371         formats.extend(self._extract_m3u8_formats(
2372             m3u8_url, video_id, 'mp4', 'm3u8_native',
2373             m3u8_id='hls', fatal=False))
2374         return formats
2375
2376     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2377         query = compat_urlparse.urlparse(url).query
2378         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2379         mobj = re.search(
2380             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2381         url_base = mobj.group('url')
2382         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2383         formats = []
2384
2385         def manifest_url(manifest):
2386             m_url = '%s/%s' % (http_base_url, manifest)
2387             if query:
2388                 m_url += '?%s' % query
2389             return m_url
2390
2391         if 'm3u8' not in skip_protocols:
2392             formats.extend(self._extract_m3u8_formats(
2393                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2394                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2395         if 'f4m' not in skip_protocols:
2396             formats.extend(self._extract_f4m_formats(
2397                 manifest_url('manifest.f4m'),
2398                 video_id, f4m_id='hds', fatal=False))
2399         if 'dash' not in skip_protocols:
2400             formats.extend(self._extract_mpd_formats(
2401                 manifest_url('manifest.mpd'),
2402                 video_id, mpd_id='dash', fatal=False))
2403         if re.search(r'(?:/smil:|\.smil)', url_base):
2404             if 'smil' not in skip_protocols:
2405                 rtmp_formats = self._extract_smil_formats(
2406                     manifest_url('jwplayer.smil'),
2407                     video_id, fatal=False)
2408                 for rtmp_format in rtmp_formats:
2409                     rtsp_format = rtmp_format.copy()
2410                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2411                     del rtsp_format['play_path']
2412                     del rtsp_format['ext']
2413                     rtsp_format.update({
2414                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2415                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2416                         'protocol': 'rtsp',
2417                     })
2418                     formats.extend([rtmp_format, rtsp_format])
2419         else:
2420             for protocol in ('rtmp', 'rtsp'):
2421                 if protocol not in skip_protocols:
2422                     formats.append({
2423                         'url': '%s:%s' % (protocol, url_base),
2424                         'format_id': protocol,
2425                         'protocol': protocol,
2426                     })
2427         return formats
2428
2429     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2430         mobj = re.search(
2431             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2432             webpage)
2433         if mobj:
2434             try:
2435                 jwplayer_data = self._parse_json(mobj.group('options'),
2436                                                  video_id=video_id,
2437                                                  transform_source=transform_source)
2438             except ExtractorError:
2439                 pass
2440             else:
2441                 if isinstance(jwplayer_data, dict):
2442                     return jwplayer_data
2443
2444     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2445         jwplayer_data = self._find_jwplayer_data(
2446             webpage, video_id, transform_source=js_to_json)
2447         return self._parse_jwplayer_data(
2448             jwplayer_data, video_id, *args, **kwargs)
2449
2450     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2451                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2452         # JWPlayer backward compatibility: flattened playlists
2453         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2454         if 'playlist' not in jwplayer_data:
2455             jwplayer_data = {'playlist': [jwplayer_data]}
2456
2457         entries = []
2458
2459         # JWPlayer backward compatibility: single playlist item
2460         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2461         if not isinstance(jwplayer_data['playlist'], list):
2462             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2463
2464         for video_data in jwplayer_data['playlist']:
2465             # JWPlayer backward compatibility: flattened sources
2466             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2467             if 'sources' not in video_data:
2468                 video_data['sources'] = [video_data]
2469
2470             this_video_id = video_id or video_data['mediaid']
2471
2472             formats = self._parse_jwplayer_formats(
2473                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2474                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2475
2476             subtitles = {}
2477             tracks = video_data.get('tracks')
2478             if tracks and isinstance(tracks, list):
2479                 for track in tracks:
2480                     if not isinstance(track, dict):
2481                         continue
2482                     track_kind = track.get('kind')
2483                     if not track_kind or not isinstance(track_kind, compat_str):
2484                         continue
2485                     if track_kind.lower() not in ('captions', 'subtitles'):
2486                         continue
2487                     track_url = urljoin(base_url, track.get('file'))
2488                     if not track_url:
2489                         continue
2490                     subtitles.setdefault(track.get('label') or 'en', []).append({
2491                         'url': self._proto_relative_url(track_url)
2492                     })
2493
2494             entry = {
2495                 'id': this_video_id,
2496                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2497                 'description': video_data.get('description'),
2498                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2499                 'timestamp': int_or_none(video_data.get('pubdate')),
2500                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2501                 'subtitles': subtitles,
2502             }
2503             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2504             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2505                 entry.update({
2506                     '_type': 'url_transparent',
2507                     'url': formats[0]['url'],
2508                 })
2509             else:
2510                 self._sort_formats(formats)
2511                 entry['formats'] = formats
2512             entries.append(entry)
2513         if len(entries) == 1:
2514             return entries[0]
2515         else:
2516             return self.playlist_result(entries)
2517
2518     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2519                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2520         urls = []
2521         formats = []
2522         for source in jwplayer_sources_data:
2523             if not isinstance(source, dict):
2524                 continue
2525             source_url = self._proto_relative_url(source.get('file'))
2526             if not source_url:
2527                 continue
2528             if base_url:
2529                 source_url = compat_urlparse.urljoin(base_url, source_url)
2530             if source_url in urls:
2531                 continue
2532             urls.append(source_url)
2533             source_type = source.get('type') or ''
2534             ext = mimetype2ext(source_type) or determine_ext(source_url)
2535             if source_type == 'hls' or ext == 'm3u8':
2536                 formats.extend(self._extract_m3u8_formats(
2537                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2538                     m3u8_id=m3u8_id, fatal=False))
2539             elif source_type == 'dash' or ext == 'mpd':
2540                 formats.extend(self._extract_mpd_formats(
2541                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2542             elif ext == 'smil':
2543                 formats.extend(self._extract_smil_formats(
2544                     source_url, video_id, fatal=False))
2545             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2546             elif source_type.startswith('audio') or ext in (
2547                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2548                 formats.append({
2549                     'url': source_url,
2550                     'vcodec': 'none',
2551                     'ext': ext,
2552                 })
2553             else:
2554                 height = int_or_none(source.get('height'))
2555                 if height is None:
2556                     # Often no height is provided but there is a label in
2557                     # format like "1080p", "720p SD", or 1080.
2558                     height = int_or_none(self._search_regex(
2559                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2560                         'height', default=None))
2561                 a_format = {
2562                     'url': source_url,
2563                     'width': int_or_none(source.get('width')),
2564                     'height': height,
2565                     'tbr': int_or_none(source.get('bitrate')),
2566                     'ext': ext,
2567                 }
2568                 if source_url.startswith('rtmp'):
2569                     a_format['ext'] = 'flv'
2570                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2571                     # of jwplayer.flash.swf
2572                     rtmp_url_parts = re.split(
2573                         r'((?:mp4|mp3|flv):)', source_url, 1)
2574                     if len(rtmp_url_parts) == 3:
2575                         rtmp_url, prefix, play_path = rtmp_url_parts
2576                         a_format.update({
2577                             'url': rtmp_url,
2578                             'play_path': prefix + play_path,
2579                         })
2580                     if rtmp_params:
2581                         a_format.update(rtmp_params)
2582                 formats.append(a_format)
2583         return formats
2584
2585     def _live_title(self, name):
2586         """ Generate the title for a live video """
2587         now = datetime.datetime.now()
2588         now_str = now.strftime('%Y-%m-%d %H:%M')
2589         return name + ' ' + now_str
2590
2591     def _int(self, v, name, fatal=False, **kwargs):
2592         res = int_or_none(v, **kwargs)
2593         if 'get_attr' in kwargs:
2594             print(getattr(v, kwargs['get_attr']))
2595         if res is None:
2596             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2597             if fatal:
2598                 raise ExtractorError(msg)
2599             else:
2600                 self._downloader.report_warning(msg)
2601         return res
2602
2603     def _float(self, v, name, fatal=False, **kwargs):
2604         res = float_or_none(v, **kwargs)
2605         if res is None:
2606             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2607             if fatal:
2608                 raise ExtractorError(msg)
2609             else:
2610                 self._downloader.report_warning(msg)
2611         return res
2612
2613     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2614                     path='/', secure=False, discard=False, rest={}, **kwargs):
2615         cookie = compat_cookiejar.Cookie(
2616             0, name, value, port, port is not None, domain, True,
2617             domain.startswith('.'), path, True, secure, expire_time,
2618             discard, None, None, rest)
2619         self._downloader.cookiejar.set_cookie(cookie)
2620
2621     def _get_cookies(self, url):
2622         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2623         req = sanitized_Request(url)
2624         self._downloader.cookiejar.add_cookie_header(req)
2625         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2626
2627     def get_testcases(self, include_onlymatching=False):
2628         t = getattr(self, '_TEST', None)
2629         if t:
2630             assert not hasattr(self, '_TESTS'), \
2631                 '%s has _TEST and _TESTS' % type(self).__name__
2632             tests = [t]
2633         else:
2634             tests = getattr(self, '_TESTS', [])
2635         for t in tests:
2636             if not include_onlymatching and t.get('only_matching', False):
2637                 continue
2638             t['name'] = type(self).__name__[:-len('IE')]
2639             yield t
2640
2641     def is_suitable(self, age_limit):
2642         """ Test whether the extractor is generally suitable for the given
2643         age limit (i.e. pornographic sites are not, all others usually are) """
2644
2645         any_restricted = False
2646         for tc in self.get_testcases(include_onlymatching=False):
2647             if tc.get('playlist', []):
2648                 tc = tc['playlist'][0]
2649             is_restricted = age_restricted(
2650                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2651             if not is_restricted:
2652                 return True
2653             any_restricted = any_restricted or is_restricted
2654         return not any_restricted
2655
2656     def extract_subtitles(self, *args, **kwargs):
2657         if (self._downloader.params.get('writesubtitles', False) or
2658                 self._downloader.params.get('listsubtitles')):
2659             return self._get_subtitles(*args, **kwargs)
2660         return {}
2661
2662     def _get_subtitles(self, *args, **kwargs):
2663         raise NotImplementedError('This method must be implemented by subclasses')
2664
2665     @staticmethod
2666     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2667         """ Merge subtitle items for one language. Items with duplicated URLs
2668         will be dropped. """
2669         list1_urls = set([item['url'] for item in subtitle_list1])
2670         ret = list(subtitle_list1)
2671         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2672         return ret
2673
2674     @classmethod
2675     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2676         """ Merge two subtitle dictionaries, language by language. """
2677         ret = dict(subtitle_dict1)
2678         for lang in subtitle_dict2:
2679             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2680         return ret
2681
2682     def extract_automatic_captions(self, *args, **kwargs):
2683         if (self._downloader.params.get('writeautomaticsub', False) or
2684                 self._downloader.params.get('listsubtitles')):
2685             return self._get_automatic_captions(*args, **kwargs)
2686         return {}
2687
2688     def _get_automatic_captions(self, *args, **kwargs):
2689         raise NotImplementedError('This method must be implemented by subclasses')
2690
2691     def mark_watched(self, *args, **kwargs):
2692         if (self._downloader.params.get('mark_watched', False) and
2693                 (self._get_login_info()[0] is not None or
2694                     self._downloader.params.get('cookiefile') is not None)):
2695             self._mark_watched(*args, **kwargs)
2696
2697     def _mark_watched(self, *args, **kwargs):
2698         raise NotImplementedError('This method must be implemented by subclasses')
2699
2700     def geo_verification_headers(self):
2701         headers = {}
2702         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2703         if geo_verification_proxy:
2704             headers['Ytdl-request-proxy'] = geo_verification_proxy
2705         return headers
2706
2707     def _generic_id(self, url):
2708         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2709
2710     def _generic_title(self, url):
2711         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2712
2713
2714 class SearchInfoExtractor(InfoExtractor):
2715     """
2716     Base class for paged search queries extractors.
2717     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2718     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2719     """
2720
2721     @classmethod
2722     def _make_valid_url(cls):
2723         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2724
2725     @classmethod
2726     def suitable(cls, url):
2727         return re.match(cls._make_valid_url(), url) is not None
2728
2729     def _real_extract(self, query):
2730         mobj = re.match(self._make_valid_url(), query)
2731         if mobj is None:
2732             raise ExtractorError('Invalid search query "%s"' % query)
2733
2734         prefix = mobj.group('prefix')
2735         query = mobj.group('query')
2736         if prefix == '':
2737             return self._get_n_results(query, 1)
2738         elif prefix == 'all':
2739             return self._get_n_results(query, self._MAX_RESULTS)
2740         else:
2741             n = int(prefix)
2742             if n <= 0:
2743                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2744             elif n > self._MAX_RESULTS:
2745                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2746                 n = self._MAX_RESULTS
2747             return self._get_n_results(query, n)
2748
2749     def _get_n_results(self, query, n):
2750         """Get a specified number of results for a query"""
2751         raise NotImplementedError('This method must be implemented by subclasses')
2752
2753     @property
2754     def SEARCH_KEY(self):
2755         return self._SEARCH_KEY