Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30     compat_xml_parse_error,
  31 )
  32 from ..downloader.f4m import (
  33     get_base_url,
  34     remove_encrypted_media,
  35 )
  36 from ..utils import (
  37     NO_DEFAULT,
  38     age_restricted,
  39     base_url,
  40     bug_reports_message,
  41     clean_html,
  42     compiled_regex_type,
  43     determine_ext,
  44     determine_protocol,
  45     error_to_compat_str,
  46     ExtractorError,
  47     extract_attributes,
  48     fix_xml_ampersands,
  49     float_or_none,
  50     GeoRestrictedError,
  51     GeoUtils,
  52     int_or_none,
  53     js_to_json,
  54     mimetype2ext,
  55     orderedSet,
  56     parse_codecs,
  57     parse_duration,
  58     parse_iso8601,
  59     parse_m3u8_attributes,
  60     RegexNotFoundError,
  61     sanitized_Request,
  62     sanitize_filename,
  63     unescapeHTML,
  64     unified_strdate,
  65     unified_timestamp,
  66     update_Request,
  67     update_url_query,
  68     urljoin,
  69     url_basename,
  70     xpath_element,
  71     xpath_text,
  72     xpath_with_ns,
  73 )
  74
  75
  76 class InfoExtractor(object):
  77     """Information Extractor class.
  78
  79     Information extractors are the classes that, given a URL, extract
  80     information about the video (or videos) the URL refers to. This
  81     information includes the real video URL, the video title, author and
  82     others. The information is stored in a dictionary which is then
  83     passed to the YoutubeDL. The YoutubeDL processes this
  84     information possibly downloading the video to the file system, among
  85     other possible outcomes.
  86
  87     The type field determines the type of the result.
  88     By far the most common value (and the default if _type is missing) is
  89     "video", which indicates a single video.
  90
  91     For a video, the dictionaries must include the following fields:
  92
  93     id:             Video identifier.
  94     title:          Video title, unescaped.
  95
  96     Additionally, it must contain either a formats entry or a url one:
  97
  98     formats:        A list of dictionaries for each format available, ordered
  99                     from worst to best quality.
 100
 101                     Potential fields:
 102                     * url        Mandatory. The URL of the video file
 103                     * manifest_url
 104                                  The URL of the manifest file in case of
 105                                  fragmented media (DASH, hls, hds)
 106                     * ext        Will be calculated from URL if missing
 107                     * format     A human-readable description of the format
 108                                  ("mp4 container with h264/opus").
 109                                  Calculated from the format_id, width, height.
 110                                  and format_note fields if missing.
 111                     * format_id  A short description of the format
 112                                  ("mp4_h264_opus" or "19").
 113                                 Technically optional, but strongly recommended.
 114                     * format_note Additional info about the format
 115                                  ("3D" or "DASH video")
 116                     * width      Width of the video, if known
 117                     * height     Height of the video, if known
 118                     * resolution Textual description of width and height
 119                     * tbr        Average bitrate of audio and video in KBit/s
 120                     * abr        Average audio bitrate in KBit/s
 121                     * acodec     Name of the audio codec in use
 122                     * asr        Audio sampling rate in Hertz
 123                     * vbr        Average video bitrate in KBit/s
 124                     * fps        Frame rate
 125                     * vcodec     Name of the video codec in use
 126                     * container  Name of the container format
 127                     * filesize   The number of bytes, if known in advance
 128                     * filesize_approx  An estimate for the number of bytes
 129                     * player_url SWF Player URL (used for rtmpdump).
 130                     * protocol   The protocol that will be used for the actual
 131                                  download, lower-case.
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 133                                  "m3u8", "m3u8_native" or "http_dash_segments".
 134                     * fragment_base_url
 135                                  Base URL for fragments. Each fragment's path
 136                                  value (if present) will be relative to
 137                                  this URL.
 138                     * fragments  A list of fragments of a fragmented media.
 139                                  Each fragment entry must contain either an url
 140                                  or a path. If an url is present it should be
 141                                  considered by a client. Otherwise both path and
 142                                  fragment_base_url must be present. Here is
 143                                  the list of all potential fields:
 144                                  * "url" - fragment's URL
 145                                  * "path" - fragment's path relative to
 146                                             fragment_base_url
 147                                  * "duration" (optional, int or float)
 148                                  * "filesize" (optional, int)
 149                     * preference Order number of this format. If this field is
 150                                  present and not None, the formats get sorted
 151                                  by this field, regardless of all other values.
 152                                  -1 for default (order by other properties),
 153                                  -2 or smaller for less than default.
 154                                  < -1000 to hide the format (if there is
 155                                     another one which is strictly better)
 156                     * language   Language code, e.g. "de" or "en-US".
 157                     * language_preference  Is this in the language mentioned in
 158                                  the URL?
 159                                  10 if it's what the URL is about,
 160                                  -1 for default (don't know),
 161                                  -10 otherwise, other values reserved for now.
 162                     * quality    Order number of the video quality of this
 163                                  format, irrespective of the file format.
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * source_preference  Order number for this video source
 167                                   (quality takes higher priority)
 168                                  -1 for default (order by other properties),
 169                                  -2 or smaller for less than default.
 170                     * http_headers  A dictionary of additional HTTP headers
 171                                  to add to the request.
 172                     * stretched_ratio  If given and not 1, indicates that the
 173                                  video's pixels are not square.
 174                                  width : height ratio as float.
 175                     * no_resume  The server does not support resuming the
 176                                  (HTTP or RTMP) download. Boolean.
 177                     * downloader_options  A dictionary of downloader options as
 178                                  described in FileDownloader
 179
 180     url:            Final video URL.
 181     ext:            Video filename extension.
 182     format:         The video format, defaults to ext (used for --get-format)
 183     player_url:     SWF Player URL (used for rtmpdump).
 184
 185     The following fields are optional:
 186
 187     alt_title:      A secondary title of the video.
 188     display_id      An alternative identifier for the video, not necessarily
 189                     unique, but available before title. Typically, id is
 190                     something like "4234987", title "Dancing naked mole rats",
 191                     and display_id "dancing-naked-mole-rats"
 192     thumbnails:     A list of dictionaries, with the following entries:
 193                         * "id" (optional, string) - Thumbnail format ID
 194                         * "url"
 195                         * "preference" (optional, int) - quality of the image
 196                         * "width" (optional, int)
 197                         * "height" (optional, int)
 198                         * "resolution" (optional, string "{width}x{height"},
 199                                         deprecated)
 200                         * "filesize" (optional, int)
 201     thumbnail:      Full URL to a video thumbnail image.
 202     description:    Full video description.
 203     uploader:       Full name of the video uploader.
 204     license:        License name the video is licensed under.
 205     creator:        The creator of the video.
 206     release_date:   The date (YYYYMMDD) when the video was released.
 207     timestamp:      UNIX timestamp of the moment the video became available.
 208     upload_date:    Video upload date (YYYYMMDD).
 209                     If not explicitly set, calculated from timestamp.
 210     uploader_id:    Nickname or id of the video uploader.
 211     uploader_url:   Full URL to a personal webpage of the video uploader.
 212     location:       Physical location where the video was filmed.
 213     subtitles:      The available subtitles as a dictionary in the format
 214                     {tag: subformats}. "tag" is usually a language code, and
 215                     "subformats" is a list sorted from lower to higher
 216                     preference, each element is a dictionary with the "ext"
 217                     entry and one of:
 218                         * "data": The subtitles file contents
 219                         * "url": A URL pointing to the subtitles file
 220                     "ext" will be calculated from URL if missing
 221     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 222                     automatically generated captions
 223     duration:       Length of the video in seconds, as an integer or float.
 224     view_count:     How many users have watched the video on the platform.
 225     like_count:     Number of positive ratings of the video
 226     dislike_count:  Number of negative ratings of the video
 227     repost_count:   Number of reposts of the video
 228     average_rating: Average rating give by users, the scale used depends on the webpage
 229     comment_count:  Number of comments on the video
 230     comments:       A list of comments, each with one or more of the following
 231                     properties (all but one of text or html optional):
 232                         * "author" - human-readable name of the comment author
 233                         * "author_id" - user ID of the comment author
 234                         * "id" - Comment ID
 235                         * "html" - Comment as HTML
 236                         * "text" - Plain text of the comment
 237                         * "timestamp" - UNIX timestamp of comment
 238                         * "parent" - ID of the comment this one is replying to.
 239                                      Set to "root" to indicate that this is a
 240                                      comment to the original video.
 241     age_limit:      Age restriction for the video, as an integer (years)
 242     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 243                     should allow to get the same result again. (It will be set
 244                     by YoutubeDL if it's missing)
 245     categories:     A list of categories that the video falls in, for example
 246                     ["Sports", "Berlin"]
 247     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 248     is_live:        True, False, or None (=unknown). Whether this video is a
 249                     live stream that goes on instead of a fixed-length video.
 250     start_time:     Time in seconds where the reproduction should start, as
 251                     specified in the URL.
 252     end_time:       Time in seconds where the reproduction should end, as
 253                     specified in the URL.
 254     chapters:       A list of dictionaries, with the following entries:
 255                         * "start_time" - The start time of the chapter in seconds
 256                         * "end_time" - The end time of the chapter in seconds
 257                         * "title" (optional, string)
 258
 259     The following fields should only be used when the video belongs to some logical
 260     chapter or section:
 261
 262     chapter:        Name or title of the chapter the video belongs to.
 263     chapter_number: Number of the chapter the video belongs to, as an integer.
 264     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 265
 266     The following fields should only be used when the video is an episode of some
 267     series, programme or podcast:
 268
 269     series:         Title of the series or programme the video episode belongs to.
 270     season:         Title of the season the video episode belongs to.
 271     season_number:  Number of the season the video episode belongs to, as an integer.
 272     season_id:      Id of the season the video episode belongs to, as a unicode string.
 273     episode:        Title of the video episode. Unlike mandatory video title field,
 274                     this field should denote the exact title of the video episode
 275                     without any kind of decoration.
 276     episode_number: Number of the video episode within a season, as an integer.
 277     episode_id:     Id of the video episode, as a unicode string.
 278
 279     The following fields should only be used when the media is a track or a part of
 280     a music album:
 281
 282     track:          Title of the track.
 283     track_number:   Number of the track within an album or a disc, as an integer.
 284     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 285                     as a unicode string.
 286     artist:         Artist(s) of the track.
 287     genre:          Genre(s) of the track.
 288     album:          Title of the album the track belongs to.
 289     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 290     album_artist:   List of all artists appeared on the album (e.g.
 291                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 292                     and compilations).
 293     disc_number:    Number of the disc or other physical medium the track belongs to,
 294                     as an integer.
 295     release_year:   Year (YYYY) when the album was released.
 296
 297     Unless mentioned otherwise, the fields should be Unicode strings.
 298
 299     Unless mentioned otherwise, None is equivalent to absence of information.
 300
 301
 302     _type "playlist" indicates multiple videos.
 303     There must be a key "entries", which is a list, an iterable, or a PagedList
 304     object, each element of which is a valid dictionary by this specification.
 305
 306     Additionally, playlists can have "id", "title", "description", "uploader",
 307     "uploader_id", "uploader_url" attributes with the same semantics as videos
 308     (see above).
 309
 310
 311     _type "multi_video" indicates that there are multiple videos that
 312     form a single show, for examples multiple acts of an opera or TV episode.
 313     It must have an entries key like a playlist and contain all the keys
 314     required for a video at the same time.
 315
 316
 317     _type "url" indicates that the video must be extracted from another
 318     location, possibly by a different extractor. Its only required key is:
 319     "url" - the next URL to extract.
 320     The key "ie_key" can be set to the class name (minus the trailing "IE",
 321     e.g. "Youtube") if the extractor class is known in advance.
 322     Additionally, the dictionary may have any properties of the resolved entity
 323     known in advance, for example "title" if the title of the referred video is
 324     known ahead of time.
 325
 326
 327     _type "url_transparent" entities have the same specification as "url", but
 328     indicate that the given additional information is more precise than the one
 329     associated with the resolved URL.
 330     This is useful when a site employs a video service that hosts the video and
 331     its technical metadata, but that video service does not embed a useful
 332     title, description etc.
 333
 334
 335     Subclasses of this one should re-define the _real_initialize() and
 336     _real_extract() methods and define a _VALID_URL regexp.
 337     Probably, they should also be added to the list of extractors.
 338
 339     _GEO_BYPASS attribute may be set to False in order to disable
 340     geo restriction bypass mechanisms for a particular extractor.
 341     Though it won't disable explicit geo restriction bypass based on
 342     country code provided with geo_bypass_country. (experimental)
 343
 344     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 345     countries for this extractor. One of these countries will be used by
 346     geo restriction bypass mechanism right away in order to bypass
 347     geo restriction, of course, if the mechanism is not disabled. (experimental)
 348
 349     NB: both these geo attributes are experimental and may change in future
 350     or be completely removed.
 351
 352     Finally, the _WORKING attribute should be set to False for broken IEs
 353     in order to warn the users and skip the tests.
 354     """
 355
 356     _ready = False
 357     _downloader = None
 358     _x_forwarded_for_ip = None
 359     _GEO_BYPASS = True
 360     _GEO_COUNTRIES = None
 361     _WORKING = True
 362
 363     def __init__(self, downloader=None):
 364         """Constructor. Receives an optional downloader."""
 365         self._ready = False
 366         self._x_forwarded_for_ip = None
 367         self.set_downloader(downloader)
 368
 369     @classmethod
 370     def suitable(cls, url):
 371         """Receives a URL and returns True if suitable for this IE."""
 372
 373         # This does not use has/getattr intentionally - we want to know whether
 374         # we have cached the regexp for *this* class, whereas getattr would also
 375         # match the superclass
 376         if '_VALID_URL_RE' not in cls.__dict__:
 377             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 378         return cls._VALID_URL_RE.match(url) is not None
 379
 380     @classmethod
 381     def _match_id(cls, url):
 382         if '_VALID_URL_RE' not in cls.__dict__:
 383             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 384         m = cls._VALID_URL_RE.match(url)
 385         assert m
 386         return compat_str(m.group('id'))
 387
 388     @classmethod
 389     def working(cls):
 390         """Getter method for _WORKING."""
 391         return cls._WORKING
 392
 393     def initialize(self):
 394         """Initializes an instance (authentication, etc)."""
 395         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 396         if not self._ready:
 397             self._real_initialize()
 398             self._ready = True
 399
 400     def _initialize_geo_bypass(self, countries):
 401         """
 402         Initialize geo restriction bypass mechanism.
 403
 404         This method is used to initialize geo bypass mechanism based on faking
 405         X-Forwarded-For HTTP header. A random country from provided country list
 406         is selected and a random IP belonging to this country is generated. This
 407         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 408         HTTP requests.
 409
 410         This method will be used for initial geo bypass mechanism initialization
 411         during the instance initialization with _GEO_COUNTRIES.
 412
 413         You may also manually call it from extractor's code if geo countries
 414         information is not available beforehand (e.g. obtained during
 415         extraction) or due to some another reason.
 416         """
 417         if not self._x_forwarded_for_ip:
 418             country_code = self._downloader.params.get('geo_bypass_country', None)
 419             # If there is no explicit country for geo bypass specified and
 420             # the extractor is known to be geo restricted let's fake IP
 421             # as X-Forwarded-For right away.
 422             if (not country_code and
 423                     self._GEO_BYPASS and
 424                     self._downloader.params.get('geo_bypass', True) and
 425                     countries):
 426                 country_code = random.choice(countries)
 427             if country_code:
 428                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 429                 if self._downloader.params.get('verbose', False):
 430                     self._downloader.to_screen(
 431                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 432                         % (self._x_forwarded_for_ip, country_code.upper()))
 433
 434     def extract(self, url):
 435         """Extracts URL information and returns it in list of dicts."""
 436         try:
 437             for _ in range(2):
 438                 try:
 439                     self.initialize()
 440                     ie_result = self._real_extract(url)
 441                     if self._x_forwarded_for_ip:
 442                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 443                     return ie_result
 444                 except GeoRestrictedError as e:
 445                     if self.__maybe_fake_ip_and_retry(e.countries):
 446                         continue
 447                     raise
 448         except ExtractorError:
 449             raise
 450         except compat_http_client.IncompleteRead as e:
 451             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 452         except (KeyError, StopIteration) as e:
 453             raise ExtractorError('An extractor error has occurred.', cause=e)
 454
 455     def __maybe_fake_ip_and_retry(self, countries):
 456         if (not self._downloader.params.get('geo_bypass_country', None) and
 457                 self._GEO_BYPASS and
 458                 self._downloader.params.get('geo_bypass', True) and
 459                 not self._x_forwarded_for_ip and
 460                 countries):
 461             country_code = random.choice(countries)
 462             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 463             if self._x_forwarded_for_ip:
 464                 self.report_warning(
 465                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 466                     % (self._x_forwarded_for_ip, country_code.upper()))
 467                 return True
 468         return False
 469
 470     def set_downloader(self, downloader):
 471         """Sets the downloader for this IE."""
 472         self._downloader = downloader
 473
 474     def _real_initialize(self):
 475         """Real initialization process. Redefine in subclasses."""
 476         pass
 477
 478     def _real_extract(self, url):
 479         """Real extraction process. Redefine in subclasses."""
 480         pass
 481
 482     @classmethod
 483     def ie_key(cls):
 484         """A string for getting the InfoExtractor with get_info_extractor"""
 485         return compat_str(cls.__name__[:-2])
 486
 487     @property
 488     def IE_NAME(self):
 489         return compat_str(type(self).__name__[:-2])
 490
 491     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 492         """ Returns the response handle """
 493         if note is None:
 494             self.report_download_webpage(video_id)
 495         elif note is not False:
 496             if video_id is None:
 497                 self.to_screen('%s' % (note,))
 498             else:
 499                 self.to_screen('%s: %s' % (video_id, note))
 500
 501         # Some sites check X-Forwarded-For HTTP header in order to figure out
 502         # the origin of the client behind proxy. This allows bypassing geo
 503         # restriction by faking this header's value to IP that belongs to some
 504         # geo unrestricted country. We will do so once we encounter any
 505         # geo restriction error.
 506         if self._x_forwarded_for_ip:
 507             if 'X-Forwarded-For' not in headers:
 508                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 509
 510         if isinstance(url_or_request, compat_urllib_request.Request):
 511             url_or_request = update_Request(
 512                 url_or_request, data=data, headers=headers, query=query)
 513         else:
 514             if query:
 515                 url_or_request = update_url_query(url_or_request, query)
 516             if data is not None or headers:
 517                 url_or_request = sanitized_Request(url_or_request, data, headers)
 518         try:
 519             return self._downloader.urlopen(url_or_request)
 520         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 521             if errnote is False:
 522                 return False
 523             if errnote is None:
 524                 errnote = 'Unable to download webpage'
 525
 526             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 527             if fatal:
 528                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 529             else:
 530                 self._downloader.report_warning(errmsg)
 531                 return False
 532
 533     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 534         """ Returns a tuple (page content as string, URL handle) """
 535         # Strip hashes from the URL (#1038)
 536         if isinstance(url_or_request, (compat_str, str)):
 537             url_or_request = url_or_request.partition('#')[0]
 538
 539         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 540         if urlh is False:
 541             assert not fatal
 542             return False
 543         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 544         return (content, urlh)
 545
 546     @staticmethod
 547     def _guess_encoding_from_content(content_type, webpage_bytes):
 548         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 549         if m:
 550             encoding = m.group(1)
 551         else:
 552             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 553                           webpage_bytes[:1024])
 554             if m:
 555                 encoding = m.group(1).decode('ascii')
 556             elif webpage_bytes.startswith(b'\xff\xfe'):
 557                 encoding = 'utf-16'
 558             else:
 559                 encoding = 'utf-8'
 560
 561         return encoding
 562
 563     def __check_blocked(self, content):
 564         first_block = content[:512]
 565         if ('<title>Access to this site is blocked</title>' in content and
 566                 'Websense' in first_block):
 567             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 568             blocked_iframe = self._html_search_regex(
 569                 r'<iframe src="([^"]+)"', content,
 570                 'Websense information URL', default=None)
 571             if blocked_iframe:
 572                 msg += ' Visit %s for more details' % blocked_iframe
 573             raise ExtractorError(msg, expected=True)
 574         if '<title>The URL you requested has been blocked</title>' in first_block:
 575             msg = (
 576                 'Access to this webpage has been blocked by Indian censorship. '
 577                 'Use a VPN or proxy server (with --proxy) to route around it.')
 578             block_msg = self._html_search_regex(
 579                 r'</h1><p>(.*?)</p>',
 580                 content, 'block message', default=None)
 581             if block_msg:
 582                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 583             raise ExtractorError(msg, expected=True)
 584         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 585                 'blocklist.rkn.gov.ru' in content):
 586             raise ExtractorError(
 587                 'Access to this webpage has been blocked by decision of the Russian government. '
 588                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 589                 expected=True)
 590
 591     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 592         content_type = urlh.headers.get('Content-Type', '')
 593         webpage_bytes = urlh.read()
 594         if prefix is not None:
 595             webpage_bytes = prefix + webpage_bytes
 596         if not encoding:
 597             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 598         if self._downloader.params.get('dump_intermediate_pages', False):
 599             self.to_screen('Dumping request to ' + urlh.geturl())
 600             dump = base64.b64encode(webpage_bytes).decode('ascii')
 601             self._downloader.to_screen(dump)
 602         if self._downloader.params.get('write_pages', False):
 603             basen = '%s_%s' % (video_id, urlh.geturl())
 604             if len(basen) > 240:
 605                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 606                 basen = basen[:240 - len(h)] + h
 607             raw_filename = basen + '.dump'
 608             filename = sanitize_filename(raw_filename, restricted=True)
 609             self.to_screen('Saving request to ' + filename)
 610             # Working around MAX_PATH limitation on Windows (see
 611             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 612             if compat_os_name == 'nt':
 613                 absfilepath = os.path.abspath(filename)
 614                 if len(absfilepath) > 259:
 615                     filename = '\\\\?\\' + absfilepath
 616             with open(filename, 'wb') as outf:
 617                 outf.write(webpage_bytes)
 618
 619         try:
 620             content = webpage_bytes.decode(encoding, 'replace')
 621         except LookupError:
 622             content = webpage_bytes.decode('utf-8', 'replace')
 623
 624         self.__check_blocked(content)
 625
 626         return content
 627
 628     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 629         """ Returns the data of the page as a string """
 630         success = False
 631         try_count = 0
 632         while success is False:
 633             try:
 634                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 635                 success = True
 636             except compat_http_client.IncompleteRead as e:
 637                 try_count += 1
 638                 if try_count >= tries:
 639                     raise e
 640                 self._sleep(timeout, video_id)
 641         if res is False:
 642             return res
 643         else:
 644             content, _ = res
 645             return content
 646
 647     def _download_xml_handle(
 648             self, url_or_request, video_id, note='Downloading XML',
 649             errnote='Unable to download XML', transform_source=None,
 650             fatal=True, encoding=None, data=None, headers={}, query={}):
 651         """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)"""
 652         res = self._download_webpage_handle(
 653             url_or_request, video_id, note, errnote, fatal=fatal,
 654             encoding=encoding, data=data, headers=headers, query=query)
 655         if res is False:
 656             return res
 657         xml_string, urlh = res
 658         return self._parse_xml(
 659             xml_string, video_id, transform_source=transform_source,
 660             fatal=fatal), urlh
 661
 662     def _download_xml(self, url_or_request, video_id,
 663                       note='Downloading XML', errnote='Unable to download XML',
 664                       transform_source=None, fatal=True, encoding=None,
 665                       data=None, headers={}, query={}):
 666         """Return the xml as an xml.etree.ElementTree.Element"""
 667         res = self._download_xml_handle(
 668             url_or_request, video_id, note=note, errnote=errnote,
 669             transform_source=transform_source, fatal=fatal, encoding=encoding,
 670             data=data, headers=headers, query=query)
 671         return res if res is False else res[0]
 672
 673     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
 674         if transform_source:
 675             xml_string = transform_source(xml_string)
 676         try:
 677             return compat_etree_fromstring(xml_string.encode('utf-8'))
 678         except compat_xml_parse_error as ve:
 679             errmsg = '%s: Failed to parse XML ' % video_id
 680             if fatal:
 681                 raise ExtractorError(errmsg, cause=ve)
 682             else:
 683                 self.report_warning(errmsg + str(ve))
 684
 685     def _download_json(self, url_or_request, video_id,
 686                        note='Downloading JSON metadata',
 687                        errnote='Unable to download JSON metadata',
 688                        transform_source=None,
 689                        fatal=True, encoding=None, data=None, headers={}, query={}):
 690         json_string = self._download_webpage(
 691             url_or_request, video_id, note, errnote, fatal=fatal,
 692             encoding=encoding, data=data, headers=headers, query=query)
 693         if (not fatal) and json_string is False:
 694             return None
 695         return self._parse_json(
 696             json_string, video_id, transform_source=transform_source, fatal=fatal)
 697
 698     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 699         if transform_source:
 700             json_string = transform_source(json_string)
 701         try:
 702             return json.loads(json_string)
 703         except ValueError as ve:
 704             errmsg = '%s: Failed to parse JSON ' % video_id
 705             if fatal:
 706                 raise ExtractorError(errmsg, cause=ve)
 707             else:
 708                 self.report_warning(errmsg + str(ve))
 709
 710     def report_warning(self, msg, video_id=None):
 711         idstr = '' if video_id is None else '%s: ' % video_id
 712         self._downloader.report_warning(
 713             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 714
 715     def to_screen(self, msg):
 716         """Print msg to screen, prefixing it with '[ie_name]'"""
 717         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 718
 719     def report_extraction(self, id_or_name):
 720         """Report information extraction."""
 721         self.to_screen('%s: Extracting information' % id_or_name)
 722
 723     def report_download_webpage(self, video_id):
 724         """Report webpage download."""
 725         self.to_screen('%s: Downloading webpage' % video_id)
 726
 727     def report_age_confirmation(self):
 728         """Report attempt to confirm age."""
 729         self.to_screen('Confirming age')
 730
 731     def report_login(self):
 732         """Report attempt to log in."""
 733         self.to_screen('Logging in')
 734
 735     @staticmethod
 736     def raise_login_required(msg='This video is only available for registered users'):
 737         raise ExtractorError(
 738             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 739             expected=True)
 740
 741     @staticmethod
 742     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 743         raise GeoRestrictedError(msg, countries=countries)
 744
 745     # Methods for following #608
 746     @staticmethod
 747     def url_result(url, ie=None, video_id=None, video_title=None):
 748         """Returns a URL that points to a page that should be processed"""
 749         # TODO: ie should be the class used for getting the info
 750         video_info = {'_type': 'url',
 751                       'url': url,
 752                       'ie_key': ie}
 753         if video_id is not None:
 754             video_info['id'] = video_id
 755         if video_title is not None:
 756             video_info['title'] = video_title
 757         return video_info
 758
 759     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
 760         urls = orderedSet(
 761             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 762             for m in matches)
 763         return self.playlist_result(
 764             urls, playlist_id=playlist_id, playlist_title=playlist_title)
 765
 766     @staticmethod
 767     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 768         """Returns a playlist"""
 769         video_info = {'_type': 'playlist',
 770                       'entries': entries}
 771         if playlist_id:
 772             video_info['id'] = playlist_id
 773         if playlist_title:
 774             video_info['title'] = playlist_title
 775         if playlist_description:
 776             video_info['description'] = playlist_description
 777         return video_info
 778
 779     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 780         """
 781         Perform a regex search on the given string, using a single or a list of
 782         patterns returning the first matching group.
 783         In case of failure return a default value or raise a WARNING or a
 784         RegexNotFoundError, depending on fatal, specifying the field name.
 785         """
 786         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 787             mobj = re.search(pattern, string, flags)
 788         else:
 789             for p in pattern:
 790                 mobj = re.search(p, string, flags)
 791                 if mobj:
 792                     break
 793
 794         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 795             _name = '\033[0;34m%s\033[0m' % name
 796         else:
 797             _name = name
 798
 799         if mobj:
 800             if group is None:
 801                 # return the first matching group
 802                 return next(g for g in mobj.groups() if g is not None)
 803             else:
 804                 return mobj.group(group)
 805         elif default is not NO_DEFAULT:
 806             return default
 807         elif fatal:
 808             raise RegexNotFoundError('Unable to extract %s' % _name)
 809         else:
 810             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 811             return None
 812
 813     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 814         """
 815         Like _search_regex, but strips HTML tags and unescapes entities.
 816         """
 817         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 818         if res:
 819             return clean_html(res).strip()
 820         else:
 821             return res
 822
 823     def _get_netrc_login_info(self, netrc_machine=None):
 824         username = None
 825         password = None
 826         netrc_machine = netrc_machine or self._NETRC_MACHINE
 827
 828         if self._downloader.params.get('usenetrc', False):
 829             try:
 830                 info = netrc.netrc().authenticators(netrc_machine)
 831                 if info is not None:
 832                     username = info[0]
 833                     password = info[2]
 834                 else:
 835                     raise netrc.NetrcParseError(
 836                         'No authenticators for %s' % netrc_machine)
 837             except (IOError, netrc.NetrcParseError) as err:
 838                 self._downloader.report_warning(
 839                     'parsing .netrc: %s' % error_to_compat_str(err))
 840
 841         return username, password
 842
 843     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 844         """
 845         Get the login info as (username, password)
 846         First look for the manually specified credentials using username_option
 847         and password_option as keys in params dictionary. If no such credentials
 848         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 849         value.
 850         If there's no info available, return (None, None)
 851         """
 852         if self._downloader is None:
 853             return (None, None)
 854
 855         downloader_params = self._downloader.params
 856
 857         # Attempt to use provided username and password or .netrc data
 858         if downloader_params.get(username_option) is not None:
 859             username = downloader_params[username_option]
 860             password = downloader_params[password_option]
 861         else:
 862             username, password = self._get_netrc_login_info(netrc_machine)
 863
 864         return username, password
 865
 866     def _get_tfa_info(self, note='two-factor verification code'):
 867         """
 868         Get the two-factor authentication info
 869         TODO - asking the user will be required for sms/phone verify
 870         currently just uses the command line option
 871         If there's no info available, return None
 872         """
 873         if self._downloader is None:
 874             return None
 875         downloader_params = self._downloader.params
 876
 877         if downloader_params.get('twofactor') is not None:
 878             return downloader_params['twofactor']
 879
 880         return compat_getpass('Type %s and press [Return]: ' % note)
 881
 882     # Helper functions for extracting OpenGraph info
 883     @staticmethod
 884     def _og_regexes(prop):
 885         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 886         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 887                        % {'prop': re.escape(prop)})
 888         template = r'<meta[^>]+?%s[^>]+?%s'
 889         return [
 890             template % (property_re, content_re),
 891             template % (content_re, property_re),
 892         ]
 893
 894     @staticmethod
 895     def _meta_regex(prop):
 896         return r'''(?isx)<meta
 897                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 898                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 899
 900     def _og_search_property(self, prop, html, name=None, **kargs):
 901         if not isinstance(prop, (list, tuple)):
 902             prop = [prop]
 903         if name is None:
 904             name = 'OpenGraph %s' % prop[0]
 905         og_regexes = []
 906         for p in prop:
 907             og_regexes.extend(self._og_regexes(p))
 908         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 909         if escaped is None:
 910             return None
 911         return unescapeHTML(escaped)
 912
 913     def _og_search_thumbnail(self, html, **kargs):
 914         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 915
 916     def _og_search_description(self, html, **kargs):
 917         return self._og_search_property('description', html, fatal=False, **kargs)
 918
 919     def _og_search_title(self, html, **kargs):
 920         return self._og_search_property('title', html, **kargs)
 921
 922     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 923         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 924         if secure:
 925             regexes = self._og_regexes('video:secure_url') + regexes
 926         return self._html_search_regex(regexes, html, name, **kargs)
 927
 928     def _og_search_url(self, html, **kargs):
 929         return self._og_search_property('url', html, **kargs)
 930
 931     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 932         if not isinstance(name, (list, tuple)):
 933             name = [name]
 934         if display_name is None:
 935             display_name = name[0]
 936         return self._html_search_regex(
 937             [self._meta_regex(n) for n in name],
 938             html, display_name, fatal=fatal, group='content', **kwargs)
 939
 940     def _dc_search_uploader(self, html):
 941         return self._html_search_meta('dc.creator', html, 'uploader')
 942
 943     def _rta_search(self, html):
 944         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 945         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 946                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 947                      html):
 948             return 18
 949         return 0
 950
 951     def _media_rating_search(self, html):
 952         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 953         rating = self._html_search_meta('rating', html)
 954
 955         if not rating:
 956             return None
 957
 958         RATING_TABLE = {
 959             'safe for kids': 0,
 960             'general': 8,
 961             '14 years': 14,
 962             'mature': 17,
 963             'restricted': 19,
 964         }
 965         return RATING_TABLE.get(rating.lower())
 966
 967     def _family_friendly_search(self, html):
 968         # See http://schema.org/VideoObject
 969         family_friendly = self._html_search_meta(
 970             'isFamilyFriendly', html, default=None)
 971
 972         if not family_friendly:
 973             return None
 974
 975         RATING_TABLE = {
 976             '1': 0,
 977             'true': 0,
 978             '0': 18,
 979             'false': 18,
 980         }
 981         return RATING_TABLE.get(family_friendly.lower())
 982
 983     def _twitter_search_player(self, html):
 984         return self._html_search_meta('twitter:player', html,
 985                                       'twitter card player')
 986
 987     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 988         json_ld = self._search_regex(
 989             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 990             html, 'JSON-LD', group='json_ld', **kwargs)
 991         default = kwargs.get('default', NO_DEFAULT)
 992         if not json_ld:
 993             return default if default is not NO_DEFAULT else {}
 994         # JSON-LD may be malformed and thus `fatal` should be respected.
 995         # At the same time `default` may be passed that assumes `fatal=False`
 996         # for _search_regex. Let's simulate the same behavior here as well.
 997         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 998         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 999
1000     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
1001         if isinstance(json_ld, compat_str):
1002             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
1003         if not json_ld:
1004             return {}
1005         info = {}
1006         if not isinstance(json_ld, (list, tuple, dict)):
1007             return info
1008         if isinstance(json_ld, dict):
1009             json_ld = [json_ld]
1010
1011         def extract_video_object(e):
1012             assert e['@type'] == 'VideoObject'
1013             info.update({
1014                 'url': e.get('contentUrl'),
1015                 'title': unescapeHTML(e.get('name')),
1016                 'description': unescapeHTML(e.get('description')),
1017                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
1018                 'duration': parse_duration(e.get('duration')),
1019                 'timestamp': unified_timestamp(e.get('uploadDate')),
1020                 'filesize': float_or_none(e.get('contentSize')),
1021                 'tbr': int_or_none(e.get('bitrate')),
1022                 'width': int_or_none(e.get('width')),
1023                 'height': int_or_none(e.get('height')),
1024                 'view_count': int_or_none(e.get('interactionCount')),
1025             })
1026
1027         for e in json_ld:
1028             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')):
1029                 item_type = e.get('@type')
1030                 if expected_type is not None and expected_type != item_type:
1031                     return info
1032                 if item_type in ('TVEpisode', 'Episode'):
1033                     info.update({
1034                         'episode': unescapeHTML(e.get('name')),
1035                         'episode_number': int_or_none(e.get('episodeNumber')),
1036                         'description': unescapeHTML(e.get('description')),
1037                     })
1038                     part_of_season = e.get('partOfSeason')
1039                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'):
1040                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1041                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1042                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'):
1043                         info['series'] = unescapeHTML(part_of_series.get('name'))
1044                 elif item_type in ('Article', 'NewsArticle'):
1045                     info.update({
1046                         'timestamp': parse_iso8601(e.get('datePublished')),
1047                         'title': unescapeHTML(e.get('headline')),
1048                         'description': unescapeHTML(e.get('articleBody')),
1049                     })
1050                 elif item_type == 'VideoObject':
1051                     extract_video_object(e)
1052                     continue
1053                 video = e.get('video')
1054                 if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1055                     extract_video_object(video)
1056                 break
1057         return dict((k, v) for k, v in info.items() if v is not None)
1058
1059     @staticmethod
1060     def _hidden_inputs(html):
1061         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1062         hidden_inputs = {}
1063         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1064             attrs = extract_attributes(input)
1065             if not input:
1066                 continue
1067             if attrs.get('type') not in ('hidden', 'submit'):
1068                 continue
1069             name = attrs.get('name') or attrs.get('id')
1070             value = attrs.get('value')
1071             if name and value is not None:
1072                 hidden_inputs[name] = value
1073         return hidden_inputs
1074
1075     def _form_hidden_inputs(self, form_id, html):
1076         form = self._search_regex(
1077             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1078             html, '%s form' % form_id, group='form')
1079         return self._hidden_inputs(form)
1080
1081     def _sort_formats(self, formats, field_preference=None):
1082         if not formats:
1083             raise ExtractorError('No video formats found')
1084
1085         for f in formats:
1086             # Automatically determine tbr when missing based on abr and vbr (improves
1087             # formats sorting in some cases)
1088             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1089                 f['tbr'] = f['abr'] + f['vbr']
1090
1091         def _formats_key(f):
1092             # TODO remove the following workaround
1093             from ..utils import determine_ext
1094             if not f.get('ext') and 'url' in f:
1095                 f['ext'] = determine_ext(f['url'])
1096
1097             if isinstance(field_preference, (list, tuple)):
1098                 return tuple(
1099                     f.get(field)
1100                     if f.get(field) is not None
1101                     else ('' if field == 'format_id' else -1)
1102                     for field in field_preference)
1103
1104             preference = f.get('preference')
1105             if preference is None:
1106                 preference = 0
1107                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1108                     preference -= 0.5
1109
1110             protocol = f.get('protocol') or determine_protocol(f)
1111             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1112
1113             if f.get('vcodec') == 'none':  # audio only
1114                 preference -= 50
1115                 if self._downloader.params.get('prefer_free_formats'):
1116                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1117                 else:
1118                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1119                 ext_preference = 0
1120                 try:
1121                     audio_ext_preference = ORDER.index(f['ext'])
1122                 except ValueError:
1123                     audio_ext_preference = -1
1124             else:
1125                 if f.get('acodec') == 'none':  # video only
1126                     preference -= 40
1127                 if self._downloader.params.get('prefer_free_formats'):
1128                     ORDER = ['flv', 'mp4', 'webm']
1129                 else:
1130                     ORDER = ['webm', 'flv', 'mp4']
1131                 try:
1132                     ext_preference = ORDER.index(f['ext'])
1133                 except ValueError:
1134                     ext_preference = -1
1135                 audio_ext_preference = 0
1136
1137             return (
1138                 preference,
1139                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1140                 f.get('quality') if f.get('quality') is not None else -1,
1141                 f.get('tbr') if f.get('tbr') is not None else -1,
1142                 f.get('filesize') if f.get('filesize') is not None else -1,
1143                 f.get('vbr') if f.get('vbr') is not None else -1,
1144                 f.get('height') if f.get('height') is not None else -1,
1145                 f.get('width') if f.get('width') is not None else -1,
1146                 proto_preference,
1147                 ext_preference,
1148                 f.get('abr') if f.get('abr') is not None else -1,
1149                 audio_ext_preference,
1150                 f.get('fps') if f.get('fps') is not None else -1,
1151                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1152                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1153                 f.get('format_id') if f.get('format_id') is not None else '',
1154             )
1155         formats.sort(key=_formats_key)
1156
1157     def _check_formats(self, formats, video_id):
1158         if formats:
1159             formats[:] = filter(
1160                 lambda f: self._is_valid_url(
1161                     f['url'], video_id,
1162                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1163                 formats)
1164
1165     @staticmethod
1166     def _remove_duplicate_formats(formats):
1167         format_urls = set()
1168         unique_formats = []
1169         for f in formats:
1170             if f['url'] not in format_urls:
1171                 format_urls.add(f['url'])
1172                 unique_formats.append(f)
1173         formats[:] = unique_formats
1174
1175     def _is_valid_url(self, url, video_id, item='video', headers={}):
1176         url = self._proto_relative_url(url, scheme='http:')
1177         # For now assume non HTTP(S) URLs always valid
1178         if not (url.startswith('http://') or url.startswith('https://')):
1179             return True
1180         try:
1181             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1182             return True
1183         except ExtractorError as e:
1184             if isinstance(e.cause, compat_urllib_error.URLError):
1185                 self.to_screen(
1186                     '%s: %s URL is invalid, skipping' % (video_id, item))
1187                 return False
1188             raise
1189
1190     def http_scheme(self):
1191         """ Either "http:" or "https:", depending on the user's preferences """
1192         return (
1193             'http:'
1194             if self._downloader.params.get('prefer_insecure', False)
1195             else 'https:')
1196
1197     def _proto_relative_url(self, url, scheme=None):
1198         if url is None:
1199             return url
1200         if url.startswith('//'):
1201             if scheme is None:
1202                 scheme = self.http_scheme()
1203             return scheme + url
1204         else:
1205             return url
1206
1207     def _sleep(self, timeout, video_id, msg_template=None):
1208         if msg_template is None:
1209             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1210         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1211         self.to_screen(msg)
1212         time.sleep(timeout)
1213
1214     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1215                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1216                              fatal=True, m3u8_id=None):
1217         manifest = self._download_xml(
1218             manifest_url, video_id, 'Downloading f4m manifest',
1219             'Unable to download f4m manifest',
1220             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1221             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1222             transform_source=transform_source,
1223             fatal=fatal)
1224
1225         if manifest is False:
1226             return []
1227
1228         return self._parse_f4m_formats(
1229             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1230             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1231
1232     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1233                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1234                            fatal=True, m3u8_id=None):
1235         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1236         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1237         if akamai_pv is not None and ';' in akamai_pv.text:
1238             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1239             if playerVerificationChallenge.strip() != '':
1240                 return []
1241
1242         formats = []
1243         manifest_version = '1.0'
1244         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1245         if not media_nodes:
1246             manifest_version = '2.0'
1247             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1248         # Remove unsupported DRM protected media from final formats
1249         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1250         media_nodes = remove_encrypted_media(media_nodes)
1251         if not media_nodes:
1252             return formats
1253
1254         manifest_base_url = get_base_url(manifest)
1255
1256         bootstrap_info = xpath_element(
1257             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1258             'bootstrap info', default=None)
1259
1260         vcodec = None
1261         mime_type = xpath_text(
1262             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1263             'base URL', default=None)
1264         if mime_type and mime_type.startswith('audio/'):
1265             vcodec = 'none'
1266
1267         for i, media_el in enumerate(media_nodes):
1268             tbr = int_or_none(media_el.attrib.get('bitrate'))
1269             width = int_or_none(media_el.attrib.get('width'))
1270             height = int_or_none(media_el.attrib.get('height'))
1271             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1272             # If <bootstrapInfo> is present, the specified f4m is a
1273             # stream-level manifest, and only set-level manifests may refer to
1274             # external resources.  See section 11.4 and section 4 of F4M spec
1275             if bootstrap_info is None:
1276                 media_url = None
1277                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1278                 if manifest_version == '2.0':
1279                     media_url = media_el.attrib.get('href')
1280                 if media_url is None:
1281                     media_url = media_el.attrib.get('url')
1282                 if not media_url:
1283                     continue
1284                 manifest_url = (
1285                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1286                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1287                 # If media_url is itself a f4m manifest do the recursive extraction
1288                 # since bitrates in parent manifest (this one) and media_url manifest
1289                 # may differ leading to inability to resolve the format by requested
1290                 # bitrate in f4m downloader
1291                 ext = determine_ext(manifest_url)
1292                 if ext == 'f4m':
1293                     f4m_formats = self._extract_f4m_formats(
1294                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1295                         transform_source=transform_source, fatal=fatal)
1296                     # Sometimes stream-level manifest contains single media entry that
1297                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1298                     # At the same time parent's media entry in set-level manifest may
1299                     # contain it. We will copy it from parent in such cases.
1300                     if len(f4m_formats) == 1:
1301                         f = f4m_formats[0]
1302                         f.update({
1303                             'tbr': f.get('tbr') or tbr,
1304                             'width': f.get('width') or width,
1305                             'height': f.get('height') or height,
1306                             'format_id': f.get('format_id') if not tbr else format_id,
1307                             'vcodec': vcodec,
1308                         })
1309                     formats.extend(f4m_formats)
1310                     continue
1311                 elif ext == 'm3u8':
1312                     formats.extend(self._extract_m3u8_formats(
1313                         manifest_url, video_id, 'mp4', preference=preference,
1314                         m3u8_id=m3u8_id, fatal=fatal))
1315                     continue
1316             formats.append({
1317                 'format_id': format_id,
1318                 'url': manifest_url,
1319                 'manifest_url': manifest_url,
1320                 'ext': 'flv' if bootstrap_info is not None else None,
1321                 'protocol': 'f4m',
1322                 'tbr': tbr,
1323                 'width': width,
1324                 'height': height,
1325                 'vcodec': vcodec,
1326                 'preference': preference,
1327             })
1328         return formats
1329
1330     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1331         return {
1332             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1333             'url': m3u8_url,
1334             'ext': ext,
1335             'protocol': 'm3u8',
1336             'preference': preference - 100 if preference else -100,
1337             'resolution': 'multiple',
1338             'format_note': 'Quality selection URL',
1339         }
1340
1341     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1342                               entry_protocol='m3u8', preference=None,
1343                               m3u8_id=None, note=None, errnote=None,
1344                               fatal=True, live=False):
1345         res = self._download_webpage_handle(
1346             m3u8_url, video_id,
1347             note=note or 'Downloading m3u8 information',
1348             errnote=errnote or 'Failed to download m3u8 information',
1349             fatal=fatal)
1350
1351         if res is False:
1352             return []
1353
1354         m3u8_doc, urlh = res
1355         m3u8_url = urlh.geturl()
1356
1357         return self._parse_m3u8_formats(
1358             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1359             preference=preference, m3u8_id=m3u8_id, live=live)
1360
1361     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1362                             entry_protocol='m3u8', preference=None,
1363                             m3u8_id=None, live=False):
1364         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1365             return []
1366
1367         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay
1368             return []
1369
1370         formats = []
1371
1372         format_url = lambda u: (
1373             u
1374             if re.match(r'^https?://', u)
1375             else compat_urlparse.urljoin(m3u8_url, u))
1376
1377         # References:
1378         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1379         # 2. https://github.com/rg3/youtube-dl/issues/12211
1380
1381         # We should try extracting formats only from master playlists [1, 4.3.4],
1382         # i.e. playlists that describe available qualities. On the other hand
1383         # media playlists [1, 4.3.3] should be returned as is since they contain
1384         # just the media without qualities renditions.
1385         # Fortunately, master playlist can be easily distinguished from media
1386         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1387         # master playlist tags MUST NOT appear in a media playist and vice versa.
1388         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1389         # media playlist and MUST NOT appear in master playlist thus we can
1390         # clearly detect media playlist with this criterion.
1391
1392         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1393             return [{
1394                 'url': m3u8_url,
1395                 'format_id': m3u8_id,
1396                 'ext': ext,
1397                 'protocol': entry_protocol,
1398                 'preference': preference,
1399             }]
1400
1401         groups = {}
1402         last_stream_inf = {}
1403
1404         def extract_media(x_media_line):
1405             media = parse_m3u8_attributes(x_media_line)
1406             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1407             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1408             if not (media_type and group_id and name):
1409                 return
1410             groups.setdefault(group_id, []).append(media)
1411             if media_type not in ('VIDEO', 'AUDIO'):
1412                 return
1413             media_url = media.get('URI')
1414             if media_url:
1415                 format_id = []
1416                 for v in (m3u8_id, group_id, name):
1417                     if v:
1418                         format_id.append(v)
1419                 f = {
1420                     'format_id': '-'.join(format_id),
1421                     'url': format_url(media_url),
1422                     'manifest_url': m3u8_url,
1423                     'language': media.get('LANGUAGE'),
1424                     'ext': ext,
1425                     'protocol': entry_protocol,
1426                     'preference': preference,
1427                 }
1428                 if media_type == 'AUDIO':
1429                     f['vcodec'] = 'none'
1430                 formats.append(f)
1431
1432         def build_stream_name():
1433             # Despite specification does not mention NAME attribute for
1434             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1435             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1436             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1437             stream_name = last_stream_inf.get('NAME')
1438             if stream_name:
1439                 return stream_name
1440             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1441             # from corresponding rendition group
1442             stream_group_id = last_stream_inf.get('VIDEO')
1443             if not stream_group_id:
1444                 return
1445             stream_group = groups.get(stream_group_id)
1446             if not stream_group:
1447                 return stream_group_id
1448             rendition = stream_group[0]
1449             return rendition.get('NAME') or stream_group_id
1450
1451         for line in m3u8_doc.splitlines():
1452             if line.startswith('#EXT-X-STREAM-INF:'):
1453                 last_stream_inf = parse_m3u8_attributes(line)
1454             elif line.startswith('#EXT-X-MEDIA:'):
1455                 extract_media(line)
1456             elif line.startswith('#') or not line.strip():
1457                 continue
1458             else:
1459                 tbr = float_or_none(
1460                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1461                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1462                 format_id = []
1463                 if m3u8_id:
1464                     format_id.append(m3u8_id)
1465                 stream_name = build_stream_name()
1466                 # Bandwidth of live streams may differ over time thus making
1467                 # format_id unpredictable. So it's better to keep provided
1468                 # format_id intact.
1469                 if not live:
1470                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1471                 manifest_url = format_url(line.strip())
1472                 f = {
1473                     'format_id': '-'.join(format_id),
1474                     'url': manifest_url,
1475                     'manifest_url': m3u8_url,
1476                     'tbr': tbr,
1477                     'ext': ext,
1478                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1479                     'protocol': entry_protocol,
1480                     'preference': preference,
1481                 }
1482                 resolution = last_stream_inf.get('RESOLUTION')
1483                 if resolution:
1484                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1485                     if mobj:
1486                         f['width'] = int(mobj.group('width'))
1487                         f['height'] = int(mobj.group('height'))
1488                 # Unified Streaming Platform
1489                 mobj = re.search(
1490                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1491                 if mobj:
1492                     abr, vbr = mobj.groups()
1493                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1494                     f.update({
1495                         'vbr': vbr,
1496                         'abr': abr,
1497                     })
1498                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1499                 f.update(codecs)
1500                 audio_group_id = last_stream_inf.get('AUDIO')
1501                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1502                 # references a rendition group MUST have a CODECS attribute.
1503                 # However, this is not always respected, for example, [2]
1504                 # contains EXT-X-STREAM-INF tag which references AUDIO
1505                 # rendition group but does not have CODECS and despite
1506                 # referencing audio group an audio group, it represents
1507                 # a complete (with audio and video) format. So, for such cases
1508                 # we will ignore references to rendition groups and treat them
1509                 # as complete formats.
1510                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1511                     audio_group = groups.get(audio_group_id)
1512                     if audio_group and audio_group[0].get('URI'):
1513                         # TODO: update acodec for audio only formats with
1514                         # the same GROUP-ID
1515                         f['acodec'] = 'none'
1516                 formats.append(f)
1517                 last_stream_inf = {}
1518         return formats
1519
1520     @staticmethod
1521     def _xpath_ns(path, namespace=None):
1522         if not namespace:
1523             return path
1524         out = []
1525         for c in path.split('/'):
1526             if not c or c == '.':
1527                 out.append(c)
1528             else:
1529                 out.append('{%s}%s' % (namespace, c))
1530         return '/'.join(out)
1531
1532     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1533         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1534
1535         if smil is False:
1536             assert not fatal
1537             return []
1538
1539         namespace = self._parse_smil_namespace(smil)
1540
1541         return self._parse_smil_formats(
1542             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1543
1544     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1545         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1546         if smil is False:
1547             return {}
1548         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1549
1550     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1551         return self._download_xml(
1552             smil_url, video_id, 'Downloading SMIL file',
1553             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1554
1555     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1556         namespace = self._parse_smil_namespace(smil)
1557
1558         formats = self._parse_smil_formats(
1559             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1560         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1561
1562         video_id = os.path.splitext(url_basename(smil_url))[0]
1563         title = None
1564         description = None
1565         upload_date = None
1566         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1567             name = meta.attrib.get('name')
1568             content = meta.attrib.get('content')
1569             if not name or not content:
1570                 continue
1571             if not title and name == 'title':
1572                 title = content
1573             elif not description and name in ('description', 'abstract'):
1574                 description = content
1575             elif not upload_date and name == 'date':
1576                 upload_date = unified_strdate(content)
1577
1578         thumbnails = [{
1579             'id': image.get('type'),
1580             'url': image.get('src'),
1581             'width': int_or_none(image.get('width')),
1582             'height': int_or_none(image.get('height')),
1583         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1584
1585         return {
1586             'id': video_id,
1587             'title': title or video_id,
1588             'description': description,
1589             'upload_date': upload_date,
1590             'thumbnails': thumbnails,
1591             'formats': formats,
1592             'subtitles': subtitles,
1593         }
1594
1595     def _parse_smil_namespace(self, smil):
1596         return self._search_regex(
1597             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1598
1599     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1600         base = smil_url
1601         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1602             b = meta.get('base') or meta.get('httpBase')
1603             if b:
1604                 base = b
1605                 break
1606
1607         formats = []
1608         rtmp_count = 0
1609         http_count = 0
1610         m3u8_count = 0
1611
1612         srcs = []
1613         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1614         for medium in media:
1615             src = medium.get('src')
1616             if not src or src in srcs:
1617                 continue
1618             srcs.append(src)
1619
1620             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1621             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1622             width = int_or_none(medium.get('width'))
1623             height = int_or_none(medium.get('height'))
1624             proto = medium.get('proto')
1625             ext = medium.get('ext')
1626             src_ext = determine_ext(src)
1627             streamer = medium.get('streamer') or base
1628
1629             if proto == 'rtmp' or streamer.startswith('rtmp'):
1630                 rtmp_count += 1
1631                 formats.append({
1632                     'url': streamer,
1633                     'play_path': src,
1634                     'ext': 'flv',
1635                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1636                     'tbr': bitrate,
1637                     'filesize': filesize,
1638                     'width': width,
1639                     'height': height,
1640                 })
1641                 if transform_rtmp_url:
1642                     streamer, src = transform_rtmp_url(streamer, src)
1643                     formats[-1].update({
1644                         'url': streamer,
1645                         'play_path': src,
1646                     })
1647                 continue
1648
1649             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1650             src_url = src_url.strip()
1651
1652             if proto == 'm3u8' or src_ext == 'm3u8':
1653                 m3u8_formats = self._extract_m3u8_formats(
1654                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1655                 if len(m3u8_formats) == 1:
1656                     m3u8_count += 1
1657                     m3u8_formats[0].update({
1658                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1659                         'tbr': bitrate,
1660                         'width': width,
1661                         'height': height,
1662                     })
1663                 formats.extend(m3u8_formats)
1664                 continue
1665
1666             if src_ext == 'f4m':
1667                 f4m_url = src_url
1668                 if not f4m_params:
1669                     f4m_params = {
1670                         'hdcore': '3.2.0',
1671                         'plugin': 'flowplayer-3.2.0.1',
1672                     }
1673                 f4m_url += '&' if '?' in f4m_url else '?'
1674                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1675                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1676                 continue
1677
1678             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1679                 http_count += 1
1680                 formats.append({
1681                     'url': src_url,
1682                     'ext': ext or src_ext or 'flv',
1683                     'format_id': 'http-%d' % (bitrate or http_count),
1684                     'tbr': bitrate,
1685                     'filesize': filesize,
1686                     'width': width,
1687                     'height': height,
1688                 })
1689                 continue
1690
1691         return formats
1692
1693     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1694         urls = []
1695         subtitles = {}
1696         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1697             src = textstream.get('src')
1698             if not src or src in urls:
1699                 continue
1700             urls.append(src)
1701             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1702             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1703             subtitles.setdefault(lang, []).append({
1704                 'url': src,
1705                 'ext': ext,
1706             })
1707         return subtitles
1708
1709     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True):
1710         xspf = self._download_xml(
1711             xspf_url, playlist_id, 'Downloading xpsf playlist',
1712             'Unable to download xspf manifest', fatal=fatal)
1713         if xspf is False:
1714             return []
1715         return self._parse_xspf(
1716             xspf, playlist_id, xspf_url=xspf_url,
1717             xspf_base_url=base_url(xspf_url))
1718
1719     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None):
1720         NS_MAP = {
1721             'xspf': 'http://xspf.org/ns/0/',
1722             's1': 'http://static.streamone.nl/player/ns/0',
1723         }
1724
1725         entries = []
1726         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1727             title = xpath_text(
1728                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1729             description = xpath_text(
1730                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1731             thumbnail = xpath_text(
1732                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1733             duration = float_or_none(
1734                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1735
1736             formats = []
1737             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)):
1738                 format_url = urljoin(xspf_base_url, location.text)
1739                 if not format_url:
1740                     continue
1741                 formats.append({
1742                     'url': format_url,
1743                     'manifest_url': xspf_url,
1744                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1745                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1746                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1747                 })
1748             self._sort_formats(formats)
1749
1750             entries.append({
1751                 'id': playlist_id,
1752                 'title': title,
1753                 'description': description,
1754                 'thumbnail': thumbnail,
1755                 'duration': duration,
1756                 'formats': formats,
1757             })
1758         return entries
1759
1760     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1761         res = self._download_xml_handle(
1762             mpd_url, video_id,
1763             note=note or 'Downloading MPD manifest',
1764             errnote=errnote or 'Failed to download MPD manifest',
1765             fatal=fatal)
1766         if res is False:
1767             return []
1768         mpd_doc, urlh = res
1769         mpd_base_url = base_url(urlh.geturl())
1770
1771         return self._parse_mpd_formats(
1772             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url,
1773             formats_dict=formats_dict, mpd_url=mpd_url)
1774
1775     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1776         """
1777         Parse formats from MPD manifest.
1778         References:
1779          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1780             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1781          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1782         """
1783         if mpd_doc.get('type') == 'dynamic':
1784             return []
1785
1786         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1787
1788         def _add_ns(path):
1789             return self._xpath_ns(path, namespace)
1790
1791         def is_drm_protected(element):
1792             return element.find(_add_ns('ContentProtection')) is not None
1793
1794         def extract_multisegment_info(element, ms_parent_info):
1795             ms_info = ms_parent_info.copy()
1796
1797             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1798             # common attributes and elements.  We will only extract relevant
1799             # for us.
1800             def extract_common(source):
1801                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1802                 if segment_timeline is not None:
1803                     s_e = segment_timeline.findall(_add_ns('S'))
1804                     if s_e:
1805                         ms_info['total_number'] = 0
1806                         ms_info['s'] = []
1807                         for s in s_e:
1808                             r = int(s.get('r', 0))
1809                             ms_info['total_number'] += 1 + r
1810                             ms_info['s'].append({
1811                                 't': int(s.get('t', 0)),
1812                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1813                                 'd': int(s.attrib['d']),
1814                                 'r': r,
1815                             })
1816                 start_number = source.get('startNumber')
1817                 if start_number:
1818                     ms_info['start_number'] = int(start_number)
1819                 timescale = source.get('timescale')
1820                 if timescale:
1821                     ms_info['timescale'] = int(timescale)
1822                 segment_duration = source.get('duration')
1823                 if segment_duration:
1824                     ms_info['segment_duration'] = float(segment_duration)
1825
1826             def extract_Initialization(source):
1827                 initialization = source.find(_add_ns('Initialization'))
1828                 if initialization is not None:
1829                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1830
1831             segment_list = element.find(_add_ns('SegmentList'))
1832             if segment_list is not None:
1833                 extract_common(segment_list)
1834                 extract_Initialization(segment_list)
1835                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1836                 if segment_urls_e:
1837                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1838             else:
1839                 segment_template = element.find(_add_ns('SegmentTemplate'))
1840                 if segment_template is not None:
1841                     extract_common(segment_template)
1842                     media = segment_template.get('media')
1843                     if media:
1844                         ms_info['media'] = media
1845                     initialization = segment_template.get('initialization')
1846                     if initialization:
1847                         ms_info['initialization'] = initialization
1848                     else:
1849                         extract_Initialization(segment_template)
1850             return ms_info
1851
1852         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1853         formats = []
1854         for period in mpd_doc.findall(_add_ns('Period')):
1855             period_duration = parse_duration(period.get('duration')) or mpd_duration
1856             period_ms_info = extract_multisegment_info(period, {
1857                 'start_number': 1,
1858                 'timescale': 1,
1859             })
1860             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1861                 if is_drm_protected(adaptation_set):
1862                     continue
1863                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1864                 for representation in adaptation_set.findall(_add_ns('Representation')):
1865                     if is_drm_protected(representation):
1866                         continue
1867                     representation_attrib = adaptation_set.attrib.copy()
1868                     representation_attrib.update(representation.attrib)
1869                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1870                     mime_type = representation_attrib['mimeType']
1871                     content_type = mime_type.split('/')[0]
1872                     if content_type == 'text':
1873                         # TODO implement WebVTT downloading
1874                         pass
1875                     elif content_type in ('video', 'audio'):
1876                         base_url = ''
1877                         for element in (representation, adaptation_set, period, mpd_doc):
1878                             base_url_e = element.find(_add_ns('BaseURL'))
1879                             if base_url_e is not None:
1880                                 base_url = base_url_e.text + base_url
1881                                 if re.match(r'^https?://', base_url):
1882                                     break
1883                         if mpd_base_url and not re.match(r'^https?://', base_url):
1884                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1885                                 mpd_base_url += '/'
1886                             base_url = mpd_base_url + base_url
1887                         representation_id = representation_attrib.get('id')
1888                         lang = representation_attrib.get('lang')
1889                         url_el = representation.find(_add_ns('BaseURL'))
1890                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1891                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1892                         f = {
1893                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1894                             'url': base_url,
1895                             'manifest_url': mpd_url,
1896                             'ext': mimetype2ext(mime_type),
1897                             'width': int_or_none(representation_attrib.get('width')),
1898                             'height': int_or_none(representation_attrib.get('height')),
1899                             'tbr': float_or_none(bandwidth, 1000),
1900                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1901                             'fps': int_or_none(representation_attrib.get('frameRate')),
1902                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1903                             'format_note': 'DASH %s' % content_type,
1904                             'filesize': filesize,
1905                             'container': mimetype2ext(mime_type) + '_dash',
1906                         }
1907                         f.update(parse_codecs(representation_attrib.get('codecs')))
1908                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1909
1910                         def prepare_template(template_name, identifiers):
1911                             t = representation_ms_info[template_name]
1912                             t = t.replace('$RepresentationID$', representation_id)
1913                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1914                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1915                             t.replace('$$', '$')
1916                             return t
1917
1918                         # @initialization is a regular template like @media one
1919                         # so it should be handled just the same way (see
1920                         # https://github.com/rg3/youtube-dl/issues/11605)
1921                         if 'initialization' in representation_ms_info:
1922                             initialization_template = prepare_template(
1923                                 'initialization',
1924                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1925                                 # $Time$ shall not be included for @initialization thus
1926                                 # only $Bandwidth$ remains
1927                                 ('Bandwidth', ))
1928                             representation_ms_info['initialization_url'] = initialization_template % {
1929                                 'Bandwidth': bandwidth,
1930                             }
1931
1932                         def location_key(location):
1933                             return 'url' if re.match(r'^https?://', location) else 'path'
1934
1935                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1936
1937                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1938                             media_location_key = location_key(media_template)
1939
1940                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1941                             # can't be used at the same time
1942                             if '%(Number' in media_template and 's' not in representation_ms_info:
1943                                 segment_duration = None
1944                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info:
1945                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1946                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1947                                 representation_ms_info['fragments'] = [{
1948                                     media_location_key: media_template % {
1949                                         'Number': segment_number,
1950                                         'Bandwidth': bandwidth,
1951                                     },
1952                                     'duration': segment_duration,
1953                                 } for segment_number in range(
1954                                     representation_ms_info['start_number'],
1955                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1956                             else:
1957                                 # $Number*$ or $Time$ in media template with S list available
1958                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1959                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1960                                 representation_ms_info['fragments'] = []
1961                                 segment_time = 0
1962                                 segment_d = None
1963                                 segment_number = representation_ms_info['start_number']
1964
1965                                 def add_segment_url():
1966                                     segment_url = media_template % {
1967                                         'Time': segment_time,
1968                                         'Bandwidth': bandwidth,
1969                                         'Number': segment_number,
1970                                     }
1971                                     representation_ms_info['fragments'].append({
1972                                         media_location_key: segment_url,
1973                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1974                                     })
1975
1976                                 for num, s in enumerate(representation_ms_info['s']):
1977                                     segment_time = s.get('t') or segment_time
1978                                     segment_d = s['d']
1979                                     add_segment_url()
1980                                     segment_number += 1
1981                                     for r in range(s.get('r', 0)):
1982                                         segment_time += segment_d
1983                                         add_segment_url()
1984                                         segment_number += 1
1985                                     segment_time += segment_d
1986                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1987                             # No media template
1988                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1989                             # or any YouTube dashsegments video
1990                             fragments = []
1991                             segment_index = 0
1992                             timescale = representation_ms_info['timescale']
1993                             for s in representation_ms_info['s']:
1994                                 duration = float_or_none(s['d'], timescale)
1995                                 for r in range(s.get('r', 0) + 1):
1996                                     segment_uri = representation_ms_info['segment_urls'][segment_index]
1997                                     fragments.append({
1998                                         location_key(segment_uri): segment_uri,
1999                                         'duration': duration,
2000                                     })
2001                                     segment_index += 1
2002                             representation_ms_info['fragments'] = fragments
2003                         elif 'segment_urls' in representation_ms_info:
2004                             # Segment URLs with no SegmentTimeline
2005                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091
2006                             # https://github.com/rg3/youtube-dl/pull/14844
2007                             fragments = []
2008                             segment_duration = float_or_none(
2009                                 representation_ms_info['segment_duration'],
2010                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None
2011                             for segment_url in representation_ms_info['segment_urls']:
2012                                 fragment = {
2013                                     location_key(segment_url): segment_url,
2014                                 }
2015                                 if segment_duration:
2016                                     fragment['duration'] = segment_duration
2017                                 fragments.append(fragment)
2018                             representation_ms_info['fragments'] = fragments
2019                         # NB: MPD manifest may contain direct URLs to unfragmented media.
2020                         # No fragments key is present in this case.
2021                         if 'fragments' in representation_ms_info:
2022                             f.update({
2023                                 'fragment_base_url': base_url,
2024                                 'fragments': [],
2025                                 'protocol': 'http_dash_segments',
2026                             })
2027                             if 'initialization_url' in representation_ms_info:
2028                                 initialization_url = representation_ms_info['initialization_url']
2029                                 if not f.get('url'):
2030                                     f['url'] = initialization_url
2031                                 f['fragments'].append({location_key(initialization_url): initialization_url})
2032                             f['fragments'].extend(representation_ms_info['fragments'])
2033                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation
2034                         # is not necessarily unique within a Period thus formats with
2035                         # the same `format_id` are quite possible. There are numerous examples
2036                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111,
2037                         # https://github.com/rg3/youtube-dl/issues/13919)
2038                         full_info = formats_dict.get(representation_id, {}).copy()
2039                         full_info.update(f)
2040                         formats.append(full_info)
2041                     else:
2042                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
2043         return formats
2044
2045     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
2046         res = self._download_xml_handle(
2047             ism_url, video_id,
2048             note=note or 'Downloading ISM manifest',
2049             errnote=errnote or 'Failed to download ISM manifest',
2050             fatal=fatal)
2051         if res is False:
2052             return []
2053         ism_doc, urlh = res
2054
2055         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id)
2056
2057     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2058         """
2059         Parse formats from ISM manifest.
2060         References:
2061          1. [MS-SSTR]: Smooth Streaming Protocol,
2062             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2063         """
2064         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2065             return []
2066
2067         duration = int(ism_doc.attrib['Duration'])
2068         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2069
2070         formats = []
2071         for stream in ism_doc.findall('StreamIndex'):
2072             stream_type = stream.get('Type')
2073             if stream_type not in ('video', 'audio'):
2074                 continue
2075             url_pattern = stream.attrib['Url']
2076             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2077             stream_name = stream.get('Name')
2078             for track in stream.findall('QualityLevel'):
2079                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None)
2080                 # TODO: add support for WVC1 and WMAP
2081                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2082                     self.report_warning('%s is not a supported codec' % fourcc)
2083                     continue
2084                 tbr = int(track.attrib['Bitrate']) // 1000
2085                 # [1] does not mention Width and Height attributes. However,
2086                 # they're often present while MaxWidth and MaxHeight are
2087                 # missing, so should be used as fallbacks
2088                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2089                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2090                 sampling_rate = int_or_none(track.get('SamplingRate'))
2091
2092                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2093                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2094
2095                 fragments = []
2096                 fragment_ctx = {
2097                     'time': 0,
2098                 }
2099                 stream_fragments = stream.findall('c')
2100                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2101                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2102                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2103                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2104                     if not fragment_ctx['duration']:
2105                         try:
2106                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2107                         except IndexError:
2108                             next_fragment_time = duration
2109                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2110                     for _ in range(fragment_repeat):
2111                         fragments.append({
2112                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2113                             'duration': fragment_ctx['duration'] / stream_timescale,
2114                         })
2115                         fragment_ctx['time'] += fragment_ctx['duration']
2116
2117                 format_id = []
2118                 if ism_id:
2119                     format_id.append(ism_id)
2120                 if stream_name:
2121                     format_id.append(stream_name)
2122                 format_id.append(compat_str(tbr))
2123
2124                 formats.append({
2125                     'format_id': '-'.join(format_id),
2126                     'url': ism_url,
2127                     'manifest_url': ism_url,
2128                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2129                     'width': width,
2130                     'height': height,
2131                     'tbr': tbr,
2132                     'asr': sampling_rate,
2133                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2134                     'acodec': 'none' if stream_type == 'video' else fourcc,
2135                     'protocol': 'ism',
2136                     'fragments': fragments,
2137                     '_download_params': {
2138                         'duration': duration,
2139                         'timescale': stream_timescale,
2140                         'width': width or 0,
2141                         'height': height or 0,
2142                         'fourcc': fourcc,
2143                         'codec_private_data': track.get('CodecPrivateData'),
2144                         'sampling_rate': sampling_rate,
2145                         'channels': int_or_none(track.get('Channels', 2)),
2146                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2147                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2148                     },
2149                 })
2150         return formats
2151
2152     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2153         def absolute_url(item_url):
2154             return urljoin(base_url, item_url)
2155
2156         def parse_content_type(content_type):
2157             if not content_type:
2158                 return {}
2159             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2160             if ctr:
2161                 mimetype, codecs = ctr.groups()
2162                 f = parse_codecs(codecs)
2163                 f['ext'] = mimetype2ext(mimetype)
2164                 return f
2165             return {}
2166
2167         def _media_formats(src, cur_media_type, type_info={}):
2168             full_url = absolute_url(src)
2169             ext = type_info.get('ext') or determine_ext(full_url)
2170             if ext == 'm3u8':
2171                 is_plain_url = False
2172                 formats = self._extract_m3u8_formats(
2173                     full_url, video_id, ext='mp4',
2174                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2175                     preference=preference, fatal=False)
2176             elif ext == 'mpd':
2177                 is_plain_url = False
2178                 formats = self._extract_mpd_formats(
2179                     full_url, video_id, mpd_id=mpd_id, fatal=False)
2180             else:
2181                 is_plain_url = True
2182                 formats = [{
2183                     'url': full_url,
2184                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2185                 }]
2186             return is_plain_url, formats
2187
2188         entries = []
2189         # amp-video and amp-audio are very similar to their HTML5 counterparts
2190         # so we wll include them right here (see
2191         # https://www.ampproject.org/docs/reference/components/amp-video)
2192         media_tags = [(media_tag, media_type, '')
2193                       for media_tag, media_type
2194                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
2195         media_tags.extend(re.findall(
2196             # We only allow video|audio followed by a whitespace or '>'.
2197             # Allowing more characters may end up in significant slow down (see
2198             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2199             # http://www.porntrex.com/maps/videositemap.xml).
2200             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2201         for media_tag, media_type, media_content in media_tags:
2202             media_info = {
2203                 'formats': [],
2204                 'subtitles': {},
2205             }
2206             media_attributes = extract_attributes(media_tag)
2207             src = media_attributes.get('src')
2208             if src:
2209                 _, formats = _media_formats(src, media_type)
2210                 media_info['formats'].extend(formats)
2211             media_info['thumbnail'] = absolute_url(media_attributes.get('poster'))
2212             if media_content:
2213                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2214                     source_attributes = extract_attributes(source_tag)
2215                     src = source_attributes.get('src')
2216                     if not src:
2217                         continue
2218                     f = parse_content_type(source_attributes.get('type'))
2219                     is_plain_url, formats = _media_formats(src, media_type, f)
2220                     if is_plain_url:
2221                         # res attribute is not standard but seen several times
2222                         # in the wild
2223                         f.update({
2224                             'height': int_or_none(source_attributes.get('res')),
2225                             'format_id': source_attributes.get('label'),
2226                         })
2227                         f.update(formats[0])
2228                         media_info['formats'].append(f)
2229                     else:
2230                         media_info['formats'].extend(formats)
2231                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2232                     track_attributes = extract_attributes(track_tag)
2233                     kind = track_attributes.get('kind')
2234                     if not kind or kind in ('subtitles', 'captions'):
2235                         src = track_attributes.get('src')
2236                         if not src:
2237                             continue
2238                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2239                         media_info['subtitles'].setdefault(lang, []).append({
2240                             'url': absolute_url(src),
2241                         })
2242             if media_info['formats'] or media_info['subtitles']:
2243                 entries.append(media_info)
2244         return entries
2245
2246     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2247         formats = []
2248         hdcore_sign = 'hdcore=3.7.0'
2249         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2250         hds_host = hosts.get('hds')
2251         if hds_host:
2252             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2253         if 'hdcore=' not in f4m_url:
2254             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2255         f4m_formats = self._extract_f4m_formats(
2256             f4m_url, video_id, f4m_id='hds', fatal=False)
2257         for entry in f4m_formats:
2258             entry.update({'extra_param_to_segment_url': hdcore_sign})
2259         formats.extend(f4m_formats)
2260         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2261         hls_host = hosts.get('hls')
2262         if hls_host:
2263             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2264         formats.extend(self._extract_m3u8_formats(
2265             m3u8_url, video_id, 'mp4', 'm3u8_native',
2266             m3u8_id='hls', fatal=False))
2267         return formats
2268
2269     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2270         query = compat_urlparse.urlparse(url).query
2271         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2272         mobj = re.search(
2273             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url)
2274         url_base = mobj.group('url')
2275         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base)
2276         formats = []
2277
2278         def manifest_url(manifest):
2279             m_url = '%s/%s' % (http_base_url, manifest)
2280             if query:
2281                 m_url += '?%s' % query
2282             return m_url
2283
2284         if 'm3u8' not in skip_protocols:
2285             formats.extend(self._extract_m3u8_formats(
2286                 manifest_url('playlist.m3u8'), video_id, 'mp4',
2287                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2288         if 'f4m' not in skip_protocols:
2289             formats.extend(self._extract_f4m_formats(
2290                 manifest_url('manifest.f4m'),
2291                 video_id, f4m_id='hds', fatal=False))
2292         if 'dash' not in skip_protocols:
2293             formats.extend(self._extract_mpd_formats(
2294                 manifest_url('manifest.mpd'),
2295                 video_id, mpd_id='dash', fatal=False))
2296         if re.search(r'(?:/smil:|\.smil)', url_base):
2297             if 'smil' not in skip_protocols:
2298                 rtmp_formats = self._extract_smil_formats(
2299                     manifest_url('jwplayer.smil'),
2300                     video_id, fatal=False)
2301                 for rtmp_format in rtmp_formats:
2302                     rtsp_format = rtmp_format.copy()
2303                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2304                     del rtsp_format['play_path']
2305                     del rtsp_format['ext']
2306                     rtsp_format.update({
2307                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2308                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2309                         'protocol': 'rtsp',
2310                     })
2311                     formats.extend([rtmp_format, rtsp_format])
2312         else:
2313             for protocol in ('rtmp', 'rtsp'):
2314                 if protocol not in skip_protocols:
2315                     formats.append({
2316                         'url': '%s:%s' % (protocol, url_base),
2317                         'format_id': protocol,
2318                         'protocol': protocol,
2319                     })
2320         return formats
2321
2322     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2323         mobj = re.search(
2324             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2325             webpage)
2326         if mobj:
2327             try:
2328                 jwplayer_data = self._parse_json(mobj.group('options'),
2329                                                  video_id=video_id,
2330                                                  transform_source=transform_source)
2331             except ExtractorError:
2332                 pass
2333             else:
2334                 if isinstance(jwplayer_data, dict):
2335                     return jwplayer_data
2336
2337     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2338         jwplayer_data = self._find_jwplayer_data(
2339             webpage, video_id, transform_source=js_to_json)
2340         return self._parse_jwplayer_data(
2341             jwplayer_data, video_id, *args, **kwargs)
2342
2343     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2344                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2345         # JWPlayer backward compatibility: flattened playlists
2346         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2347         if 'playlist' not in jwplayer_data:
2348             jwplayer_data = {'playlist': [jwplayer_data]}
2349
2350         entries = []
2351
2352         # JWPlayer backward compatibility: single playlist item
2353         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2354         if not isinstance(jwplayer_data['playlist'], list):
2355             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2356
2357         for video_data in jwplayer_data['playlist']:
2358             # JWPlayer backward compatibility: flattened sources
2359             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2360             if 'sources' not in video_data:
2361                 video_data['sources'] = [video_data]
2362
2363             this_video_id = video_id or video_data['mediaid']
2364
2365             formats = self._parse_jwplayer_formats(
2366                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2367                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2368
2369             subtitles = {}
2370             tracks = video_data.get('tracks')
2371             if tracks and isinstance(tracks, list):
2372                 for track in tracks:
2373                     if not isinstance(track, dict):
2374                         continue
2375                     track_kind = track.get('kind')
2376                     if not track_kind or not isinstance(track_kind, compat_str):
2377                         continue
2378                     if track_kind.lower() not in ('captions', 'subtitles'):
2379                         continue
2380                     track_url = urljoin(base_url, track.get('file'))
2381                     if not track_url:
2382                         continue
2383                     subtitles.setdefault(track.get('label') or 'en', []).append({
2384                         'url': self._proto_relative_url(track_url)
2385                     })
2386
2387             entry = {
2388                 'id': this_video_id,
2389                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')),
2390                 'description': video_data.get('description'),
2391                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2392                 'timestamp': int_or_none(video_data.get('pubdate')),
2393                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2394                 'subtitles': subtitles,
2395             }
2396             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32
2397             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']):
2398                 entry.update({
2399                     '_type': 'url_transparent',
2400                     'url': formats[0]['url'],
2401                 })
2402             else:
2403                 self._sort_formats(formats)
2404                 entry['formats'] = formats
2405             entries.append(entry)
2406         if len(entries) == 1:
2407             return entries[0]
2408         else:
2409             return self.playlist_result(entries)
2410
2411     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2412                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2413         urls = []
2414         formats = []
2415         for source in jwplayer_sources_data:
2416             if not isinstance(source, dict):
2417                 continue
2418             source_url = self._proto_relative_url(source.get('file'))
2419             if not source_url:
2420                 continue
2421             if base_url:
2422                 source_url = compat_urlparse.urljoin(base_url, source_url)
2423             if source_url in urls:
2424                 continue
2425             urls.append(source_url)
2426             source_type = source.get('type') or ''
2427             ext = mimetype2ext(source_type) or determine_ext(source_url)
2428             if source_type == 'hls' or ext == 'm3u8':
2429                 formats.extend(self._extract_m3u8_formats(
2430                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2431                     m3u8_id=m3u8_id, fatal=False))
2432             elif source_type == 'dash' or ext == 'mpd':
2433                 formats.extend(self._extract_mpd_formats(
2434                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2435             elif ext == 'smil':
2436                 formats.extend(self._extract_smil_formats(
2437                     source_url, video_id, fatal=False))
2438             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2439             elif source_type.startswith('audio') or ext in (
2440                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2441                 formats.append({
2442                     'url': source_url,
2443                     'vcodec': 'none',
2444                     'ext': ext,
2445                 })
2446             else:
2447                 height = int_or_none(source.get('height'))
2448                 if height is None:
2449                     # Often no height is provided but there is a label in
2450                     # format like "1080p", "720p SD", or 1080.
2451                     height = int_or_none(self._search_regex(
2452                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2453                         'height', default=None))
2454                 a_format = {
2455                     'url': source_url,
2456                     'width': int_or_none(source.get('width')),
2457                     'height': height,
2458                     'tbr': int_or_none(source.get('bitrate')),
2459                     'ext': ext,
2460                 }
2461                 if source_url.startswith('rtmp'):
2462                     a_format['ext'] = 'flv'
2463                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2464                     # of jwplayer.flash.swf
2465                     rtmp_url_parts = re.split(
2466                         r'((?:mp4|mp3|flv):)', source_url, 1)
2467                     if len(rtmp_url_parts) == 3:
2468                         rtmp_url, prefix, play_path = rtmp_url_parts
2469                         a_format.update({
2470                             'url': rtmp_url,
2471                             'play_path': prefix + play_path,
2472                         })
2473                     if rtmp_params:
2474                         a_format.update(rtmp_params)
2475                 formats.append(a_format)
2476         return formats
2477
2478     def _live_title(self, name):
2479         """ Generate the title for a live video """
2480         now = datetime.datetime.now()
2481         now_str = now.strftime('%Y-%m-%d %H:%M')
2482         return name + ' ' + now_str
2483
2484     def _int(self, v, name, fatal=False, **kwargs):
2485         res = int_or_none(v, **kwargs)
2486         if 'get_attr' in kwargs:
2487             print(getattr(v, kwargs['get_attr']))
2488         if res is None:
2489             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2490             if fatal:
2491                 raise ExtractorError(msg)
2492             else:
2493                 self._downloader.report_warning(msg)
2494         return res
2495
2496     def _float(self, v, name, fatal=False, **kwargs):
2497         res = float_or_none(v, **kwargs)
2498         if res is None:
2499             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2500             if fatal:
2501                 raise ExtractorError(msg)
2502             else:
2503                 self._downloader.report_warning(msg)
2504         return res
2505
2506     def _set_cookie(self, domain, name, value, expire_time=None, port=None,
2507                     path='/', secure=False, discard=False, rest={}, **kwargs):
2508         cookie = compat_cookiejar.Cookie(
2509             0, name, value, port, port is not None, domain, True,
2510             domain.startswith('.'), path, True, secure, expire_time,
2511             discard, None, None, rest)
2512         self._downloader.cookiejar.set_cookie(cookie)
2513
2514     def _get_cookies(self, url):
2515         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2516         req = sanitized_Request(url)
2517         self._downloader.cookiejar.add_cookie_header(req)
2518         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2519
2520     def get_testcases(self, include_onlymatching=False):
2521         t = getattr(self, '_TEST', None)
2522         if t:
2523             assert not hasattr(self, '_TESTS'), \
2524                 '%s has _TEST and _TESTS' % type(self).__name__
2525             tests = [t]
2526         else:
2527             tests = getattr(self, '_TESTS', [])
2528         for t in tests:
2529             if not include_onlymatching and t.get('only_matching', False):
2530                 continue
2531             t['name'] = type(self).__name__[:-len('IE')]
2532             yield t
2533
2534     def is_suitable(self, age_limit):
2535         """ Test whether the extractor is generally suitable for the given
2536         age limit (i.e. pornographic sites are not, all others usually are) """
2537
2538         any_restricted = False
2539         for tc in self.get_testcases(include_onlymatching=False):
2540             if tc.get('playlist', []):
2541                 tc = tc['playlist'][0]
2542             is_restricted = age_restricted(
2543                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2544             if not is_restricted:
2545                 return True
2546             any_restricted = any_restricted or is_restricted
2547         return not any_restricted
2548
2549     def extract_subtitles(self, *args, **kwargs):
2550         if (self._downloader.params.get('writesubtitles', False) or
2551                 self._downloader.params.get('listsubtitles')):
2552             return self._get_subtitles(*args, **kwargs)
2553         return {}
2554
2555     def _get_subtitles(self, *args, **kwargs):
2556         raise NotImplementedError('This method must be implemented by subclasses')
2557
2558     @staticmethod
2559     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2560         """ Merge subtitle items for one language. Items with duplicated URLs
2561         will be dropped. """
2562         list1_urls = set([item['url'] for item in subtitle_list1])
2563         ret = list(subtitle_list1)
2564         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2565         return ret
2566
2567     @classmethod
2568     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2569         """ Merge two subtitle dictionaries, language by language. """
2570         ret = dict(subtitle_dict1)
2571         for lang in subtitle_dict2:
2572             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2573         return ret
2574
2575     def extract_automatic_captions(self, *args, **kwargs):
2576         if (self._downloader.params.get('writeautomaticsub', False) or
2577                 self._downloader.params.get('listsubtitles')):
2578             return self._get_automatic_captions(*args, **kwargs)
2579         return {}
2580
2581     def _get_automatic_captions(self, *args, **kwargs):
2582         raise NotImplementedError('This method must be implemented by subclasses')
2583
2584     def mark_watched(self, *args, **kwargs):
2585         if (self._downloader.params.get('mark_watched', False) and
2586                 (self._get_login_info()[0] is not None or
2587                     self._downloader.params.get('cookiefile') is not None)):
2588             self._mark_watched(*args, **kwargs)
2589
2590     def _mark_watched(self, *args, **kwargs):
2591         raise NotImplementedError('This method must be implemented by subclasses')
2592
2593     def geo_verification_headers(self):
2594         headers = {}
2595         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2596         if geo_verification_proxy:
2597             headers['Ytdl-request-proxy'] = geo_verification_proxy
2598         return headers
2599
2600     def _generic_id(self, url):
2601         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2602
2603     def _generic_title(self, url):
2604         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2605
2606
2607 class SearchInfoExtractor(InfoExtractor):
2608     """
2609     Base class for paged search queries extractors.
2610     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2611     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2612     """
2613
2614     @classmethod
2615     def _make_valid_url(cls):
2616         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2617
2618     @classmethod
2619     def suitable(cls, url):
2620         return re.match(cls._make_valid_url(), url) is not None
2621
2622     def _real_extract(self, query):
2623         mobj = re.match(self._make_valid_url(), query)
2624         if mobj is None:
2625             raise ExtractorError('Invalid search query "%s"' % query)
2626
2627         prefix = mobj.group('prefix')
2628         query = mobj.group('query')
2629         if prefix == '':
2630             return self._get_n_results(query, 1)
2631         elif prefix == 'all':
2632             return self._get_n_results(query, self._MAX_RESULTS)
2633         else:
2634             n = int(prefix)
2635             if n <= 0:
2636                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2637             elif n > self._MAX_RESULTS:
2638                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2639                 n = self._MAX_RESULTS
2640             return self._get_n_results(query, n)
2641
2642     def _get_n_results(self, query, n):
2643         """Get a specified number of results for a query"""
2644         raise NotImplementedError('This method must be implemented by subclasses')
2645
2646     @property
2647     def SEARCH_KEY(self):
2648         return self._SEARCH_KEY