Raphaël G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py

   1 # coding: utf-8
   2 from __future__ import unicode_literals
   3
   4 import base64
   5 import datetime
   6 import hashlib
   7 import json
   8 import netrc
   9 import os
  10 import random
  11 import re
  12 import socket
  13 import sys
  14 import time
  15 import math
  16
  17 from ..compat import (
  18     compat_cookiejar,
  19     compat_cookies,
  20     compat_etree_fromstring,
  21     compat_getpass,
  22     compat_http_client,
  23     compat_os_name,
  24     compat_str,
  25     compat_urllib_error,
  26     compat_urllib_parse_unquote,
  27     compat_urllib_parse_urlencode,
  28     compat_urllib_request,
  29     compat_urlparse,
  30 )
  31 from ..downloader.f4m import remove_encrypted_media
  32 from ..utils import (
  33     NO_DEFAULT,
  34     age_restricted,
  35     base_url,
  36     bug_reports_message,
  37     clean_html,
  38     compiled_regex_type,
  39     determine_ext,
  40     determine_protocol,
  41     error_to_compat_str,
  42     ExtractorError,
  43     extract_attributes,
  44     fix_xml_ampersands,
  45     float_or_none,
  46     GeoRestrictedError,
  47     GeoUtils,
  48     int_or_none,
  49     js_to_json,
  50     mimetype2ext,
  51     orderedSet,
  52     parse_codecs,
  53     parse_duration,
  54     parse_iso8601,
  55     parse_m3u8_attributes,
  56     RegexNotFoundError,
  57     sanitized_Request,
  58     sanitize_filename,
  59     unescapeHTML,
  60     unified_strdate,
  61     unified_timestamp,
  62     update_Request,
  63     update_url_query,
  64     urljoin,
  65     url_basename,
  66     xpath_element,
  67     xpath_text,
  68     xpath_with_ns,
  69 )
  70
  71
  72 class InfoExtractor(object):
  73     """Information Extractor class.
  74
  75     Information extractors are the classes that, given a URL, extract
  76     information about the video (or videos) the URL refers to. This
  77     information includes the real video URL, the video title, author and
  78     others. The information is stored in a dictionary which is then
  79     passed to the YoutubeDL. The YoutubeDL processes this
  80     information possibly downloading the video to the file system, among
  81     other possible outcomes.
  82
  83     The type field determines the type of the result.
  84     By far the most common value (and the default if _type is missing) is
  85     "video", which indicates a single video.
  86
  87     For a video, the dictionaries must include the following fields:
  88
  89     id:             Video identifier.
  90     title:          Video title, unescaped.
  91
  92     Additionally, it must contain either a formats entry or a url one:
  93
  94     formats:        A list of dictionaries for each format available, ordered
  95                     from worst to best quality.
  96
  97                     Potential fields:
  98                     * url        Mandatory. The URL of the video file
  99                     * manifest_url
 100                                  The URL of the manifest file in case of
 101                                  fragmented media (DASH, hls, hds)
 102                     * ext        Will be calculated from URL if missing
 103                     * format     A human-readable description of the format
 104                                  ("mp4 container with h264/opus").
 105                                  Calculated from the format_id, width, height.
 106                                  and format_note fields if missing.
 107                     * format_id  A short description of the format
 108                                  ("mp4_h264_opus" or "19").
 109                                 Technically optional, but strongly recommended.
 110                     * format_note Additional info about the format
 111                                  ("3D" or "DASH video")
 112                     * width      Width of the video, if known
 113                     * height     Height of the video, if known
 114                     * resolution Textual description of width and height
 115                     * tbr        Average bitrate of audio and video in KBit/s
 116                     * abr        Average audio bitrate in KBit/s
 117                     * acodec     Name of the audio codec in use
 118                     * asr        Audio sampling rate in Hertz
 119                     * vbr        Average video bitrate in KBit/s
 120                     * fps        Frame rate
 121                     * vcodec     Name of the video codec in use
 122                     * container  Name of the container format
 123                     * filesize   The number of bytes, if known in advance
 124                     * filesize_approx  An estimate for the number of bytes
 125                     * player_url SWF Player URL (used for rtmpdump).
 126                     * protocol   The protocol that will be used for the actual
 127                                  download, lower-case.
 128                                  "http", "https", "rtsp", "rtmp", "rtmpe",
 129                                  "m3u8", "m3u8_native" or "http_dash_segments".
 130                     * fragment_base_url
 131                                  Base URL for fragments. Each fragment's path
 132                                  value (if present) will be relative to
 133                                  this URL.
 134                     * fragments  A list of fragments of a fragmented media.
 135                                  Each fragment entry must contain either an url
 136                                  or a path. If an url is present it should be
 137                                  considered by a client. Otherwise both path and
 138                                  fragment_base_url must be present. Here is
 139                                  the list of all potential fields:
 140                                  * "url" - fragment's URL
 141                                  * "path" - fragment's path relative to
 142                                             fragment_base_url
 143                                  * "duration" (optional, int or float)
 144                                  * "filesize" (optional, int)
 145                     * preference Order number of this format. If this field is
 146                                  present and not None, the formats get sorted
 147                                  by this field, regardless of all other values.
 148                                  -1 for default (order by other properties),
 149                                  -2 or smaller for less than default.
 150                                  < -1000 to hide the format (if there is
 151                                     another one which is strictly better)
 152                     * language   Language code, e.g. "de" or "en-US".
 153                     * language_preference  Is this in the language mentioned in
 154                                  the URL?
 155                                  10 if it's what the URL is about,
 156                                  -1 for default (don't know),
 157                                  -10 otherwise, other values reserved for now.
 158                     * quality    Order number of the video quality of this
 159                                  format, irrespective of the file format.
 160                                  -1 for default (order by other properties),
 161                                  -2 or smaller for less than default.
 162                     * source_preference  Order number for this video source
 163                                   (quality takes higher priority)
 164                                  -1 for default (order by other properties),
 165                                  -2 or smaller for less than default.
 166                     * http_headers  A dictionary of additional HTTP headers
 167                                  to add to the request.
 168                     * stretched_ratio  If given and not 1, indicates that the
 169                                  video's pixels are not square.
 170                                  width : height ratio as float.
 171                     * no_resume  The server does not support resuming the
 172                                  (HTTP or RTMP) download. Boolean.
 173
 174     url:            Final video URL.
 175     ext:            Video filename extension.
 176     format:         The video format, defaults to ext (used for --get-format)
 177     player_url:     SWF Player URL (used for rtmpdump).
 178
 179     The following fields are optional:
 180
 181     alt_title:      A secondary title of the video.
 182     display_id      An alternative identifier for the video, not necessarily
 183                     unique, but available before title. Typically, id is
 184                     something like "4234987", title "Dancing naked mole rats",
 185                     and display_id "dancing-naked-mole-rats"
 186     thumbnails:     A list of dictionaries, with the following entries:
 187                         * "id" (optional, string) - Thumbnail format ID
 188                         * "url"
 189                         * "preference" (optional, int) - quality of the image
 190                         * "width" (optional, int)
 191                         * "height" (optional, int)
 192                         * "resolution" (optional, string "{width}x{height"},
 193                                         deprecated)
 194                         * "filesize" (optional, int)
 195     thumbnail:      Full URL to a video thumbnail image.
 196     description:    Full video description.
 197     uploader:       Full name of the video uploader.
 198     license:        License name the video is licensed under.
 199     creator:        The creator of the video.
 200     release_date:   The date (YYYYMMDD) when the video was released.
 201     timestamp:      UNIX timestamp of the moment the video became available.
 202     upload_date:    Video upload date (YYYYMMDD).
 203                     If not explicitly set, calculated from timestamp.
 204     uploader_id:    Nickname or id of the video uploader.
 205     uploader_url:   Full URL to a personal webpage of the video uploader.
 206     location:       Physical location where the video was filmed.
 207     subtitles:      The available subtitles as a dictionary in the format
 208                     {tag: subformats}. "tag" is usually a language code, and
 209                     "subformats" is a list sorted from lower to higher
 210                     preference, each element is a dictionary with the "ext"
 211                     entry and one of:
 212                         * "data": The subtitles file contents
 213                         * "url": A URL pointing to the subtitles file
 214                     "ext" will be calculated from URL if missing
 215     automatic_captions: Like 'subtitles', used by the YoutubeIE for
 216                     automatically generated captions
 217     duration:       Length of the video in seconds, as an integer or float.
 218     view_count:     How many users have watched the video on the platform.
 219     like_count:     Number of positive ratings of the video
 220     dislike_count:  Number of negative ratings of the video
 221     repost_count:   Number of reposts of the video
 222     average_rating: Average rating give by users, the scale used depends on the webpage
 223     comment_count:  Number of comments on the video
 224     comments:       A list of comments, each with one or more of the following
 225                     properties (all but one of text or html optional):
 226                         * "author" - human-readable name of the comment author
 227                         * "author_id" - user ID of the comment author
 228                         * "id" - Comment ID
 229                         * "html" - Comment as HTML
 230                         * "text" - Plain text of the comment
 231                         * "timestamp" - UNIX timestamp of comment
 232                         * "parent" - ID of the comment this one is replying to.
 233                                      Set to "root" to indicate that this is a
 234                                      comment to the original video.
 235     age_limit:      Age restriction for the video, as an integer (years)
 236     webpage_url:    The URL to the video webpage, if given to youtube-dl it
 237                     should allow to get the same result again. (It will be set
 238                     by YoutubeDL if it's missing)
 239     categories:     A list of categories that the video falls in, for example
 240                     ["Sports", "Berlin"]
 241     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"]
 242     is_live:        True, False, or None (=unknown). Whether this video is a
 243                     live stream that goes on instead of a fixed-length video.
 244     start_time:     Time in seconds where the reproduction should start, as
 245                     specified in the URL.
 246     end_time:       Time in seconds where the reproduction should end, as
 247                     specified in the URL.
 248     chapters:       A list of dictionaries, with the following entries:
 249                         * "start_time" - The start time of the chapter in seconds
 250                         * "end_time" - The end time of the chapter in seconds
 251                         * "title" (optional, string)
 252
 253     The following fields should only be used when the video belongs to some logical
 254     chapter or section:
 255
 256     chapter:        Name or title of the chapter the video belongs to.
 257     chapter_number: Number of the chapter the video belongs to, as an integer.
 258     chapter_id:     Id of the chapter the video belongs to, as a unicode string.
 259
 260     The following fields should only be used when the video is an episode of some
 261     series, programme or podcast:
 262
 263     series:         Title of the series or programme the video episode belongs to.
 264     season:         Title of the season the video episode belongs to.
 265     season_number:  Number of the season the video episode belongs to, as an integer.
 266     season_id:      Id of the season the video episode belongs to, as a unicode string.
 267     episode:        Title of the video episode. Unlike mandatory video title field,
 268                     this field should denote the exact title of the video episode
 269                     without any kind of decoration.
 270     episode_number: Number of the video episode within a season, as an integer.
 271     episode_id:     Id of the video episode, as a unicode string.
 272
 273     The following fields should only be used when the media is a track or a part of
 274     a music album:
 275
 276     track:          Title of the track.
 277     track_number:   Number of the track within an album or a disc, as an integer.
 278     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii),
 279                     as a unicode string.
 280     artist:         Artist(s) of the track.
 281     genre:          Genre(s) of the track.
 282     album:          Title of the album the track belongs to.
 283     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc).
 284     album_artist:   List of all artists appeared on the album (e.g.
 285                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits
 286                     and compilations).
 287     disc_number:    Number of the disc or other physical medium the track belongs to,
 288                     as an integer.
 289     release_year:   Year (YYYY) when the album was released.
 290
 291     Unless mentioned otherwise, the fields should be Unicode strings.
 292
 293     Unless mentioned otherwise, None is equivalent to absence of information.
 294
 295
 296     _type "playlist" indicates multiple videos.
 297     There must be a key "entries", which is a list, an iterable, or a PagedList
 298     object, each element of which is a valid dictionary by this specification.
 299
 300     Additionally, playlists can have "title", "description" and "id" attributes
 301     with the same semantics as videos (see above).
 302
 303
 304     _type "multi_video" indicates that there are multiple videos that
 305     form a single show, for examples multiple acts of an opera or TV episode.
 306     It must have an entries key like a playlist and contain all the keys
 307     required for a video at the same time.
 308
 309
 310     _type "url" indicates that the video must be extracted from another
 311     location, possibly by a different extractor. Its only required key is:
 312     "url" - the next URL to extract.
 313     The key "ie_key" can be set to the class name (minus the trailing "IE",
 314     e.g. "Youtube") if the extractor class is known in advance.
 315     Additionally, the dictionary may have any properties of the resolved entity
 316     known in advance, for example "title" if the title of the referred video is
 317     known ahead of time.
 318
 319
 320     _type "url_transparent" entities have the same specification as "url", but
 321     indicate that the given additional information is more precise than the one
 322     associated with the resolved URL.
 323     This is useful when a site employs a video service that hosts the video and
 324     its technical metadata, but that video service does not embed a useful
 325     title, description etc.
 326
 327
 328     Subclasses of this one should re-define the _real_initialize() and
 329     _real_extract() methods and define a _VALID_URL regexp.
 330     Probably, they should also be added to the list of extractors.
 331
 332     _GEO_BYPASS attribute may be set to False in order to disable
 333     geo restriction bypass mechanisms for a particular extractor.
 334     Though it won't disable explicit geo restriction bypass based on
 335     country code provided with geo_bypass_country. (experimental)
 336
 337     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted
 338     countries for this extractor. One of these countries will be used by
 339     geo restriction bypass mechanism right away in order to bypass
 340     geo restriction, of course, if the mechanism is not disabled. (experimental)
 341
 342     NB: both these geo attributes are experimental and may change in future
 343     or be completely removed.
 344
 345     Finally, the _WORKING attribute should be set to False for broken IEs
 346     in order to warn the users and skip the tests.
 347     """
 348
 349     _ready = False
 350     _downloader = None
 351     _x_forwarded_for_ip = None
 352     _GEO_BYPASS = True
 353     _GEO_COUNTRIES = None
 354     _WORKING = True
 355
 356     def __init__(self, downloader=None):
 357         """Constructor. Receives an optional downloader."""
 358         self._ready = False
 359         self._x_forwarded_for_ip = None
 360         self.set_downloader(downloader)
 361
 362     @classmethod
 363     def suitable(cls, url):
 364         """Receives a URL and returns True if suitable for this IE."""
 365
 366         # This does not use has/getattr intentionally - we want to know whether
 367         # we have cached the regexp for *this* class, whereas getattr would also
 368         # match the superclass
 369         if '_VALID_URL_RE' not in cls.__dict__:
 370             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 371         return cls._VALID_URL_RE.match(url) is not None
 372
 373     @classmethod
 374     def _match_id(cls, url):
 375         if '_VALID_URL_RE' not in cls.__dict__:
 376             cls._VALID_URL_RE = re.compile(cls._VALID_URL)
 377         m = cls._VALID_URL_RE.match(url)
 378         assert m
 379         return m.group('id')
 380
 381     @classmethod
 382     def working(cls):
 383         """Getter method for _WORKING."""
 384         return cls._WORKING
 385
 386     def initialize(self):
 387         """Initializes an instance (authentication, etc)."""
 388         self._initialize_geo_bypass(self._GEO_COUNTRIES)
 389         if not self._ready:
 390             self._real_initialize()
 391             self._ready = True
 392
 393     def _initialize_geo_bypass(self, countries):
 394         """
 395         Initialize geo restriction bypass mechanism.
 396
 397         This method is used to initialize geo bypass mechanism based on faking
 398         X-Forwarded-For HTTP header. A random country from provided country list
 399         is selected and a random IP belonging to this country is generated. This
 400         IP will be passed as X-Forwarded-For HTTP header in all subsequent
 401         HTTP requests.
 402
 403         This method will be used for initial geo bypass mechanism initialization
 404         during the instance initialization with _GEO_COUNTRIES.
 405
 406         You may also manually call it from extractor's code if geo countries
 407         information is not available beforehand (e.g. obtained during
 408         extraction) or due to some another reason.
 409         """
 410         if not self._x_forwarded_for_ip:
 411             country_code = self._downloader.params.get('geo_bypass_country', None)
 412             # If there is no explicit country for geo bypass specified and
 413             # the extractor is known to be geo restricted let's fake IP
 414             # as X-Forwarded-For right away.
 415             if (not country_code and
 416                     self._GEO_BYPASS and
 417                     self._downloader.params.get('geo_bypass', True) and
 418                     countries):
 419                 country_code = random.choice(countries)
 420             if country_code:
 421                 self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 422                 if self._downloader.params.get('verbose', False):
 423                     self._downloader.to_stdout(
 424                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.'
 425                         % (self._x_forwarded_for_ip, country_code.upper()))
 426
 427     def extract(self, url):
 428         """Extracts URL information and returns it in list of dicts."""
 429         try:
 430             for _ in range(2):
 431                 try:
 432                     self.initialize()
 433                     ie_result = self._real_extract(url)
 434                     if self._x_forwarded_for_ip:
 435                         ie_result['__x_forwarded_for_ip'] = self._x_forwarded_for_ip
 436                     return ie_result
 437                 except GeoRestrictedError as e:
 438                     if self.__maybe_fake_ip_and_retry(e.countries):
 439                         continue
 440                     raise
 441         except ExtractorError:
 442             raise
 443         except compat_http_client.IncompleteRead as e:
 444             raise ExtractorError('A network error has occurred.', cause=e, expected=True)
 445         except (KeyError, StopIteration) as e:
 446             raise ExtractorError('An extractor error has occurred.', cause=e)
 447
 448     def __maybe_fake_ip_and_retry(self, countries):
 449         if (not self._downloader.params.get('geo_bypass_country', None) and
 450                 self._GEO_BYPASS and
 451                 self._downloader.params.get('geo_bypass', True) and
 452                 not self._x_forwarded_for_ip and
 453                 countries):
 454             country_code = random.choice(countries)
 455             self._x_forwarded_for_ip = GeoUtils.random_ipv4(country_code)
 456             if self._x_forwarded_for_ip:
 457                 self.report_warning(
 458                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.'
 459                     % (self._x_forwarded_for_ip, country_code.upper()))
 460                 return True
 461         return False
 462
 463     def set_downloader(self, downloader):
 464         """Sets the downloader for this IE."""
 465         self._downloader = downloader
 466
 467     def _real_initialize(self):
 468         """Real initialization process. Redefine in subclasses."""
 469         pass
 470
 471     def _real_extract(self, url):
 472         """Real extraction process. Redefine in subclasses."""
 473         pass
 474
 475     @classmethod
 476     def ie_key(cls):
 477         """A string for getting the InfoExtractor with get_info_extractor"""
 478         return compat_str(cls.__name__[:-2])
 479
 480     @property
 481     def IE_NAME(self):
 482         return compat_str(type(self).__name__[:-2])
 483
 484     def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers={}, query={}):
 485         """ Returns the response handle """
 486         if note is None:
 487             self.report_download_webpage(video_id)
 488         elif note is not False:
 489             if video_id is None:
 490                 self.to_screen('%s' % (note,))
 491             else:
 492                 self.to_screen('%s: %s' % (video_id, note))
 493         if isinstance(url_or_request, compat_urllib_request.Request):
 494             url_or_request = update_Request(
 495                 url_or_request, data=data, headers=headers, query=query)
 496         else:
 497             if query:
 498                 url_or_request = update_url_query(url_or_request, query)
 499             if data is not None or headers:
 500                 url_or_request = sanitized_Request(url_or_request, data, headers)
 501         try:
 502             return self._downloader.urlopen(url_or_request)
 503         except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
 504             if errnote is False:
 505                 return False
 506             if errnote is None:
 507                 errnote = 'Unable to download webpage'
 508
 509             errmsg = '%s: %s' % (errnote, error_to_compat_str(err))
 510             if fatal:
 511                 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
 512             else:
 513                 self._downloader.report_warning(errmsg)
 514                 return False
 515
 516     def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 517         """ Returns a tuple (page content as string, URL handle) """
 518         # Strip hashes from the URL (#1038)
 519         if isinstance(url_or_request, (compat_str, str)):
 520             url_or_request = url_or_request.partition('#')[0]
 521
 522         # Some sites check X-Forwarded-For HTTP header in order to figure out
 523         # the origin of the client behind proxy. This allows bypassing geo
 524         # restriction by faking this header's value to IP that belongs to some
 525         # geo unrestricted country. We will do so once we encounter any
 526         # geo restriction error.
 527         if self._x_forwarded_for_ip:
 528             if 'X-Forwarded-For' not in headers:
 529                 headers['X-Forwarded-For'] = self._x_forwarded_for_ip
 530
 531         urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query)
 532         if urlh is False:
 533             assert not fatal
 534             return False
 535         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding)
 536         return (content, urlh)
 537
 538     @staticmethod
 539     def _guess_encoding_from_content(content_type, webpage_bytes):
 540         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
 541         if m:
 542             encoding = m.group(1)
 543         else:
 544             m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
 545                           webpage_bytes[:1024])
 546             if m:
 547                 encoding = m.group(1).decode('ascii')
 548             elif webpage_bytes.startswith(b'\xff\xfe'):
 549                 encoding = 'utf-16'
 550             else:
 551                 encoding = 'utf-8'
 552
 553         return encoding
 554
 555     def __check_blocked(self, content):
 556         first_block = content[:512]
 557         if ('<title>Access to this site is blocked</title>' in content and
 558                 'Websense' in first_block):
 559             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
 560             blocked_iframe = self._html_search_regex(
 561                 r'<iframe src="([^"]+)"', content,
 562                 'Websense information URL', default=None)
 563             if blocked_iframe:
 564                 msg += ' Visit %s for more details' % blocked_iframe
 565             raise ExtractorError(msg, expected=True)
 566         if '<title>The URL you requested has been blocked</title>' in first_block:
 567             msg = (
 568                 'Access to this webpage has been blocked by Indian censorship. '
 569                 'Use a VPN or proxy server (with --proxy) to route around it.')
 570             block_msg = self._html_search_regex(
 571                 r'</h1><p>(.*?)</p>',
 572                 content, 'block message', default=None)
 573             if block_msg:
 574                 msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
 575             raise ExtractorError(msg, expected=True)
 576         if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
 577                 'blocklist.rkn.gov.ru' in content):
 578             raise ExtractorError(
 579                 'Access to this webpage has been blocked by decision of the Russian government. '
 580                 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
 581                 expected=True)
 582
 583     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
 584         content_type = urlh.headers.get('Content-Type', '')
 585         webpage_bytes = urlh.read()
 586         if prefix is not None:
 587             webpage_bytes = prefix + webpage_bytes
 588         if not encoding:
 589             encoding = self._guess_encoding_from_content(content_type, webpage_bytes)
 590         if self._downloader.params.get('dump_intermediate_pages', False):
 591             try:
 592                 url = url_or_request.get_full_url()
 593             except AttributeError:
 594                 url = url_or_request
 595             self.to_screen('Dumping request to ' + url)
 596             dump = base64.b64encode(webpage_bytes).decode('ascii')
 597             self._downloader.to_screen(dump)
 598         if self._downloader.params.get('write_pages', False):
 599             try:
 600                 url = url_or_request.get_full_url()
 601             except AttributeError:
 602                 url = url_or_request
 603             basen = '%s_%s' % (video_id, url)
 604             if len(basen) > 240:
 605                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
 606                 basen = basen[:240 - len(h)] + h
 607             raw_filename = basen + '.dump'
 608             filename = sanitize_filename(raw_filename, restricted=True)
 609             self.to_screen('Saving request to ' + filename)
 610             # Working around MAX_PATH limitation on Windows (see
 611             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
 612             if compat_os_name == 'nt':
 613                 absfilepath = os.path.abspath(filename)
 614                 if len(absfilepath) > 259:
 615                     filename = '\\\\?\\' + absfilepath
 616             with open(filename, 'wb') as outf:
 617                 outf.write(webpage_bytes)
 618
 619         try:
 620             content = webpage_bytes.decode(encoding, 'replace')
 621         except LookupError:
 622             content = webpage_bytes.decode('utf-8', 'replace')
 623
 624         self.__check_blocked(content)
 625
 626         return content
 627
 628     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}):
 629         """ Returns the data of the page as a string """
 630         success = False
 631         try_count = 0
 632         while success is False:
 633             try:
 634                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query)
 635                 success = True
 636             except compat_http_client.IncompleteRead as e:
 637                 try_count += 1
 638                 if try_count >= tries:
 639                     raise e
 640                 self._sleep(timeout, video_id)
 641         if res is False:
 642             return res
 643         else:
 644             content, _ = res
 645             return content
 646
 647     def _download_xml(self, url_or_request, video_id,
 648                       note='Downloading XML', errnote='Unable to download XML',
 649                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
 650         """Return the xml as an xml.etree.ElementTree.Element"""
 651         xml_string = self._download_webpage(
 652             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
 653         if xml_string is False:
 654             return xml_string
 655         if transform_source:
 656             xml_string = transform_source(xml_string)
 657         return compat_etree_fromstring(xml_string.encode('utf-8'))
 658
 659     def _download_json(self, url_or_request, video_id,
 660                        note='Downloading JSON metadata',
 661                        errnote='Unable to download JSON metadata',
 662                        transform_source=None,
 663                        fatal=True, encoding=None, data=None, headers={}, query={}):
 664         json_string = self._download_webpage(
 665             url_or_request, video_id, note, errnote, fatal=fatal,
 666             encoding=encoding, data=data, headers=headers, query=query)
 667         if (not fatal) and json_string is False:
 668             return None
 669         return self._parse_json(
 670             json_string, video_id, transform_source=transform_source, fatal=fatal)
 671
 672     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True):
 673         if transform_source:
 674             json_string = transform_source(json_string)
 675         try:
 676             return json.loads(json_string)
 677         except ValueError as ve:
 678             errmsg = '%s: Failed to parse JSON ' % video_id
 679             if fatal:
 680                 raise ExtractorError(errmsg, cause=ve)
 681             else:
 682                 self.report_warning(errmsg + str(ve))
 683
 684     def report_warning(self, msg, video_id=None):
 685         idstr = '' if video_id is None else '%s: ' % video_id
 686         self._downloader.report_warning(
 687             '[%s] %s%s' % (self.IE_NAME, idstr, msg))
 688
 689     def to_screen(self, msg):
 690         """Print msg to screen, prefixing it with '[ie_name]'"""
 691         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
 692
 693     def report_extraction(self, id_or_name):
 694         """Report information extraction."""
 695         self.to_screen('%s: Extracting information' % id_or_name)
 696
 697     def report_download_webpage(self, video_id):
 698         """Report webpage download."""
 699         self.to_screen('%s: Downloading webpage' % video_id)
 700
 701     def report_age_confirmation(self):
 702         """Report attempt to confirm age."""
 703         self.to_screen('Confirming age')
 704
 705     def report_login(self):
 706         """Report attempt to log in."""
 707         self.to_screen('Logging in')
 708
 709     @staticmethod
 710     def raise_login_required(msg='This video is only available for registered users'):
 711         raise ExtractorError(
 712             '%s. Use --username and --password or --netrc to provide account credentials.' % msg,
 713             expected=True)
 714
 715     @staticmethod
 716     def raise_geo_restricted(msg='This video is not available from your location due to geo restriction', countries=None):
 717         raise GeoRestrictedError(msg, countries=countries)
 718
 719     # Methods for following #608
 720     @staticmethod
 721     def url_result(url, ie=None, video_id=None, video_title=None):
 722         """Returns a URL that points to a page that should be processed"""
 723         # TODO: ie should be the class used for getting the info
 724         video_info = {'_type': 'url',
 725                       'url': url,
 726                       'ie_key': ie}
 727         if video_id is not None:
 728             video_info['id'] = video_id
 729         if video_title is not None:
 730             video_info['title'] = video_title
 731         return video_info
 732
 733     def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
 734         urlrs = orderedSet(
 735             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
 736             for m in matches)
 737         return self.playlist_result(
 738             urlrs, playlist_id=video_id, playlist_title=video_title)
 739
 740     @staticmethod
 741     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
 742         """Returns a playlist"""
 743         video_info = {'_type': 'playlist',
 744                       'entries': entries}
 745         if playlist_id:
 746             video_info['id'] = playlist_id
 747         if playlist_title:
 748             video_info['title'] = playlist_title
 749         if playlist_description:
 750             video_info['description'] = playlist_description
 751         return video_info
 752
 753     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 754         """
 755         Perform a regex search on the given string, using a single or a list of
 756         patterns returning the first matching group.
 757         In case of failure return a default value or raise a WARNING or a
 758         RegexNotFoundError, depending on fatal, specifying the field name.
 759         """
 760         if isinstance(pattern, (str, compat_str, compiled_regex_type)):
 761             mobj = re.search(pattern, string, flags)
 762         else:
 763             for p in pattern:
 764                 mobj = re.search(p, string, flags)
 765                 if mobj:
 766                     break
 767
 768         if not self._downloader.params.get('no_color') and compat_os_name != 'nt' and sys.stderr.isatty():
 769             _name = '\033[0;34m%s\033[0m' % name
 770         else:
 771             _name = name
 772
 773         if mobj:
 774             if group is None:
 775                 # return the first matching group
 776                 return next(g for g in mobj.groups() if g is not None)
 777             else:
 778                 return mobj.group(group)
 779         elif default is not NO_DEFAULT:
 780             return default
 781         elif fatal:
 782             raise RegexNotFoundError('Unable to extract %s' % _name)
 783         else:
 784             self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message())
 785             return None
 786
 787     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None):
 788         """
 789         Like _search_regex, but strips HTML tags and unescapes entities.
 790         """
 791         res = self._search_regex(pattern, string, name, default, fatal, flags, group)
 792         if res:
 793             return clean_html(res).strip()
 794         else:
 795             return res
 796
 797     def _get_netrc_login_info(self, netrc_machine=None):
 798         username = None
 799         password = None
 800         netrc_machine = netrc_machine or self._NETRC_MACHINE
 801
 802         if self._downloader.params.get('usenetrc', False):
 803             try:
 804                 info = netrc.netrc().authenticators(netrc_machine)
 805                 if info is not None:
 806                     username = info[0]
 807                     password = info[2]
 808                 else:
 809                     raise netrc.NetrcParseError(
 810                         'No authenticators for %s' % netrc_machine)
 811             except (IOError, netrc.NetrcParseError) as err:
 812                 self._downloader.report_warning(
 813                     'parsing .netrc: %s' % error_to_compat_str(err))
 814
 815         return username, password
 816
 817     def _get_login_info(self, username_option='username', password_option='password', netrc_machine=None):
 818         """
 819         Get the login info as (username, password)
 820         First look for the manually specified credentials using username_option
 821         and password_option as keys in params dictionary. If no such credentials
 822         available look in the netrc file using the netrc_machine or _NETRC_MACHINE
 823         value.
 824         If there's no info available, return (None, None)
 825         """
 826         if self._downloader is None:
 827             return (None, None)
 828
 829         downloader_params = self._downloader.params
 830
 831         # Attempt to use provided username and password or .netrc data
 832         if downloader_params.get(username_option) is not None:
 833             username = downloader_params[username_option]
 834             password = downloader_params[password_option]
 835         else:
 836             username, password = self._get_netrc_login_info(netrc_machine)
 837
 838         return username, password
 839
 840     def _get_tfa_info(self, note='two-factor verification code'):
 841         """
 842         Get the two-factor authentication info
 843         TODO - asking the user will be required for sms/phone verify
 844         currently just uses the command line option
 845         If there's no info available, return None
 846         """
 847         if self._downloader is None:
 848             return None
 849         downloader_params = self._downloader.params
 850
 851         if downloader_params.get('twofactor') is not None:
 852             return downloader_params['twofactor']
 853
 854         return compat_getpass('Type %s and press [Return]: ' % note)
 855
 856     # Helper functions for extracting OpenGraph info
 857     @staticmethod
 858     def _og_regexes(prop):
 859         content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))'
 860         property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)'
 861                        % {'prop': re.escape(prop)})
 862         template = r'<meta[^>]+?%s[^>]+?%s'
 863         return [
 864             template % (property_re, content_re),
 865             template % (content_re, property_re),
 866         ]
 867
 868     @staticmethod
 869     def _meta_regex(prop):
 870         return r'''(?isx)<meta
 871                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1)
 872                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop)
 873
 874     def _og_search_property(self, prop, html, name=None, **kargs):
 875         if not isinstance(prop, (list, tuple)):
 876             prop = [prop]
 877         if name is None:
 878             name = 'OpenGraph %s' % prop[0]
 879         og_regexes = []
 880         for p in prop:
 881             og_regexes.extend(self._og_regexes(p))
 882         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs)
 883         if escaped is None:
 884             return None
 885         return unescapeHTML(escaped)
 886
 887     def _og_search_thumbnail(self, html, **kargs):
 888         return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs)
 889
 890     def _og_search_description(self, html, **kargs):
 891         return self._og_search_property('description', html, fatal=False, **kargs)
 892
 893     def _og_search_title(self, html, **kargs):
 894         return self._og_search_property('title', html, **kargs)
 895
 896     def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
 897         regexes = self._og_regexes('video') + self._og_regexes('video:url')
 898         if secure:
 899             regexes = self._og_regexes('video:secure_url') + regexes
 900         return self._html_search_regex(regexes, html, name, **kargs)
 901
 902     def _og_search_url(self, html, **kargs):
 903         return self._og_search_property('url', html, **kargs)
 904
 905     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
 906         if not isinstance(name, (list, tuple)):
 907             name = [name]
 908         if display_name is None:
 909             display_name = name[0]
 910         return self._html_search_regex(
 911             [self._meta_regex(n) for n in name],
 912             html, display_name, fatal=fatal, group='content', **kwargs)
 913
 914     def _dc_search_uploader(self, html):
 915         return self._html_search_meta('dc.creator', html, 'uploader')
 916
 917     def _rta_search(self, html):
 918         # See http://www.rtalabel.org/index.php?content=howtofaq#single
 919         if re.search(r'(?ix)<meta\s+name="rating"\s+'
 920                      r'     content="RTA-5042-1996-1400-1577-RTA"',
 921                      html):
 922             return 18
 923         return 0
 924
 925     def _media_rating_search(self, html):
 926         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
 927         rating = self._html_search_meta('rating', html)
 928
 929         if not rating:
 930             return None
 931
 932         RATING_TABLE = {
 933             'safe for kids': 0,
 934             'general': 8,
 935             '14 years': 14,
 936             'mature': 17,
 937             'restricted': 19,
 938         }
 939         return RATING_TABLE.get(rating.lower())
 940
 941     def _family_friendly_search(self, html):
 942         # See http://schema.org/VideoObject
 943         family_friendly = self._html_search_meta('isFamilyFriendly', html)
 944
 945         if not family_friendly:
 946             return None
 947
 948         RATING_TABLE = {
 949             '1': 0,
 950             'true': 0,
 951             '0': 18,
 952             'false': 18,
 953         }
 954         return RATING_TABLE.get(family_friendly.lower())
 955
 956     def _twitter_search_player(self, html):
 957         return self._html_search_meta('twitter:player', html,
 958                                       'twitter card player')
 959
 960     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs):
 961         json_ld = self._search_regex(
 962             r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>',
 963             html, 'JSON-LD', group='json_ld', **kwargs)
 964         default = kwargs.get('default', NO_DEFAULT)
 965         if not json_ld:
 966             return default if default is not NO_DEFAULT else {}
 967         # JSON-LD may be malformed and thus `fatal` should be respected.
 968         # At the same time `default` may be passed that assumes `fatal=False`
 969         # for _search_regex. Let's simulate the same behavior here as well.
 970         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False
 971         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type)
 972
 973     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None):
 974         if isinstance(json_ld, compat_str):
 975             json_ld = self._parse_json(json_ld, video_id, fatal=fatal)
 976         if not json_ld:
 977             return {}
 978         info = {}
 979         if not isinstance(json_ld, (list, tuple, dict)):
 980             return info
 981         if isinstance(json_ld, dict):
 982             json_ld = [json_ld]
 983
 984         def extract_video_object(e):
 985             assert e['@type'] == 'VideoObject'
 986             info.update({
 987                 'url': e.get('contentUrl'),
 988                 'title': unescapeHTML(e.get('name')),
 989                 'description': unescapeHTML(e.get('description')),
 990                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
 991                 'duration': parse_duration(e.get('duration')),
 992                 'timestamp': unified_timestamp(e.get('uploadDate')),
 993                 'filesize': float_or_none(e.get('contentSize')),
 994                 'tbr': int_or_none(e.get('bitrate')),
 995                 'width': int_or_none(e.get('width')),
 996                 'height': int_or_none(e.get('height')),
 997                 'view_count': int_or_none(e.get('interactionCount')),
 998             })
 999
1000         for e in json_ld:
1001             if e.get('@context') == 'http://schema.org':
1002                 item_type = e.get('@type')
1003                 if expected_type is not None and expected_type != item_type:
1004                     return info
1005                 if item_type == 'TVEpisode':
1006                     info.update({
1007                         'episode': unescapeHTML(e.get('name')),
1008                         'episode_number': int_or_none(e.get('episodeNumber')),
1009                         'description': unescapeHTML(e.get('description')),
1010                     })
1011                     part_of_season = e.get('partOfSeason')
1012                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason':
1013                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber'))
1014                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries')
1015                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries':
1016                         info['series'] = unescapeHTML(part_of_series.get('name'))
1017                 elif item_type == 'Article':
1018                     info.update({
1019                         'timestamp': parse_iso8601(e.get('datePublished')),
1020                         'title': unescapeHTML(e.get('headline')),
1021                         'description': unescapeHTML(e.get('articleBody')),
1022                     })
1023                 elif item_type == 'VideoObject':
1024                     extract_video_object(e)
1025                 elif item_type == 'WebPage':
1026                     video = e.get('video')
1027                     if isinstance(video, dict) and video.get('@type') == 'VideoObject':
1028                         extract_video_object(video)
1029                 break
1030         return dict((k, v) for k, v in info.items() if v is not None)
1031
1032     @staticmethod
1033     def _hidden_inputs(html):
1034         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html)
1035         hidden_inputs = {}
1036         for input in re.findall(r'(?i)(<input[^>]+>)', html):
1037             attrs = extract_attributes(input)
1038             if not input:
1039                 continue
1040             if attrs.get('type') not in ('hidden', 'submit'):
1041                 continue
1042             name = attrs.get('name') or attrs.get('id')
1043             value = attrs.get('value')
1044             if name and value is not None:
1045                 hidden_inputs[name] = value
1046         return hidden_inputs
1047
1048     def _form_hidden_inputs(self, form_id, html):
1049         form = self._search_regex(
1050             r'(?is)<form[^>]+?id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id,
1051             html, '%s form' % form_id, group='form')
1052         return self._hidden_inputs(form)
1053
1054     def _sort_formats(self, formats, field_preference=None):
1055         if not formats:
1056             raise ExtractorError('No video formats found')
1057
1058         for f in formats:
1059             # Automatically determine tbr when missing based on abr and vbr (improves
1060             # formats sorting in some cases)
1061             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None:
1062                 f['tbr'] = f['abr'] + f['vbr']
1063
1064         def _formats_key(f):
1065             # TODO remove the following workaround
1066             from ..utils import determine_ext
1067             if not f.get('ext') and 'url' in f:
1068                 f['ext'] = determine_ext(f['url'])
1069
1070             if isinstance(field_preference, (list, tuple)):
1071                 return tuple(
1072                     f.get(field)
1073                     if f.get(field) is not None
1074                     else ('' if field == 'format_id' else -1)
1075                     for field in field_preference)
1076
1077             preference = f.get('preference')
1078             if preference is None:
1079                 preference = 0
1080                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported
1081                     preference -= 0.5
1082
1083             protocol = f.get('protocol') or determine_protocol(f)
1084             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1)
1085
1086             if f.get('vcodec') == 'none':  # audio only
1087                 preference -= 50
1088                 if self._downloader.params.get('prefer_free_formats'):
1089                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
1090                 else:
1091                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
1092                 ext_preference = 0
1093                 try:
1094                     audio_ext_preference = ORDER.index(f['ext'])
1095                 except ValueError:
1096                     audio_ext_preference = -1
1097             else:
1098                 if f.get('acodec') == 'none':  # video only
1099                     preference -= 40
1100                 if self._downloader.params.get('prefer_free_formats'):
1101                     ORDER = ['flv', 'mp4', 'webm']
1102                 else:
1103                     ORDER = ['webm', 'flv', 'mp4']
1104                 try:
1105                     ext_preference = ORDER.index(f['ext'])
1106                 except ValueError:
1107                     ext_preference = -1
1108                 audio_ext_preference = 0
1109
1110             return (
1111                 preference,
1112                 f.get('language_preference') if f.get('language_preference') is not None else -1,
1113                 f.get('quality') if f.get('quality') is not None else -1,
1114                 f.get('tbr') if f.get('tbr') is not None else -1,
1115                 f.get('filesize') if f.get('filesize') is not None else -1,
1116                 f.get('vbr') if f.get('vbr') is not None else -1,
1117                 f.get('height') if f.get('height') is not None else -1,
1118                 f.get('width') if f.get('width') is not None else -1,
1119                 proto_preference,
1120                 ext_preference,
1121                 f.get('abr') if f.get('abr') is not None else -1,
1122                 audio_ext_preference,
1123                 f.get('fps') if f.get('fps') is not None else -1,
1124                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
1125                 f.get('source_preference') if f.get('source_preference') is not None else -1,
1126                 f.get('format_id') if f.get('format_id') is not None else '',
1127             )
1128         formats.sort(key=_formats_key)
1129
1130     def _check_formats(self, formats, video_id):
1131         if formats:
1132             formats[:] = filter(
1133                 lambda f: self._is_valid_url(
1134                     f['url'], video_id,
1135                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'),
1136                 formats)
1137
1138     @staticmethod
1139     def _remove_duplicate_formats(formats):
1140         format_urls = set()
1141         unique_formats = []
1142         for f in formats:
1143             if f['url'] not in format_urls:
1144                 format_urls.add(f['url'])
1145                 unique_formats.append(f)
1146         formats[:] = unique_formats
1147
1148     def _is_valid_url(self, url, video_id, item='video', headers={}):
1149         url = self._proto_relative_url(url, scheme='http:')
1150         # For now assume non HTTP(S) URLs always valid
1151         if not (url.startswith('http://') or url.startswith('https://')):
1152             return True
1153         try:
1154             self._request_webpage(url, video_id, 'Checking %s URL' % item, headers=headers)
1155             return True
1156         except ExtractorError as e:
1157             if isinstance(e.cause, compat_urllib_error.URLError):
1158                 self.to_screen(
1159                     '%s: %s URL is invalid, skipping' % (video_id, item))
1160                 return False
1161             raise
1162
1163     def http_scheme(self):
1164         """ Either "http:" or "https:", depending on the user's preferences """
1165         return (
1166             'http:'
1167             if self._downloader.params.get('prefer_insecure', False)
1168             else 'https:')
1169
1170     def _proto_relative_url(self, url, scheme=None):
1171         if url is None:
1172             return url
1173         if url.startswith('//'):
1174             if scheme is None:
1175                 scheme = self.http_scheme()
1176             return scheme + url
1177         else:
1178             return url
1179
1180     def _sleep(self, timeout, video_id, msg_template=None):
1181         if msg_template is None:
1182             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
1183         msg = msg_template % {'video_id': video_id, 'timeout': timeout}
1184         self.to_screen(msg)
1185         time.sleep(timeout)
1186
1187     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None,
1188                              transform_source=lambda s: fix_xml_ampersands(s).strip(),
1189                              fatal=True, m3u8_id=None):
1190         manifest = self._download_xml(
1191             manifest_url, video_id, 'Downloading f4m manifest',
1192             'Unable to download f4m manifest',
1193             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests
1194             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244)
1195             transform_source=transform_source,
1196             fatal=fatal)
1197
1198         if manifest is False:
1199             return []
1200
1201         return self._parse_f4m_formats(
1202             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1203             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id)
1204
1205     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None,
1206                            transform_source=lambda s: fix_xml_ampersands(s).strip(),
1207                            fatal=True, m3u8_id=None):
1208         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy
1209         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0')
1210         if akamai_pv is not None and ';' in akamai_pv.text:
1211             playerVerificationChallenge = akamai_pv.text.split(';')[0]
1212             if playerVerificationChallenge.strip() != '':
1213                 return []
1214
1215         formats = []
1216         manifest_version = '1.0'
1217         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
1218         if not media_nodes:
1219             manifest_version = '2.0'
1220             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media')
1221         # Remove unsupported DRM protected media from final formats
1222         # rendition (see https://github.com/rg3/youtube-dl/issues/8573).
1223         media_nodes = remove_encrypted_media(media_nodes)
1224         if not media_nodes:
1225             return formats
1226         base_url = xpath_text(
1227             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'],
1228             'base URL', default=None)
1229         if base_url:
1230             base_url = base_url.strip()
1231
1232         bootstrap_info = xpath_element(
1233             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'],
1234             'bootstrap info', default=None)
1235
1236         vcodec = None
1237         mime_type = xpath_text(
1238             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'],
1239             'base URL', default=None)
1240         if mime_type and mime_type.startswith('audio/'):
1241             vcodec = 'none'
1242
1243         for i, media_el in enumerate(media_nodes):
1244             tbr = int_or_none(media_el.attrib.get('bitrate'))
1245             width = int_or_none(media_el.attrib.get('width'))
1246             height = int_or_none(media_el.attrib.get('height'))
1247             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)]))
1248             # If <bootstrapInfo> is present, the specified f4m is a
1249             # stream-level manifest, and only set-level manifests may refer to
1250             # external resources.  See section 11.4 and section 4 of F4M spec
1251             if bootstrap_info is None:
1252                 media_url = None
1253                 # @href is introduced in 2.0, see section 11.6 of F4M spec
1254                 if manifest_version == '2.0':
1255                     media_url = media_el.attrib.get('href')
1256                 if media_url is None:
1257                     media_url = media_el.attrib.get('url')
1258                 if not media_url:
1259                     continue
1260                 manifest_url = (
1261                     media_url if media_url.startswith('http://') or media_url.startswith('https://')
1262                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url))
1263                 # If media_url is itself a f4m manifest do the recursive extraction
1264                 # since bitrates in parent manifest (this one) and media_url manifest
1265                 # may differ leading to inability to resolve the format by requested
1266                 # bitrate in f4m downloader
1267                 ext = determine_ext(manifest_url)
1268                 if ext == 'f4m':
1269                     f4m_formats = self._extract_f4m_formats(
1270                         manifest_url, video_id, preference=preference, f4m_id=f4m_id,
1271                         transform_source=transform_source, fatal=fatal)
1272                     # Sometimes stream-level manifest contains single media entry that
1273                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player).
1274                     # At the same time parent's media entry in set-level manifest may
1275                     # contain it. We will copy it from parent in such cases.
1276                     if len(f4m_formats) == 1:
1277                         f = f4m_formats[0]
1278                         f.update({
1279                             'tbr': f.get('tbr') or tbr,
1280                             'width': f.get('width') or width,
1281                             'height': f.get('height') or height,
1282                             'format_id': f.get('format_id') if not tbr else format_id,
1283                             'vcodec': vcodec,
1284                         })
1285                     formats.extend(f4m_formats)
1286                     continue
1287                 elif ext == 'm3u8':
1288                     formats.extend(self._extract_m3u8_formats(
1289                         manifest_url, video_id, 'mp4', preference=preference,
1290                         m3u8_id=m3u8_id, fatal=fatal))
1291                     continue
1292             formats.append({
1293                 'format_id': format_id,
1294                 'url': manifest_url,
1295                 'manifest_url': manifest_url,
1296                 'ext': 'flv' if bootstrap_info is not None else None,
1297                 'tbr': tbr,
1298                 'width': width,
1299                 'height': height,
1300                 'vcodec': vcodec,
1301                 'preference': preference,
1302             })
1303         return formats
1304
1305     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None):
1306         return {
1307             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])),
1308             'url': m3u8_url,
1309             'ext': ext,
1310             'protocol': 'm3u8',
1311             'preference': preference - 100 if preference else -100,
1312             'resolution': 'multiple',
1313             'format_note': 'Quality selection URL',
1314         }
1315
1316     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
1317                               entry_protocol='m3u8', preference=None,
1318                               m3u8_id=None, note=None, errnote=None,
1319                               fatal=True, live=False):
1320         res = self._download_webpage_handle(
1321             m3u8_url, video_id,
1322             note=note or 'Downloading m3u8 information',
1323             errnote=errnote or 'Failed to download m3u8 information',
1324             fatal=fatal)
1325
1326         if res is False:
1327             return []
1328
1329         m3u8_doc, urlh = res
1330         m3u8_url = urlh.geturl()
1331
1332         return self._parse_m3u8_formats(
1333             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
1334             preference=preference, m3u8_id=m3u8_id, live=live)
1335
1336     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
1337                             entry_protocol='m3u8', preference=None,
1338                             m3u8_id=None, live=False):
1339         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access
1340             return []
1341
1342         formats = []
1343
1344         format_url = lambda u: (
1345             u
1346             if re.match(r'^https?://', u)
1347             else compat_urlparse.urljoin(m3u8_url, u))
1348
1349         # References:
1350         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
1351         # 2. https://github.com/rg3/youtube-dl/issues/12211
1352
1353         # We should try extracting formats only from master playlists [1, 4.3.4],
1354         # i.e. playlists that describe available qualities. On the other hand
1355         # media playlists [1, 4.3.3] should be returned as is since they contain
1356         # just the media without qualities renditions.
1357         # Fortunately, master playlist can be easily distinguished from media
1358         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
1359         # master playlist tags MUST NOT appear in a media playist and vice versa.
1360         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
1361         # media playlist and MUST NOT appear in master playlist thus we can
1362         # clearly detect media playlist with this criterion.
1363
1364         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is
1365             return [{
1366                 'url': m3u8_url,
1367                 'format_id': m3u8_id,
1368                 'ext': ext,
1369                 'protocol': entry_protocol,
1370                 'preference': preference,
1371             }]
1372
1373         groups = {}
1374         last_stream_inf = {}
1375
1376         def extract_media(x_media_line):
1377             media = parse_m3u8_attributes(x_media_line)
1378             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
1379             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
1380             if not (media_type and group_id and name):
1381                 return
1382             groups.setdefault(group_id, []).append(media)
1383             if media_type not in ('VIDEO', 'AUDIO'):
1384                 return
1385             media_url = media.get('URI')
1386             if media_url:
1387                 format_id = []
1388                 for v in (group_id, name):
1389                     if v:
1390                         format_id.append(v)
1391                 f = {
1392                     'format_id': '-'.join(format_id),
1393                     'url': format_url(media_url),
1394                     'manifest_url': m3u8_url,
1395                     'language': media.get('LANGUAGE'),
1396                     'ext': ext,
1397                     'protocol': entry_protocol,
1398                     'preference': preference,
1399                 }
1400                 if media_type == 'AUDIO':
1401                     f['vcodec'] = 'none'
1402                 formats.append(f)
1403
1404         def build_stream_name():
1405             # Despite specification does not mention NAME attribute for
1406             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
1407             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
1408             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
1409             stream_name = last_stream_inf.get('NAME')
1410             if stream_name:
1411                 return stream_name
1412             # If there is no NAME in EXT-X-STREAM-INF it will be obtained
1413             # from corresponding rendition group
1414             stream_group_id = last_stream_inf.get('VIDEO')
1415             if not stream_group_id:
1416                 return
1417             stream_group = groups.get(stream_group_id)
1418             if not stream_group:
1419                 return stream_group_id
1420             rendition = stream_group[0]
1421             return rendition.get('NAME') or stream_group_id
1422
1423         for line in m3u8_doc.splitlines():
1424             if line.startswith('#EXT-X-STREAM-INF:'):
1425                 last_stream_inf = parse_m3u8_attributes(line)
1426             elif line.startswith('#EXT-X-MEDIA:'):
1427                 extract_media(line)
1428             elif line.startswith('#') or not line.strip():
1429                 continue
1430             else:
1431                 tbr = float_or_none(
1432                     last_stream_inf.get('AVERAGE-BANDWIDTH') or
1433                     last_stream_inf.get('BANDWIDTH'), scale=1000)
1434                 format_id = []
1435                 if m3u8_id:
1436                     format_id.append(m3u8_id)
1437                 stream_name = build_stream_name()
1438                 # Bandwidth of live streams may differ over time thus making
1439                 # format_id unpredictable. So it's better to keep provided
1440                 # format_id intact.
1441                 if not live:
1442                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats)))
1443                 manifest_url = format_url(line.strip())
1444                 f = {
1445                     'format_id': '-'.join(format_id),
1446                     'url': manifest_url,
1447                     'manifest_url': m3u8_url,
1448                     'tbr': tbr,
1449                     'ext': ext,
1450                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
1451                     'protocol': entry_protocol,
1452                     'preference': preference,
1453                 }
1454                 resolution = last_stream_inf.get('RESOLUTION')
1455                 if resolution:
1456                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
1457                     if mobj:
1458                         f['width'] = int(mobj.group('width'))
1459                         f['height'] = int(mobj.group('height'))
1460                 # Unified Streaming Platform
1461                 mobj = re.search(
1462                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url'])
1463                 if mobj:
1464                     abr, vbr = mobj.groups()
1465                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000)
1466                     f.update({
1467                         'vbr': vbr,
1468                         'abr': abr,
1469                     })
1470                 codecs = parse_codecs(last_stream_inf.get('CODECS'))
1471                 f.update(codecs)
1472                 audio_group_id = last_stream_inf.get('AUDIO')
1473                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
1474                 # references a rendition group MUST have a CODECS attribute.
1475                 # However, this is not always respected, for example, [2]
1476                 # contains EXT-X-STREAM-INF tag which references AUDIO
1477                 # rendition group but does not have CODECS and despite
1478                 # referencing audio group an audio group, it represents
1479                 # a complete (with audio and video) format. So, for such cases
1480                 # we will ignore references to rendition groups and treat them
1481                 # as complete formats.
1482                 if audio_group_id and codecs and f.get('vcodec') != 'none':
1483                     audio_group = groups.get(audio_group_id)
1484                     if audio_group and audio_group[0].get('URI'):
1485                         # TODO: update acodec for audio only formats with
1486                         # the same GROUP-ID
1487                         f['acodec'] = 'none'
1488                 formats.append(f)
1489                 last_stream_inf = {}
1490         return formats
1491
1492     @staticmethod
1493     def _xpath_ns(path, namespace=None):
1494         if not namespace:
1495             return path
1496         out = []
1497         for c in path.split('/'):
1498             if not c or c == '.':
1499                 out.append(c)
1500             else:
1501                 out.append('{%s}%s' % (namespace, c))
1502         return '/'.join(out)
1503
1504     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None):
1505         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source)
1506
1507         if smil is False:
1508             assert not fatal
1509             return []
1510
1511         namespace = self._parse_smil_namespace(smil)
1512
1513         return self._parse_smil_formats(
1514             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1515
1516     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None):
1517         smil = self._download_smil(smil_url, video_id, fatal=fatal)
1518         if smil is False:
1519             return {}
1520         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params)
1521
1522     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None):
1523         return self._download_xml(
1524             smil_url, video_id, 'Downloading SMIL file',
1525             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source)
1526
1527     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None):
1528         namespace = self._parse_smil_namespace(smil)
1529
1530         formats = self._parse_smil_formats(
1531             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params)
1532         subtitles = self._parse_smil_subtitles(smil, namespace=namespace)
1533
1534         video_id = os.path.splitext(url_basename(smil_url))[0]
1535         title = None
1536         description = None
1537         upload_date = None
1538         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1539             name = meta.attrib.get('name')
1540             content = meta.attrib.get('content')
1541             if not name or not content:
1542                 continue
1543             if not title and name == 'title':
1544                 title = content
1545             elif not description and name in ('description', 'abstract'):
1546                 description = content
1547             elif not upload_date and name == 'date':
1548                 upload_date = unified_strdate(content)
1549
1550         thumbnails = [{
1551             'id': image.get('type'),
1552             'url': image.get('src'),
1553             'width': int_or_none(image.get('width')),
1554             'height': int_or_none(image.get('height')),
1555         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')]
1556
1557         return {
1558             'id': video_id,
1559             'title': title or video_id,
1560             'description': description,
1561             'upload_date': upload_date,
1562             'thumbnails': thumbnails,
1563             'formats': formats,
1564             'subtitles': subtitles,
1565         }
1566
1567     def _parse_smil_namespace(self, smil):
1568         return self._search_regex(
1569             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None)
1570
1571     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None):
1572         base = smil_url
1573         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)):
1574             b = meta.get('base') or meta.get('httpBase')
1575             if b:
1576                 base = b
1577                 break
1578
1579         formats = []
1580         rtmp_count = 0
1581         http_count = 0
1582         m3u8_count = 0
1583
1584         srcs = []
1585         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace))
1586         for medium in media:
1587             src = medium.get('src')
1588             if not src or src in srcs:
1589                 continue
1590             srcs.append(src)
1591
1592             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000)
1593             filesize = int_or_none(medium.get('size') or medium.get('fileSize'))
1594             width = int_or_none(medium.get('width'))
1595             height = int_or_none(medium.get('height'))
1596             proto = medium.get('proto')
1597             ext = medium.get('ext')
1598             src_ext = determine_ext(src)
1599             streamer = medium.get('streamer') or base
1600
1601             if proto == 'rtmp' or streamer.startswith('rtmp'):
1602                 rtmp_count += 1
1603                 formats.append({
1604                     'url': streamer,
1605                     'play_path': src,
1606                     'ext': 'flv',
1607                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate),
1608                     'tbr': bitrate,
1609                     'filesize': filesize,
1610                     'width': width,
1611                     'height': height,
1612                 })
1613                 if transform_rtmp_url:
1614                     streamer, src = transform_rtmp_url(streamer, src)
1615                     formats[-1].update({
1616                         'url': streamer,
1617                         'play_path': src,
1618                     })
1619                 continue
1620
1621             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src)
1622             src_url = src_url.strip()
1623
1624             if proto == 'm3u8' or src_ext == 'm3u8':
1625                 m3u8_formats = self._extract_m3u8_formats(
1626                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False)
1627                 if len(m3u8_formats) == 1:
1628                     m3u8_count += 1
1629                     m3u8_formats[0].update({
1630                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate),
1631                         'tbr': bitrate,
1632                         'width': width,
1633                         'height': height,
1634                     })
1635                 formats.extend(m3u8_formats)
1636                 continue
1637
1638             if src_ext == 'f4m':
1639                 f4m_url = src_url
1640                 if not f4m_params:
1641                     f4m_params = {
1642                         'hdcore': '3.2.0',
1643                         'plugin': 'flowplayer-3.2.0.1',
1644                     }
1645                 f4m_url += '&' if '?' in f4m_url else '?'
1646                 f4m_url += compat_urllib_parse_urlencode(f4m_params)
1647                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False))
1648                 continue
1649
1650             if src_url.startswith('http') and self._is_valid_url(src, video_id):
1651                 http_count += 1
1652                 formats.append({
1653                     'url': src_url,
1654                     'ext': ext or src_ext or 'flv',
1655                     'format_id': 'http-%d' % (bitrate or http_count),
1656                     'tbr': bitrate,
1657                     'filesize': filesize,
1658                     'width': width,
1659                     'height': height,
1660                 })
1661                 continue
1662
1663         return formats
1664
1665     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'):
1666         urls = []
1667         subtitles = {}
1668         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))):
1669             src = textstream.get('src')
1670             if not src or src in urls:
1671                 continue
1672             urls.append(src)
1673             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src)
1674             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang
1675             subtitles.setdefault(lang, []).append({
1676                 'url': src,
1677                 'ext': ext,
1678             })
1679         return subtitles
1680
1681     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True):
1682         xspf = self._download_xml(
1683             playlist_url, playlist_id, 'Downloading xpsf playlist',
1684             'Unable to download xspf manifest', fatal=fatal)
1685         if xspf is False:
1686             return []
1687         return self._parse_xspf(xspf, playlist_id)
1688
1689     def _parse_xspf(self, playlist, playlist_id):
1690         NS_MAP = {
1691             'xspf': 'http://xspf.org/ns/0/',
1692             's1': 'http://static.streamone.nl/player/ns/0',
1693         }
1694
1695         entries = []
1696         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)):
1697             title = xpath_text(
1698                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id)
1699             description = xpath_text(
1700                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description')
1701             thumbnail = xpath_text(
1702                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail')
1703             duration = float_or_none(
1704                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000)
1705
1706             formats = [{
1707                 'url': location.text,
1708                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)),
1709                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))),
1710                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))),
1711             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))]
1712             self._sort_formats(formats)
1713
1714             entries.append({
1715                 'id': playlist_id,
1716                 'title': title,
1717                 'description': description,
1718                 'thumbnail': thumbnail,
1719                 'duration': duration,
1720                 'formats': formats,
1721             })
1722         return entries
1723
1724     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}):
1725         res = self._download_webpage_handle(
1726             mpd_url, video_id,
1727             note=note or 'Downloading MPD manifest',
1728             errnote=errnote or 'Failed to download MPD manifest',
1729             fatal=fatal)
1730         if res is False:
1731             return []
1732         mpd, urlh = res
1733         mpd_base_url = base_url(urlh.geturl())
1734
1735         return self._parse_mpd_formats(
1736             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url,
1737             formats_dict=formats_dict, mpd_url=mpd_url)
1738
1739     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None):
1740         """
1741         Parse formats from MPD manifest.
1742         References:
1743          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E),
1744             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip
1745          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP
1746         """
1747         if mpd_doc.get('type') == 'dynamic':
1748             return []
1749
1750         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None)
1751
1752         def _add_ns(path):
1753             return self._xpath_ns(path, namespace)
1754
1755         def is_drm_protected(element):
1756             return element.find(_add_ns('ContentProtection')) is not None
1757
1758         def extract_multisegment_info(element, ms_parent_info):
1759             ms_info = ms_parent_info.copy()
1760
1761             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some
1762             # common attributes and elements.  We will only extract relevant
1763             # for us.
1764             def extract_common(source):
1765                 segment_timeline = source.find(_add_ns('SegmentTimeline'))
1766                 if segment_timeline is not None:
1767                     s_e = segment_timeline.findall(_add_ns('S'))
1768                     if s_e:
1769                         ms_info['total_number'] = 0
1770                         ms_info['s'] = []
1771                         for s in s_e:
1772                             r = int(s.get('r', 0))
1773                             ms_info['total_number'] += 1 + r
1774                             ms_info['s'].append({
1775                                 't': int(s.get('t', 0)),
1776                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60])
1777                                 'd': int(s.attrib['d']),
1778                                 'r': r,
1779                             })
1780                 start_number = source.get('startNumber')
1781                 if start_number:
1782                     ms_info['start_number'] = int(start_number)
1783                 timescale = source.get('timescale')
1784                 if timescale:
1785                     ms_info['timescale'] = int(timescale)
1786                 segment_duration = source.get('duration')
1787                 if segment_duration:
1788                     ms_info['segment_duration'] = int(segment_duration)
1789
1790             def extract_Initialization(source):
1791                 initialization = source.find(_add_ns('Initialization'))
1792                 if initialization is not None:
1793                     ms_info['initialization_url'] = initialization.attrib['sourceURL']
1794
1795             segment_list = element.find(_add_ns('SegmentList'))
1796             if segment_list is not None:
1797                 extract_common(segment_list)
1798                 extract_Initialization(segment_list)
1799                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL'))
1800                 if segment_urls_e:
1801                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e]
1802             else:
1803                 segment_template = element.find(_add_ns('SegmentTemplate'))
1804                 if segment_template is not None:
1805                     extract_common(segment_template)
1806                     media = segment_template.get('media')
1807                     if media:
1808                         ms_info['media'] = media
1809                     initialization = segment_template.get('initialization')
1810                     if initialization:
1811                         ms_info['initialization'] = initialization
1812                     else:
1813                         extract_Initialization(segment_template)
1814             return ms_info
1815
1816         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration'))
1817         formats = []
1818         for period in mpd_doc.findall(_add_ns('Period')):
1819             period_duration = parse_duration(period.get('duration')) or mpd_duration
1820             period_ms_info = extract_multisegment_info(period, {
1821                 'start_number': 1,
1822                 'timescale': 1,
1823             })
1824             for adaptation_set in period.findall(_add_ns('AdaptationSet')):
1825                 if is_drm_protected(adaptation_set):
1826                     continue
1827                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info)
1828                 for representation in adaptation_set.findall(_add_ns('Representation')):
1829                     if is_drm_protected(representation):
1830                         continue
1831                     representation_attrib = adaptation_set.attrib.copy()
1832                     representation_attrib.update(representation.attrib)
1833                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory
1834                     mime_type = representation_attrib['mimeType']
1835                     content_type = mime_type.split('/')[0]
1836                     if content_type == 'text':
1837                         # TODO implement WebVTT downloading
1838                         pass
1839                     elif content_type in ('video', 'audio'):
1840                         base_url = ''
1841                         for element in (representation, adaptation_set, period, mpd_doc):
1842                             base_url_e = element.find(_add_ns('BaseURL'))
1843                             if base_url_e is not None:
1844                                 base_url = base_url_e.text + base_url
1845                                 if re.match(r'^https?://', base_url):
1846                                     break
1847                         if mpd_base_url and not re.match(r'^https?://', base_url):
1848                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'):
1849                                 mpd_base_url += '/'
1850                             base_url = mpd_base_url + base_url
1851                         representation_id = representation_attrib.get('id')
1852                         lang = representation_attrib.get('lang')
1853                         url_el = representation.find(_add_ns('BaseURL'))
1854                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None)
1855                         bandwidth = int_or_none(representation_attrib.get('bandwidth'))
1856                         f = {
1857                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id,
1858                             'url': base_url,
1859                             'manifest_url': mpd_url,
1860                             'ext': mimetype2ext(mime_type),
1861                             'width': int_or_none(representation_attrib.get('width')),
1862                             'height': int_or_none(representation_attrib.get('height')),
1863                             'tbr': float_or_none(bandwidth, 1000),
1864                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
1865                             'fps': int_or_none(representation_attrib.get('frameRate')),
1866                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
1867                             'format_note': 'DASH %s' % content_type,
1868                             'filesize': filesize,
1869                         }
1870                         f.update(parse_codecs(representation_attrib.get('codecs')))
1871                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info)
1872
1873                         def prepare_template(template_name, identifiers):
1874                             t = representation_ms_info[template_name]
1875                             t = t.replace('$RepresentationID$', representation_id)
1876                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t)
1877                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t)
1878                             t.replace('$$', '$')
1879                             return t
1880
1881                         # @initialization is a regular template like @media one
1882                         # so it should be handled just the same way (see
1883                         # https://github.com/rg3/youtube-dl/issues/11605)
1884                         if 'initialization' in representation_ms_info:
1885                             initialization_template = prepare_template(
1886                                 'initialization',
1887                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and
1888                                 # $Time$ shall not be included for @initialization thus
1889                                 # only $Bandwidth$ remains
1890                                 ('Bandwidth', ))
1891                             representation_ms_info['initialization_url'] = initialization_template % {
1892                                 'Bandwidth': bandwidth,
1893                             }
1894
1895                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
1896
1897                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
1898
1899                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
1900                             # can't be used at the same time
1901                             if '%(Number' in media_template and 's' not in representation_ms_info:
1902                                 segment_duration = None
1903                                 if 'total_number' not in representation_ms_info and 'segment_duration':
1904                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
1905                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
1906                                 representation_ms_info['fragments'] = [{
1907                                     'url': media_template % {
1908                                         'Number': segment_number,
1909                                         'Bandwidth': bandwidth,
1910                                     },
1911                                     'duration': segment_duration,
1912                                 } for segment_number in range(
1913                                     representation_ms_info['start_number'],
1914                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])]
1915                             else:
1916                                 # $Number*$ or $Time$ in media template with S list available
1917                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg
1918                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411
1919                                 representation_ms_info['fragments'] = []
1920                                 segment_time = 0
1921                                 segment_d = None
1922                                 segment_number = representation_ms_info['start_number']
1923
1924                                 def add_segment_url():
1925                                     segment_url = media_template % {
1926                                         'Time': segment_time,
1927                                         'Bandwidth': bandwidth,
1928                                         'Number': segment_number,
1929                                     }
1930                                     representation_ms_info['fragments'].append({
1931                                         'url': segment_url,
1932                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']),
1933                                     })
1934
1935                                 for num, s in enumerate(representation_ms_info['s']):
1936                                     segment_time = s.get('t') or segment_time
1937                                     segment_d = s['d']
1938                                     add_segment_url()
1939                                     segment_number += 1
1940                                     for r in range(s.get('r', 0)):
1941                                         segment_time += segment_d
1942                                         add_segment_url()
1943                                         segment_number += 1
1944                                     segment_time += segment_d
1945                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info:
1946                             # No media template
1947                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI
1948                             # or any YouTube dashsegments video
1949                             fragments = []
1950                             segment_index = 0
1951                             timescale = representation_ms_info['timescale']
1952                             for s in representation_ms_info['s']:
1953                                 duration = float_or_none(s['d'], timescale)
1954                                 for r in range(s.get('r', 0) + 1):
1955                                     fragments.append({
1956                                         'url': representation_ms_info['segment_urls'][segment_index],
1957                                         'duration': duration,
1958                                     })
1959                                     segment_index += 1
1960                             representation_ms_info['fragments'] = fragments
1961                         # NB: MPD manifest may contain direct URLs to unfragmented media.
1962                         # No fragments key is present in this case.
1963                         if 'fragments' in representation_ms_info:
1964                             f.update({
1965                                 'fragments': [],
1966                                 'protocol': 'http_dash_segments',
1967                             })
1968                             if 'initialization_url' in representation_ms_info:
1969                                 initialization_url = representation_ms_info['initialization_url']
1970                                 if not f.get('url'):
1971                                     f['url'] = initialization_url
1972                                 f['fragments'].append({'url': initialization_url})
1973                             f['fragments'].extend(representation_ms_info['fragments'])
1974                             for fragment in f['fragments']:
1975                                 fragment['url'] = urljoin(base_url, fragment['url'])
1976                         try:
1977                             existing_format = next(
1978                                 fo for fo in formats
1979                                 if fo['format_id'] == representation_id)
1980                         except StopIteration:
1981                             full_info = formats_dict.get(representation_id, {}).copy()
1982                             full_info.update(f)
1983                             formats.append(full_info)
1984                         else:
1985                             existing_format.update(f)
1986                     else:
1987                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type)
1988         return formats
1989
1990     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True):
1991         res = self._download_webpage_handle(
1992             ism_url, video_id,
1993             note=note or 'Downloading ISM manifest',
1994             errnote=errnote or 'Failed to download ISM manifest',
1995             fatal=fatal)
1996         if res is False:
1997             return []
1998         ism, urlh = res
1999
2000         return self._parse_ism_formats(
2001             compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
2002
2003     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
2004         """
2005         Parse formats from ISM manifest.
2006         References:
2007          1. [MS-SSTR]: Smooth Streaming Protocol,
2008             https://msdn.microsoft.com/en-us/library/ff469518.aspx
2009         """
2010         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
2011             return []
2012
2013         duration = int(ism_doc.attrib['Duration'])
2014         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000
2015
2016         formats = []
2017         for stream in ism_doc.findall('StreamIndex'):
2018             stream_type = stream.get('Type')
2019             if stream_type not in ('video', 'audio'):
2020                 continue
2021             url_pattern = stream.attrib['Url']
2022             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale
2023             stream_name = stream.get('Name')
2024             for track in stream.findall('QualityLevel'):
2025                 fourcc = track.get('FourCC')
2026                 # TODO: add support for WVC1 and WMAP
2027                 if fourcc not in ('H264', 'AVC1', 'AACL'):
2028                     self.report_warning('%s is not a supported codec' % fourcc)
2029                     continue
2030                 tbr = int(track.attrib['Bitrate']) // 1000
2031                 # [1] does not mention Width and Height attributes. However,
2032                 # they're often present while MaxWidth and MaxHeight are
2033                 # missing, so should be used as fallbacks
2034                 width = int_or_none(track.get('MaxWidth') or track.get('Width'))
2035                 height = int_or_none(track.get('MaxHeight') or track.get('Height'))
2036                 sampling_rate = int_or_none(track.get('SamplingRate'))
2037
2038                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
2039                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern)
2040
2041                 fragments = []
2042                 fragment_ctx = {
2043                     'time': 0,
2044                 }
2045                 stream_fragments = stream.findall('c')
2046                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments):
2047                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time']
2048                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1
2049                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d'))
2050                     if not fragment_ctx['duration']:
2051                         try:
2052                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t'])
2053                         except IndexError:
2054                             next_fragment_time = duration
2055                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat
2056                     for _ in range(fragment_repeat):
2057                         fragments.append({
2058                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern),
2059                             'duration': fragment_ctx['duration'] / stream_timescale,
2060                         })
2061                         fragment_ctx['time'] += fragment_ctx['duration']
2062
2063                 format_id = []
2064                 if ism_id:
2065                     format_id.append(ism_id)
2066                 if stream_name:
2067                     format_id.append(stream_name)
2068                 format_id.append(compat_str(tbr))
2069
2070                 formats.append({
2071                     'format_id': '-'.join(format_id),
2072                     'url': ism_url,
2073                     'manifest_url': ism_url,
2074                     'ext': 'ismv' if stream_type == 'video' else 'isma',
2075                     'width': width,
2076                     'height': height,
2077                     'tbr': tbr,
2078                     'asr': sampling_rate,
2079                     'vcodec': 'none' if stream_type == 'audio' else fourcc,
2080                     'acodec': 'none' if stream_type == 'video' else fourcc,
2081                     'protocol': 'ism',
2082                     'fragments': fragments,
2083                     '_download_params': {
2084                         'duration': duration,
2085                         'timescale': stream_timescale,
2086                         'width': width or 0,
2087                         'height': height or 0,
2088                         'fourcc': fourcc,
2089                         'codec_private_data': track.get('CodecPrivateData'),
2090                         'sampling_rate': sampling_rate,
2091                         'channels': int_or_none(track.get('Channels', 2)),
2092                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)),
2093                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)),
2094                     },
2095                 })
2096         return formats
2097
2098     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None):
2099         def absolute_url(video_url):
2100             return compat_urlparse.urljoin(base_url, video_url)
2101
2102         def parse_content_type(content_type):
2103             if not content_type:
2104                 return {}
2105             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type)
2106             if ctr:
2107                 mimetype, codecs = ctr.groups()
2108                 f = parse_codecs(codecs)
2109                 f['ext'] = mimetype2ext(mimetype)
2110                 return f
2111             return {}
2112
2113         def _media_formats(src, cur_media_type):
2114             full_url = absolute_url(src)
2115             ext = determine_ext(full_url)
2116             if ext == 'm3u8':
2117                 is_plain_url = False
2118                 formats = self._extract_m3u8_formats(
2119                     full_url, video_id, ext='mp4',
2120                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
2121                     preference=preference)
2122             elif ext == 'mpd':
2123                 is_plain_url = False
2124                 formats = self._extract_mpd_formats(
2125                     full_url, video_id, mpd_id=mpd_id)
2126             else:
2127                 is_plain_url = True
2128                 formats = [{
2129                     'url': full_url,
2130                     'vcodec': 'none' if cur_media_type == 'audio' else None,
2131                 }]
2132             return is_plain_url, formats
2133
2134         entries = []
2135         media_tags = [(media_tag, media_type, '')
2136                       for media_tag, media_type
2137                       in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
2138         media_tags.extend(re.findall(
2139             # We only allow video|audio followed by a whitespace or '>'.
2140             # Allowing more characters may end up in significant slow down (see
2141             # https://github.com/rg3/youtube-dl/issues/11979, example URL:
2142             # http://www.porntrex.com/maps/videositemap.xml).
2143             r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
2144         for media_tag, media_type, media_content in media_tags:
2145             media_info = {
2146                 'formats': [],
2147                 'subtitles': {},
2148             }
2149             media_attributes = extract_attributes(media_tag)
2150             src = media_attributes.get('src')
2151             if src:
2152                 _, formats = _media_formats(src, media_type)
2153                 media_info['formats'].extend(formats)
2154             media_info['thumbnail'] = media_attributes.get('poster')
2155             if media_content:
2156                 for source_tag in re.findall(r'<source[^>]+>', media_content):
2157                     source_attributes = extract_attributes(source_tag)
2158                     src = source_attributes.get('src')
2159                     if not src:
2160                         continue
2161                     is_plain_url, formats = _media_formats(src, media_type)
2162                     if is_plain_url:
2163                         f = parse_content_type(source_attributes.get('type'))
2164                         f.update(formats[0])
2165                         media_info['formats'].append(f)
2166                     else:
2167                         media_info['formats'].extend(formats)
2168                 for track_tag in re.findall(r'<track[^>]+>', media_content):
2169                     track_attributes = extract_attributes(track_tag)
2170                     kind = track_attributes.get('kind')
2171                     if not kind or kind in ('subtitles', 'captions'):
2172                         src = track_attributes.get('src')
2173                         if not src:
2174                             continue
2175                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label')
2176                         media_info['subtitles'].setdefault(lang, []).append({
2177                             'url': absolute_url(src),
2178                         })
2179             if media_info['formats'] or media_info['subtitles']:
2180                 entries.append(media_info)
2181         return entries
2182
2183     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
2184         formats = []
2185         hdcore_sign = 'hdcore=3.7.0'
2186         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
2187         hds_host = hosts.get('hds')
2188         if hds_host:
2189             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
2190         if 'hdcore=' not in f4m_url:
2191             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign
2192         f4m_formats = self._extract_f4m_formats(
2193             f4m_url, video_id, f4m_id='hds', fatal=False)
2194         for entry in f4m_formats:
2195             entry.update({'extra_param_to_segment_url': hdcore_sign})
2196         formats.extend(f4m_formats)
2197         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8')
2198         hls_host = hosts.get('hls')
2199         if hls_host:
2200             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url)
2201         formats.extend(self._extract_m3u8_formats(
2202             m3u8_url, video_id, 'mp4', 'm3u8_native',
2203             m3u8_id='hls', fatal=False))
2204         return formats
2205
2206     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
2207         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
2208         url_base = self._search_regex(
2209             r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
2210         http_base_url = '%s:%s' % ('http', url_base)
2211         formats = []
2212         if 'm3u8' not in skip_protocols:
2213             formats.extend(self._extract_m3u8_formats(
2214                 http_base_url + '/playlist.m3u8', video_id, 'mp4',
2215                 m3u8_entry_protocol, m3u8_id='hls', fatal=False))
2216         if 'f4m' not in skip_protocols:
2217             formats.extend(self._extract_f4m_formats(
2218                 http_base_url + '/manifest.f4m',
2219                 video_id, f4m_id='hds', fatal=False))
2220         if 'dash' not in skip_protocols:
2221             formats.extend(self._extract_mpd_formats(
2222                 http_base_url + '/manifest.mpd',
2223                 video_id, mpd_id='dash', fatal=False))
2224         if re.search(r'(?:/smil:|\.smil)', url_base):
2225             if 'smil' not in skip_protocols:
2226                 rtmp_formats = self._extract_smil_formats(
2227                     http_base_url + '/jwplayer.smil',
2228                     video_id, fatal=False)
2229                 for rtmp_format in rtmp_formats:
2230                     rtsp_format = rtmp_format.copy()
2231                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path'])
2232                     del rtsp_format['play_path']
2233                     del rtsp_format['ext']
2234                     rtsp_format.update({
2235                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'),
2236                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'),
2237                         'protocol': 'rtsp',
2238                     })
2239                     formats.extend([rtmp_format, rtsp_format])
2240         else:
2241             for protocol in ('rtmp', 'rtsp'):
2242                 if protocol not in skip_protocols:
2243                     formats.append({
2244                         'url': '%s:%s' % (protocol, url_base),
2245                         'format_id': protocol,
2246                         'protocol': protocol,
2247                     })
2248         return formats
2249
2250     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
2251         mobj = re.search(
2252             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
2253             webpage)
2254         if mobj:
2255             try:
2256                 jwplayer_data = self._parse_json(mobj.group('options'),
2257                                                  video_id=video_id,
2258                                                  transform_source=transform_source)
2259             except ExtractorError:
2260                 pass
2261             else:
2262                 if isinstance(jwplayer_data, dict):
2263                     return jwplayer_data
2264
2265     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs):
2266         jwplayer_data = self._find_jwplayer_data(
2267             webpage, video_id, transform_source=js_to_json)
2268         return self._parse_jwplayer_data(
2269             jwplayer_data, video_id, *args, **kwargs)
2270
2271     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True,
2272                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2273         # JWPlayer backward compatibility: flattened playlists
2274         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96
2275         if 'playlist' not in jwplayer_data:
2276             jwplayer_data = {'playlist': [jwplayer_data]}
2277
2278         entries = []
2279
2280         # JWPlayer backward compatibility: single playlist item
2281         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10
2282         if not isinstance(jwplayer_data['playlist'], list):
2283             jwplayer_data['playlist'] = [jwplayer_data['playlist']]
2284
2285         for video_data in jwplayer_data['playlist']:
2286             # JWPlayer backward compatibility: flattened sources
2287             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35
2288             if 'sources' not in video_data:
2289                 video_data['sources'] = [video_data]
2290
2291             this_video_id = video_id or video_data['mediaid']
2292
2293             formats = self._parse_jwplayer_formats(
2294                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id,
2295                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url)
2296             self._sort_formats(formats)
2297
2298             subtitles = {}
2299             tracks = video_data.get('tracks')
2300             if tracks and isinstance(tracks, list):
2301                 for track in tracks:
2302                     if track.get('kind') != 'captions':
2303                         continue
2304                     track_url = urljoin(base_url, track.get('file'))
2305                     if not track_url:
2306                         continue
2307                     subtitles.setdefault(track.get('label') or 'en', []).append({
2308                         'url': self._proto_relative_url(track_url)
2309                     })
2310
2311             entries.append({
2312                 'id': this_video_id,
2313                 'title': video_data['title'] if require_title else video_data.get('title'),
2314                 'description': video_data.get('description'),
2315                 'thumbnail': self._proto_relative_url(video_data.get('image')),
2316                 'timestamp': int_or_none(video_data.get('pubdate')),
2317                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')),
2318                 'subtitles': subtitles,
2319                 'formats': formats,
2320             })
2321         if len(entries) == 1:
2322             return entries[0]
2323         else:
2324             return self.playlist_result(entries)
2325
2326     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
2327                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
2328         urls = []
2329         formats = []
2330         for source in jwplayer_sources_data:
2331             source_url = self._proto_relative_url(source.get('file'))
2332             if not source_url:
2333                 continue
2334             if base_url:
2335                 source_url = compat_urlparse.urljoin(base_url, source_url)
2336             if source_url in urls:
2337                 continue
2338             urls.append(source_url)
2339             source_type = source.get('type') or ''
2340             ext = mimetype2ext(source_type) or determine_ext(source_url)
2341             if source_type == 'hls' or ext == 'm3u8':
2342                 formats.extend(self._extract_m3u8_formats(
2343                     source_url, video_id, 'mp4', entry_protocol='m3u8_native',
2344                     m3u8_id=m3u8_id, fatal=False))
2345             elif ext == 'mpd':
2346                 formats.extend(self._extract_mpd_formats(
2347                     source_url, video_id, mpd_id=mpd_id, fatal=False))
2348             elif ext == 'smil':
2349                 formats.extend(self._extract_smil_formats(
2350                     source_url, video_id, fatal=False))
2351             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67
2352             elif source_type.startswith('audio') or ext in (
2353                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'):
2354                 formats.append({
2355                     'url': source_url,
2356                     'vcodec': 'none',
2357                     'ext': ext,
2358                 })
2359             else:
2360                 height = int_or_none(source.get('height'))
2361                 if height is None:
2362                     # Often no height is provided but there is a label in
2363                     # format like "1080p", "720p SD", or 1080.
2364                     height = int_or_none(self._search_regex(
2365                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''),
2366                         'height', default=None))
2367                 a_format = {
2368                     'url': source_url,
2369                     'width': int_or_none(source.get('width')),
2370                     'height': height,
2371                     'tbr': int_or_none(source.get('bitrate')),
2372                     'ext': ext,
2373                 }
2374                 if source_url.startswith('rtmp'):
2375                     a_format['ext'] = 'flv'
2376                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as
2377                     # of jwplayer.flash.swf
2378                     rtmp_url_parts = re.split(
2379                         r'((?:mp4|mp3|flv):)', source_url, 1)
2380                     if len(rtmp_url_parts) == 3:
2381                         rtmp_url, prefix, play_path = rtmp_url_parts
2382                         a_format.update({
2383                             'url': rtmp_url,
2384                             'play_path': prefix + play_path,
2385                         })
2386                     if rtmp_params:
2387                         a_format.update(rtmp_params)
2388                 formats.append(a_format)
2389         return formats
2390
2391     def _live_title(self, name):
2392         """ Generate the title for a live video """
2393         now = datetime.datetime.now()
2394         now_str = now.strftime('%Y-%m-%d %H:%M')
2395         return name + ' ' + now_str
2396
2397     def _int(self, v, name, fatal=False, **kwargs):
2398         res = int_or_none(v, **kwargs)
2399         if 'get_attr' in kwargs:
2400             print(getattr(v, kwargs['get_attr']))
2401         if res is None:
2402             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2403             if fatal:
2404                 raise ExtractorError(msg)
2405             else:
2406                 self._downloader.report_warning(msg)
2407         return res
2408
2409     def _float(self, v, name, fatal=False, **kwargs):
2410         res = float_or_none(v, **kwargs)
2411         if res is None:
2412             msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
2413             if fatal:
2414                 raise ExtractorError(msg)
2415             else:
2416                 self._downloader.report_warning(msg)
2417         return res
2418
2419     def _set_cookie(self, domain, name, value, expire_time=None):
2420         cookie = compat_cookiejar.Cookie(
2421             0, name, value, None, None, domain, None,
2422             None, '/', True, False, expire_time, '', None, None, None)
2423         self._downloader.cookiejar.set_cookie(cookie)
2424
2425     def _get_cookies(self, url):
2426         """ Return a compat_cookies.SimpleCookie with the cookies for the url """
2427         req = sanitized_Request(url)
2428         self._downloader.cookiejar.add_cookie_header(req)
2429         return compat_cookies.SimpleCookie(req.get_header('Cookie'))
2430
2431     def get_testcases(self, include_onlymatching=False):
2432         t = getattr(self, '_TEST', None)
2433         if t:
2434             assert not hasattr(self, '_TESTS'), \
2435                 '%s has _TEST and _TESTS' % type(self).__name__
2436             tests = [t]
2437         else:
2438             tests = getattr(self, '_TESTS', [])
2439         for t in tests:
2440             if not include_onlymatching and t.get('only_matching', False):
2441                 continue
2442             t['name'] = type(self).__name__[:-len('IE')]
2443             yield t
2444
2445     def is_suitable(self, age_limit):
2446         """ Test whether the extractor is generally suitable for the given
2447         age limit (i.e. pornographic sites are not, all others usually are) """
2448
2449         any_restricted = False
2450         for tc in self.get_testcases(include_onlymatching=False):
2451             if tc.get('playlist', []):
2452                 tc = tc['playlist'][0]
2453             is_restricted = age_restricted(
2454                 tc.get('info_dict', {}).get('age_limit'), age_limit)
2455             if not is_restricted:
2456                 return True
2457             any_restricted = any_restricted or is_restricted
2458         return not any_restricted
2459
2460     def extract_subtitles(self, *args, **kwargs):
2461         if (self._downloader.params.get('writesubtitles', False) or
2462                 self._downloader.params.get('listsubtitles')):
2463             return self._get_subtitles(*args, **kwargs)
2464         return {}
2465
2466     def _get_subtitles(self, *args, **kwargs):
2467         raise NotImplementedError('This method must be implemented by subclasses')
2468
2469     @staticmethod
2470     def _merge_subtitle_items(subtitle_list1, subtitle_list2):
2471         """ Merge subtitle items for one language. Items with duplicated URLs
2472         will be dropped. """
2473         list1_urls = set([item['url'] for item in subtitle_list1])
2474         ret = list(subtitle_list1)
2475         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls])
2476         return ret
2477
2478     @classmethod
2479     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2):
2480         """ Merge two subtitle dictionaries, language by language. """
2481         ret = dict(subtitle_dict1)
2482         for lang in subtitle_dict2:
2483             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang])
2484         return ret
2485
2486     def extract_automatic_captions(self, *args, **kwargs):
2487         if (self._downloader.params.get('writeautomaticsub', False) or
2488                 self._downloader.params.get('listsubtitles')):
2489             return self._get_automatic_captions(*args, **kwargs)
2490         return {}
2491
2492     def _get_automatic_captions(self, *args, **kwargs):
2493         raise NotImplementedError('This method must be implemented by subclasses')
2494
2495     def mark_watched(self, *args, **kwargs):
2496         if (self._downloader.params.get('mark_watched', False) and
2497                 (self._get_login_info()[0] is not None or
2498                     self._downloader.params.get('cookiefile') is not None)):
2499             self._mark_watched(*args, **kwargs)
2500
2501     def _mark_watched(self, *args, **kwargs):
2502         raise NotImplementedError('This method must be implemented by subclasses')
2503
2504     def geo_verification_headers(self):
2505         headers = {}
2506         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy')
2507         if geo_verification_proxy:
2508             headers['Ytdl-request-proxy'] = geo_verification_proxy
2509         return headers
2510
2511     def _generic_id(self, url):
2512         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0])
2513
2514     def _generic_title(self, url):
2515         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0])
2516
2517
2518 class SearchInfoExtractor(InfoExtractor):
2519     """
2520     Base class for paged search queries extractors.
2521     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query}
2522     Instances should define _SEARCH_KEY and _MAX_RESULTS.
2523     """
2524
2525     @classmethod
2526     def _make_valid_url(cls):
2527         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
2528
2529     @classmethod
2530     def suitable(cls, url):
2531         return re.match(cls._make_valid_url(), url) is not None
2532
2533     def _real_extract(self, query):
2534         mobj = re.match(self._make_valid_url(), query)
2535         if mobj is None:
2536             raise ExtractorError('Invalid search query "%s"' % query)
2537
2538         prefix = mobj.group('prefix')
2539         query = mobj.group('query')
2540         if prefix == '':
2541             return self._get_n_results(query, 1)
2542         elif prefix == 'all':
2543             return self._get_n_results(query, self._MAX_RESULTS)
2544         else:
2545             n = int(prefix)
2546             if n <= 0:
2547                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
2548             elif n > self._MAX_RESULTS:
2549                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
2550                 n = self._MAX_RESULTS
2551             return self._get_n_results(query, n)
2552
2553     def _get_n_results(self, query, n):
2554         """Get a specified number of results for a query"""
2555         raise NotImplementedError('This method must be implemented by subclasses')
2556
2557     @property
2558     def SEARCH_KEY(self):
2559         return self._SEARCH_KEY