2 from __future__ 
import unicode_literals
 
  17 from ..compat 
import ( 
  20     compat_etree_fromstring
, 
  27     compat_urllib_parse_unquote
, 
  28     compat_urllib_parse_urlencode
, 
  29     compat_urllib_request
, 
  31     compat_xml_parse_error
, 
  33 from ..downloader
.f4m 
import ( 
  35     remove_encrypted_media
, 
  61     parse_m3u8_attributes
, 
  79 class InfoExtractor(object): 
  80     """Information Extractor class. 
  82     Information extractors are the classes that, given a URL, extract 
  83     information about the video (or videos) the URL refers to. This 
  84     information includes the real video URL, the video title, author and 
  85     others. The information is stored in a dictionary which is then 
  86     passed to the YoutubeDL. The YoutubeDL processes this 
  87     information possibly downloading the video to the file system, among 
  88     other possible outcomes. 
  90     The type field determines the type of the result. 
  91     By far the most common value (and the default if _type is missing) is 
  92     "video", which indicates a single video. 
  94     For a video, the dictionaries must include the following fields: 
  97     title:          Video title, unescaped. 
  99     Additionally, it must contain either a formats entry or a url one: 
 101     formats:        A list of dictionaries for each format available, ordered 
 102                     from worst to best quality. 
 105                     * url        Mandatory. The URL of the video file 
 107                                  The URL of the manifest file in case of 
 108                                  fragmented media (DASH, hls, hds) 
 109                     * ext        Will be calculated from URL if missing 
 110                     * format     A human-readable description of the format 
 111                                  ("mp4 container with h264/opus"). 
 112                                  Calculated from the format_id, width, height. 
 113                                  and format_note fields if missing. 
 114                     * format_id  A short description of the format 
 115                                  ("mp4_h264_opus" or "19"). 
 116                                 Technically optional, but strongly recommended. 
 117                     * format_note Additional info about the format 
 118                                  ("3D" or "DASH video") 
 119                     * width      Width of the video, if known 
 120                     * height     Height of the video, if known 
 121                     * resolution Textual description of width and height 
 122                     * tbr        Average bitrate of audio and video in KBit/s 
 123                     * abr        Average audio bitrate in KBit/s 
 124                     * acodec     Name of the audio codec in use 
 125                     * asr        Audio sampling rate in Hertz 
 126                     * vbr        Average video bitrate in KBit/s 
 128                     * vcodec     Name of the video codec in use 
 129                     * container  Name of the container format 
 130                     * filesize   The number of bytes, if known in advance 
 131                     * filesize_approx  An estimate for the number of bytes 
 132                     * player_url SWF Player URL (used for rtmpdump). 
 133                     * protocol   The protocol that will be used for the actual 
 134                                  download, lower-case. 
 135                                  "http", "https", "rtsp", "rtmp", "rtmpe", 
 136                                  "m3u8", "m3u8_native" or "http_dash_segments". 
 138                                  Base URL for fragments. Each fragment's path 
 139                                  value (if present) will be relative to 
 141                     * fragments  A list of fragments of a fragmented media. 
 142                                  Each fragment entry must contain either an url 
 143                                  or a path. If an url is present it should be 
 144                                  considered by a client. Otherwise both path and 
 145                                  fragment_base_url must be present. Here is 
 146                                  the list of all potential fields: 
 147                                  * "url" - fragment's URL 
 148                                  * "path" - fragment's path relative to 
 150                                  * "duration" (optional, int or float) 
 151                                  * "filesize" (optional, int) 
 152                     * preference Order number of this format. If this field is 
 153                                  present and not None, the formats get sorted 
 154                                  by this field, regardless of all other values. 
 155                                  -1 for default (order by other properties), 
 156                                  -2 or smaller for less than default. 
 157                                  < -1000 to hide the format (if there is 
 158                                     another one which is strictly better) 
 159                     * language   Language code, e.g. "de" or "en-US". 
 160                     * language_preference  Is this in the language mentioned in 
 162                                  10 if it's what the URL is about, 
 163                                  -1 for default (don't know), 
 164                                  -10 otherwise, other values reserved for now. 
 165                     * quality    Order number of the video quality of this 
 166                                  format, irrespective of the file format. 
 167                                  -1 for default (order by other properties), 
 168                                  -2 or smaller for less than default. 
 169                     * source_preference  Order number for this video source 
 170                                   (quality takes higher priority) 
 171                                  -1 for default (order by other properties), 
 172                                  -2 or smaller for less than default. 
 173                     * http_headers  A dictionary of additional HTTP headers 
 174                                  to add to the request. 
 175                     * stretched_ratio  If given and not 1, indicates that the 
 176                                  video's pixels are not square. 
 177                                  width : height ratio as float. 
 178                     * no_resume  The server does not support resuming the 
 179                                  (HTTP or RTMP) download. Boolean. 
 180                     * downloader_options  A dictionary of downloader options as 
 181                                  described in FileDownloader 
 183     url:            Final video URL. 
 184     ext:            Video filename extension. 
 185     format:         The video format, defaults to ext (used for --get-format) 
 186     player_url:     SWF Player URL (used for rtmpdump). 
 188     The following fields are optional: 
 190     alt_title:      A secondary title of the video. 
 191     display_id      An alternative identifier for the video, not necessarily 
 192                     unique, but available before title. Typically, id is 
 193                     something like "4234987", title "Dancing naked mole rats", 
 194                     and display_id "dancing-naked-mole-rats" 
 195     thumbnails:     A list of dictionaries, with the following entries: 
 196                         * "id" (optional, string) - Thumbnail format ID 
 198                         * "preference" (optional, int) - quality of the image 
 199                         * "width" (optional, int) 
 200                         * "height" (optional, int) 
 201                         * "resolution" (optional, string "{width}x{height"}, 
 203                         * "filesize" (optional, int) 
 204     thumbnail:      Full URL to a video thumbnail image. 
 205     description:    Full video description. 
 206     uploader:       Full name of the video uploader. 
 207     license:        License name the video is licensed under. 
 208     creator:        The creator of the video. 
 209     release_date:   The date (YYYYMMDD) when the video was released. 
 210     timestamp:      UNIX timestamp of the moment the video became available. 
 211     upload_date:    Video upload date (YYYYMMDD). 
 212                     If not explicitly set, calculated from timestamp. 
 213     uploader_id:    Nickname or id of the video uploader. 
 214     uploader_url:   Full URL to a personal webpage of the video uploader. 
 215     channel:        Full name of the channel the video is uploaded on. 
 216                     Note that channel fields may or may not repeat uploader 
 217                     fields. This depends on a particular extractor. 
 218     channel_id:     Id of the channel. 
 219     channel_url:    Full URL to a channel webpage. 
 220     location:       Physical location where the video was filmed. 
 221     subtitles:      The available subtitles as a dictionary in the format 
 222                     {tag: subformats}. "tag" is usually a language code, and 
 223                     "subformats" is a list sorted from lower to higher 
 224                     preference, each element is a dictionary with the "ext" 
 226                         * "data": The subtitles file contents 
 227                         * "url": A URL pointing to the subtitles file 
 228                     "ext" will be calculated from URL if missing 
 229     automatic_captions: Like 'subtitles', used by the YoutubeIE for 
 230                     automatically generated captions 
 231     duration:       Length of the video in seconds, as an integer or float. 
 232     view_count:     How many users have watched the video on the platform. 
 233     like_count:     Number of positive ratings of the video 
 234     dislike_count:  Number of negative ratings of the video 
 235     repost_count:   Number of reposts of the video 
 236     average_rating: Average rating give by users, the scale used depends on the webpage 
 237     comment_count:  Number of comments on the video 
 238     comments:       A list of comments, each with one or more of the following 
 239                     properties (all but one of text or html optional): 
 240                         * "author" - human-readable name of the comment author 
 241                         * "author_id" - user ID of the comment author 
 243                         * "html" - Comment as HTML 
 244                         * "text" - Plain text of the comment 
 245                         * "timestamp" - UNIX timestamp of comment 
 246                         * "parent" - ID of the comment this one is replying to. 
 247                                      Set to "root" to indicate that this is a 
 248                                      comment to the original video. 
 249     age_limit:      Age restriction for the video, as an integer (years) 
 250     webpage_url:    The URL to the video webpage, if given to youtube-dl it 
 251                     should allow to get the same result again. (It will be set 
 252                     by YoutubeDL if it's missing) 
 253     categories:     A list of categories that the video falls in, for example 
 255     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"] 
 256     is_live:        True, False, or None (=unknown). Whether this video is a 
 257                     live stream that goes on instead of a fixed-length video. 
 258     start_time:     Time in seconds where the reproduction should start, as 
 259                     specified in the URL. 
 260     end_time:       Time in seconds where the reproduction should end, as 
 261                     specified in the URL. 
 262     chapters:       A list of dictionaries, with the following entries: 
 263                         * "start_time" - The start time of the chapter in seconds 
 264                         * "end_time" - The end time of the chapter in seconds 
 265                         * "title" (optional, string) 
 267     The following fields should only be used when the video belongs to some logical 
 270     chapter:        Name or title of the chapter the video belongs to. 
 271     chapter_number: Number of the chapter the video belongs to, as an integer. 
 272     chapter_id:     Id of the chapter the video belongs to, as a unicode string. 
 274     The following fields should only be used when the video is an episode of some 
 275     series, programme or podcast: 
 277     series:         Title of the series or programme the video episode belongs to. 
 278     season:         Title of the season the video episode belongs to. 
 279     season_number:  Number of the season the video episode belongs to, as an integer. 
 280     season_id:      Id of the season the video episode belongs to, as a unicode string. 
 281     episode:        Title of the video episode. Unlike mandatory video title field, 
 282                     this field should denote the exact title of the video episode 
 283                     without any kind of decoration. 
 284     episode_number: Number of the video episode within a season, as an integer. 
 285     episode_id:     Id of the video episode, as a unicode string. 
 287     The following fields should only be used when the media is a track or a part of 
 290     track:          Title of the track. 
 291     track_number:   Number of the track within an album or a disc, as an integer. 
 292     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii), 
 294     artist:         Artist(s) of the track. 
 295     genre:          Genre(s) of the track. 
 296     album:          Title of the album the track belongs to. 
 297     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). 
 298     album_artist:   List of all artists appeared on the album (e.g. 
 299                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits 
 301     disc_number:    Number of the disc or other physical medium the track belongs to, 
 303     release_year:   Year (YYYY) when the album was released. 
 305     Unless mentioned otherwise, the fields should be Unicode strings. 
 307     Unless mentioned otherwise, None is equivalent to absence of information. 
 310     _type "playlist" indicates multiple videos. 
 311     There must be a key "entries", which is a list, an iterable, or a PagedList 
 312     object, each element of which is a valid dictionary by this specification. 
 314     Additionally, playlists can have "id", "title", "description", "uploader", 
 315     "uploader_id", "uploader_url" attributes with the same semantics as videos 
 319     _type "multi_video" indicates that there are multiple videos that 
 320     form a single show, for examples multiple acts of an opera or TV episode. 
 321     It must have an entries key like a playlist and contain all the keys 
 322     required for a video at the same time. 
 325     _type "url" indicates that the video must be extracted from another 
 326     location, possibly by a different extractor. Its only required key is: 
 327     "url" - the next URL to extract. 
 328     The key "ie_key" can be set to the class name (minus the trailing "IE", 
 329     e.g. "Youtube") if the extractor class is known in advance. 
 330     Additionally, the dictionary may have any properties of the resolved entity 
 331     known in advance, for example "title" if the title of the referred video is 
 335     _type "url_transparent" entities have the same specification as "url", but 
 336     indicate that the given additional information is more precise than the one 
 337     associated with the resolved URL. 
 338     This is useful when a site employs a video service that hosts the video and 
 339     its technical metadata, but that video service does not embed a useful 
 340     title, description etc. 
 343     Subclasses of this one should re-define the _real_initialize() and 
 344     _real_extract() methods and define a _VALID_URL regexp. 
 345     Probably, they should also be added to the list of extractors. 
 347     _GEO_BYPASS attribute may be set to False in order to disable 
 348     geo restriction bypass mechanisms for a particular extractor. 
 349     Though it won't disable explicit geo restriction bypass based on 
 350     country code provided with geo_bypass_country. 
 352     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted 
 353     countries for this extractor. One of these countries will be used by 
 354     geo restriction bypass mechanism right away in order to bypass 
 355     geo restriction, of course, if the mechanism is not disabled. 
 357     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted 
 358     IP blocks in CIDR notation for this extractor. One of these IP blocks 
 359     will be used by geo restriction bypass mechanism similarly 
 362     Finally, the _WORKING attribute should be set to False for broken IEs 
 363     in order to warn the users and skip the tests. 
 368     _x_forwarded_for_ip 
= None 
 370     _GEO_COUNTRIES 
= None 
 371     _GEO_IP_BLOCKS 
= None 
 374     def __init__(self
, downloader
=None): 
 375         """Constructor. Receives an optional downloader.""" 
 377         self
._x
_forwarded
_for
_ip 
= None 
 378         self
.set_downloader(downloader
) 
 381     def suitable(cls
, url
): 
 382         """Receives a URL and returns True if suitable for this IE.""" 
 384         # This does not use has/getattr intentionally - we want to know whether 
 385         # we have cached the regexp for *this* class, whereas getattr would also 
 386         # match the superclass 
 387         if '_VALID_URL_RE' not in cls
.__dict
__: 
 388             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 389         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 392     def _match_id(cls
, url
): 
 393         if '_VALID_URL_RE' not in cls
.__dict
__: 
 394             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 395         m 
= cls
._VALID
_URL
_RE
.match(url
) 
 397         return compat_str(m
.group('id')) 
 401         """Getter method for _WORKING.""" 
 404     def initialize(self
): 
 405         """Initializes an instance (authentication, etc).""" 
 406         self
._initialize
_geo
_bypass
({ 
 407             'countries': self
._GEO
_COUNTRIES
, 
 408             'ip_blocks': self
._GEO
_IP
_BLOCKS
, 
 411             self
._real
_initialize
() 
 414     def _initialize_geo_bypass(self
, geo_bypass_context
): 
 416         Initialize geo restriction bypass mechanism. 
 418         This method is used to initialize geo bypass mechanism based on faking 
 419         X-Forwarded-For HTTP header. A random country from provided country list 
 420         is selected and a random IP belonging to this country is generated. This 
 421         IP will be passed as X-Forwarded-For HTTP header in all subsequent 
 424         This method will be used for initial geo bypass mechanism initialization 
 425         during the instance initialization with _GEO_COUNTRIES and 
 428         You may also manually call it from extractor's code if geo bypass 
 429         information is not available beforehand (e.g. obtained during 
 430         extraction) or due to some other reason. In this case you should pass 
 431         this information in geo bypass context passed as first argument. It may 
 432         contain following fields: 
 434         countries:  List of geo unrestricted countries (similar 
 436         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation 
 437                     (similar to _GEO_IP_BLOCKS) 
 440         if not self
._x
_forwarded
_for
_ip
: 
 442             # Geo bypass mechanism is explicitly disabled by user 
 443             if not self
._downloader
.params
.get('geo_bypass', True): 
 446             if not geo_bypass_context
: 
 447                 geo_bypass_context 
= {} 
 449             # Backward compatibility: previously _initialize_geo_bypass 
 450             # expected a list of countries, some 3rd party code may still use 
 452             if isinstance(geo_bypass_context
, (list, tuple)): 
 453                 geo_bypass_context 
= { 
 454                     'countries': geo_bypass_context
, 
 457             # The whole point of geo bypass mechanism is to fake IP 
 458             # as X-Forwarded-For HTTP header based on some IP block or 
 461             # Path 1: bypassing based on IP block in CIDR notation 
 463             # Explicit IP block specified by user, use it right away 
 464             # regardless of whether extractor is geo bypassable or not 
 465             ip_block 
= self
._downloader
.params
.get('geo_bypass_ip_block', None) 
 467             # Otherwise use random IP block from geo bypass context but only 
 468             # if extractor is known as geo bypassable 
 470                 ip_blocks 
= geo_bypass_context
.get('ip_blocks') 
 471                 if self
._GEO
_BYPASS 
and ip_blocks
: 
 472                     ip_block 
= random
.choice(ip_blocks
) 
 475                 self
._x
_forwarded
_for
_ip 
= GeoUtils
.random_ipv4(ip_block
) 
 476                 if self
._downloader
.params
.get('verbose', False): 
 477                     self
._downloader
.to_screen( 
 478                         '[debug] Using fake IP %s as X-Forwarded-For.' 
 479                         % self
._x
_forwarded
_for
_ip
) 
 482             # Path 2: bypassing based on country code 
 484             # Explicit country code specified by user, use it right away 
 485             # regardless of whether extractor is geo bypassable or not 
 486             country 
= self
._downloader
.params
.get('geo_bypass_country', None) 
 488             # Otherwise use random country code from geo bypass context but 
 489             # only if extractor is known as geo bypassable 
 491                 countries 
= geo_bypass_context
.get('countries') 
 492                 if self
._GEO
_BYPASS 
and countries
: 
 493                     country 
= random
.choice(countries
) 
 496                 self
._x
_forwarded
_for
_ip 
= GeoUtils
.random_ipv4(country
) 
 497                 if self
._downloader
.params
.get('verbose', False): 
 498                     self
._downloader
.to_screen( 
 499                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.' 
 500                         % (self
._x
_forwarded
_for
_ip
, country
.upper())) 
 502     def extract(self
, url
): 
 503         """Extracts URL information and returns it in list of dicts.""" 
 508                     ie_result 
= self
._real
_extract
(url
) 
 509                     if self
._x
_forwarded
_for
_ip
: 
 510                         ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
 
 512                 except GeoRestrictedError 
as e
: 
 513                     if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
): 
 516         except ExtractorError
: 
 518         except compat_http_client
.IncompleteRead 
as e
: 
 519             raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True) 
 520         except (KeyError, StopIteration) as e
: 
 521             raise ExtractorError('An extractor error has occurred.', cause
=e
) 
 523     def __maybe_fake_ip_and_retry(self
, countries
): 
 524         if (not self
._downloader
.params
.get('geo_bypass_country', None) and 
 526                 self
._downloader
.params
.get('geo_bypass', True) and 
 527                 not self
._x
_forwarded
_for
_ip 
and 
 529             country_code 
= random
.choice(countries
) 
 530             self
._x
_forwarded
_for
_ip 
= GeoUtils
.random_ipv4(country_code
) 
 531             if self
._x
_forwarded
_for
_ip
: 
 533                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' 
 534                     % (self
._x
_forwarded
_for
_ip
, country_code
.upper())) 
 538     def set_downloader(self
, downloader
): 
 539         """Sets the downloader for this IE.""" 
 540         self
._downloader 
= downloader
 
 542     def _real_initialize(self
): 
 543         """Real initialization process. Redefine in subclasses.""" 
 546     def _real_extract(self
, url
): 
 547         """Real extraction process. Redefine in subclasses.""" 
 552         """A string for getting the InfoExtractor with get_info_extractor""" 
 553         return compat_str(cls
.__name
__[:-2]) 
 557         return compat_str(type(self
).__name
__[:-2]) 
 560     def __can_accept_status_code(err
, expected_status
): 
 561         assert isinstance(err
, compat_urllib_error
.HTTPError
) 
 562         if expected_status 
is None: 
 564         if isinstance(expected_status
, compat_integer_types
): 
 565             return err
.code 
== expected_status
 
 566         elif isinstance(expected_status
, (list, tuple)): 
 567             return err
.code 
in expected_status
 
 568         elif callable(expected_status
): 
 569             return expected_status(err
.code
) is True 
 573     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query
={}, expected_status
=None): 
 575         Return the response handle. 
 577         See _download_webpage docstring for arguments specification. 
 580             self
.report_download_webpage(video_id
) 
 581         elif note 
is not False: 
 583                 self
.to_screen('%s' % (note
,)) 
 585                 self
.to_screen('%s: %s' % (video_id
, note
)) 
 587         # Some sites check X-Forwarded-For HTTP header in order to figure out 
 588         # the origin of the client behind proxy. This allows bypassing geo 
 589         # restriction by faking this header's value to IP that belongs to some 
 590         # geo unrestricted country. We will do so once we encounter any 
 591         # geo restriction error. 
 592         if self
._x
_forwarded
_for
_ip
: 
 593             if 'X-Forwarded-For' not in headers
: 
 594                 headers
['X-Forwarded-For'] = self
._x
_forwarded
_for
_ip
 
 596         if isinstance(url_or_request
, compat_urllib_request
.Request
): 
 597             url_or_request 
= update_Request( 
 598                 url_or_request
, data
=data
, headers
=headers
, query
=query
) 
 601                 url_or_request 
= update_url_query(url_or_request
, query
) 
 602             if data 
is not None or headers
: 
 603                 url_or_request 
= sanitized_Request(url_or_request
, data
, headers
) 
 605             return self
._downloader
.urlopen(url_or_request
) 
 606         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 607             if isinstance(err
, compat_urllib_error
.HTTPError
): 
 608                 if self
.__can
_accept
_status
_code
(err
, expected_status
): 
 609                     # Retain reference to error to prevent file object from 
 610                     # being closed before it can be read. Works around the 
 611                     # effects of <https://bugs.python.org/issue15002> 
 612                     # introduced in Python 3.4.1. 
 619                 errnote 
= 'Unable to download webpage' 
 621             errmsg 
= '%s: %s' % (errnote
, error_to_compat_str(err
)) 
 623                 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
) 
 625                 self
._downloader
.report_warning(errmsg
) 
 628     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query
={}, expected_status
=None): 
 630         Return a tuple (page content as string, URL handle). 
 632         See _download_webpage docstring for arguments specification. 
 634         # Strip hashes from the URL (#1038) 
 635         if isinstance(url_or_request
, (compat_str
, str)): 
 636             url_or_request 
= url_or_request
.partition('#')[0] 
 638         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
, expected_status
=expected_status
) 
 642         content 
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
) 
 643         return (content
, urlh
) 
 646     def _guess_encoding_from_content(content_type
, webpage_bytes
): 
 647         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 649             encoding 
= m
.group(1) 
 651             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 652                           webpage_bytes[:1024]) 
 654                 encoding = m.group(1).decode('ascii') 
 655             elif webpage_bytes.startswith(b'\xff\xfe'): 
 662     def __check_blocked(self, content): 
 663         first_block = content[:512] 
 664         if ('<title>Access to this site is blocked</title>' in content and 
 665                 'Websense' in first_block): 
 666             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' 
 667             blocked_iframe = self._html_search_regex( 
 668                 r'<iframe src="([^
"]+)"', content, 
 669                 'Websense information URL
', default=None) 
 671                 msg += ' Visit 
%s for more details
' % blocked_iframe 
 672             raise ExtractorError(msg, expected=True) 
 673         if '<title
>The URL you requested has been blocked
</title
>' in first_block: 
 675                 'Access to this webpage has been blocked by Indian censorship
. ' 
 676                 'Use a VPN 
or proxy 
server (with --proxy
) to route around it
.') 
 677             block_msg = self._html_search_regex( 
 678                 r'</h1
><p
>(.*?
)</p
>', 
 679                 content, 'block message
', default=None) 
 681                 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ') 
 682             raise ExtractorError(msg, expected=True) 
 683         if ('<title
>TTK 
:: ŠŠ¾ŃŃŃŠæ Šŗ ŃŠµŃŃŃŃŃ Š¾Š³ŃŠ°Š½ŠøŃен
</title
>' in content and 
 684                 'blocklist
.rkn
.gov
.ru
' in content): 
 685             raise ExtractorError( 
 686                 'Access to this webpage has been blocked by decision of the Russian government
. ' 
 687                 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.', 
 690     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): 
 691         content_type = urlh.headers.get('Content
-Type
', '') 
 692         webpage_bytes = urlh.read() 
 693         if prefix is not None: 
 694             webpage_bytes = prefix + webpage_bytes 
 696             encoding = self._guess_encoding_from_content(content_type, webpage_bytes) 
 697         if self._downloader.params.get('dump_intermediate_pages
', False): 
 698             self.to_screen('Dumping request to 
' + urlh.geturl()) 
 699             dump = base64.b64encode(webpage_bytes).decode('ascii
') 
 700             self._downloader.to_screen(dump) 
 701         if self._downloader.params.get('write_pages
', False): 
 702             basen = '%s_%s' % (video_id, urlh.geturl()) 
 704                 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest() 
 705                 basen = basen[:240 - len(h)] + h 
 706             raw_filename = basen + '.dump
' 
 707             filename = sanitize_filename(raw_filename, restricted=True) 
 708             self.to_screen('Saving request to 
' + filename) 
 709             # Working around MAX_PATH limitation on Windows (see 
 710             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) 
 711             if compat_os_name == 'nt
': 
 712                 absfilepath = os.path.abspath(filename) 
 713                 if len(absfilepath) > 259: 
 714                     filename = '\\\\?
\\' + absfilepath 
 715             with open(filename, 'wb
') as outf: 
 716                 outf.write(webpage_bytes) 
 719             content = webpage_bytes.decode(encoding, 'replace
') 
 721             content = webpage_bytes.decode('utf
-8', 'replace
') 
 723         self.__check_blocked(content) 
 727     def _download_webpage( 
 728             self, url_or_request, video_id, note=None, errnote=None, 
 729             fatal=True, tries=1, timeout=5, encoding=None, data=None, 
 730             headers={}, query={}, expected_status=None): 
 732         Return the data of the page as a string. 
 735         url_or_request -- plain text URL as a string or 
 736             a compat_urllib_request.Requestobject 
 737         video_id -- Video/playlist/item identifier (string) 
 740         note -- note printed before downloading (string) 
 741         errnote -- note printed in case of an error (string) 
 742         fatal -- flag denoting whether error should be considered fatal, 
 743             i.e. whether it should cause ExtractionError to be raised, 
 744             otherwise a warning will be reported and extraction continued 
 745         tries -- number of tries 
 746         timeout -- sleep interval between tries 
 747         encoding -- encoding for a page content decoding, guessed automatically 
 748             when not explicitly specified 
 749         data -- POST data (bytes) 
 750         headers -- HTTP headers (dict) 
 751         query -- URL query (dict) 
 752         expected_status -- allows to accept failed HTTP requests (non 2xx 
 753             status code) by explicitly specifying a set of accepted status 
 754             codes. Can be any of the following entities: 
 755                 - an integer type specifying an exact failed status code to 
 757                 - a list or a tuple of integer types specifying a list of 
 758                   failed status codes to accept 
 759                 - a callable accepting an actual failed status code and 
 760                   returning True if it should be accepted 
 761             Note that this argument does not affect success status codes (2xx) 
 762             which are always accepted. 
 767         while success is False: 
 769                 res = self._download_webpage_handle( 
 770                     url_or_request, video_id, note, errnote, fatal, 
 771                     encoding=encoding, data=data, headers=headers, query=query, 
 772                     expected_status=expected_status) 
 774             except compat_http_client.IncompleteRead as e: 
 776                 if try_count >= tries: 
 778                 self._sleep(timeout, video_id) 
 785     def _download_xml_handle( 
 786             self, url_or_request, video_id, note='Downloading XML
', 
 787             errnote='Unable to download XML
', transform_source=None, 
 788             fatal=True, encoding=None, data=None, headers={}, query={}, 
 789             expected_status=None): 
 791         Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle). 
 793         See _download_webpage docstring for arguments specification. 
 795         res = self._download_webpage_handle( 
 796             url_or_request, video_id, note, errnote, fatal=fatal, 
 797             encoding=encoding, data=data, headers=headers, query=query, 
 798             expected_status=expected_status) 
 801         xml_string, urlh = res 
 802         return self._parse_xml( 
 803             xml_string, video_id, transform_source=transform_source, 
 807             self, url_or_request, video_id, 
 808             note='Downloading XML
', errnote='Unable to download XML
', 
 809             transform_source=None, fatal=True, encoding=None, 
 810             data=None, headers={}, query={}, expected_status=None): 
 812         Return the xml as an xml.etree.ElementTree.Element. 
 814         See _download_webpage docstring for arguments specification. 
 816         res = self._download_xml_handle( 
 817             url_or_request, video_id, note=note, errnote=errnote, 
 818             transform_source=transform_source, fatal=fatal, encoding=encoding, 
 819             data=data, headers=headers, query=query, 
 820             expected_status=expected_status) 
 821         return res if res is False else res[0] 
 823     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): 
 825             xml_string = transform_source(xml_string) 
 827             return compat_etree_fromstring(xml_string.encode('utf
-8')) 
 828         except compat_xml_parse_error as ve: 
 829             errmsg = '%s: Failed to parse XML 
' % video_id 
 831                 raise ExtractorError(errmsg, cause=ve) 
 833                 self.report_warning(errmsg + str(ve)) 
 835     def _download_json_handle( 
 836             self, url_or_request, video_id, note='Downloading JSON metadata
', 
 837             errnote='Unable to download JSON metadata
', transform_source=None, 
 838             fatal=True, encoding=None, data=None, headers={}, query={}, 
 839             expected_status=None): 
 841         Return a tuple (JSON object, URL handle). 
 843         See _download_webpage docstring for arguments specification. 
 845         res = self._download_webpage_handle( 
 846             url_or_request, video_id, note, errnote, fatal=fatal, 
 847             encoding=encoding, data=data, headers=headers, query=query, 
 848             expected_status=expected_status) 
 851         json_string, urlh = res 
 852         return self._parse_json( 
 853             json_string, video_id, transform_source=transform_source, 
 857             self, url_or_request, video_id, note='Downloading JSON metadata
', 
 858             errnote='Unable to download JSON metadata
', transform_source=None, 
 859             fatal=True, encoding=None, data=None, headers={}, query={}, 
 860             expected_status=None): 
 862         Return the JSON object as a dict. 
 864         See _download_webpage docstring for arguments specification. 
 866         res = self._download_json_handle( 
 867             url_or_request, video_id, note=note, errnote=errnote, 
 868             transform_source=transform_source, fatal=fatal, encoding=encoding, 
 869             data=data, headers=headers, query=query, 
 870             expected_status=expected_status) 
 871         return res if res is False else res[0] 
 873     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): 
 875             json_string = transform_source(json_string) 
 877             return json.loads(json_string) 
 878         except ValueError as ve: 
 879             errmsg = '%s: Failed to parse JSON 
' % video_id 
 881                 raise ExtractorError(errmsg, cause=ve) 
 883                 self.report_warning(errmsg + str(ve)) 
 885     def report_warning(self, msg, video_id=None): 
 886         idstr = '' if video_id is None else '%s: ' % video_id 
 887         self._downloader.report_warning( 
 888             '[%s] %s%s' % (self.IE_NAME, idstr, msg)) 
 890     def to_screen(self, msg): 
 891         """Print msg to screen, prefixing it with '[ie_name
]'""" 
 892         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) 
 894     def report_extraction(self, id_or_name): 
 895         """Report information extraction.""" 
 896         self.to_screen('%s: Extracting information
' % id_or_name) 
 898     def report_download_webpage(self, video_id): 
 899         """Report webpage download.""" 
 900         self.to_screen('%s: Downloading webpage
' % video_id) 
 902     def report_age_confirmation(self): 
 903         """Report attempt to confirm age.""" 
 904         self.to_screen('Confirming age
') 
 906     def report_login(self): 
 907         """Report attempt to log in.""" 
 908         self.to_screen('Logging 
in') 
 911     def raise_login_required(msg='This video 
is only available 
for registered users
'): 
 912         raise ExtractorError( 
 913             '%s. Use 
--username 
and --password 
or --netrc to provide account credentials
.' % msg, 
 917     def raise_geo_restricted(msg='This video 
is not available 
from your location due to geo restriction
', countries=None): 
 918         raise GeoRestrictedError(msg, countries=countries) 
 920     # Methods for following #608 
 922     def url_result(url, ie=None, video_id=None, video_title=None): 
 923         """Returns a URL that points to a page that should be processed""" 
 924         # TODO: ie should be the class used for getting the info 
 925         video_info = {'_type
': 'url
', 
 928         if video_id is not None: 
 929             video_info['id'] = video_id 
 930         if video_title is not None: 
 931             video_info['title
'] = video_title 
 934     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): 
 936             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) 
 938         return self.playlist_result( 
 939             urls, playlist_id=playlist_id, playlist_title=playlist_title) 
 942     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): 
 943         """Returns a playlist""" 
 944         video_info = {'_type
': 'playlist
', 
 947             video_info['id'] = playlist_id 
 949             video_info['title
'] = playlist_title 
 950         if playlist_description: 
 951             video_info['description
'] = playlist_description 
 954     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 956         Perform a regex search on the given string, using a single or a list of 
 957         patterns returning the first matching group. 
 958         In case of failure return a default value or raise a WARNING or a 
 959         RegexNotFoundError, depending on fatal, specifying the field name. 
 961         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 962             mobj = re.search(pattern, string, flags) 
 965                 mobj = re.search(p, string, flags) 
 969         if not self._downloader.params.get('no_color
') and compat_os_name != 'nt
' and sys.stderr.isatty(): 
 970             _name = '\033[0;34m
%s\033[0m
' % name 
 976                 # return the first matching group 
 977                 return next(g for g in mobj.groups() if g is not None) 
 979                 return mobj.group(group) 
 980         elif default is not NO_DEFAULT: 
 983             raise RegexNotFoundError('Unable to extract 
%s' % _name) 
 985             self._downloader.report_warning('unable to extract 
%s' % _name + bug_reports_message()) 
 988     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 990         Like _search_regex, but strips HTML tags and unescapes entities. 
 992         res = self._search_regex(pattern, string, name, default, fatal, flags, group) 
 994             return clean_html(res).strip() 
 998     def _get_netrc_login_info(self, netrc_machine=None): 
1001         netrc_machine = netrc_machine or self._NETRC_MACHINE 
1003         if self._downloader.params.get('usenetrc
', False): 
1005                 info = netrc.netrc().authenticators(netrc_machine) 
1006                 if info is not None: 
1010                     raise netrc.NetrcParseError( 
1011                         'No authenticators 
for %s' % netrc_machine) 
1012             except (IOError, netrc.NetrcParseError) as err: 
1013                 self._downloader.report_warning( 
1014                     'parsing 
.netrc
: %s' % error_to_compat_str(err)) 
1016         return username, password 
1018     def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None): 
1020         Get the login info as (username, password) 
1021         First look for the manually specified credentials using username_option 
1022         and password_option as keys in params dictionary. If no such credentials 
1023         available look in the netrc file using the netrc_machine or _NETRC_MACHINE 
1025         If there's no info available
, return (None, None) 
1027         if self._downloader is None: 
1030         downloader_params = self._downloader.params 
1032         # Attempt to use provided username and password or .netrc data 
1033         if downloader_params.get(username_option) is not None: 
1034             username = downloader_params[username_option] 
1035             password = downloader_params[password_option] 
1037             username, password = self._get_netrc_login_info(netrc_machine) 
1039         return username, password 
1041     def _get_tfa_info(self, note='two-factor verification code'): 
1043         Get the two
-factor authentication info
 
1044         TODO 
- asking the user will be required 
for sms
/phone verify
 
1045         currently just uses the command line option
 
1046         If there
's no info available, return None 
1048         if self._downloader is None: 
1050         downloader_params = self._downloader.params 
1052         if downloader_params.get('twofactor
') is not None: 
1053             return downloader_params['twofactor
'] 
1055         return compat_getpass('Type 
%s and press 
[Return
]: ' % note) 
1057     # Helper functions for extracting OpenGraph info 
1059     def _og_regexes(prop): 
1060         content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))' 
1061         property_re = (r'(?
:name|
property)=(?
:\'og
:%(prop)s\'|
"og:%(prop)s"|\s
*og
:%(prop)s\b)' 
1062                        % {'prop
': re.escape(prop)}) 
1063         template = r'<meta
[^
>]+?
%s[^
>]+?
%s' 
1065             template % (property_re, content_re), 
1066             template % (content_re, property_re), 
1070     def _meta_regex(prop): 
1071         return r'''(?isx)<meta 
1072                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) 
1073                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) 
1075     def _og_search_property(self, prop, html, name=None, **kargs): 
1076         if not isinstance(prop, (list, tuple)): 
1079             name = 'OpenGraph 
%s' % prop[0] 
1082             og_regexes.extend(self._og_regexes(p)) 
1083         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) 
1086         return unescapeHTML(escaped) 
1088     def _og_search_thumbnail(self, html, **kargs): 
1089         return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs) 
1091     def _og_search_description(self, html, **kargs): 
1092         return self._og_search_property('description
', html, fatal=False, **kargs) 
1094     def _og_search_title(self, html, **kargs): 
1095         return self._og_search_property('title
', html, **kargs) 
1097     def _og_search_video_url(self, html, name='video url
', secure=True, **kargs): 
1098         regexes = self._og_regexes('video
') + self._og_regexes('video
:url
') 
1100             regexes = self._og_regexes('video
:secure_url
') + regexes 
1101         return self._html_search_regex(regexes, html, name, **kargs) 
1103     def _og_search_url(self, html, **kargs): 
1104         return self._og_search_property('url
', html, **kargs) 
1106     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): 
1107         if not isinstance(name, (list, tuple)): 
1109         if display_name is None: 
1110             display_name = name[0] 
1111         return self._html_search_regex( 
1112             [self._meta_regex(n) for n in name], 
1113             html, display_name, fatal=fatal, group='content
', **kwargs) 
1115     def _dc_search_uploader(self, html): 
1116         return self._html_search_meta('dc
.creator
', html, 'uploader
') 
1118     def _rta_search(self, html): 
1119         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
1120         if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+' 
1121                      r'     content
="RTA-5042-1996-1400-1577-RTA"', 
1126     def _media_rating_search(self, html): 
1127         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 
1128         rating = self._html_search_meta('rating
', html) 
1140         return RATING_TABLE.get(rating.lower()) 
1142     def _family_friendly_search(self, html): 
1143         # See http://schema.org/VideoObject 
1144         family_friendly = self._html_search_meta( 
1145             'isFamilyFriendly
', html, default=None) 
1147         if not family_friendly: 
1156         return RATING_TABLE.get(family_friendly.lower()) 
1158     def _twitter_search_player(self, html): 
1159         return self._html_search_meta('twitter
:player
', html, 
1160                                       'twitter card player
') 
1162     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): 
1163         json_ld = self._search_regex( 
1164             JSON_LD_RE, html, 'JSON
-LD
', group='json_ld
', **kwargs) 
1165         default = kwargs.get('default
', NO_DEFAULT) 
1167             return default if default is not NO_DEFAULT else {} 
1168         # JSON-LD may be malformed and thus `fatal` should be respected. 
1169         # At the same time `default` may be passed that assumes `fatal=False` 
1170         # for _search_regex. Let's simulate the same behavior here 
as well
. 
1171         fatal 
= kwargs
.get('fatal', True) if default 
== NO_DEFAULT 
else False 
1172         return self
._json
_ld
(json_ld
, video_id
, fatal
=fatal
, expected_type
=expected_type
) 
1174     def _json_ld(self
, json_ld
, video_id
, fatal
=True, expected_type
=None): 
1175         if isinstance(json_ld
, compat_str
): 
1176             json_ld 
= self
._parse
_json
(json_ld
, video_id
, fatal
=fatal
) 
1180         if not isinstance(json_ld
, (list, tuple, dict)): 
1182         if isinstance(json_ld
, dict): 
1185         INTERACTION_TYPE_MAP 
= { 
1186             'CommentAction': 'comment', 
1187             'AgreeAction': 'like', 
1188             'DisagreeAction': 'dislike', 
1189             'LikeAction': 'like', 
1190             'DislikeAction': 'dislike', 
1191             'ListenAction': 'view', 
1192             'WatchAction': 'view', 
1193             'ViewAction': 'view', 
1196         def extract_interaction_statistic(e
): 
1197             interaction_statistic 
= e
.get('interactionStatistic') 
1198             if not isinstance(interaction_statistic
, list): 
1200             for is_e 
in interaction_statistic
: 
1201                 if not isinstance(is_e
, dict): 
1203                 if is_e
.get('@type') != 'InteractionCounter': 
1205                 interaction_type 
= is_e
.get('interactionType') 
1206                 if not isinstance(interaction_type
, compat_str
): 
1208                 interaction_count 
= int_or_none(is_e
.get('userInteractionCount')) 
1209                 if interaction_count 
is None: 
1211                 count_kind 
= INTERACTION_TYPE_MAP
.get(interaction_type
.split('/')[-1]) 
1214                 count_key 
= '%s_count' % count_kind
 
1215                 if info
.get(count_key
) is not None: 
1217                 info
[count_key
] = interaction_count
 
1219         def extract_video_object(e
): 
1220             assert e
['@type'] == 'VideoObject' 
1222                 'url': url_or_none(e
.get('contentUrl')), 
1223                 'title': unescapeHTML(e
.get('name')), 
1224                 'description': unescapeHTML(e
.get('description')), 
1225                 'thumbnail': url_or_none(e
.get('thumbnailUrl') or e
.get('thumbnailURL')), 
1226                 'duration': parse_duration(e
.get('duration')), 
1227                 'timestamp': unified_timestamp(e
.get('uploadDate')), 
1228                 'filesize': float_or_none(e
.get('contentSize')), 
1229                 'tbr': int_or_none(e
.get('bitrate')), 
1230                 'width': int_or_none(e
.get('width')), 
1231                 'height': int_or_none(e
.get('height')), 
1232                 'view_count': int_or_none(e
.get('interactionCount')), 
1234             extract_interaction_statistic(e
) 
1237             if isinstance(e
.get('@context'), compat_str
) and re
.match(r
'^https?://schema.org/?$', e
.get('@context')): 
1238                 item_type 
= e
.get('@type') 
1239                 if expected_type 
is not None and expected_type 
!= item_type
: 
1241                 if item_type 
in ('TVEpisode', 'Episode'): 
1243                         'episode': unescapeHTML(e
.get('name')), 
1244                         'episode_number': int_or_none(e
.get('episodeNumber')), 
1245                         'description': unescapeHTML(e
.get('description')), 
1247                     part_of_season 
= e
.get('partOfSeason') 
1248                     if isinstance(part_of_season
, dict) and part_of_season
.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): 
1249                         info
['season_number'] = int_or_none(part_of_season
.get('seasonNumber')) 
1250                     part_of_series 
= e
.get('partOfSeries') or e
.get('partOfTVSeries') 
1251                     if isinstance(part_of_series
, dict) and part_of_series
.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): 
1252                         info
['series'] = unescapeHTML(part_of_series
.get('name')) 
1253                 elif item_type 
in ('Article', 'NewsArticle'): 
1255                         'timestamp': parse_iso8601(e
.get('datePublished')), 
1256                         'title': unescapeHTML(e
.get('headline')), 
1257                         'description': unescapeHTML(e
.get('articleBody')), 
1259                 elif item_type 
== 'VideoObject': 
1260                     extract_video_object(e
) 
1262                 video 
= e
.get('video') 
1263                 if isinstance(video
, dict) and video
.get('@type') == 'VideoObject': 
1264                     extract_video_object(video
) 
1266         return dict((k
, v
) for k
, v 
in info
.items() if v 
is not None) 
1269     def _hidden_inputs(html
): 
1270         html 
= re
.sub(r
'<!--(?:(?!<!--).)*-->', '', html
) 
1272         for input in re
.findall(r
'(?i)(<input[^>]+>)', html
): 
1273             attrs 
= extract_attributes(input) 
1276             if attrs
.get('type') not in ('hidden', 'submit'): 
1278             name 
= attrs
.get('name') or attrs
.get('id') 
1279             value 
= attrs
.get('value') 
1280             if name 
and value 
is not None: 
1281                 hidden_inputs
[name
] = value
 
1282         return hidden_inputs
 
1284     def _form_hidden_inputs(self
, form_id
, html
): 
1285         form 
= self
._search
_regex
( 
1286             r
'(?is)<form[^>]+?id=(["\'])%s\
1[^
>]*>(?P
<form
>.+?
)</form
>' % form_id, 
1287             html, '%s form
' % form_id, group='form
') 
1288         return self._hidden_inputs(form) 
1290     def _sort_formats(self, formats, field_preference=None): 
1292             raise ExtractorError('No video formats found
') 
1295             # Automatically determine tbr when missing based on abr and vbr (improves 
1296             # formats sorting in some cases) 
1297             if 'tbr
' not in f and f.get('abr
') is not None and f.get('vbr
') is not None: 
1298                 f['tbr
'] = f['abr
'] + f['vbr
'] 
1300         def _formats_key(f): 
1301             # TODO remove the following workaround 
1302             from ..utils import determine_ext 
1303             if not f.get('ext
') and 'url
' in f: 
1304                 f['ext
'] = determine_ext(f['url
']) 
1306             if isinstance(field_preference, (list, tuple)): 
1309                     if f.get(field) is not None 
1310                     else ('' if field == 'format_id
' else -1) 
1311                     for field in field_preference) 
1313             preference = f.get('preference
') 
1314             if preference is None: 
1316                 if f.get('ext
') in ['f4f
', 'f4m
']:  # Not yet supported 
1319             protocol = f.get('protocol
') or determine_protocol(f) 
1320             proto_preference = 0 if protocol in ['http
', 'https
'] else (-0.5 if protocol == 'rtsp
' else -0.1) 
1322             if f.get('vcodec
') == 'none
':  # audio only 
1324                 if self._downloader.params.get('prefer_free_formats
'): 
1325                     ORDER = ['aac
', 'mp3
', 'm4a
', 'webm
', 'ogg
', 'opus
'] 
1327                     ORDER = ['webm
', 'opus
', 'ogg
', 'mp3
', 'aac
', 'm4a
'] 
1330                     audio_ext_preference = ORDER.index(f['ext
']) 
1332                     audio_ext_preference = -1 
1334                 if f.get('acodec
') == 'none
':  # video only 
1336                 if self._downloader.params.get('prefer_free_formats
'): 
1337                     ORDER = ['flv
', 'mp4
', 'webm
'] 
1339                     ORDER = ['webm
', 'flv
', 'mp4
'] 
1341                     ext_preference = ORDER.index(f['ext
']) 
1344                 audio_ext_preference = 0 
1348                 f.get('language_preference
') if f.get('language_preference
') is not None else -1, 
1349                 f.get('quality
') if f.get('quality
') is not None else -1, 
1350                 f.get('tbr
') if f.get('tbr
') is not None else -1, 
1351                 f.get('filesize
') if f.get('filesize
') is not None else -1, 
1352                 f.get('vbr
') if f.get('vbr
') is not None else -1, 
1353                 f.get('height
') if f.get('height
') is not None else -1, 
1354                 f.get('width
') if f.get('width
') is not None else -1, 
1357                 f.get('abr
') if f.get('abr
') is not None else -1, 
1358                 audio_ext_preference, 
1359                 f.get('fps
') if f.get('fps
') is not None else -1, 
1360                 f.get('filesize_approx
') if f.get('filesize_approx
') is not None else -1, 
1361                 f.get('source_preference
') if f.get('source_preference
') is not None else -1, 
1362                 f.get('format_id
') if f.get('format_id
') is not None else '', 
1364         formats.sort(key=_formats_key) 
1366     def _check_formats(self, formats, video_id): 
1368             formats[:] = filter( 
1369                 lambda f: self._is_valid_url( 
1371                     item='%s video format
' % f.get('format_id
') if f.get('format_id
') else 'video
'), 
1375     def _remove_duplicate_formats(formats): 
1379             if f['url
'] not in format_urls: 
1380                 format_urls.add(f['url
']) 
1381                 unique_formats.append(f) 
1382         formats[:] = unique_formats 
1384     def _is_valid_url(self, url, video_id, item='video
', headers={}): 
1385         url = self._proto_relative_url(url, scheme='http
:') 
1386         # For now assume non HTTP(S) URLs always valid 
1387         if not (url.startswith('http
://') or url.startswith('https
://')): 
1390             self._request_webpage(url, video_id, 'Checking 
%s URL
' % item, headers=headers) 
1392         except ExtractorError as e: 
1393             if isinstance(e.cause, compat_urllib_error.URLError): 
1395                     '%s: %s URL 
is invalid
, skipping
' % (video_id, item)) 
1399     def http_scheme(self): 
1400         """ Either "http:" or "https:", depending on the user's preferences 
""" 
1403             if self._downloader.params.get('prefer_insecure', False) 
1406     def _proto_relative_url(self, url, scheme=None): 
1409         if url.startswith('//'): 
1411                 scheme = self.http_scheme() 
1416     def _sleep(self, timeout, video_id, msg_template=None): 
1417         if msg_template is None: 
1418             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' 
1419         msg = msg_template % {'video_id': video_id, 'timeout': timeout} 
1423     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, 
1424                              transform_source=lambda s: fix_xml_ampersands(s).strip(), 
1425                              fatal=True, m3u8_id=None): 
1426         manifest = self._download_xml( 
1427             manifest_url, video_id, 'Downloading f4m manifest', 
1428             'Unable to download f4m manifest', 
1429             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests 
1430             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) 
1431             transform_source=transform_source, 
1434         if manifest is False: 
1437         return self._parse_f4m_formats( 
1438             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1439             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) 
1441     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, 
1442                            transform_source=lambda s: fix_xml_ampersands(s).strip(), 
1443                            fatal=True, m3u8_id=None): 
1444         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy 
1445         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') 
1446         if akamai_pv is not None and ';' in akamai_pv.text: 
1447             playerVerificationChallenge = akamai_pv.text.split(';')[0] 
1448             if playerVerificationChallenge.strip() != '': 
1452         manifest_version = '1.0' 
1453         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') 
1455             manifest_version = '2.0' 
1456             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') 
1457         # Remove unsupported DRM protected media from final formats 
1458         # rendition (see https://github.com/rg3/youtube-dl/issues/8573). 
1459         media_nodes = remove_encrypted_media(media_nodes) 
1463         manifest_base_url = get_base_url(manifest) 
1465         bootstrap_info = xpath_element( 
1466             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 
1467             'bootstrap info', default=None) 
1470         mime_type = xpath_text( 
1471             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], 
1472             'base URL', default=None) 
1473         if mime_type and mime_type.startswith('audio/'): 
1476         for i, media_el in enumerate(media_nodes): 
1477             tbr = int_or_none(media_el.attrib.get('bitrate')) 
1478             width = int_or_none(media_el.attrib.get('width')) 
1479             height = int_or_none(media_el.attrib.get('height')) 
1480             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) 
1481             # If <bootstrapInfo> is present, the specified f4m is a 
1482             # stream-level manifest, and only set-level manifests may refer to 
1483             # external resources.  See section 11.4 and section 4 of F4M spec 
1484             if bootstrap_info is None: 
1486                 # @href is introduced in 2.0, see section 11.6 of F4M spec 
1487                 if manifest_version == '2.0': 
1488                     media_url = media_el.attrib.get('href') 
1489                 if media_url is None: 
1490                     media_url = media_el.attrib.get('url') 
1494                     media_url if media_url.startswith('http://') or media_url.startswith('https://') 
1495                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) 
1496                 # If media_url is itself a f4m manifest do the recursive extraction 
1497                 # since bitrates in parent manifest (this one) and media_url manifest 
1498                 # may differ leading to inability to resolve the format by requested 
1499                 # bitrate in f4m downloader 
1500                 ext = determine_ext(manifest_url) 
1502                     f4m_formats = self._extract_f4m_formats( 
1503                         manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1504                         transform_source=transform_source, fatal=fatal) 
1505                     # Sometimes stream-level manifest contains single media entry that 
1506                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). 
1507                     # At the same time parent's media entry in set-level manifest may 
1508                     # contain it. We will copy it from parent in such cases. 
1509                     if len(f4m_formats) == 1: 
1512                             'tbr': f.get('tbr') or tbr, 
1513                             'width': f.get('width') or width, 
1514                             'height': f.get('height') or height, 
1515                             'format_id': f.get('format_id') if not tbr else format_id, 
1518                     formats.extend(f4m_formats) 
1521                     formats.extend(self._extract_m3u8_formats( 
1522                         manifest_url, video_id, 'mp4', preference=preference, 
1523                         m3u8_id=m3u8_id, fatal=fatal)) 
1526                 'format_id': format_id, 
1527                 'url': manifest_url, 
1528                 'manifest_url': manifest_url, 
1529                 'ext': 'flv' if bootstrap_info is not None else None, 
1535                 'preference': preference, 
1539     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): 
1541             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 
1545             'preference': preference - 100 if preference else -100, 
1546             'resolution': 'multiple', 
1547             'format_note': 'Quality selection URL', 
1550     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, 
1551                               entry_protocol='m3u8', preference=None, 
1552                               m3u8_id=None, note=None, errnote=None, 
1553                               fatal=True, live=False): 
1554         res = self._download_webpage_handle( 
1556             note=note or 'Downloading m3u8 information', 
1557             errnote=errnote or 'Failed to download m3u8 information', 
1563         m3u8_doc, urlh = res 
1564         m3u8_url = urlh.geturl() 
1566         return self._parse_m3u8_formats( 
1567             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, 
1568             preference=preference, m3u8_id=m3u8_id, live=live) 
1570     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, 
1571                             entry_protocol='m3u8', preference=None, 
1572                             m3u8_id=None, live=False): 
1573         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access 
1576         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay 
1581         format_url = lambda u: ( 
1583             if re.match(r'^https?://', u) 
1584             else compat_urlparse.urljoin(m3u8_url, u)) 
1587         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 
1588         # 2. https://github.com/rg3/youtube-dl/issues/12211 
1590         # We should try extracting formats only from master playlists [1, 4.3.4], 
1591         # i.e. playlists that describe available qualities. On the other hand 
1592         # media playlists [1, 4.3.3] should be returned as is since they contain 
1593         # just the media without qualities renditions. 
1594         # Fortunately, master playlist can be easily distinguished from media 
1595         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] 
1596         # master playlist tags MUST NOT appear in a media playist and vice versa. 
1597         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every 
1598         # media playlist and MUST NOT appear in master playlist thus we can 
1599         # clearly detect media playlist with this criterion. 
1601         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is 
1604                 'format_id': m3u8_id, 
1606                 'protocol': entry_protocol, 
1607                 'preference': preference, 
1611         last_stream_inf = {} 
1613         def extract_media(x_media_line): 
1614             media = parse_m3u8_attributes(x_media_line) 
1615             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED 
1616             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') 
1617             if not (media_type and group_id and name): 
1619             groups.setdefault(group_id, []).append(media) 
1620             if media_type not in ('VIDEO', 'AUDIO'): 
1622             media_url = media.get('URI') 
1625                 for v in (m3u8_id, group_id, name): 
1629                     'format_id': '-'.join(format_id), 
1630                     'url': format_url(media_url), 
1631                     'manifest_url': m3u8_url, 
1632                     'language': media.get('LANGUAGE'), 
1634                     'protocol': entry_protocol, 
1635                     'preference': preference, 
1637                 if media_type == 'AUDIO': 
1638                     f['vcodec'] = 'none' 
1641         def build_stream_name(): 
1642             # Despite specification does not mention NAME attribute for 
1643             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] 
1644             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) 
1645             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 
1646             stream_name = last_stream_inf.get('NAME') 
1649             # If there is no NAME in EXT-X-STREAM-INF it will be obtained 
1650             # from corresponding rendition group 
1651             stream_group_id = last_stream_inf.get('VIDEO') 
1652             if not stream_group_id: 
1654             stream_group = groups.get(stream_group_id) 
1655             if not stream_group: 
1656                 return stream_group_id 
1657             rendition = stream_group[0] 
1658             return rendition.get('NAME') or stream_group_id 
1660         for line in m3u8_doc.splitlines(): 
1661             if line.startswith('#EXT-X-STREAM-INF:'): 
1662                 last_stream_inf = parse_m3u8_attributes(line) 
1663             elif line.startswith('#EXT-X-MEDIA:'): 
1665             elif line.startswith('#') or not line.strip(): 
1668                 tbr = float_or_none( 
1669                     last_stream_inf.get('AVERAGE-BANDWIDTH') or 
1670                     last_stream_inf.get('BANDWIDTH'), scale=1000) 
1673                     format_id.append(m3u8_id) 
1674                 stream_name = build_stream_name() 
1675                 # Bandwidth of live streams may differ over time thus making 
1676                 # format_id unpredictable. So it's better to keep provided 
1679                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) 
1680                 manifest_url = format_url(line.strip()) 
1682                     'format_id': '-'.join(format_id), 
1683                     'url': manifest_url, 
1684                     'manifest_url': m3u8_url, 
1687                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), 
1688                     'protocol': entry_protocol, 
1689                     'preference': preference, 
1691                 resolution = last_stream_inf.get('RESOLUTION') 
1693                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) 
1695                         f['width'] = int(mobj.group('width')) 
1696                         f['height'] = int(mobj.group('height')) 
1697                 # Unified Streaming Platform 
1699                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) 
1701                     abr, vbr = mobj.groups() 
1702                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) 
1707                 codecs = parse_codecs(last_stream_inf.get('CODECS')) 
1709                 audio_group_id = last_stream_inf.get('AUDIO') 
1710                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which 
1711                 # references a rendition group MUST have a CODECS attribute. 
1712                 # However, this is not always respected, for example, [2] 
1713                 # contains EXT-X-STREAM-INF tag which references AUDIO 
1714                 # rendition group but does not have CODECS and despite 
1715                 # referencing an audio group it represents a complete 
1716                 # (with audio and video) format. So, for such cases we will 
1717                 # ignore references to rendition groups and treat them 
1718                 # as complete formats. 
1719                 if audio_group_id and codecs and f.get('vcodec') != 'none': 
1720                     audio_group = groups.get(audio_group_id) 
1721                     if audio_group and audio_group[0].get('URI'): 
1722                         # TODO: update acodec for audio only formats with 
1724                         f['acodec'] = 'none' 
1726                 last_stream_inf = {} 
1730     def _xpath_ns(path, namespace=None): 
1734         for c in path.split('/'): 
1735             if not c or c == '.': 
1738                 out.append('{%s}%s' % (namespace, c)) 
1739         return '/'.join(out) 
1741     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): 
1742         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) 
1748         namespace = self._parse_smil_namespace(smil) 
1750         return self._parse_smil_formats( 
1751             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1753     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): 
1754         smil = self._download_smil(smil_url, video_id, fatal=fatal) 
1757         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) 
1759     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): 
1760         return self._download_xml( 
1761             smil_url, video_id, 'Downloading SMIL file', 
1762             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) 
1764     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): 
1765         namespace = self._parse_smil_namespace(smil) 
1767         formats = self._parse_smil_formats( 
1768             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1769         subtitles = self._parse_smil_subtitles(smil, namespace=namespace) 
1771         video_id = os.path.splitext(url_basename(smil_url))[0] 
1775         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1776             name = meta.attrib.get('name') 
1777             content = meta.attrib.get('content') 
1778             if not name or not content: 
1780             if not title and name == 'title': 
1782             elif not description and name in ('description', 'abstract'): 
1783                 description = content 
1784             elif not upload_date and name == 'date': 
1785                 upload_date = unified_strdate(content) 
1788             'id': image.get('type'), 
1789             'url': image.get('src'), 
1790             'width': int_or_none(image.get('width')), 
1791             'height': int_or_none(image.get('height')), 
1792         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] 
1796             'title': title or video_id, 
1797             'description': description, 
1798             'upload_date': upload_date, 
1799             'thumbnails': thumbnails, 
1801             'subtitles': subtitles, 
1804     def _parse_smil_namespace(self, smil): 
1805         return self._search_regex( 
1806             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) 
1808     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): 
1810         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1811             b = meta.get('base') or meta.get('httpBase') 
1822         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) 
1823         for medium in media: 
1824             src = medium.get('src') 
1825             if not src or src in srcs: 
1829             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) 
1830             filesize = int_or_none(medium.get('size') or medium.get('fileSize')) 
1831             width = int_or_none(medium.get('width')) 
1832             height = int_or_none(medium.get('height')) 
1833             proto = medium.get('proto') 
1834             ext = medium.get('ext') 
1835             src_ext = determine_ext(src) 
1836             streamer = medium.get('streamer') or base 
1838             if proto == 'rtmp' or streamer.startswith('rtmp'): 
1844                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), 
1846                     'filesize': filesize, 
1850                 if transform_rtmp_url: 
1851                     streamer, src = transform_rtmp_url(streamer, src) 
1852                     formats[-1].update({ 
1858             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) 
1859             src_url = src_url.strip() 
1861             if proto == 'm3u8' or src_ext == 'm3u8': 
1862                 m3u8_formats = self._extract_m3u8_formats( 
1863                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) 
1864                 if len(m3u8_formats) == 1: 
1866                     m3u8_formats[0].update({ 
1867                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), 
1872                 formats.extend(m3u8_formats) 
1873             elif src_ext == 'f4m': 
1878                         'plugin': 'flowplayer-3.2.0.1', 
1880                 f4m_url += '&' if '?' in f4m_url else '?' 
1881                 f4m_url += compat_urllib_parse_urlencode(f4m_params) 
1882                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) 
1883             elif src_ext == 'mpd': 
1884                 formats.extend(self._extract_mpd_formats( 
1885                     src_url, video_id, mpd_id='dash', fatal=False)) 
1886             elif re.search(r'\.ism/[Mm]anifest', src_url): 
1887                 formats.extend(self._extract_ism_formats( 
1888                     src_url, video_id, ism_id='mss', fatal=False)) 
1889             elif src_url.startswith('http') and self._is_valid_url(src, video_id): 
1893                     'ext': ext or src_ext or 'flv', 
1894                     'format_id': 'http-%d' % (bitrate or http_count), 
1896                     'filesize': filesize, 
1903     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): 
1906         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): 
1907             src = textstream.get('src') 
1908             if not src or src in urls: 
1911             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) 
1912             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang 
1913             subtitles.setdefault(lang, []).append({ 
1919     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): 
1920         xspf = self._download_xml( 
1921             xspf_url, playlist_id, 'Downloading xpsf playlist', 
1922             'Unable to download xspf manifest', fatal=fatal) 
1925         return self._parse_xspf( 
1926             xspf, playlist_id, xspf_url=xspf_url, 
1927             xspf_base_url=base_url(xspf_url)) 
1929     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): 
1931             'xspf': 'http://xspf.org/ns/0/', 
1932             's1': 'http://static.streamone.nl/player/ns/0', 
1936         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): 
1938                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) 
1939             description = xpath_text( 
1940                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') 
1941             thumbnail = xpath_text( 
1942                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') 
1943             duration = float_or_none( 
1944                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) 
1947             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): 
1948                 format_url = urljoin(xspf_base_url, location.text) 
1953                     'manifest_url': xspf_url, 
1954                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), 
1955                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 
1956                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), 
1958             self._sort_formats(formats) 
1963                 'description': description, 
1964                 'thumbnail': thumbnail, 
1965                 'duration': duration, 
1970     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): 
1971         res = self._download_xml_handle( 
1973             note=note or 'Downloading MPD manifest', 
1974             errnote=errnote or 'Failed to download MPD manifest', 
1979         mpd_base_url = base_url(urlh.geturl()) 
1981         return self._parse_mpd_formats( 
1982             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, 
1983             formats_dict=formats_dict, mpd_url=mpd_url) 
1985     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): 
1987         Parse formats 
from MPD manifest
. 
1989          1. MPEG
-DASH Standard
, ISO
/IEC 
23009-1:2014(E
), 
1990             http
://standards
.iso
.org
/ittf
/PubliclyAvailableStandards
/c065274_ISO_IEC_23009
-1_2014.zip 
1991          2. https
://en
.wikipedia
.org
/wiki
/Dynamic_Adaptive_Streaming_over_HTTP
 
1993         if mpd_doc.get('type') == 'dynamic': 
1996         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) 
1999             return self._xpath_ns(path, namespace) 
2001         def is_drm_protected(element): 
2002             return element.find(_add_ns('ContentProtection')) is not None 
2004         def extract_multisegment_info(element, ms_parent_info): 
2005             ms_info = ms_parent_info.copy() 
2007             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some 
2008             # common attributes and elements.  We will only extract relevant 
2010             def extract_common(source): 
2011                 segment_timeline = source.find(_add_ns('SegmentTimeline')) 
2012                 if segment_timeline is not None: 
2013                     s_e = segment_timeline.findall(_add_ns('S')) 
2015                         ms_info['total_number'] = 0 
2018                             r = int(s.get('r', 0)) 
2019                             ms_info['total_number'] += 1 + r 
2020                             ms_info['s'].append({ 
2021                                 't': int(s.get('t', 0)), 
2022                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) 
2023                                 'd': int(s.attrib['d']), 
2026                 start_number = source.get('startNumber') 
2028                     ms_info['start_number'] = int(start_number) 
2029                 timescale = source.get('timescale') 
2031                     ms_info['timescale'] = int(timescale) 
2032                 segment_duration = source.get('duration') 
2033                 if segment_duration: 
2034                     ms_info['segment_duration'] = float(segment_duration) 
2036             def extract_Initialization(source): 
2037                 initialization = source.find(_add_ns('Initialization')) 
2038                 if initialization is not None: 
2039                     ms_info['initialization_url'] = initialization.attrib['sourceURL'] 
2041             segment_list = element.find(_add_ns('SegmentList')) 
2042             if segment_list is not None: 
2043                 extract_common(segment_list) 
2044                 extract_Initialization(segment_list) 
2045                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) 
2047                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] 
2049                 segment_template = element.find(_add_ns('SegmentTemplate')) 
2050                 if segment_template is not None: 
2051                     extract_common(segment_template) 
2052                     media = segment_template.get('media') 
2054                         ms_info['media'] = media 
2055                     initialization = segment_template.get('initialization') 
2057                         ms_info['initialization'] = initialization 
2059                         extract_Initialization(segment_template) 
2062         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) 
2064         for period in mpd_doc.findall(_add_ns('Period')): 
2065             period_duration = parse_duration(period.get('duration')) or mpd_duration 
2066             period_ms_info = extract_multisegment_info(period, { 
2070             for adaptation_set in period.findall(_add_ns('AdaptationSet')): 
2071                 if is_drm_protected(adaptation_set): 
2073                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) 
2074                 for representation in adaptation_set.findall(_add_ns('Representation')): 
2075                     if is_drm_protected(representation): 
2077                     representation_attrib = adaptation_set.attrib.copy() 
2078                     representation_attrib.update(representation.attrib) 
2079                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory 
2080                     mime_type = representation_attrib['mimeType'] 
2081                     content_type = mime_type.split('/')[0] 
2082                     if content_type == 'text': 
2083                         # TODO implement WebVTT downloading 
2085                     elif content_type in ('video', 'audio'): 
2087                         for element in (representation, adaptation_set, period, mpd_doc): 
2088                             base_url_e = element.find(_add_ns('BaseURL')) 
2089                             if base_url_e is not None: 
2090                                 base_url = base_url_e.text + base_url 
2091                                 if re.match(r'^https?://', base_url): 
2093                         if mpd_base_url and not re.match(r'^https?://', base_url): 
2094                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'): 
2096                             base_url = mpd_base_url + base_url 
2097                         representation_id = representation_attrib.get('id') 
2098                         lang = representation_attrib.get('lang') 
2099                         url_el = representation.find(_add_ns('BaseURL')) 
2100                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) 
2101                         bandwidth = int_or_none(representation_attrib.get('bandwidth')) 
2103                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 
2105                             'manifest_url': mpd_url, 
2106                             'ext': mimetype2ext(mime_type), 
2107                             'width': int_or_none(representation_attrib.get('width')), 
2108                             'height': int_or_none(representation_attrib.get('height')), 
2109                             'tbr': float_or_none(bandwidth, 1000), 
2110                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 
2111                             'fps': int_or_none(representation_attrib.get('frameRate')), 
2112                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 
2113                             'format_note': 'DASH %s' % content_type, 
2114                             'filesize': filesize, 
2115                             'container': mimetype2ext(mime_type) + '_dash', 
2117                         f.update(parse_codecs(representation_attrib.get('codecs'))) 
2118                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) 
2120                         def prepare_template(template_name, identifiers): 
2121                             tmpl = representation_ms_info[template_name] 
2122                             # First of, % characters outside $...$ templates 
2123                             # must be escaped by doubling for proper processing 
2124                             # by % operator string formatting used further (see 
2125                             # https://github.com/rg3/youtube-dl/issues/16867). 
2131                                     in_template = not in_template 
2132                                 elif c == '%' and not in_template: 
2134                             # Next, $...$ templates are translated to their 
2135                             # %(...) counterparts to be used with % operator 
2136                             t = t.replace('$RepresentationID$', representation_id) 
2137                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) 
2138                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) 
2139                             t.replace('$$', '$') 
2142                         # @initialization is a regular template like @media one 
2143                         # so it should be handled just the same way (see 
2144                         # https://github.com/rg3/youtube-dl/issues/11605) 
2145                         if 'initialization' in representation_ms_info: 
2146                             initialization_template = prepare_template( 
2148                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and 
2149                                 # $Time$ shall not be included for @initialization thus 
2150                                 # only $Bandwidth$ remains 
2152                             representation_ms_info['initialization_url'] = initialization_template % { 
2153                                 'Bandwidth': bandwidth, 
2156                         def location_key(location): 
2157                             return 'url' if re.match(r'^https?://', location) else 'path' 
2159                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: 
2161                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) 
2162                             media_location_key = location_key(media_template) 
2164                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ 
2165                             # can't be used at the same time 
2166                             if '%(Number' in media_template and 's' not in representation_ms_info: 
2167                                 segment_duration = None 
2168                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: 
2169                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) 
2170                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) 
2171                                 representation_ms_info['fragments'] = [{ 
2172                                     media_location_key: media_template % { 
2173                                         'Number': segment_number, 
2174                                         'Bandwidth': bandwidth, 
2176                                     'duration': segment_duration, 
2177                                 } for segment_number in range( 
2178                                     representation_ms_info['start_number'], 
2179                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])] 
2181                                 # $Number*$ or $Time$ in media template with S list available 
2182                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg 
2183                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 
2184                                 representation_ms_info['fragments'] = [] 
2187                                 segment_number = representation_ms_info['start_number'] 
2189                                 def add_segment_url(): 
2190                                     segment_url = media_template % { 
2191                                         'Time': segment_time, 
2192                                         'Bandwidth': bandwidth, 
2193                                         'Number': segment_number, 
2195                                     representation_ms_info['fragments'].append({ 
2196                                         media_location_key: segment_url, 
2197                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']), 
2200                                 for num, s in enumerate(representation_ms_info['s']): 
2201                                     segment_time = s.get('t') or segment_time 
2205                                     for r in range(s.get('r', 0)): 
2206                                         segment_time += segment_d 
2209                                     segment_time += segment_d 
2210                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: 
2212                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI 
2213                             # or any YouTube dashsegments video 
2216                             timescale = representation_ms_info['timescale'] 
2217                             for s in representation_ms_info['s']: 
2218                                 duration = float_or_none(s['d'], timescale) 
2219                                 for r in range(s.get('r', 0) + 1): 
2220                                     segment_uri = representation_ms_info['segment_urls'][segment_index] 
2222                                         location_key(segment_uri): segment_uri, 
2223                                         'duration': duration, 
2226                             representation_ms_info['fragments'] = fragments 
2227                         elif 'segment_urls' in representation_ms_info: 
2228                             # Segment URLs with no SegmentTimeline 
2229                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 
2230                             # https://github.com/rg3/youtube-dl/pull/14844 
2232                             segment_duration = float_or_none( 
2233                                 representation_ms_info['segment_duration'], 
2234                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None 
2235                             for segment_url in representation_ms_info['segment_urls']: 
2237                                     location_key(segment_url): segment_url, 
2239                                 if segment_duration: 
2240                                     fragment['duration'] = segment_duration 
2241                                 fragments.append(fragment) 
2242                             representation_ms_info['fragments'] = fragments 
2243                         # NB: MPD manifest may contain direct URLs to unfragmented media. 
2244                         # No fragments key is present in this case. 
2245                         if 'fragments' in representation_ms_info: 
2247                                 'fragment_base_url': base_url, 
2249                                 'protocol': 'http_dash_segments', 
2251                             if 'initialization_url' in representation_ms_info: 
2252                                 initialization_url = representation_ms_info['initialization_url'] 
2253                                 if not f.get('url'): 
2254                                     f['url'] = initialization_url 
2255                                 f['fragments'].append({location_key(initialization_url): initialization_url}) 
2256                             f['fragments'].extend(representation_ms_info['fragments']) 
2257                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation 
2258                         # is not necessarily unique within a Period thus formats with 
2259                         # the same `format_id` are quite possible. There are numerous examples 
2260                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, 
2261                         # https://github.com/rg3/youtube-dl/issues/13919) 
2262                         full_info = formats_dict.get(representation_id, {}).copy() 
2264                         formats.append(full_info) 
2266                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) 
2269     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): 
2270         res = self._download_xml_handle( 
2272             note=note or 'Downloading ISM manifest', 
2273             errnote=errnote or 'Failed to download ISM manifest', 
2279         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) 
2281     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): 
2283         Parse formats 
from ISM manifest
. 
2285          1. [MS
-SSTR
]: Smooth Streaming Protocol
, 
2286             https
://msdn
.microsoft
.com
/en
-us
/library
/ff469518
.aspx
 
2288         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: 
2291         duration = int(ism_doc.attrib['Duration']) 
2292         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 
2295         for stream in ism_doc.findall('StreamIndex'): 
2296             stream_type = stream.get('Type') 
2297             if stream_type not in ('video', 'audio'): 
2299             url_pattern = stream.attrib['Url'] 
2300             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale 
2301             stream_name = stream.get('Name') 
2302             for track in stream.findall('QualityLevel'): 
2303                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) 
2304                 # TODO: add support for WVC1 and WMAP 
2305                 if fourcc not in ('H264', 'AVC1', 'AACL'): 
2306                     self.report_warning('%s is not a supported codec' % fourcc) 
2308                 tbr = int(track.attrib['Bitrate']) // 1000 
2309                 # [1] does not mention Width and Height attributes. However, 
2310                 # they're often present while MaxWidth and MaxHeight are 
2311                 # missing, so should be used as fallbacks 
2312                 width = int_or_none(track.get('MaxWidth') or track.get('Width')) 
2313                 height = int_or_none(track.get('MaxHeight') or track.get('Height')) 
2314                 sampling_rate = int_or_none(track.get('SamplingRate')) 
2316                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) 
2317                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) 
2323                 stream_fragments = stream.findall('c') 
2324                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments): 
2325                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] 
2326                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 
2327                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) 
2328                     if not fragment_ctx['duration']: 
2330                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) 
2332                             next_fragment_time = duration 
2333                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat 
2334                     for _ in range(fragment_repeat): 
2336                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), 
2337                             'duration': fragment_ctx['duration'] / stream_timescale, 
2339                         fragment_ctx['time'] += fragment_ctx['duration'] 
2343                     format_id.append(ism_id) 
2345                     format_id.append(stream_name) 
2346                 format_id.append(compat_str(tbr)) 
2349                     'format_id': '-'.join(format_id), 
2351                     'manifest_url': ism_url, 
2352                     'ext': 'ismv' if stream_type == 'video' else 'isma', 
2356                     'asr': sampling_rate, 
2357                     'vcodec': 'none' if stream_type == 'audio' else fourcc, 
2358                     'acodec': 'none' if stream_type == 'video' else fourcc, 
2360                     'fragments': fragments, 
2361                     '_download_params': { 
2362                         'duration': duration, 
2363                         'timescale': stream_timescale, 
2364                         'width': width or 0, 
2365                         'height': height or 0, 
2367                         'codec_private_data': track.get('CodecPrivateData'), 
2368                         'sampling_rate': sampling_rate, 
2369                         'channels': int_or_none(track.get('Channels', 2)), 
2370                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), 
2371                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), 
2376     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): 
2377         def absolute_url(item_url): 
2378             return urljoin(base_url, item_url) 
2380         def parse_content_type(content_type): 
2381             if not content_type: 
2383             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) 
2385                 mimetype, codecs = ctr.groups() 
2386                 f = parse_codecs(codecs) 
2387                 f['ext'] = mimetype2ext(mimetype) 
2391         def _media_formats(src, cur_media_type, type_info={}): 
2392             full_url = absolute_url(src) 
2393             ext = type_info.get('ext') or determine_ext(full_url) 
2395                 is_plain_url = False 
2396                 formats = self._extract_m3u8_formats( 
2397                     full_url, video_id, ext='mp4', 
2398                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, 
2399                     preference=preference, fatal=False) 
2401                 is_plain_url = False 
2402                 formats = self._extract_mpd_formats( 
2403                     full_url, video_id, mpd_id=mpd_id, fatal=False) 
2408                     'vcodec': 'none' if cur_media_type == 'audio' else None, 
2410             return is_plain_url, formats 
2413         # amp-video and amp-audio are very similar to their HTML5 counterparts 
2414         # so we wll include them right here (see 
2415         # https://www.ampproject.org/docs/reference/components/amp-video) 
2416         media_tags = [(media_tag, media_type, '') 
2417                       for media_tag, media_type 
2418                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] 
2419         media_tags.extend(re.findall( 
2420             # We only allow video|audio followed by a whitespace or '>'. 
2421             # Allowing more characters may end up in significant slow down (see 
2422             # https://github.com/rg3/youtube-dl/issues/11979, example URL: 
2423             # http://www.porntrex.com/maps/videositemap.xml). 
2424             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) 
2425         for media_tag, media_type, media_content in media_tags: 
2430             media_attributes = extract_attributes(media_tag) 
2431             src = media_attributes.get('src') 
2433                 _, formats = _media_formats(src, media_type) 
2434                 media_info['formats'].extend(formats) 
2435             media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) 
2437                 for source_tag in re.findall(r'<source[^>]+>', media_content): 
2438                     source_attributes = extract_attributes(source_tag) 
2439                     src = source_attributes.get('src') 
2442                     f = parse_content_type(source_attributes.get('type')) 
2443                     is_plain_url, formats = _media_formats(src, media_type, f) 
2445                         # res attribute is not standard but seen several times 
2448                             'height': int_or_none(source_attributes.get('res')), 
2449                             'format_id': source_attributes.get('label'), 
2451                         f.update(formats[0]) 
2452                         media_info['formats'].append(f) 
2454                         media_info['formats'].extend(formats) 
2455                 for track_tag in re.findall(r'<track[^>]+>', media_content): 
2456                     track_attributes = extract_attributes(track_tag) 
2457                     kind = track_attributes.get('kind') 
2458                     if not kind or kind in ('subtitles', 'captions'): 
2459                         src = track_attributes.get('src') 
2462                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') 
2463                         media_info['subtitles'].setdefault(lang, []).append({ 
2464                             'url': absolute_url(src), 
2466             for f in media_info['formats']: 
2467                 f.setdefault('http_headers', {})['Referer'] = base_url 
2468             if media_info['formats'] or media_info['subtitles']: 
2469                 entries.append(media_info) 
2472     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): 
2474         hdcore_sign = 'hdcore=3.7.0' 
2475         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') 
2476         hds_host = hosts.get('hds') 
2478             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) 
2479         if 'hdcore=' not in f4m_url: 
2480             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign 
2481         f4m_formats = self._extract_f4m_formats( 
2482             f4m_url, video_id, f4m_id='hds', fatal=False) 
2483         for entry in f4m_formats: 
2484             entry.update({'extra_param_to_segment_url': hdcore_sign}) 
2485         formats.extend(f4m_formats) 
2486         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') 
2487         hls_host = hosts.get('hls') 
2489             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) 
2490         formats.extend(self._extract_m3u8_formats( 
2491             m3u8_url, video_id, 'mp4', 'm3u8_native', 
2492             m3u8_id='hls', fatal=False)) 
2495     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): 
2496         query = compat_urlparse.urlparse(url).query 
2497         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) 
2499             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) 
2500         url_base = mobj.group('url') 
2501         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) 
2504         def manifest_url(manifest): 
2505             m_url = '%s/%s' % (http_base_url, manifest) 
2507                 m_url += '?%s' % query 
2510         if 'm3u8' not in skip_protocols: 
2511             formats.extend(self._extract_m3u8_formats( 
2512                 manifest_url('playlist.m3u8'), video_id, 'mp4', 
2513                 m3u8_entry_protocol, m3u8_id='hls', fatal=False)) 
2514         if 'f4m' not in skip_protocols: 
2515             formats.extend(self._extract_f4m_formats( 
2516                 manifest_url('manifest.f4m'), 
2517                 video_id, f4m_id='hds', fatal=False)) 
2518         if 'dash' not in skip_protocols: 
2519             formats.extend(self._extract_mpd_formats( 
2520                 manifest_url('manifest.mpd'), 
2521                 video_id, mpd_id='dash', fatal=False)) 
2522         if re.search(r'(?:/smil:|\.smil)', url_base): 
2523             if 'smil' not in skip_protocols: 
2524                 rtmp_formats = self._extract_smil_formats( 
2525                     manifest_url('jwplayer.smil'), 
2526                     video_id, fatal=False) 
2527                 for rtmp_format in rtmp_formats: 
2528                     rtsp_format = rtmp_format.copy() 
2529                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) 
2530                     del rtsp_format['play_path'] 
2531                     del rtsp_format['ext'] 
2532                     rtsp_format.update({ 
2533                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), 
2534                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), 
2537                     formats.extend([rtmp_format, rtsp_format]) 
2539             for protocol in ('rtmp', 'rtsp'): 
2540                 if protocol not in skip_protocols: 
2542                         'url': '%s:%s' % (protocol, url_base), 
2543                         'format_id': protocol, 
2544                         'protocol': protocol, 
2548     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): 
2550             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', 
2554                 jwplayer_data = self._parse_json(mobj.group('options'), 
2556                                                  transform_source=transform_source) 
2557             except ExtractorError: 
2560                 if isinstance(jwplayer_data, dict): 
2561                     return jwplayer_data 
2563     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): 
2564         jwplayer_data = self._find_jwplayer_data( 
2565             webpage, video_id, transform_source=js_to_json) 
2566         return self._parse_jwplayer_data( 
2567             jwplayer_data, video_id, *args, **kwargs) 
2569     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, 
2570                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): 
2571         # JWPlayer backward compatibility: flattened playlists 
2572         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 
2573         if 'playlist' not in jwplayer_data: 
2574             jwplayer_data = {'playlist': [jwplayer_data]} 
2578         # JWPlayer backward compatibility: single playlist item 
2579         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 
2580         if not isinstance(jwplayer_data['playlist'], list): 
2581             jwplayer_data['playlist'] = [jwplayer_data['playlist']] 
2583         for video_data in jwplayer_data['playlist']: 
2584             # JWPlayer backward compatibility: flattened sources 
2585             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 
2586             if 'sources' not in video_data: 
2587                 video_data['sources'] = [video_data] 
2589             this_video_id = video_id or video_data['mediaid'] 
2591             formats = self._parse_jwplayer_formats( 
2592                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, 
2593                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) 
2596             tracks = video_data.get('tracks') 
2597             if tracks and isinstance(tracks, list): 
2598                 for track in tracks: 
2599                     if not isinstance(track, dict): 
2601                     track_kind = track.get('kind') 
2602                     if not track_kind or not isinstance(track_kind, compat_str): 
2604                     if track_kind.lower() not in ('captions', 'subtitles'): 
2606                     track_url = urljoin(base_url, track.get('file')) 
2609                     subtitles.setdefault(track.get('label') or 'en', []).append({ 
2610                         'url': self._proto_relative_url(track_url) 
2614                 'id': this_video_id, 
2615                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 
2616                 'description': video_data.get('description'), 
2617                 'thumbnail': self._proto_relative_url(video_data.get('image')), 
2618                 'timestamp': int_or_none(video_data.get('pubdate')), 
2619                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 
2620                 'subtitles': subtitles, 
2622             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 
2623             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): 
2625                     '_type': 'url_transparent', 
2626                     'url': formats[0]['url'], 
2629                 self._sort_formats(formats) 
2630                 entry['formats'] = formats 
2631             entries.append(entry) 
2632         if len(entries) == 1: 
2635             return self.playlist_result(entries) 
2637     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 
2638                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): 
2641         for source in jwplayer_sources_data: 
2642             if not isinstance(source, dict): 
2644             source_url = self._proto_relative_url(source.get('file')) 
2648                 source_url = compat_urlparse.urljoin(base_url, source_url) 
2649             if source_url in urls: 
2651             urls.append(source_url) 
2652             source_type = source.get('type') or '' 
2653             ext = mimetype2ext(source_type) or determine_ext(source_url) 
2654             if source_type == 'hls' or ext == 'm3u8': 
2655                 formats.extend(self._extract_m3u8_formats( 
2656                     source_url, video_id, 'mp4', entry_protocol='m3u8_native', 
2657                     m3u8_id=m3u8_id, fatal=False)) 
2658             elif source_type == 'dash' or ext == 'mpd': 
2659                 formats.extend(self._extract_mpd_formats( 
2660                     source_url, video_id, mpd_id=mpd_id, fatal=False)) 
2662                 formats.extend(self._extract_smil_formats( 
2663                     source_url, video_id, fatal=False)) 
2664             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 
2665             elif source_type.startswith('audio') or ext in ( 
2666                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): 
2673                 height = int_or_none(source.get('height')) 
2675                     # Often no height is provided but there is a label in 
2676                     # format like "1080p", "720p SD", or 1080. 
2677                     height = int_or_none(self._search_regex( 
2678                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), 
2679                         'height', default=None)) 
2682                     'width': int_or_none(source.get('width')), 
2684                     'tbr': int_or_none(source.get('bitrate')), 
2687                 if source_url.startswith('rtmp'): 
2688                     a_format['ext'] = 'flv' 
2689                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as 
2690                     # of jwplayer.flash.swf 
2691                     rtmp_url_parts = re.split( 
2692                         r'((?:mp4|mp3|flv):)', source_url, 1) 
2693                     if len(rtmp_url_parts) == 3: 
2694                         rtmp_url, prefix, play_path = rtmp_url_parts 
2697                             'play_path': prefix + play_path, 
2700                         a_format.update(rtmp_params) 
2701                 formats.append(a_format) 
2704     def _live_title(self, name): 
2705         """ Generate the title 
for a live video 
""" 
2706         now = datetime.datetime.now() 
2707         now_str = now.strftime('%Y-%m-%d %H:%M') 
2708         return name + ' ' + now_str 
2710     def _int(self, v, name, fatal=False, **kwargs): 
2711         res = int_or_none(v, **kwargs) 
2712         if 'get_attr' in kwargs: 
2713             print(getattr(v, kwargs['get_attr'])) 
2715             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
2717                 raise ExtractorError(msg) 
2719                 self._downloader.report_warning(msg) 
2722     def _float(self, v, name, fatal=False, **kwargs): 
2723         res = float_or_none(v, **kwargs) 
2725             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
2727                 raise ExtractorError(msg) 
2729                 self._downloader.report_warning(msg) 
2732     def _set_cookie(self, domain, name, value, expire_time=None, port=None, 
2733                     path='/', secure=False, discard=False, rest={}, **kwargs): 
2734         cookie = compat_cookiejar.Cookie( 
2735             0, name, value, port, port is not None, domain, True, 
2736             domain.startswith('.'), path, True, secure, expire_time, 
2737             discard, None, None, rest) 
2738         self._downloader.cookiejar.set_cookie(cookie) 
2740     def _get_cookies(self, url): 
2741         """ Return a compat_cookies
.SimpleCookie 
with the cookies 
for the url 
""" 
2742         req = sanitized_Request(url) 
2743         self._downloader.cookiejar.add_cookie_header(req) 
2744         return compat_cookies.SimpleCookie(req.get_header('Cookie')) 
2746     def get_testcases(self, include_onlymatching=False): 
2747         t = getattr(self, '_TEST', None) 
2749             assert not hasattr(self, '_TESTS'), \ 
2750                 '%s has _TEST and _TESTS' % type(self).__name__ 
2753             tests = getattr(self, '_TESTS', []) 
2755             if not include_onlymatching and t.get('only_matching', False): 
2757             t['name'] = type(self).__name__[:-len('IE')] 
2760     def is_suitable(self, age_limit): 
2761         """ Test whether the extractor 
is generally suitable 
for the given
 
2762         age 
limit (i
.e
. pornographic sites are 
not, all others usually are
) """ 
2764         any_restricted = False 
2765         for tc in self.get_testcases(include_onlymatching=False): 
2766             if tc.get('playlist', []): 
2767                 tc = tc['playlist'][0] 
2768             is_restricted = age_restricted( 
2769                 tc.get('info_dict', {}).get('age_limit'), age_limit) 
2770             if not is_restricted: 
2772             any_restricted = any_restricted or is_restricted 
2773         return not any_restricted 
2775     def extract_subtitles(self, *args, **kwargs): 
2776         if (self._downloader.params.get('writesubtitles', False) or 
2777                 self._downloader.params.get('listsubtitles')): 
2778             return self._get_subtitles(*args, **kwargs) 
2781     def _get_subtitles(self, *args, **kwargs): 
2782         raise NotImplementedError('This method must be implemented by subclasses') 
2785     def _merge_subtitle_items(subtitle_list1, subtitle_list2): 
2786         """ Merge subtitle items 
for one language
. Items 
with duplicated URLs
 
2787         will be dropped
. """ 
2788         list1_urls = set([item['url'] for item in subtitle_list1]) 
2789         ret = list(subtitle_list1) 
2790         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) 
2794     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): 
2795         """ Merge two subtitle dictionaries
, language by language
. """ 
2796         ret = dict(subtitle_dict1) 
2797         for lang in subtitle_dict2: 
2798             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) 
2801     def extract_automatic_captions(self, *args, **kwargs): 
2802         if (self._downloader.params.get('writeautomaticsub', False) or 
2803                 self._downloader.params.get('listsubtitles')): 
2804             return self._get_automatic_captions(*args, **kwargs) 
2807     def _get_automatic_captions(self, *args, **kwargs): 
2808         raise NotImplementedError('This method must be implemented by subclasses') 
2810     def mark_watched(self, *args, **kwargs): 
2811         if (self._downloader.params.get('mark_watched', False) and 
2812                 (self._get_login_info()[0] is not None or 
2813                     self._downloader.params.get('cookiefile') is not None)): 
2814             self._mark_watched(*args, **kwargs) 
2816     def _mark_watched(self, *args, **kwargs): 
2817         raise NotImplementedError('This method must be implemented by subclasses') 
2819     def geo_verification_headers(self): 
2821         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') 
2822         if geo_verification_proxy: 
2823             headers['Ytdl-request-proxy'] = geo_verification_proxy 
2826     def _generic_id(self, url): 
2827         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) 
2829     def _generic_title(self, url): 
2830         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) 
2833 class SearchInfoExtractor(InfoExtractor): 
2835     Base 
class for paged search queries extractors
. 
2836     They accept URLs 
in the format 
_SEARCH_KEY(|all|
[0-9]):{query}
 
2837     Instances should define _SEARCH_KEY 
and _MAX_RESULTS
. 
2841     def _make_valid_url(cls): 
2842         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY 
2845     def suitable(cls, url): 
2846         return re.match(cls._make_valid_url(), url) is not None 
2848     def _real_extract(self, query): 
2849         mobj = re.match(self._make_valid_url(), query) 
2851             raise ExtractorError('Invalid search query "%s"' % query) 
2853         prefix = mobj.group('prefix') 
2854         query = mobj.group('query') 
2856             return self._get_n_results(query, 1) 
2857         elif prefix == 'all': 
2858             return self._get_n_results(query, self._MAX_RESULTS) 
2862                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) 
2863             elif n > self._MAX_RESULTS: 
2864                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 
2865                 n = self._MAX_RESULTS 
2866             return self._get_n_results(query, n) 
2868     def _get_n_results(self, query, n): 
2869         """Get a specified number of results 
for a query
""" 
2870         raise NotImplementedError('This method must be implemented by subclasses') 
2873     def SEARCH_KEY(self): 
2874         return self._SEARCH_KEY