2 from __future__ 
import unicode_literals
 
  17 from ..compat 
import ( 
  20     compat_etree_fromstring
, 
  26     compat_urllib_parse_unquote
, 
  27     compat_urllib_parse_urlencode
, 
  28     compat_urllib_request
, 
  30     compat_xml_parse_error
, 
  32 from ..downloader
.f4m 
import ( 
  34     remove_encrypted_media
, 
  59     parse_m3u8_attributes
, 
  76 class InfoExtractor(object): 
  77     """Information Extractor class. 
  79     Information extractors are the classes that, given a URL, extract 
  80     information about the video (or videos) the URL refers to. This 
  81     information includes the real video URL, the video title, author and 
  82     others. The information is stored in a dictionary which is then 
  83     passed to the YoutubeDL. The YoutubeDL processes this 
  84     information possibly downloading the video to the file system, among 
  85     other possible outcomes. 
  87     The type field determines the type of the result. 
  88     By far the most common value (and the default if _type is missing) is 
  89     "video", which indicates a single video. 
  91     For a video, the dictionaries must include the following fields: 
  94     title:          Video title, unescaped. 
  96     Additionally, it must contain either a formats entry or a url one: 
  98     formats:        A list of dictionaries for each format available, ordered 
  99                     from worst to best quality. 
 102                     * url        Mandatory. The URL of the video file 
 104                                  The URL of the manifest file in case of 
 105                                  fragmented media (DASH, hls, hds) 
 106                     * ext        Will be calculated from URL if missing 
 107                     * format     A human-readable description of the format 
 108                                  ("mp4 container with h264/opus"). 
 109                                  Calculated from the format_id, width, height. 
 110                                  and format_note fields if missing. 
 111                     * format_id  A short description of the format 
 112                                  ("mp4_h264_opus" or "19"). 
 113                                 Technically optional, but strongly recommended. 
 114                     * format_note Additional info about the format 
 115                                  ("3D" or "DASH video") 
 116                     * width      Width of the video, if known 
 117                     * height     Height of the video, if known 
 118                     * resolution Textual description of width and height 
 119                     * tbr        Average bitrate of audio and video in KBit/s 
 120                     * abr        Average audio bitrate in KBit/s 
 121                     * acodec     Name of the audio codec in use 
 122                     * asr        Audio sampling rate in Hertz 
 123                     * vbr        Average video bitrate in KBit/s 
 125                     * vcodec     Name of the video codec in use 
 126                     * container  Name of the container format 
 127                     * filesize   The number of bytes, if known in advance 
 128                     * filesize_approx  An estimate for the number of bytes 
 129                     * player_url SWF Player URL (used for rtmpdump). 
 130                     * protocol   The protocol that will be used for the actual 
 131                                  download, lower-case. 
 132                                  "http", "https", "rtsp", "rtmp", "rtmpe", 
 133                                  "m3u8", "m3u8_native" or "http_dash_segments". 
 135                                  Base URL for fragments. Each fragment's path 
 136                                  value (if present) will be relative to 
 138                     * fragments  A list of fragments of a fragmented media. 
 139                                  Each fragment entry must contain either an url 
 140                                  or a path. If an url is present it should be 
 141                                  considered by a client. Otherwise both path and 
 142                                  fragment_base_url must be present. Here is 
 143                                  the list of all potential fields: 
 144                                  * "url" - fragment's URL 
 145                                  * "path" - fragment's path relative to 
 147                                  * "duration" (optional, int or float) 
 148                                  * "filesize" (optional, int) 
 149                     * preference Order number of this format. If this field is 
 150                                  present and not None, the formats get sorted 
 151                                  by this field, regardless of all other values. 
 152                                  -1 for default (order by other properties), 
 153                                  -2 or smaller for less than default. 
 154                                  < -1000 to hide the format (if there is 
 155                                     another one which is strictly better) 
 156                     * language   Language code, e.g. "de" or "en-US". 
 157                     * language_preference  Is this in the language mentioned in 
 159                                  10 if it's what the URL is about, 
 160                                  -1 for default (don't know), 
 161                                  -10 otherwise, other values reserved for now. 
 162                     * quality    Order number of the video quality of this 
 163                                  format, irrespective of the file format. 
 164                                  -1 for default (order by other properties), 
 165                                  -2 or smaller for less than default. 
 166                     * source_preference  Order number for this video source 
 167                                   (quality takes higher priority) 
 168                                  -1 for default (order by other properties), 
 169                                  -2 or smaller for less than default. 
 170                     * http_headers  A dictionary of additional HTTP headers 
 171                                  to add to the request. 
 172                     * stretched_ratio  If given and not 1, indicates that the 
 173                                  video's pixels are not square. 
 174                                  width : height ratio as float. 
 175                     * no_resume  The server does not support resuming the 
 176                                  (HTTP or RTMP) download. Boolean. 
 177                     * downloader_options  A dictionary of downloader options as 
 178                                  described in FileDownloader 
 180     url:            Final video URL. 
 181     ext:            Video filename extension. 
 182     format:         The video format, defaults to ext (used for --get-format) 
 183     player_url:     SWF Player URL (used for rtmpdump). 
 185     The following fields are optional: 
 187     alt_title:      A secondary title of the video. 
 188     display_id      An alternative identifier for the video, not necessarily 
 189                     unique, but available before title. Typically, id is 
 190                     something like "4234987", title "Dancing naked mole rats", 
 191                     and display_id "dancing-naked-mole-rats" 
 192     thumbnails:     A list of dictionaries, with the following entries: 
 193                         * "id" (optional, string) - Thumbnail format ID 
 195                         * "preference" (optional, int) - quality of the image 
 196                         * "width" (optional, int) 
 197                         * "height" (optional, int) 
 198                         * "resolution" (optional, string "{width}x{height"}, 
 200                         * "filesize" (optional, int) 
 201     thumbnail:      Full URL to a video thumbnail image. 
 202     description:    Full video description. 
 203     uploader:       Full name of the video uploader. 
 204     license:        License name the video is licensed under. 
 205     creator:        The creator of the video. 
 206     release_date:   The date (YYYYMMDD) when the video was released. 
 207     timestamp:      UNIX timestamp of the moment the video became available. 
 208     upload_date:    Video upload date (YYYYMMDD). 
 209                     If not explicitly set, calculated from timestamp. 
 210     uploader_id:    Nickname or id of the video uploader. 
 211     uploader_url:   Full URL to a personal webpage of the video uploader. 
 212     location:       Physical location where the video was filmed. 
 213     subtitles:      The available subtitles as a dictionary in the format 
 214                     {tag: subformats}. "tag" is usually a language code, and 
 215                     "subformats" is a list sorted from lower to higher 
 216                     preference, each element is a dictionary with the "ext" 
 218                         * "data": The subtitles file contents 
 219                         * "url": A URL pointing to the subtitles file 
 220                     "ext" will be calculated from URL if missing 
 221     automatic_captions: Like 'subtitles', used by the YoutubeIE for 
 222                     automatically generated captions 
 223     duration:       Length of the video in seconds, as an integer or float. 
 224     view_count:     How many users have watched the video on the platform. 
 225     like_count:     Number of positive ratings of the video 
 226     dislike_count:  Number of negative ratings of the video 
 227     repost_count:   Number of reposts of the video 
 228     average_rating: Average rating give by users, the scale used depends on the webpage 
 229     comment_count:  Number of comments on the video 
 230     comments:       A list of comments, each with one or more of the following 
 231                     properties (all but one of text or html optional): 
 232                         * "author" - human-readable name of the comment author 
 233                         * "author_id" - user ID of the comment author 
 235                         * "html" - Comment as HTML 
 236                         * "text" - Plain text of the comment 
 237                         * "timestamp" - UNIX timestamp of comment 
 238                         * "parent" - ID of the comment this one is replying to. 
 239                                      Set to "root" to indicate that this is a 
 240                                      comment to the original video. 
 241     age_limit:      Age restriction for the video, as an integer (years) 
 242     webpage_url:    The URL to the video webpage, if given to youtube-dl it 
 243                     should allow to get the same result again. (It will be set 
 244                     by YoutubeDL if it's missing) 
 245     categories:     A list of categories that the video falls in, for example 
 247     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"] 
 248     is_live:        True, False, or None (=unknown). Whether this video is a 
 249                     live stream that goes on instead of a fixed-length video. 
 250     start_time:     Time in seconds where the reproduction should start, as 
 251                     specified in the URL. 
 252     end_time:       Time in seconds where the reproduction should end, as 
 253                     specified in the URL. 
 254     chapters:       A list of dictionaries, with the following entries: 
 255                         * "start_time" - The start time of the chapter in seconds 
 256                         * "end_time" - The end time of the chapter in seconds 
 257                         * "title" (optional, string) 
 259     The following fields should only be used when the video belongs to some logical 
 262     chapter:        Name or title of the chapter the video belongs to. 
 263     chapter_number: Number of the chapter the video belongs to, as an integer. 
 264     chapter_id:     Id of the chapter the video belongs to, as a unicode string. 
 266     The following fields should only be used when the video is an episode of some 
 267     series, programme or podcast: 
 269     series:         Title of the series or programme the video episode belongs to. 
 270     season:         Title of the season the video episode belongs to. 
 271     season_number:  Number of the season the video episode belongs to, as an integer. 
 272     season_id:      Id of the season the video episode belongs to, as a unicode string. 
 273     episode:        Title of the video episode. Unlike mandatory video title field, 
 274                     this field should denote the exact title of the video episode 
 275                     without any kind of decoration. 
 276     episode_number: Number of the video episode within a season, as an integer. 
 277     episode_id:     Id of the video episode, as a unicode string. 
 279     The following fields should only be used when the media is a track or a part of 
 282     track:          Title of the track. 
 283     track_number:   Number of the track within an album or a disc, as an integer. 
 284     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii), 
 286     artist:         Artist(s) of the track. 
 287     genre:          Genre(s) of the track. 
 288     album:          Title of the album the track belongs to. 
 289     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). 
 290     album_artist:   List of all artists appeared on the album (e.g. 
 291                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits 
 293     disc_number:    Number of the disc or other physical medium the track belongs to, 
 295     release_year:   Year (YYYY) when the album was released. 
 297     Unless mentioned otherwise, the fields should be Unicode strings. 
 299     Unless mentioned otherwise, None is equivalent to absence of information. 
 302     _type "playlist" indicates multiple videos. 
 303     There must be a key "entries", which is a list, an iterable, or a PagedList 
 304     object, each element of which is a valid dictionary by this specification. 
 306     Additionally, playlists can have "id", "title", "description", "uploader", 
 307     "uploader_id", "uploader_url" attributes with the same semantics as videos 
 311     _type "multi_video" indicates that there are multiple videos that 
 312     form a single show, for examples multiple acts of an opera or TV episode. 
 313     It must have an entries key like a playlist and contain all the keys 
 314     required for a video at the same time. 
 317     _type "url" indicates that the video must be extracted from another 
 318     location, possibly by a different extractor. Its only required key is: 
 319     "url" - the next URL to extract. 
 320     The key "ie_key" can be set to the class name (minus the trailing "IE", 
 321     e.g. "Youtube") if the extractor class is known in advance. 
 322     Additionally, the dictionary may have any properties of the resolved entity 
 323     known in advance, for example "title" if the title of the referred video is 
 327     _type "url_transparent" entities have the same specification as "url", but 
 328     indicate that the given additional information is more precise than the one 
 329     associated with the resolved URL. 
 330     This is useful when a site employs a video service that hosts the video and 
 331     its technical metadata, but that video service does not embed a useful 
 332     title, description etc. 
 335     Subclasses of this one should re-define the _real_initialize() and 
 336     _real_extract() methods and define a _VALID_URL regexp. 
 337     Probably, they should also be added to the list of extractors. 
 339     _GEO_BYPASS attribute may be set to False in order to disable 
 340     geo restriction bypass mechanisms for a particular extractor. 
 341     Though it won't disable explicit geo restriction bypass based on 
 342     country code provided with geo_bypass_country. 
 344     _GEO_COUNTRIES attribute may contain a list of presumably geo unrestricted 
 345     countries for this extractor. One of these countries will be used by 
 346     geo restriction bypass mechanism right away in order to bypass 
 347     geo restriction, of course, if the mechanism is not disabled. 
 349     _GEO_IP_BLOCKS attribute may contain a list of presumably geo unrestricted 
 350     IP blocks in CIDR notation for this extractor. One of these IP blocks 
 351     will be used by geo restriction bypass mechanism similarly 
 354     Finally, the _WORKING attribute should be set to False for broken IEs 
 355     in order to warn the users and skip the tests. 
 360     _x_forwarded_for_ip 
= None 
 362     _GEO_COUNTRIES 
= None 
 363     _GEO_IP_BLOCKS 
= None 
 366     def __init__(self
, downloader
=None): 
 367         """Constructor. Receives an optional downloader.""" 
 369         self
._x
_forwarded
_for
_ip 
= None 
 370         self
.set_downloader(downloader
) 
 373     def suitable(cls
, url
): 
 374         """Receives a URL and returns True if suitable for this IE.""" 
 376         # This does not use has/getattr intentionally - we want to know whether 
 377         # we have cached the regexp for *this* class, whereas getattr would also 
 378         # match the superclass 
 379         if '_VALID_URL_RE' not in cls
.__dict
__: 
 380             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 381         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 384     def _match_id(cls
, url
): 
 385         if '_VALID_URL_RE' not in cls
.__dict
__: 
 386             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 387         m 
= cls
._VALID
_URL
_RE
.match(url
) 
 389         return compat_str(m
.group('id')) 
 393         """Getter method for _WORKING.""" 
 396     def initialize(self
): 
 397         """Initializes an instance (authentication, etc).""" 
 398         self
._initialize
_geo
_bypass
({ 
 399             'countries': self
._GEO
_COUNTRIES
, 
 400             'ip_blocks': self
._GEO
_IP
_BLOCKS
, 
 403             self
._real
_initialize
() 
 406     def _initialize_geo_bypass(self
, geo_bypass_context
): 
 408         Initialize geo restriction bypass mechanism. 
 410         This method is used to initialize geo bypass mechanism based on faking 
 411         X-Forwarded-For HTTP header. A random country from provided country list 
 412         is selected and a random IP belonging to this country is generated. This 
 413         IP will be passed as X-Forwarded-For HTTP header in all subsequent 
 416         This method will be used for initial geo bypass mechanism initialization 
 417         during the instance initialization with _GEO_COUNTRIES and 
 420         You may also manually call it from extractor's code if geo bypass 
 421         information is not available beforehand (e.g. obtained during 
 422         extraction) or due to some other reason. In this case you should pass 
 423         this information in geo bypass context passed as first argument. It may 
 424         contain following fields: 
 426         countries:  List of geo unrestricted countries (similar 
 428         ip_blocks:  List of geo unrestricted IP blocks in CIDR notation 
 429                     (similar to _GEO_IP_BLOCKS) 
 432         if not self
._x
_forwarded
_for
_ip
: 
 434             # Geo bypass mechanism is explicitly disabled by user 
 435             if not self
._downloader
.params
.get('geo_bypass', True): 
 438             if not geo_bypass_context
: 
 439                 geo_bypass_context 
= {} 
 441             # Backward compatibility: previously _initialize_geo_bypass 
 442             # expected a list of countries, some 3rd party code may still use 
 444             if isinstance(geo_bypass_context
, (list, tuple)): 
 445                 geo_bypass_context 
= { 
 446                     'countries': geo_bypass_context
, 
 449             # The whole point of geo bypass mechanism is to fake IP 
 450             # as X-Forwarded-For HTTP header based on some IP block or 
 453             # Path 1: bypassing based on IP block in CIDR notation 
 455             # Explicit IP block specified by user, use it right away 
 456             # regardless of whether extractor is geo bypassable or not 
 457             ip_block 
= self
._downloader
.params
.get('geo_bypass_ip_block', None) 
 459             # Otherwise use random IP block from geo bypass context but only 
 460             # if extractor is known as geo bypassable 
 462                 ip_blocks 
= geo_bypass_context
.get('ip_blocks') 
 463                 if self
._GEO
_BYPASS 
and ip_blocks
: 
 464                     ip_block 
= random
.choice(ip_blocks
) 
 467                 self
._x
_forwarded
_for
_ip 
= GeoUtils
.random_ipv4(ip_block
) 
 468                 if self
._downloader
.params
.get('verbose', False): 
 469                     self
._downloader
.to_screen( 
 470                         '[debug] Using fake IP %s as X-Forwarded-For.' 
 471                         % self
._x
_forwarded
_for
_ip
) 
 474             # Path 2: bypassing based on country code 
 476             # Explicit country code specified by user, use it right away 
 477             # regardless of whether extractor is geo bypassable or not 
 478             country 
= self
._downloader
.params
.get('geo_bypass_country', None) 
 480             # Otherwise use random country code from geo bypass context but 
 481             # only if extractor is known as geo bypassable 
 483                 countries 
= geo_bypass_context
.get('countries') 
 484                 if self
._GEO
_BYPASS 
and countries
: 
 485                     country 
= random
.choice(countries
) 
 488                 self
._x
_forwarded
_for
_ip 
= GeoUtils
.random_ipv4(country
) 
 489                 if self
._downloader
.params
.get('verbose', False): 
 490                     self
._downloader
.to_screen( 
 491                         '[debug] Using fake IP %s (%s) as X-Forwarded-For.' 
 492                         % (self
._x
_forwarded
_for
_ip
, country
.upper())) 
 494     def extract(self
, url
): 
 495         """Extracts URL information and returns it in list of dicts.""" 
 500                     ie_result 
= self
._real
_extract
(url
) 
 501                     if self
._x
_forwarded
_for
_ip
: 
 502                         ie_result
['__x_forwarded_for_ip'] = self
._x
_forwarded
_for
_ip
 
 504                 except GeoRestrictedError 
as e
: 
 505                     if self
.__maybe
_fake
_ip
_and
_retry
(e
.countries
): 
 508         except ExtractorError
: 
 510         except compat_http_client
.IncompleteRead 
as e
: 
 511             raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True) 
 512         except (KeyError, StopIteration) as e
: 
 513             raise ExtractorError('An extractor error has occurred.', cause
=e
) 
 515     def __maybe_fake_ip_and_retry(self
, countries
): 
 516         if (not self
._downloader
.params
.get('geo_bypass_country', None) and 
 518                 self
._downloader
.params
.get('geo_bypass', True) and 
 519                 not self
._x
_forwarded
_for
_ip 
and 
 521             country_code 
= random
.choice(countries
) 
 522             self
._x
_forwarded
_for
_ip 
= GeoUtils
.random_ipv4(country_code
) 
 523             if self
._x
_forwarded
_for
_ip
: 
 525                     'Video is geo restricted. Retrying extraction with fake IP %s (%s) as X-Forwarded-For.' 
 526                     % (self
._x
_forwarded
_for
_ip
, country_code
.upper())) 
 530     def set_downloader(self
, downloader
): 
 531         """Sets the downloader for this IE.""" 
 532         self
._downloader 
= downloader
 
 534     def _real_initialize(self
): 
 535         """Real initialization process. Redefine in subclasses.""" 
 538     def _real_extract(self
, url
): 
 539         """Real extraction process. Redefine in subclasses.""" 
 544         """A string for getting the InfoExtractor with get_info_extractor""" 
 545         return compat_str(cls
.__name
__[:-2]) 
 549         return compat_str(type(self
).__name
__[:-2]) 
 551     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query
={}): 
 552         """ Returns the response handle """ 
 554             self
.report_download_webpage(video_id
) 
 555         elif note 
is not False: 
 557                 self
.to_screen('%s' % (note
,)) 
 559                 self
.to_screen('%s: %s' % (video_id
, note
)) 
 561         # Some sites check X-Forwarded-For HTTP header in order to figure out 
 562         # the origin of the client behind proxy. This allows bypassing geo 
 563         # restriction by faking this header's value to IP that belongs to some 
 564         # geo unrestricted country. We will do so once we encounter any 
 565         # geo restriction error. 
 566         if self
._x
_forwarded
_for
_ip
: 
 567             if 'X-Forwarded-For' not in headers
: 
 568                 headers
['X-Forwarded-For'] = self
._x
_forwarded
_for
_ip
 
 570         if isinstance(url_or_request
, compat_urllib_request
.Request
): 
 571             url_or_request 
= update_Request( 
 572                 url_or_request
, data
=data
, headers
=headers
, query
=query
) 
 575                 url_or_request 
= update_url_query(url_or_request
, query
) 
 576             if data 
is not None or headers
: 
 577                 url_or_request 
= sanitized_Request(url_or_request
, data
, headers
) 
 579             return self
._downloader
.urlopen(url_or_request
) 
 580         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 584                 errnote 
= 'Unable to download webpage' 
 586             errmsg 
= '%s: %s' % (errnote
, error_to_compat_str(err
)) 
 588                 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
) 
 590                 self
._downloader
.report_warning(errmsg
) 
 593     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query
={}): 
 594         """ Returns a tuple (page content as string, URL handle) """ 
 595         # Strip hashes from the URL (#1038) 
 596         if isinstance(url_or_request
, (compat_str
, str)): 
 597             url_or_request 
= url_or_request
.partition('#')[0] 
 599         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
) 
 603         content 
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
) 
 604         return (content
, urlh
) 
 607     def _guess_encoding_from_content(content_type
, webpage_bytes
): 
 608         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 610             encoding 
= m
.group(1) 
 612             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 613                           webpage_bytes[:1024]) 
 615                 encoding = m.group(1).decode('ascii') 
 616             elif webpage_bytes.startswith(b'\xff\xfe'): 
 623     def __check_blocked(self, content): 
 624         first_block = content[:512] 
 625         if ('<title>Access to this site is blocked</title>' in content and 
 626                 'Websense' in first_block): 
 627             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' 
 628             blocked_iframe = self._html_search_regex( 
 629                 r'<iframe src="([^
"]+)"', content, 
 630                 'Websense information URL
', default=None) 
 632                 msg += ' Visit 
%s for more details
' % blocked_iframe 
 633             raise ExtractorError(msg, expected=True) 
 634         if '<title
>The URL you requested has been blocked
</title
>' in first_block: 
 636                 'Access to this webpage has been blocked by Indian censorship
. ' 
 637                 'Use a VPN 
or proxy 
server (with --proxy
) to route around it
.') 
 638             block_msg = self._html_search_regex( 
 639                 r'</h1
><p
>(.*?
)</p
>', 
 640                 content, 'block message
', default=None) 
 642                 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ') 
 643             raise ExtractorError(msg, expected=True) 
 644         if ('<title
>TTK 
:: ŠŠ¾ŃŃŃŠæ Šŗ ŃŠµŃŃŃŃŃ Š¾Š³ŃŠ°Š½ŠøŃен
</title
>' in content and 
 645                 'blocklist
.rkn
.gov
.ru
' in content): 
 646             raise ExtractorError( 
 647                 'Access to this webpage has been blocked by decision of the Russian government
. ' 
 648                 'Visit http
://blocklist
.rkn
.gov
.ru
/ for a block reason
.', 
 651     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): 
 652         content_type = urlh.headers.get('Content
-Type
', '') 
 653         webpage_bytes = urlh.read() 
 654         if prefix is not None: 
 655             webpage_bytes = prefix + webpage_bytes 
 657             encoding = self._guess_encoding_from_content(content_type, webpage_bytes) 
 658         if self._downloader.params.get('dump_intermediate_pages
', False): 
 659             self.to_screen('Dumping request to 
' + urlh.geturl()) 
 660             dump = base64.b64encode(webpage_bytes).decode('ascii
') 
 661             self._downloader.to_screen(dump) 
 662         if self._downloader.params.get('write_pages
', False): 
 663             basen = '%s_%s' % (video_id, urlh.geturl()) 
 665                 h = '___
' + hashlib.md5(basen.encode('utf
-8')).hexdigest() 
 666                 basen = basen[:240 - len(h)] + h 
 667             raw_filename = basen + '.dump
' 
 668             filename = sanitize_filename(raw_filename, restricted=True) 
 669             self.to_screen('Saving request to 
' + filename) 
 670             # Working around MAX_PATH limitation on Windows (see 
 671             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) 
 672             if compat_os_name == 'nt
': 
 673                 absfilepath = os.path.abspath(filename) 
 674                 if len(absfilepath) > 259: 
 675                     filename = '\\\\?
\\' + absfilepath 
 676             with open(filename, 'wb
') as outf: 
 677                 outf.write(webpage_bytes) 
 680             content = webpage_bytes.decode(encoding, 'replace
') 
 682             content = webpage_bytes.decode('utf
-8', 'replace
') 
 684         self.__check_blocked(content) 
 688     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): 
 689         """ Returns the data of the page as a string """ 
 692         while success is False: 
 694                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) 
 696             except compat_http_client.IncompleteRead as e: 
 698                 if try_count >= tries: 
 700                 self._sleep(timeout, video_id) 
 707     def _download_xml_handle( 
 708             self, url_or_request, video_id, note='Downloading XML
', 
 709             errnote='Unable to download XML
', transform_source=None, 
 710             fatal=True, encoding=None, data=None, headers={}, query={}): 
 711         """Return a tuple (xml as an xml.etree.ElementTree.Element, URL handle)""" 
 712         res = self._download_webpage_handle( 
 713             url_or_request, video_id, note, errnote, fatal=fatal, 
 714             encoding=encoding, data=data, headers=headers, query=query) 
 717         xml_string, urlh = res 
 718         return self._parse_xml( 
 719             xml_string, video_id, transform_source=transform_source, 
 722     def _download_xml(self, url_or_request, video_id, 
 723                       note='Downloading XML
', errnote='Unable to download XML
', 
 724                       transform_source=None, fatal=True, encoding=None, 
 725                       data=None, headers={}, query={}): 
 726         """Return the xml as an xml.etree.ElementTree.Element""" 
 727         res = self._download_xml_handle( 
 728             url_or_request, video_id, note=note, errnote=errnote, 
 729             transform_source=transform_source, fatal=fatal, encoding=encoding, 
 730             data=data, headers=headers, query=query) 
 731         return res if res is False else res[0] 
 733     def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True): 
 735             xml_string = transform_source(xml_string) 
 737             return compat_etree_fromstring(xml_string.encode('utf
-8')) 
 738         except compat_xml_parse_error as ve: 
 739             errmsg = '%s: Failed to parse XML 
' % video_id 
 741                 raise ExtractorError(errmsg, cause=ve) 
 743                 self.report_warning(errmsg + str(ve)) 
 745     def _download_json_handle( 
 746             self, url_or_request, video_id, note='Downloading JSON metadata
', 
 747             errnote='Unable to download JSON metadata
', transform_source=None, 
 748             fatal=True, encoding=None, data=None, headers={}, query={}): 
 749         """Return a tuple (JSON object, URL handle)""" 
 750         res = self._download_webpage_handle( 
 751             url_or_request, video_id, note, errnote, fatal=fatal, 
 752             encoding=encoding, data=data, headers=headers, query=query) 
 755         json_string, urlh = res 
 756         return self._parse_json( 
 757             json_string, video_id, transform_source=transform_source, 
 761             self, url_or_request, video_id, note='Downloading JSON metadata
', 
 762             errnote='Unable to download JSON metadata
', transform_source=None, 
 763             fatal=True, encoding=None, data=None, headers={}, query={}): 
 764         res = self._download_json_handle( 
 765             url_or_request, video_id, note=note, errnote=errnote, 
 766             transform_source=transform_source, fatal=fatal, encoding=encoding, 
 767             data=data, headers=headers, query=query) 
 768         return res if res is False else res[0] 
 770     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): 
 772             json_string = transform_source(json_string) 
 774             return json.loads(json_string) 
 775         except ValueError as ve: 
 776             errmsg = '%s: Failed to parse JSON 
' % video_id 
 778                 raise ExtractorError(errmsg, cause=ve) 
 780                 self.report_warning(errmsg + str(ve)) 
 782     def report_warning(self, msg, video_id=None): 
 783         idstr = '' if video_id is None else '%s: ' % video_id 
 784         self._downloader.report_warning( 
 785             '[%s] %s%s' % (self.IE_NAME, idstr, msg)) 
 787     def to_screen(self, msg): 
 788         """Print msg to screen, prefixing it with '[ie_name
]'""" 
 789         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) 
 791     def report_extraction(self, id_or_name): 
 792         """Report information extraction.""" 
 793         self.to_screen('%s: Extracting information
' % id_or_name) 
 795     def report_download_webpage(self, video_id): 
 796         """Report webpage download.""" 
 797         self.to_screen('%s: Downloading webpage
' % video_id) 
 799     def report_age_confirmation(self): 
 800         """Report attempt to confirm age.""" 
 801         self.to_screen('Confirming age
') 
 803     def report_login(self): 
 804         """Report attempt to log in.""" 
 805         self.to_screen('Logging 
in') 
 808     def raise_login_required(msg='This video 
is only available 
for registered users
'): 
 809         raise ExtractorError( 
 810             '%s. Use 
--username 
and --password 
or --netrc to provide account credentials
.' % msg, 
 814     def raise_geo_restricted(msg='This video 
is not available 
from your location due to geo restriction
', countries=None): 
 815         raise GeoRestrictedError(msg, countries=countries) 
 817     # Methods for following #608 
 819     def url_result(url, ie=None, video_id=None, video_title=None): 
 820         """Returns a URL that points to a page that should be processed""" 
 821         # TODO: ie should be the class used for getting the info 
 822         video_info = {'_type
': 'url
', 
 825         if video_id is not None: 
 826             video_info['id'] = video_id 
 827         if video_title is not None: 
 828             video_info['title
'] = video_title 
 831     def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None): 
 833             self.url_result(self._proto_relative_url(getter(m) if getter else m), ie) 
 835         return self.playlist_result( 
 836             urls, playlist_id=playlist_id, playlist_title=playlist_title) 
 839     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): 
 840         """Returns a playlist""" 
 841         video_info = {'_type
': 'playlist
', 
 844             video_info['id'] = playlist_id 
 846             video_info['title
'] = playlist_title 
 847         if playlist_description: 
 848             video_info['description
'] = playlist_description 
 851     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 853         Perform a regex search on the given string, using a single or a list of 
 854         patterns returning the first matching group. 
 855         In case of failure return a default value or raise a WARNING or a 
 856         RegexNotFoundError, depending on fatal, specifying the field name. 
 858         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 859             mobj = re.search(pattern, string, flags) 
 862                 mobj = re.search(p, string, flags) 
 866         if not self._downloader.params.get('no_color
') and compat_os_name != 'nt
' and sys.stderr.isatty(): 
 867             _name = '\033[0;34m
%s\033[0m
' % name 
 873                 # return the first matching group 
 874                 return next(g for g in mobj.groups() if g is not None) 
 876                 return mobj.group(group) 
 877         elif default is not NO_DEFAULT: 
 880             raise RegexNotFoundError('Unable to extract 
%s' % _name) 
 882             self._downloader.report_warning('unable to extract 
%s' % _name + bug_reports_message()) 
 885     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 887         Like _search_regex, but strips HTML tags and unescapes entities. 
 889         res = self._search_regex(pattern, string, name, default, fatal, flags, group) 
 891             return clean_html(res).strip() 
 895     def _get_netrc_login_info(self, netrc_machine=None): 
 898         netrc_machine = netrc_machine or self._NETRC_MACHINE 
 900         if self._downloader.params.get('usenetrc
', False): 
 902                 info = netrc.netrc().authenticators(netrc_machine) 
 907                     raise netrc.NetrcParseError( 
 908                         'No authenticators 
for %s' % netrc_machine) 
 909             except (IOError, netrc.NetrcParseError) as err: 
 910                 self._downloader.report_warning( 
 911                     'parsing 
.netrc
: %s' % error_to_compat_str(err)) 
 913         return username, password 
 915     def _get_login_info(self, username_option='username
', password_option='password
', netrc_machine=None): 
 917         Get the login info as (username, password) 
 918         First look for the manually specified credentials using username_option 
 919         and password_option as keys in params dictionary. If no such credentials 
 920         available look in the netrc file using the netrc_machine or _NETRC_MACHINE 
 922         If there's no info available
, return (None, None) 
 924         if self._downloader is None: 
 927         downloader_params = self._downloader.params 
 929         # Attempt to use provided username and password or .netrc data 
 930         if downloader_params.get(username_option) is not None: 
 931             username = downloader_params[username_option] 
 932             password = downloader_params[password_option] 
 934             username, password = self._get_netrc_login_info(netrc_machine) 
 936         return username, password 
 938     def _get_tfa_info(self, note='two-factor verification code'): 
 940         Get the two
-factor authentication info
 
 941         TODO 
- asking the user will be required 
for sms
/phone verify
 
 942         currently just uses the command line option
 
 943         If there
's no info available, return None 
 945         if self._downloader is None: 
 947         downloader_params = self._downloader.params 
 949         if downloader_params.get('twofactor
') is not None: 
 950             return downloader_params['twofactor
'] 
 952         return compat_getpass('Type 
%s and press 
[Return
]: ' % note) 
 954     # Helper functions for extracting OpenGraph info 
 956     def _og_regexes(prop): 
 957         content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))' 
 958         property_re = (r'(?
:name|
property)=(?
:\'og
:%(prop)s\'|
"og:%(prop)s"|\s
*og
:%(prop)s\b)' 
 959                        % {'prop
': re.escape(prop)}) 
 960         template = r'<meta
[^
>]+?
%s[^
>]+?
%s' 
 962             template % (property_re, content_re), 
 963             template % (content_re, property_re), 
 967     def _meta_regex(prop): 
 968         return r'''(?isx)<meta 
 969                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) 
 970                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) 
 972     def _og_search_property(self, prop, html, name=None, **kargs): 
 973         if not isinstance(prop, (list, tuple)): 
 976             name = 'OpenGraph 
%s' % prop[0] 
 979             og_regexes.extend(self._og_regexes(p)) 
 980         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) 
 983         return unescapeHTML(escaped) 
 985     def _og_search_thumbnail(self, html, **kargs): 
 986         return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs) 
 988     def _og_search_description(self, html, **kargs): 
 989         return self._og_search_property('description
', html, fatal=False, **kargs) 
 991     def _og_search_title(self, html, **kargs): 
 992         return self._og_search_property('title
', html, **kargs) 
 994     def _og_search_video_url(self, html, name='video url
', secure=True, **kargs): 
 995         regexes = self._og_regexes('video
') + self._og_regexes('video
:url
') 
 997             regexes = self._og_regexes('video
:secure_url
') + regexes 
 998         return self._html_search_regex(regexes, html, name, **kargs) 
1000     def _og_search_url(self, html, **kargs): 
1001         return self._og_search_property('url
', html, **kargs) 
1003     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): 
1004         if not isinstance(name, (list, tuple)): 
1006         if display_name is None: 
1007             display_name = name[0] 
1008         return self._html_search_regex( 
1009             [self._meta_regex(n) for n in name], 
1010             html, display_name, fatal=fatal, group='content
', **kwargs) 
1012     def _dc_search_uploader(self, html): 
1013         return self._html_search_meta('dc
.creator
', html, 'uploader
') 
1015     def _rta_search(self, html): 
1016         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
1017         if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+' 
1018                      r'     content
="RTA-5042-1996-1400-1577-RTA"', 
1023     def _media_rating_search(self, html): 
1024         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 
1025         rating = self._html_search_meta('rating
', html) 
1037         return RATING_TABLE.get(rating.lower()) 
1039     def _family_friendly_search(self, html): 
1040         # See http://schema.org/VideoObject 
1041         family_friendly = self._html_search_meta( 
1042             'isFamilyFriendly
', html, default=None) 
1044         if not family_friendly: 
1053         return RATING_TABLE.get(family_friendly.lower()) 
1055     def _twitter_search_player(self, html): 
1056         return self._html_search_meta('twitter
:player
', html, 
1057                                       'twitter card player
') 
1059     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): 
1060         json_ld = self._search_regex( 
1061             r'(?s
)<script
[^
>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', 
1062             html, 'JSON-LD', group='json_ld', **kwargs) 
1063         default = kwargs.get('default', NO_DEFAULT) 
1065             return default if default is not NO_DEFAULT else {} 
1066         # JSON-LD may be malformed and thus `fatal` should be respected. 
1067         # At the same time `default` may be passed that assumes `fatal=False` 
1068         # for _search_regex. Let's simulate the same behavior here as well. 
1069         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False 
1070         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) 
1072     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): 
1073         if isinstance(json_ld, compat_str): 
1074             json_ld = self._parse_json(json_ld, video_id, fatal=fatal) 
1078         if not isinstance(json_ld, (list, tuple, dict)): 
1080         if isinstance(json_ld, dict): 
1083         INTERACTION_TYPE_MAP = { 
1084             'CommentAction': 'comment', 
1085             'AgreeAction': 'like', 
1086             'DisagreeAction': 'dislike', 
1087             'LikeAction': 'like', 
1088             'DislikeAction': 'dislike', 
1089             'ListenAction': 'view', 
1090             'WatchAction': 'view', 
1091             'ViewAction': 'view', 
1094         def extract_interaction_statistic(e): 
1095             interaction_statistic = e.get('interactionStatistic') 
1096             if not isinstance(interaction_statistic, list): 
1098             for is_e in interaction_statistic: 
1099                 if not isinstance(is_e, dict): 
1101                 if is_e.get('@type') != 'InteractionCounter': 
1103                 interaction_type = is_e.get('interactionType') 
1104                 if not isinstance(interaction_type, compat_str): 
1106                 interaction_count = int_or_none(is_e.get('userInteractionCount')) 
1107                 if interaction_count is None: 
1109                 count_kind = INTERACTION_TYPE_MAP.get(interaction_type.split('/')[-1]) 
1112                 count_key = '%s_count' % count_kind 
1113                 if info.get(count_key) is not None: 
1115                 info[count_key] = interaction_count 
1117         def extract_video_object(e): 
1118             assert e['@type'] == 'VideoObject' 
1120                 'url': e.get('contentUrl'), 
1121                 'title': unescapeHTML(e.get('name')), 
1122                 'description': unescapeHTML(e.get('description')), 
1123                 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), 
1124                 'duration': parse_duration(e.get('duration')), 
1125                 'timestamp': unified_timestamp(e.get('uploadDate')), 
1126                 'filesize': float_or_none(e.get('contentSize')), 
1127                 'tbr': int_or_none(e.get('bitrate')), 
1128                 'width': int_or_none(e.get('width')), 
1129                 'height': int_or_none(e.get('height')), 
1130                 'view_count': int_or_none(e.get('interactionCount')), 
1132             extract_interaction_statistic(e) 
1135             if isinstance(e.get('@context'), compat_str) and re.match(r'^https?://schema.org/?$', e.get('@context')): 
1136                 item_type = e.get('@type') 
1137                 if expected_type is not None and expected_type != item_type: 
1139                 if item_type in ('TVEpisode', 'Episode'): 
1141                         'episode': unescapeHTML(e.get('name')), 
1142                         'episode_number': int_or_none(e.get('episodeNumber')), 
1143                         'description': unescapeHTML(e.get('description')), 
1145                     part_of_season = e.get('partOfSeason') 
1146                     if isinstance(part_of_season, dict) and part_of_season.get('@type') in ('TVSeason', 'Season', 'CreativeWorkSeason'): 
1147                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) 
1148                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') 
1149                     if isinstance(part_of_series, dict) and part_of_series.get('@type') in ('TVSeries', 'Series', 'CreativeWorkSeries'): 
1150                         info['series'] = unescapeHTML(part_of_series.get('name')) 
1151                 elif item_type in ('Article', 'NewsArticle'): 
1153                         'timestamp': parse_iso8601(e.get('datePublished')), 
1154                         'title': unescapeHTML(e.get('headline')), 
1155                         'description': unescapeHTML(e.get('articleBody')), 
1157                 elif item_type == 'VideoObject': 
1158                     extract_video_object(e) 
1160                 video = e.get('video') 
1161                 if isinstance(video, dict) and video.get('@type') == 'VideoObject': 
1162                     extract_video_object(video) 
1164         return dict((k, v) for k, v in info.items() if v is not None) 
1167     def _hidden_inputs(html): 
1168         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) 
1170         for input in re.findall(r'(?i)(<input[^>]+>)', html): 
1171             attrs = extract_attributes(input) 
1174             if attrs.get('type') not in ('hidden', 'submit'): 
1176             name = attrs.get('name') or attrs.get('id') 
1177             value = attrs.get('value') 
1178             if name and value is not None: 
1179                 hidden_inputs[name] = value 
1180         return hidden_inputs 
1182     def _form_hidden_inputs(self, form_id, html): 
1183         form = self._search_regex( 
1184             r'(?is)<form[^>]+?id=(["\'])%s\
1[^
>]*>(?P
<form
>.+?
)</form
>' % form_id, 
1185             html, '%s form
' % form_id, group='form
') 
1186         return self._hidden_inputs(form) 
1188     def _sort_formats(self, formats, field_preference=None): 
1190             raise ExtractorError('No video formats found
') 
1193             # Automatically determine tbr when missing based on abr and vbr (improves 
1194             # formats sorting in some cases) 
1195             if 'tbr
' not in f and f.get('abr
') is not None and f.get('vbr
') is not None: 
1196                 f['tbr
'] = f['abr
'] + f['vbr
'] 
1198         def _formats_key(f): 
1199             # TODO remove the following workaround 
1200             from ..utils import determine_ext 
1201             if not f.get('ext
') and 'url
' in f: 
1202                 f['ext
'] = determine_ext(f['url
']) 
1204             if isinstance(field_preference, (list, tuple)): 
1207                     if f.get(field) is not None 
1208                     else ('' if field == 'format_id
' else -1) 
1209                     for field in field_preference) 
1211             preference = f.get('preference
') 
1212             if preference is None: 
1214                 if f.get('ext
') in ['f4f
', 'f4m
']:  # Not yet supported 
1217             protocol = f.get('protocol
') or determine_protocol(f) 
1218             proto_preference = 0 if protocol in ['http
', 'https
'] else (-0.5 if protocol == 'rtsp
' else -0.1) 
1220             if f.get('vcodec
') == 'none
':  # audio only 
1222                 if self._downloader.params.get('prefer_free_formats
'): 
1223                     ORDER = ['aac
', 'mp3
', 'm4a
', 'webm
', 'ogg
', 'opus
'] 
1225                     ORDER = ['webm
', 'opus
', 'ogg
', 'mp3
', 'aac
', 'm4a
'] 
1228                     audio_ext_preference = ORDER.index(f['ext
']) 
1230                     audio_ext_preference = -1 
1232                 if f.get('acodec
') == 'none
':  # video only 
1234                 if self._downloader.params.get('prefer_free_formats
'): 
1235                     ORDER = ['flv
', 'mp4
', 'webm
'] 
1237                     ORDER = ['webm
', 'flv
', 'mp4
'] 
1239                     ext_preference = ORDER.index(f['ext
']) 
1242                 audio_ext_preference = 0 
1246                 f.get('language_preference
') if f.get('language_preference
') is not None else -1, 
1247                 f.get('quality
') if f.get('quality
') is not None else -1, 
1248                 f.get('tbr
') if f.get('tbr
') is not None else -1, 
1249                 f.get('filesize
') if f.get('filesize
') is not None else -1, 
1250                 f.get('vbr
') if f.get('vbr
') is not None else -1, 
1251                 f.get('height
') if f.get('height
') is not None else -1, 
1252                 f.get('width
') if f.get('width
') is not None else -1, 
1255                 f.get('abr
') if f.get('abr
') is not None else -1, 
1256                 audio_ext_preference, 
1257                 f.get('fps
') if f.get('fps
') is not None else -1, 
1258                 f.get('filesize_approx
') if f.get('filesize_approx
') is not None else -1, 
1259                 f.get('source_preference
') if f.get('source_preference
') is not None else -1, 
1260                 f.get('format_id
') if f.get('format_id
') is not None else '', 
1262         formats.sort(key=_formats_key) 
1264     def _check_formats(self, formats, video_id): 
1266             formats[:] = filter( 
1267                 lambda f: self._is_valid_url( 
1269                     item='%s video format
' % f.get('format_id
') if f.get('format_id
') else 'video
'), 
1273     def _remove_duplicate_formats(formats): 
1277             if f['url
'] not in format_urls: 
1278                 format_urls.add(f['url
']) 
1279                 unique_formats.append(f) 
1280         formats[:] = unique_formats 
1282     def _is_valid_url(self, url, video_id, item='video
', headers={}): 
1283         url = self._proto_relative_url(url, scheme='http
:') 
1284         # For now assume non HTTP(S) URLs always valid 
1285         if not (url.startswith('http
://') or url.startswith('https
://')): 
1288             self._request_webpage(url, video_id, 'Checking 
%s URL
' % item, headers=headers) 
1290         except ExtractorError as e: 
1291             if isinstance(e.cause, compat_urllib_error.URLError): 
1293                     '%s: %s URL 
is invalid
, skipping
' % (video_id, item)) 
1297     def http_scheme(self): 
1298         """ Either "http:" or "https:", depending on the user's preferences 
""" 
1301             if self._downloader.params.get('prefer_insecure', False) 
1304     def _proto_relative_url(self, url, scheme=None): 
1307         if url.startswith('//'): 
1309                 scheme = self.http_scheme() 
1314     def _sleep(self, timeout, video_id, msg_template=None): 
1315         if msg_template is None: 
1316             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' 
1317         msg = msg_template % {'video_id': video_id, 'timeout': timeout} 
1321     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, 
1322                              transform_source=lambda s: fix_xml_ampersands(s).strip(), 
1323                              fatal=True, m3u8_id=None): 
1324         manifest = self._download_xml( 
1325             manifest_url, video_id, 'Downloading f4m manifest', 
1326             'Unable to download f4m manifest', 
1327             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests 
1328             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) 
1329             transform_source=transform_source, 
1332         if manifest is False: 
1335         return self._parse_f4m_formats( 
1336             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1337             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) 
1339     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, 
1340                            transform_source=lambda s: fix_xml_ampersands(s).strip(), 
1341                            fatal=True, m3u8_id=None): 
1342         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy 
1343         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') 
1344         if akamai_pv is not None and ';' in akamai_pv.text: 
1345             playerVerificationChallenge = akamai_pv.text.split(';')[0] 
1346             if playerVerificationChallenge.strip() != '': 
1350         manifest_version = '1.0' 
1351         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') 
1353             manifest_version = '2.0' 
1354             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') 
1355         # Remove unsupported DRM protected media from final formats 
1356         # rendition (see https://github.com/rg3/youtube-dl/issues/8573). 
1357         media_nodes = remove_encrypted_media(media_nodes) 
1361         manifest_base_url = get_base_url(manifest) 
1363         bootstrap_info = xpath_element( 
1364             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 
1365             'bootstrap info', default=None) 
1368         mime_type = xpath_text( 
1369             manifest, ['{http://ns.adobe.com/f4m/1.0}mimeType', '{http://ns.adobe.com/f4m/2.0}mimeType'], 
1370             'base URL', default=None) 
1371         if mime_type and mime_type.startswith('audio/'): 
1374         for i, media_el in enumerate(media_nodes): 
1375             tbr = int_or_none(media_el.attrib.get('bitrate')) 
1376             width = int_or_none(media_el.attrib.get('width')) 
1377             height = int_or_none(media_el.attrib.get('height')) 
1378             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) 
1379             # If <bootstrapInfo> is present, the specified f4m is a 
1380             # stream-level manifest, and only set-level manifests may refer to 
1381             # external resources.  See section 11.4 and section 4 of F4M spec 
1382             if bootstrap_info is None: 
1384                 # @href is introduced in 2.0, see section 11.6 of F4M spec 
1385                 if manifest_version == '2.0': 
1386                     media_url = media_el.attrib.get('href') 
1387                 if media_url is None: 
1388                     media_url = media_el.attrib.get('url') 
1392                     media_url if media_url.startswith('http://') or media_url.startswith('https://') 
1393                     else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) 
1394                 # If media_url is itself a f4m manifest do the recursive extraction 
1395                 # since bitrates in parent manifest (this one) and media_url manifest 
1396                 # may differ leading to inability to resolve the format by requested 
1397                 # bitrate in f4m downloader 
1398                 ext = determine_ext(manifest_url) 
1400                     f4m_formats = self._extract_f4m_formats( 
1401                         manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1402                         transform_source=transform_source, fatal=fatal) 
1403                     # Sometimes stream-level manifest contains single media entry that 
1404                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). 
1405                     # At the same time parent's media entry in set-level manifest may 
1406                     # contain it. We will copy it from parent in such cases. 
1407                     if len(f4m_formats) == 1: 
1410                             'tbr': f.get('tbr') or tbr, 
1411                             'width': f.get('width') or width, 
1412                             'height': f.get('height') or height, 
1413                             'format_id': f.get('format_id') if not tbr else format_id, 
1416                     formats.extend(f4m_formats) 
1419                     formats.extend(self._extract_m3u8_formats( 
1420                         manifest_url, video_id, 'mp4', preference=preference, 
1421                         m3u8_id=m3u8_id, fatal=fatal)) 
1424                 'format_id': format_id, 
1425                 'url': manifest_url, 
1426                 'manifest_url': manifest_url, 
1427                 'ext': 'flv' if bootstrap_info is not None else None, 
1433                 'preference': preference, 
1437     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): 
1439             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 
1443             'preference': preference - 100 if preference else -100, 
1444             'resolution': 'multiple', 
1445             'format_note': 'Quality selection URL', 
1448     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, 
1449                               entry_protocol='m3u8', preference=None, 
1450                               m3u8_id=None, note=None, errnote=None, 
1451                               fatal=True, live=False): 
1452         res = self._download_webpage_handle( 
1454             note=note or 'Downloading m3u8 information', 
1455             errnote=errnote or 'Failed to download m3u8 information', 
1461         m3u8_doc, urlh = res 
1462         m3u8_url = urlh.geturl() 
1464         return self._parse_m3u8_formats( 
1465             m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, 
1466             preference=preference, m3u8_id=m3u8_id, live=live) 
1468     def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, 
1469                             entry_protocol='m3u8', preference=None, 
1470                             m3u8_id=None, live=False): 
1471         if '#EXT-X-FAXS-CM:' in m3u8_doc:  # Adobe Flash Access 
1474         if re.search(r'#EXT-X-SESSION-KEY:.*?URI="skd://', m3u8_doc):  # Apple FairPlay 
1479         format_url = lambda u: ( 
1481             if re.match(r'^https?://', u) 
1482             else compat_urlparse.urljoin(m3u8_url, u)) 
1485         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 
1486         # 2. https://github.com/rg3/youtube-dl/issues/12211 
1488         # We should try extracting formats only from master playlists [1, 4.3.4], 
1489         # i.e. playlists that describe available qualities. On the other hand 
1490         # media playlists [1, 4.3.3] should be returned as is since they contain 
1491         # just the media without qualities renditions. 
1492         # Fortunately, master playlist can be easily distinguished from media 
1493         # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] 
1494         # master playlist tags MUST NOT appear in a media playist and vice versa. 
1495         # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every 
1496         # media playlist and MUST NOT appear in master playlist thus we can 
1497         # clearly detect media playlist with this criterion. 
1499         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is 
1502                 'format_id': m3u8_id, 
1504                 'protocol': entry_protocol, 
1505                 'preference': preference, 
1509         last_stream_inf = {} 
1511         def extract_media(x_media_line): 
1512             media = parse_m3u8_attributes(x_media_line) 
1513             # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED 
1514             media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') 
1515             if not (media_type and group_id and name): 
1517             groups.setdefault(group_id, []).append(media) 
1518             if media_type not in ('VIDEO', 'AUDIO'): 
1520             media_url = media.get('URI') 
1523                 for v in (m3u8_id, group_id, name): 
1527                     'format_id': '-'.join(format_id), 
1528                     'url': format_url(media_url), 
1529                     'manifest_url': m3u8_url, 
1530                     'language': media.get('LANGUAGE'), 
1532                     'protocol': entry_protocol, 
1533                     'preference': preference, 
1535                 if media_type == 'AUDIO': 
1536                     f['vcodec'] = 'none' 
1539         def build_stream_name(): 
1540             # Despite specification does not mention NAME attribute for 
1541             # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] 
1542             # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) 
1543             # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 
1544             stream_name = last_stream_inf.get('NAME') 
1547             # If there is no NAME in EXT-X-STREAM-INF it will be obtained 
1548             # from corresponding rendition group 
1549             stream_group_id = last_stream_inf.get('VIDEO') 
1550             if not stream_group_id: 
1552             stream_group = groups.get(stream_group_id) 
1553             if not stream_group: 
1554                 return stream_group_id 
1555             rendition = stream_group[0] 
1556             return rendition.get('NAME') or stream_group_id 
1558         for line in m3u8_doc.splitlines(): 
1559             if line.startswith('#EXT-X-STREAM-INF:'): 
1560                 last_stream_inf = parse_m3u8_attributes(line) 
1561             elif line.startswith('#EXT-X-MEDIA:'): 
1563             elif line.startswith('#') or not line.strip(): 
1566                 tbr = float_or_none( 
1567                     last_stream_inf.get('AVERAGE-BANDWIDTH') or 
1568                     last_stream_inf.get('BANDWIDTH'), scale=1000) 
1571                     format_id.append(m3u8_id) 
1572                 stream_name = build_stream_name() 
1573                 # Bandwidth of live streams may differ over time thus making 
1574                 # format_id unpredictable. So it's better to keep provided 
1577                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) 
1578                 manifest_url = format_url(line.strip()) 
1580                     'format_id': '-'.join(format_id), 
1581                     'url': manifest_url, 
1582                     'manifest_url': m3u8_url, 
1585                     'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), 
1586                     'protocol': entry_protocol, 
1587                     'preference': preference, 
1589                 resolution = last_stream_inf.get('RESOLUTION') 
1591                     mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) 
1593                         f['width'] = int(mobj.group('width')) 
1594                         f['height'] = int(mobj.group('height')) 
1595                 # Unified Streaming Platform 
1597                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) 
1599                     abr, vbr = mobj.groups() 
1600                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) 
1605                 codecs = parse_codecs(last_stream_inf.get('CODECS')) 
1607                 audio_group_id = last_stream_inf.get('AUDIO') 
1608                 # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which 
1609                 # references a rendition group MUST have a CODECS attribute. 
1610                 # However, this is not always respected, for example, [2] 
1611                 # contains EXT-X-STREAM-INF tag which references AUDIO 
1612                 # rendition group but does not have CODECS and despite 
1613                 # referencing audio group an audio group, it represents 
1614                 # a complete (with audio and video) format. So, for such cases 
1615                 # we will ignore references to rendition groups and treat them 
1616                 # as complete formats. 
1617                 if audio_group_id and codecs and f.get('vcodec') != 'none': 
1618                     audio_group = groups.get(audio_group_id) 
1619                     if audio_group and audio_group[0].get('URI'): 
1620                         # TODO: update acodec for audio only formats with 
1622                         f['acodec'] = 'none' 
1624                 last_stream_inf = {} 
1628     def _xpath_ns(path, namespace=None): 
1632         for c in path.split('/'): 
1633             if not c or c == '.': 
1636                 out.append('{%s}%s' % (namespace, c)) 
1637         return '/'.join(out) 
1639     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): 
1640         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) 
1646         namespace = self._parse_smil_namespace(smil) 
1648         return self._parse_smil_formats( 
1649             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1651     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): 
1652         smil = self._download_smil(smil_url, video_id, fatal=fatal) 
1655         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) 
1657     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): 
1658         return self._download_xml( 
1659             smil_url, video_id, 'Downloading SMIL file', 
1660             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) 
1662     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): 
1663         namespace = self._parse_smil_namespace(smil) 
1665         formats = self._parse_smil_formats( 
1666             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1667         subtitles = self._parse_smil_subtitles(smil, namespace=namespace) 
1669         video_id = os.path.splitext(url_basename(smil_url))[0] 
1673         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1674             name = meta.attrib.get('name') 
1675             content = meta.attrib.get('content') 
1676             if not name or not content: 
1678             if not title and name == 'title': 
1680             elif not description and name in ('description', 'abstract'): 
1681                 description = content 
1682             elif not upload_date and name == 'date': 
1683                 upload_date = unified_strdate(content) 
1686             'id': image.get('type'), 
1687             'url': image.get('src'), 
1688             'width': int_or_none(image.get('width')), 
1689             'height': int_or_none(image.get('height')), 
1690         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] 
1694             'title': title or video_id, 
1695             'description': description, 
1696             'upload_date': upload_date, 
1697             'thumbnails': thumbnails, 
1699             'subtitles': subtitles, 
1702     def _parse_smil_namespace(self, smil): 
1703         return self._search_regex( 
1704             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) 
1706     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): 
1708         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1709             b = meta.get('base') or meta.get('httpBase') 
1720         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) 
1721         for medium in media: 
1722             src = medium.get('src') 
1723             if not src or src in srcs: 
1727             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) 
1728             filesize = int_or_none(medium.get('size') or medium.get('fileSize')) 
1729             width = int_or_none(medium.get('width')) 
1730             height = int_or_none(medium.get('height')) 
1731             proto = medium.get('proto') 
1732             ext = medium.get('ext') 
1733             src_ext = determine_ext(src) 
1734             streamer = medium.get('streamer') or base 
1736             if proto == 'rtmp' or streamer.startswith('rtmp'): 
1742                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), 
1744                     'filesize': filesize, 
1748                 if transform_rtmp_url: 
1749                     streamer, src = transform_rtmp_url(streamer, src) 
1750                     formats[-1].update({ 
1756             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) 
1757             src_url = src_url.strip() 
1759             if proto == 'm3u8' or src_ext == 'm3u8': 
1760                 m3u8_formats = self._extract_m3u8_formats( 
1761                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) 
1762                 if len(m3u8_formats) == 1: 
1764                     m3u8_formats[0].update({ 
1765                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), 
1770                 formats.extend(m3u8_formats) 
1773             if src_ext == 'f4m': 
1778                         'plugin': 'flowplayer-3.2.0.1', 
1780                 f4m_url += '&' if '?' in f4m_url else '?' 
1781                 f4m_url += compat_urllib_parse_urlencode(f4m_params) 
1782                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) 
1785             if src_url.startswith('http') and self._is_valid_url(src, video_id): 
1789                     'ext': ext or src_ext or 'flv', 
1790                     'format_id': 'http-%d' % (bitrate or http_count), 
1792                     'filesize': filesize, 
1800     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): 
1803         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): 
1804             src = textstream.get('src') 
1805             if not src or src in urls: 
1808             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) 
1809             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang 
1810             subtitles.setdefault(lang, []).append({ 
1816     def _extract_xspf_playlist(self, xspf_url, playlist_id, fatal=True): 
1817         xspf = self._download_xml( 
1818             xspf_url, playlist_id, 'Downloading xpsf playlist', 
1819             'Unable to download xspf manifest', fatal=fatal) 
1822         return self._parse_xspf( 
1823             xspf, playlist_id, xspf_url=xspf_url, 
1824             xspf_base_url=base_url(xspf_url)) 
1826     def _parse_xspf(self, xspf_doc, playlist_id, xspf_url=None, xspf_base_url=None): 
1828             'xspf': 'http://xspf.org/ns/0/', 
1829             's1': 'http://static.streamone.nl/player/ns/0', 
1833         for track in xspf_doc.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): 
1835                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) 
1836             description = xpath_text( 
1837                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') 
1838             thumbnail = xpath_text( 
1839                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') 
1840             duration = float_or_none( 
1841                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) 
1844             for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP)): 
1845                 format_url = urljoin(xspf_base_url, location.text) 
1850                     'manifest_url': xspf_url, 
1851                     'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), 
1852                     'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 
1853                     'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), 
1855             self._sort_formats(formats) 
1860                 'description': description, 
1861                 'thumbnail': thumbnail, 
1862                 'duration': duration, 
1867     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): 
1868         res = self._download_xml_handle( 
1870             note=note or 'Downloading MPD manifest', 
1871             errnote=errnote or 'Failed to download MPD manifest', 
1876         mpd_base_url = base_url(urlh.geturl()) 
1878         return self._parse_mpd_formats( 
1879             mpd_doc, mpd_id=mpd_id, mpd_base_url=mpd_base_url, 
1880             formats_dict=formats_dict, mpd_url=mpd_url) 
1882     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}, mpd_url=None): 
1884         Parse formats 
from MPD manifest
. 
1886          1. MPEG
-DASH Standard
, ISO
/IEC 
23009-1:2014(E
), 
1887             http
://standards
.iso
.org
/ittf
/PubliclyAvailableStandards
/c065274_ISO_IEC_23009
-1_2014.zip 
1888          2. https
://en
.wikipedia
.org
/wiki
/Dynamic_Adaptive_Streaming_over_HTTP
 
1890         if mpd_doc.get('type') == 'dynamic': 
1893         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) 
1896             return self._xpath_ns(path, namespace) 
1898         def is_drm_protected(element): 
1899             return element.find(_add_ns('ContentProtection')) is not None 
1901         def extract_multisegment_info(element, ms_parent_info): 
1902             ms_info = ms_parent_info.copy() 
1904             # As per [1, 5.3.9.2.2] SegmentList and SegmentTemplate share some 
1905             # common attributes and elements.  We will only extract relevant 
1907             def extract_common(source): 
1908                 segment_timeline = source.find(_add_ns('SegmentTimeline')) 
1909                 if segment_timeline is not None: 
1910                     s_e = segment_timeline.findall(_add_ns('S')) 
1912                         ms_info['total_number'] = 0 
1915                             r = int(s.get('r', 0)) 
1916                             ms_info['total_number'] += 1 + r 
1917                             ms_info['s'].append({ 
1918                                 't': int(s.get('t', 0)), 
1919                                 # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) 
1920                                 'd': int(s.attrib['d']), 
1923                 start_number = source.get('startNumber') 
1925                     ms_info['start_number'] = int(start_number) 
1926                 timescale = source.get('timescale') 
1928                     ms_info['timescale'] = int(timescale) 
1929                 segment_duration = source.get('duration') 
1930                 if segment_duration: 
1931                     ms_info['segment_duration'] = float(segment_duration) 
1933             def extract_Initialization(source): 
1934                 initialization = source.find(_add_ns('Initialization')) 
1935                 if initialization is not None: 
1936                     ms_info['initialization_url'] = initialization.attrib['sourceURL'] 
1938             segment_list = element.find(_add_ns('SegmentList')) 
1939             if segment_list is not None: 
1940                 extract_common(segment_list) 
1941                 extract_Initialization(segment_list) 
1942                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) 
1944                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] 
1946                 segment_template = element.find(_add_ns('SegmentTemplate')) 
1947                 if segment_template is not None: 
1948                     extract_common(segment_template) 
1949                     media = segment_template.get('media') 
1951                         ms_info['media'] = media 
1952                     initialization = segment_template.get('initialization') 
1954                         ms_info['initialization'] = initialization 
1956                         extract_Initialization(segment_template) 
1959         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) 
1961         for period in mpd_doc.findall(_add_ns('Period')): 
1962             period_duration = parse_duration(period.get('duration')) or mpd_duration 
1963             period_ms_info = extract_multisegment_info(period, { 
1967             for adaptation_set in period.findall(_add_ns('AdaptationSet')): 
1968                 if is_drm_protected(adaptation_set): 
1970                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) 
1971                 for representation in adaptation_set.findall(_add_ns('Representation')): 
1972                     if is_drm_protected(representation): 
1974                     representation_attrib = adaptation_set.attrib.copy() 
1975                     representation_attrib.update(representation.attrib) 
1976                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory 
1977                     mime_type = representation_attrib['mimeType'] 
1978                     content_type = mime_type.split('/')[0] 
1979                     if content_type == 'text': 
1980                         # TODO implement WebVTT downloading 
1982                     elif content_type in ('video', 'audio'): 
1984                         for element in (representation, adaptation_set, period, mpd_doc): 
1985                             base_url_e = element.find(_add_ns('BaseURL')) 
1986                             if base_url_e is not None: 
1987                                 base_url = base_url_e.text + base_url 
1988                                 if re.match(r'^https?://', base_url): 
1990                         if mpd_base_url and not re.match(r'^https?://', base_url): 
1991                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'): 
1993                             base_url = mpd_base_url + base_url 
1994                         representation_id = representation_attrib.get('id') 
1995                         lang = representation_attrib.get('lang') 
1996                         url_el = representation.find(_add_ns('BaseURL')) 
1997                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) 
1998                         bandwidth = int_or_none(representation_attrib.get('bandwidth')) 
2000                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 
2002                             'manifest_url': mpd_url, 
2003                             'ext': mimetype2ext(mime_type), 
2004                             'width': int_or_none(representation_attrib.get('width')), 
2005                             'height': int_or_none(representation_attrib.get('height')), 
2006                             'tbr': float_or_none(bandwidth, 1000), 
2007                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 
2008                             'fps': int_or_none(representation_attrib.get('frameRate')), 
2009                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 
2010                             'format_note': 'DASH %s' % content_type, 
2011                             'filesize': filesize, 
2012                             'container': mimetype2ext(mime_type) + '_dash', 
2014                         f.update(parse_codecs(representation_attrib.get('codecs'))) 
2015                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) 
2017                         def prepare_template(template_name, identifiers): 
2018                             t = representation_ms_info[template_name] 
2019                             t = t.replace('$RepresentationID$', representation_id) 
2020                             t = re.sub(r'\$(%s)\$' % '|'.join(identifiers), r'%(\1)d', t) 
2021                             t = re.sub(r'\$(%s)%%([^$]+)\$' % '|'.join(identifiers), r'%(\1)\2', t) 
2022                             t.replace('$$', '$') 
2025                         # @initialization is a regular template like @media one 
2026                         # so it should be handled just the same way (see 
2027                         # https://github.com/rg3/youtube-dl/issues/11605) 
2028                         if 'initialization' in representation_ms_info: 
2029                             initialization_template = prepare_template( 
2031                                 # As per [1, 5.3.9.4.2, Table 15, page 54] $Number$ and 
2032                                 # $Time$ shall not be included for @initialization thus 
2033                                 # only $Bandwidth$ remains 
2035                             representation_ms_info['initialization_url'] = initialization_template % { 
2036                                 'Bandwidth': bandwidth, 
2039                         def location_key(location): 
2040                             return 'url' if re.match(r'^https?://', location) else 'path' 
2042                         if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info: 
2044                             media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time')) 
2045                             media_location_key = location_key(media_template) 
2047                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ 
2048                             # can't be used at the same time 
2049                             if '%(Number' in media_template and 's' not in representation_ms_info: 
2050                                 segment_duration = None 
2051                                 if 'total_number' not in representation_ms_info and 'segment_duration' in representation_ms_info: 
2052                                     segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale']) 
2053                                     representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) 
2054                                 representation_ms_info['fragments'] = [{ 
2055                                     media_location_key: media_template % { 
2056                                         'Number': segment_number, 
2057                                         'Bandwidth': bandwidth, 
2059                                     'duration': segment_duration, 
2060                                 } for segment_number in range( 
2061                                     representation_ms_info['start_number'], 
2062                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])] 
2064                                 # $Number*$ or $Time$ in media template with S list available 
2065                                 # Example $Number*$: http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg 
2066                                 # Example $Time$: https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411 
2067                                 representation_ms_info['fragments'] = [] 
2070                                 segment_number = representation_ms_info['start_number'] 
2072                                 def add_segment_url(): 
2073                                     segment_url = media_template % { 
2074                                         'Time': segment_time, 
2075                                         'Bandwidth': bandwidth, 
2076                                         'Number': segment_number, 
2078                                     representation_ms_info['fragments'].append({ 
2079                                         media_location_key: segment_url, 
2080                                         'duration': float_or_none(segment_d, representation_ms_info['timescale']), 
2083                                 for num, s in enumerate(representation_ms_info['s']): 
2084                                     segment_time = s.get('t') or segment_time 
2088                                     for r in range(s.get('r', 0)): 
2089                                         segment_time += segment_d 
2092                                     segment_time += segment_d 
2093                         elif 'segment_urls' in representation_ms_info and 's' in representation_ms_info: 
2095                             # Example: https://www.youtube.com/watch?v=iXZV5uAYMJI 
2096                             # or any YouTube dashsegments video 
2099                             timescale = representation_ms_info['timescale'] 
2100                             for s in representation_ms_info['s']: 
2101                                 duration = float_or_none(s['d'], timescale) 
2102                                 for r in range(s.get('r', 0) + 1): 
2103                                     segment_uri = representation_ms_info['segment_urls'][segment_index] 
2105                                         location_key(segment_uri): segment_uri, 
2106                                         'duration': duration, 
2109                             representation_ms_info['fragments'] = fragments 
2110                         elif 'segment_urls' in representation_ms_info: 
2111                             # Segment URLs with no SegmentTimeline 
2112                             # Example: https://www.seznam.cz/zpravy/clanek/cesko-zasahne-vitr-o-sile-vichrice-muze-byt-i-zivotu-nebezpecny-39091 
2113                             # https://github.com/rg3/youtube-dl/pull/14844 
2115                             segment_duration = float_or_none( 
2116                                 representation_ms_info['segment_duration'], 
2117                                 representation_ms_info['timescale']) if 'segment_duration' in representation_ms_info else None 
2118                             for segment_url in representation_ms_info['segment_urls']: 
2120                                     location_key(segment_url): segment_url, 
2122                                 if segment_duration: 
2123                                     fragment['duration'] = segment_duration 
2124                                 fragments.append(fragment) 
2125                             representation_ms_info['fragments'] = fragments 
2126                         # NB: MPD manifest may contain direct URLs to unfragmented media. 
2127                         # No fragments key is present in this case. 
2128                         if 'fragments' in representation_ms_info: 
2130                                 'fragment_base_url': base_url, 
2132                                 'protocol': 'http_dash_segments', 
2134                             if 'initialization_url' in representation_ms_info: 
2135                                 initialization_url = representation_ms_info['initialization_url'] 
2136                                 if not f.get('url'): 
2137                                     f['url'] = initialization_url 
2138                                 f['fragments'].append({location_key(initialization_url): initialization_url}) 
2139                             f['fragments'].extend(representation_ms_info['fragments']) 
2140                         # According to [1, 5.3.5.2, Table 7, page 35] @id of Representation 
2141                         # is not necessarily unique within a Period thus formats with 
2142                         # the same `format_id` are quite possible. There are numerous examples 
2143                         # of such manifests (see https://github.com/rg3/youtube-dl/issues/15111, 
2144                         # https://github.com/rg3/youtube-dl/issues/13919) 
2145                         full_info = formats_dict.get(representation_id, {}).copy() 
2147                         formats.append(full_info) 
2149                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) 
2152     def _extract_ism_formats(self, ism_url, video_id, ism_id=None, note=None, errnote=None, fatal=True): 
2153         res = self._download_xml_handle( 
2155             note=note or 'Downloading ISM manifest', 
2156             errnote=errnote or 'Failed to download ISM manifest', 
2162         return self._parse_ism_formats(ism_doc, urlh.geturl(), ism_id) 
2164     def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): 
2166         Parse formats 
from ISM manifest
. 
2168          1. [MS
-SSTR
]: Smooth Streaming Protocol
, 
2169             https
://msdn
.microsoft
.com
/en
-us
/library
/ff469518
.aspx
 
2171         if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: 
2174         duration = int(ism_doc.attrib['Duration']) 
2175         timescale = int_or_none(ism_doc.get('TimeScale')) or 10000000 
2178         for stream in ism_doc.findall('StreamIndex'): 
2179             stream_type = stream.get('Type') 
2180             if stream_type not in ('video', 'audio'): 
2182             url_pattern = stream.attrib['Url'] 
2183             stream_timescale = int_or_none(stream.get('TimeScale')) or timescale 
2184             stream_name = stream.get('Name') 
2185             for track in stream.findall('QualityLevel'): 
2186                 fourcc = track.get('FourCC', 'AACL' if track.get('AudioTag') == '255' else None) 
2187                 # TODO: add support for WVC1 and WMAP 
2188                 if fourcc not in ('H264', 'AVC1', 'AACL'): 
2189                     self.report_warning('%s is not a supported codec' % fourcc) 
2191                 tbr = int(track.attrib['Bitrate']) // 1000 
2192                 # [1] does not mention Width and Height attributes. However, 
2193                 # they're often present while MaxWidth and MaxHeight are 
2194                 # missing, so should be used as fallbacks 
2195                 width = int_or_none(track.get('MaxWidth') or track.get('Width')) 
2196                 height = int_or_none(track.get('MaxHeight') or track.get('Height')) 
2197                 sampling_rate = int_or_none(track.get('SamplingRate')) 
2199                 track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) 
2200                 track_url_pattern = compat_urlparse.urljoin(ism_url, track_url_pattern) 
2206                 stream_fragments = stream.findall('c') 
2207                 for stream_fragment_index, stream_fragment in enumerate(stream_fragments): 
2208                     fragment_ctx['time'] = int_or_none(stream_fragment.get('t')) or fragment_ctx['time'] 
2209                     fragment_repeat = int_or_none(stream_fragment.get('r')) or 1 
2210                     fragment_ctx['duration'] = int_or_none(stream_fragment.get('d')) 
2211                     if not fragment_ctx['duration']: 
2213                             next_fragment_time = int(stream_fragment[stream_fragment_index + 1].attrib['t']) 
2215                             next_fragment_time = duration 
2216                         fragment_ctx['duration'] = (next_fragment_time - fragment_ctx['time']) / fragment_repeat 
2217                     for _ in range(fragment_repeat): 
2219                             'url': re.sub(r'{start[ _]time}', compat_str(fragment_ctx['time']), track_url_pattern), 
2220                             'duration': fragment_ctx['duration'] / stream_timescale, 
2222                         fragment_ctx['time'] += fragment_ctx['duration'] 
2226                     format_id.append(ism_id) 
2228                     format_id.append(stream_name) 
2229                 format_id.append(compat_str(tbr)) 
2232                     'format_id': '-'.join(format_id), 
2234                     'manifest_url': ism_url, 
2235                     'ext': 'ismv' if stream_type == 'video' else 'isma', 
2239                     'asr': sampling_rate, 
2240                     'vcodec': 'none' if stream_type == 'audio' else fourcc, 
2241                     'acodec': 'none' if stream_type == 'video' else fourcc, 
2243                     'fragments': fragments, 
2244                     '_download_params': { 
2245                         'duration': duration, 
2246                         'timescale': stream_timescale, 
2247                         'width': width or 0, 
2248                         'height': height or 0, 
2250                         'codec_private_data': track.get('CodecPrivateData'), 
2251                         'sampling_rate': sampling_rate, 
2252                         'channels': int_or_none(track.get('Channels', 2)), 
2253                         'bits_per_sample': int_or_none(track.get('BitsPerSample', 16)), 
2254                         'nal_unit_length_field': int_or_none(track.get('NALUnitLengthField', 4)), 
2259     def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): 
2260         def absolute_url(item_url): 
2261             return urljoin(base_url, item_url) 
2263         def parse_content_type(content_type): 
2264             if not content_type: 
2266             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) 
2268                 mimetype, codecs = ctr.groups() 
2269                 f = parse_codecs(codecs) 
2270                 f['ext'] = mimetype2ext(mimetype) 
2274         def _media_formats(src, cur_media_type, type_info={}): 
2275             full_url = absolute_url(src) 
2276             ext = type_info.get('ext') or determine_ext(full_url) 
2278                 is_plain_url = False 
2279                 formats = self._extract_m3u8_formats( 
2280                     full_url, video_id, ext='mp4', 
2281                     entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, 
2282                     preference=preference, fatal=False) 
2284                 is_plain_url = False 
2285                 formats = self._extract_mpd_formats( 
2286                     full_url, video_id, mpd_id=mpd_id, fatal=False) 
2291                     'vcodec': 'none' if cur_media_type == 'audio' else None, 
2293             return is_plain_url, formats 
2296         # amp-video and amp-audio are very similar to their HTML5 counterparts 
2297         # so we wll include them right here (see 
2298         # https://www.ampproject.org/docs/reference/components/amp-video) 
2299         media_tags = [(media_tag, media_type, '') 
2300                       for media_tag, media_type 
2301                       in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)] 
2302         media_tags.extend(re.findall( 
2303             # We only allow video|audio followed by a whitespace or '>'. 
2304             # Allowing more characters may end up in significant slow down (see 
2305             # https://github.com/rg3/youtube-dl/issues/11979, example URL: 
2306             # http://www.porntrex.com/maps/videositemap.xml). 
2307             r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage)) 
2308         for media_tag, media_type, media_content in media_tags: 
2313             media_attributes = extract_attributes(media_tag) 
2314             src = media_attributes.get('src') 
2316                 _, formats = _media_formats(src, media_type) 
2317                 media_info['formats'].extend(formats) 
2318             media_info['thumbnail'] = absolute_url(media_attributes.get('poster')) 
2320                 for source_tag in re.findall(r'<source[^>]+>', media_content): 
2321                     source_attributes = extract_attributes(source_tag) 
2322                     src = source_attributes.get('src') 
2325                     f = parse_content_type(source_attributes.get('type')) 
2326                     is_plain_url, formats = _media_formats(src, media_type, f) 
2328                         # res attribute is not standard but seen several times 
2331                             'height': int_or_none(source_attributes.get('res')), 
2332                             'format_id': source_attributes.get('label'), 
2334                         f.update(formats[0]) 
2335                         media_info['formats'].append(f) 
2337                         media_info['formats'].extend(formats) 
2338                 for track_tag in re.findall(r'<track[^>]+>', media_content): 
2339                     track_attributes = extract_attributes(track_tag) 
2340                     kind = track_attributes.get('kind') 
2341                     if not kind or kind in ('subtitles', 'captions'): 
2342                         src = track_attributes.get('src') 
2345                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') 
2346                         media_info['subtitles'].setdefault(lang, []).append({ 
2347                             'url': absolute_url(src), 
2349             if media_info['formats'] or media_info['subtitles']: 
2350                 entries.append(media_info) 
2353     def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): 
2355         hdcore_sign = 'hdcore=3.7.0' 
2356         f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') 
2357         hds_host = hosts.get('hds') 
2359             f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) 
2360         if 'hdcore=' not in f4m_url: 
2361             f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign 
2362         f4m_formats = self._extract_f4m_formats( 
2363             f4m_url, video_id, f4m_id='hds', fatal=False) 
2364         for entry in f4m_formats: 
2365             entry.update({'extra_param_to_segment_url': hdcore_sign}) 
2366         formats.extend(f4m_formats) 
2367         m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') 
2368         hls_host = hosts.get('hls') 
2370             m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) 
2371         formats.extend(self._extract_m3u8_formats( 
2372             m3u8_url, video_id, 'mp4', 'm3u8_native', 
2373             m3u8_id='hls', fatal=False)) 
2376     def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): 
2377         query = compat_urlparse.urlparse(url).query 
2378         url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) 
2380             r'(?:(?:http|rtmp|rtsp)(?P<s>s)?:)?(?P<url>//[^?]+)', url) 
2381         url_base = mobj.group('url') 
2382         http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) 
2385         def manifest_url(manifest): 
2386             m_url = '%s/%s' % (http_base_url, manifest) 
2388                 m_url += '?%s' % query 
2391         if 'm3u8' not in skip_protocols: 
2392             formats.extend(self._extract_m3u8_formats( 
2393                 manifest_url('playlist.m3u8'), video_id, 'mp4', 
2394                 m3u8_entry_protocol, m3u8_id='hls', fatal=False)) 
2395         if 'f4m' not in skip_protocols: 
2396             formats.extend(self._extract_f4m_formats( 
2397                 manifest_url('manifest.f4m'), 
2398                 video_id, f4m_id='hds', fatal=False)) 
2399         if 'dash' not in skip_protocols: 
2400             formats.extend(self._extract_mpd_formats( 
2401                 manifest_url('manifest.mpd'), 
2402                 video_id, mpd_id='dash', fatal=False)) 
2403         if re.search(r'(?:/smil:|\.smil)', url_base): 
2404             if 'smil' not in skip_protocols: 
2405                 rtmp_formats = self._extract_smil_formats( 
2406                     manifest_url('jwplayer.smil'), 
2407                     video_id, fatal=False) 
2408                 for rtmp_format in rtmp_formats: 
2409                     rtsp_format = rtmp_format.copy() 
2410                     rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) 
2411                     del rtsp_format['play_path'] 
2412                     del rtsp_format['ext'] 
2413                     rtsp_format.update({ 
2414                         'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), 
2415                         'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), 
2418                     formats.extend([rtmp_format, rtsp_format]) 
2420             for protocol in ('rtmp', 'rtsp'): 
2421                 if protocol not in skip_protocols: 
2423                         'url': '%s:%s' % (protocol, url_base), 
2424                         'format_id': protocol, 
2425                         'protocol': protocol, 
2429     def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): 
2431             r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', 
2435                 jwplayer_data = self._parse_json(mobj.group('options'), 
2437                                                  transform_source=transform_source) 
2438             except ExtractorError: 
2441                 if isinstance(jwplayer_data, dict): 
2442                     return jwplayer_data 
2444     def _extract_jwplayer_data(self, webpage, video_id, *args, **kwargs): 
2445         jwplayer_data = self._find_jwplayer_data( 
2446             webpage, video_id, transform_source=js_to_json) 
2447         return self._parse_jwplayer_data( 
2448             jwplayer_data, video_id, *args, **kwargs) 
2450     def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, 
2451                              m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): 
2452         # JWPlayer backward compatibility: flattened playlists 
2453         # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 
2454         if 'playlist' not in jwplayer_data: 
2455             jwplayer_data = {'playlist': [jwplayer_data]} 
2459         # JWPlayer backward compatibility: single playlist item 
2460         # https://github.com/jwplayer/jwplayer/blob/v7.7.0/src/js/playlist/playlist.js#L10 
2461         if not isinstance(jwplayer_data['playlist'], list): 
2462             jwplayer_data['playlist'] = [jwplayer_data['playlist']] 
2464         for video_data in jwplayer_data['playlist']: 
2465             # JWPlayer backward compatibility: flattened sources 
2466             # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 
2467             if 'sources' not in video_data: 
2468                 video_data['sources'] = [video_data] 
2470             this_video_id = video_id or video_data['mediaid'] 
2472             formats = self._parse_jwplayer_formats( 
2473                 video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, 
2474                 mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) 
2477             tracks = video_data.get('tracks') 
2478             if tracks and isinstance(tracks, list): 
2479                 for track in tracks: 
2480                     if not isinstance(track, dict): 
2482                     track_kind = track.get('kind') 
2483                     if not track_kind or not isinstance(track_kind, compat_str): 
2485                     if track_kind.lower() not in ('captions', 'subtitles'): 
2487                     track_url = urljoin(base_url, track.get('file')) 
2490                     subtitles.setdefault(track.get('label') or 'en', []).append({ 
2491                         'url': self._proto_relative_url(track_url) 
2495                 'id': this_video_id, 
2496                 'title': unescapeHTML(video_data['title'] if require_title else video_data.get('title')), 
2497                 'description': video_data.get('description'), 
2498                 'thumbnail': self._proto_relative_url(video_data.get('image')), 
2499                 'timestamp': int_or_none(video_data.get('pubdate')), 
2500                 'duration': float_or_none(jwplayer_data.get('duration') or video_data.get('duration')), 
2501                 'subtitles': subtitles, 
2503             # https://github.com/jwplayer/jwplayer/blob/master/src/js/utils/validator.js#L32 
2504             if len(formats) == 1 and re.search(r'^(?:http|//).*(?:youtube\.com|youtu\.be)/.+', formats[0]['url']): 
2506                     '_type': 'url_transparent', 
2507                     'url': formats[0]['url'], 
2510                 self._sort_formats(formats) 
2511                 entry['formats'] = formats 
2512             entries.append(entry) 
2513         if len(entries) == 1: 
2516             return self.playlist_result(entries) 
2518     def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, 
2519                                 m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): 
2522         for source in jwplayer_sources_data: 
2523             if not isinstance(source, dict): 
2525             source_url = self._proto_relative_url(source.get('file')) 
2529                 source_url = compat_urlparse.urljoin(base_url, source_url) 
2530             if source_url in urls: 
2532             urls.append(source_url) 
2533             source_type = source.get('type') or '' 
2534             ext = mimetype2ext(source_type) or determine_ext(source_url) 
2535             if source_type == 'hls' or ext == 'm3u8': 
2536                 formats.extend(self._extract_m3u8_formats( 
2537                     source_url, video_id, 'mp4', entry_protocol='m3u8_native', 
2538                     m3u8_id=m3u8_id, fatal=False)) 
2539             elif source_type == 'dash' or ext == 'mpd': 
2540                 formats.extend(self._extract_mpd_formats( 
2541                     source_url, video_id, mpd_id=mpd_id, fatal=False)) 
2543                 formats.extend(self._extract_smil_formats( 
2544                     source_url, video_id, fatal=False)) 
2545             # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 
2546             elif source_type.startswith('audio') or ext in ( 
2547                     'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): 
2554                 height = int_or_none(source.get('height')) 
2556                     # Often no height is provided but there is a label in 
2557                     # format like "1080p", "720p SD", or 1080. 
2558                     height = int_or_none(self._search_regex( 
2559                         r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), 
2560                         'height', default=None)) 
2563                     'width': int_or_none(source.get('width')), 
2565                     'tbr': int_or_none(source.get('bitrate')), 
2568                 if source_url.startswith('rtmp'): 
2569                     a_format['ext'] = 'flv' 
2570                     # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as 
2571                     # of jwplayer.flash.swf 
2572                     rtmp_url_parts = re.split( 
2573                         r'((?:mp4|mp3|flv):)', source_url, 1) 
2574                     if len(rtmp_url_parts) == 3: 
2575                         rtmp_url, prefix, play_path = rtmp_url_parts 
2578                             'play_path': prefix + play_path, 
2581                         a_format.update(rtmp_params) 
2582                 formats.append(a_format) 
2585     def _live_title(self, name): 
2586         """ Generate the title 
for a live video 
""" 
2587         now = datetime.datetime.now() 
2588         now_str = now.strftime('%Y-%m-%d %H:%M') 
2589         return name + ' ' + now_str 
2591     def _int(self, v, name, fatal=False, **kwargs): 
2592         res = int_or_none(v, **kwargs) 
2593         if 'get_attr' in kwargs: 
2594             print(getattr(v, kwargs['get_attr'])) 
2596             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
2598                 raise ExtractorError(msg) 
2600                 self._downloader.report_warning(msg) 
2603     def _float(self, v, name, fatal=False, **kwargs): 
2604         res = float_or_none(v, **kwargs) 
2606             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
2608                 raise ExtractorError(msg) 
2610                 self._downloader.report_warning(msg) 
2613     def _set_cookie(self, domain, name, value, expire_time=None, port=None, 
2614                     path='/', secure=False, discard=False, rest={}, **kwargs): 
2615         cookie = compat_cookiejar.Cookie( 
2616             0, name, value, port, port is not None, domain, True, 
2617             domain.startswith('.'), path, True, secure, expire_time, 
2618             discard, None, None, rest) 
2619         self._downloader.cookiejar.set_cookie(cookie) 
2621     def _get_cookies(self, url): 
2622         """ Return a compat_cookies
.SimpleCookie 
with the cookies 
for the url 
""" 
2623         req = sanitized_Request(url) 
2624         self._downloader.cookiejar.add_cookie_header(req) 
2625         return compat_cookies.SimpleCookie(req.get_header('Cookie')) 
2627     def get_testcases(self, include_onlymatching=False): 
2628         t = getattr(self, '_TEST', None) 
2630             assert not hasattr(self, '_TESTS'), \ 
2631                 '%s has _TEST and _TESTS' % type(self).__name__ 
2634             tests = getattr(self, '_TESTS', []) 
2636             if not include_onlymatching and t.get('only_matching', False): 
2638             t['name'] = type(self).__name__[:-len('IE')] 
2641     def is_suitable(self, age_limit): 
2642         """ Test whether the extractor 
is generally suitable 
for the given
 
2643         age 
limit (i
.e
. pornographic sites are 
not, all others usually are
) """ 
2645         any_restricted = False 
2646         for tc in self.get_testcases(include_onlymatching=False): 
2647             if tc.get('playlist', []): 
2648                 tc = tc['playlist'][0] 
2649             is_restricted = age_restricted( 
2650                 tc.get('info_dict', {}).get('age_limit'), age_limit) 
2651             if not is_restricted: 
2653             any_restricted = any_restricted or is_restricted 
2654         return not any_restricted 
2656     def extract_subtitles(self, *args, **kwargs): 
2657         if (self._downloader.params.get('writesubtitles', False) or 
2658                 self._downloader.params.get('listsubtitles')): 
2659             return self._get_subtitles(*args, **kwargs) 
2662     def _get_subtitles(self, *args, **kwargs): 
2663         raise NotImplementedError('This method must be implemented by subclasses') 
2666     def _merge_subtitle_items(subtitle_list1, subtitle_list2): 
2667         """ Merge subtitle items 
for one language
. Items 
with duplicated URLs
 
2668         will be dropped
. """ 
2669         list1_urls = set([item['url'] for item in subtitle_list1]) 
2670         ret = list(subtitle_list1) 
2671         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) 
2675     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): 
2676         """ Merge two subtitle dictionaries
, language by language
. """ 
2677         ret = dict(subtitle_dict1) 
2678         for lang in subtitle_dict2: 
2679             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) 
2682     def extract_automatic_captions(self, *args, **kwargs): 
2683         if (self._downloader.params.get('writeautomaticsub', False) or 
2684                 self._downloader.params.get('listsubtitles')): 
2685             return self._get_automatic_captions(*args, **kwargs) 
2688     def _get_automatic_captions(self, *args, **kwargs): 
2689         raise NotImplementedError('This method must be implemented by subclasses') 
2691     def mark_watched(self, *args, **kwargs): 
2692         if (self._downloader.params.get('mark_watched', False) and 
2693                 (self._get_login_info()[0] is not None or 
2694                     self._downloader.params.get('cookiefile') is not None)): 
2695             self._mark_watched(*args, **kwargs) 
2697     def _mark_watched(self, *args, **kwargs): 
2698         raise NotImplementedError('This method must be implemented by subclasses') 
2700     def geo_verification_headers(self): 
2702         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') 
2703         if geo_verification_proxy: 
2704             headers['Ytdl-request-proxy'] = geo_verification_proxy 
2707     def _generic_id(self, url): 
2708         return compat_urllib_parse_unquote(os.path.splitext(url.rstrip('/').split('/')[-1])[0]) 
2710     def _generic_title(self, url): 
2711         return compat_urllib_parse_unquote(os.path.splitext(url_basename(url))[0]) 
2714 class SearchInfoExtractor(InfoExtractor): 
2716     Base 
class for paged search queries extractors
. 
2717     They accept URLs 
in the format 
_SEARCH_KEY(|all|
[0-9]):{query}
 
2718     Instances should define _SEARCH_KEY 
and _MAX_RESULTS
. 
2722     def _make_valid_url(cls): 
2723         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY 
2726     def suitable(cls, url): 
2727         return re.match(cls._make_valid_url(), url) is not None 
2729     def _real_extract(self, query): 
2730         mobj = re.match(self._make_valid_url(), query) 
2732             raise ExtractorError('Invalid search query "%s"' % query) 
2734         prefix = mobj.group('prefix') 
2735         query = mobj.group('query') 
2737             return self._get_n_results(query, 1) 
2738         elif prefix == 'all': 
2739             return self._get_n_results(query, self._MAX_RESULTS) 
2743                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) 
2744             elif n > self._MAX_RESULTS: 
2745                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 
2746                 n = self._MAX_RESULTS 
2747             return self._get_n_results(query, n) 
2749     def _get_n_results(self, query, n): 
2750         """Get a specified number of results 
for a query
""" 
2751         raise NotImplementedError('This method must be implemented by subclasses') 
2754     def SEARCH_KEY(self): 
2755         return self._SEARCH_KEY