1 from __future__ 
import unicode_literals
 
  15 from ..compat 
import ( 
  18     compat_etree_fromstring
, 
  24     compat_urllib_parse_urlencode
, 
  25     compat_urllib_request
, 
  28 from ..downloader
.f4m 
import remove_encrypted_media
 
  57     parse_m3u8_attributes
, 
  63 class InfoExtractor(object): 
  64     """Information Extractor class. 
  66     Information extractors are the classes that, given a URL, extract 
  67     information about the video (or videos) the URL refers to. This 
  68     information includes the real video URL, the video title, author and 
  69     others. The information is stored in a dictionary which is then 
  70     passed to the YoutubeDL. The YoutubeDL processes this 
  71     information possibly downloading the video to the file system, among 
  72     other possible outcomes. 
  74     The type field determines the type of the result. 
  75     By far the most common value (and the default if _type is missing) is 
  76     "video", which indicates a single video. 
  78     For a video, the dictionaries must include the following fields: 
  81     title:          Video title, unescaped. 
  83     Additionally, it must contain either a formats entry or a url one: 
  85     formats:        A list of dictionaries for each format available, ordered 
  86                     from worst to best quality. 
  89                     * url        Mandatory. The URL of the video file 
  90                     * ext        Will be calculated from URL if missing 
  91                     * format     A human-readable description of the format 
  92                                  ("mp4 container with h264/opus"). 
  93                                  Calculated from the format_id, width, height. 
  94                                  and format_note fields if missing. 
  95                     * format_id  A short description of the format 
  96                                  ("mp4_h264_opus" or "19"). 
  97                                 Technically optional, but strongly recommended. 
  98                     * format_note Additional info about the format 
  99                                  ("3D" or "DASH video") 
 100                     * width      Width of the video, if known 
 101                     * height     Height of the video, if known 
 102                     * resolution Textual description of width and height 
 103                     * tbr        Average bitrate of audio and video in KBit/s 
 104                     * abr        Average audio bitrate in KBit/s 
 105                     * acodec     Name of the audio codec in use 
 106                     * asr        Audio sampling rate in Hertz 
 107                     * vbr        Average video bitrate in KBit/s 
 109                     * vcodec     Name of the video codec in use 
 110                     * container  Name of the container format 
 111                     * filesize   The number of bytes, if known in advance 
 112                     * filesize_approx  An estimate for the number of bytes 
 113                     * player_url SWF Player URL (used for rtmpdump). 
 114                     * protocol   The protocol that will be used for the actual 
 115                                  download, lower-case. 
 116                                  "http", "https", "rtsp", "rtmp", "rtmpe", 
 117                                  "m3u8", "m3u8_native" or "http_dash_segments". 
 118                     * preference Order number of this format. If this field is 
 119                                  present and not None, the formats get sorted 
 120                                  by this field, regardless of all other values. 
 121                                  -1 for default (order by other properties), 
 122                                  -2 or smaller for less than default. 
 123                                  < -1000 to hide the format (if there is 
 124                                     another one which is strictly better) 
 125                     * language   Language code, e.g. "de" or "en-US". 
 126                     * language_preference  Is this in the language mentioned in 
 128                                  10 if it's what the URL is about, 
 129                                  -1 for default (don't know), 
 130                                  -10 otherwise, other values reserved for now. 
 131                     * quality    Order number of the video quality of this 
 132                                  format, irrespective of the file format. 
 133                                  -1 for default (order by other properties), 
 134                                  -2 or smaller for less than default. 
 135                     * source_preference  Order number for this video source 
 136                                   (quality takes higher priority) 
 137                                  -1 for default (order by other properties), 
 138                                  -2 or smaller for less than default. 
 139                     * http_headers  A dictionary of additional HTTP headers 
 140                                  to add to the request. 
 141                     * stretched_ratio  If given and not 1, indicates that the 
 142                                  video's pixels are not square. 
 143                                  width : height ratio as float. 
 144                     * no_resume  The server does not support resuming the 
 145                                  (HTTP or RTMP) download. Boolean. 
 147     url:            Final video URL. 
 148     ext:            Video filename extension. 
 149     format:         The video format, defaults to ext (used for --get-format) 
 150     player_url:     SWF Player URL (used for rtmpdump). 
 152     The following fields are optional: 
 154     alt_title:      A secondary title of the video. 
 155     display_id      An alternative identifier for the video, not necessarily 
 156                     unique, but available before title. Typically, id is 
 157                     something like "4234987", title "Dancing naked mole rats", 
 158                     and display_id "dancing-naked-mole-rats" 
 159     thumbnails:     A list of dictionaries, with the following entries: 
 160                         * "id" (optional, string) - Thumbnail format ID 
 162                         * "preference" (optional, int) - quality of the image 
 163                         * "width" (optional, int) 
 164                         * "height" (optional, int) 
 165                         * "resolution" (optional, string "{width}x{height"}, 
 167                         * "filesize" (optional, int) 
 168     thumbnail:      Full URL to a video thumbnail image. 
 169     description:    Full video description. 
 170     uploader:       Full name of the video uploader. 
 171     license:        License name the video is licensed under. 
 172     creator:        The creator of the video. 
 173     release_date:   The date (YYYYMMDD) when the video was released. 
 174     timestamp:      UNIX timestamp of the moment the video became available. 
 175     upload_date:    Video upload date (YYYYMMDD). 
 176                     If not explicitly set, calculated from timestamp. 
 177     uploader_id:    Nickname or id of the video uploader. 
 178     uploader_url:   Full URL to a personal webpage of the video uploader. 
 179     location:       Physical location where the video was filmed. 
 180     subtitles:      The available subtitles as a dictionary in the format 
 181                     {language: subformats}. "subformats" is a list sorted from 
 182                     lower to higher preference, each element is a dictionary 
 183                     with the "ext" entry and one of: 
 184                         * "data": The subtitles file contents 
 185                         * "url": A URL pointing to the subtitles file 
 186                     "ext" will be calculated from URL if missing 
 187     automatic_captions: Like 'subtitles', used by the YoutubeIE for 
 188                     automatically generated captions 
 189     duration:       Length of the video in seconds, as an integer or float. 
 190     view_count:     How many users have watched the video on the platform. 
 191     like_count:     Number of positive ratings of the video 
 192     dislike_count:  Number of negative ratings of the video 
 193     repost_count:   Number of reposts of the video 
 194     average_rating: Average rating give by users, the scale used depends on the webpage 
 195     comment_count:  Number of comments on the video 
 196     comments:       A list of comments, each with one or more of the following 
 197                     properties (all but one of text or html optional): 
 198                         * "author" - human-readable name of the comment author 
 199                         * "author_id" - user ID of the comment author 
 201                         * "html" - Comment as HTML 
 202                         * "text" - Plain text of the comment 
 203                         * "timestamp" - UNIX timestamp of comment 
 204                         * "parent" - ID of the comment this one is replying to. 
 205                                      Set to "root" to indicate that this is a 
 206                                      comment to the original video. 
 207     age_limit:      Age restriction for the video, as an integer (years) 
 208     webpage_url:    The URL to the video webpage, if given to youtube-dl it 
 209                     should allow to get the same result again. (It will be set 
 210                     by YoutubeDL if it's missing) 
 211     categories:     A list of categories that the video falls in, for example 
 213     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"] 
 214     is_live:        True, False, or None (=unknown). Whether this video is a 
 215                     live stream that goes on instead of a fixed-length video. 
 216     start_time:     Time in seconds where the reproduction should start, as 
 217                     specified in the URL. 
 218     end_time:       Time in seconds where the reproduction should end, as 
 219                     specified in the URL. 
 221     The following fields should only be used when the video belongs to some logical 
 224     chapter:        Name or title of the chapter the video belongs to. 
 225     chapter_number: Number of the chapter the video belongs to, as an integer. 
 226     chapter_id:     Id of the chapter the video belongs to, as a unicode string. 
 228     The following fields should only be used when the video is an episode of some 
 231     series:         Title of the series or programme the video episode belongs to. 
 232     season:         Title of the season the video episode belongs to. 
 233     season_number:  Number of the season the video episode belongs to, as an integer. 
 234     season_id:      Id of the season the video episode belongs to, as a unicode string. 
 235     episode:        Title of the video episode. Unlike mandatory video title field, 
 236                     this field should denote the exact title of the video episode 
 237                     without any kind of decoration. 
 238     episode_number: Number of the video episode within a season, as an integer. 
 239     episode_id:     Id of the video episode, as a unicode string. 
 241     The following fields should only be used when the media is a track or a part of 
 244     track:          Title of the track. 
 245     track_number:   Number of the track within an album or a disc, as an integer. 
 246     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii), 
 248     artist:         Artist(s) of the track. 
 249     genre:          Genre(s) of the track. 
 250     album:          Title of the album the track belongs to. 
 251     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). 
 252     album_artist:   List of all artists appeared on the album (e.g. 
 253                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits 
 255     disc_number:    Number of the disc or other physical medium the track belongs to, 
 257     release_year:   Year (YYYY) when the album was released. 
 259     Unless mentioned otherwise, the fields should be Unicode strings. 
 261     Unless mentioned otherwise, None is equivalent to absence of information. 
 264     _type "playlist" indicates multiple videos. 
 265     There must be a key "entries", which is a list, an iterable, or a PagedList 
 266     object, each element of which is a valid dictionary by this specification. 
 268     Additionally, playlists can have "title", "description" and "id" attributes 
 269     with the same semantics as videos (see above). 
 272     _type "multi_video" indicates that there are multiple videos that 
 273     form a single show, for examples multiple acts of an opera or TV episode. 
 274     It must have an entries key like a playlist and contain all the keys 
 275     required for a video at the same time. 
 278     _type "url" indicates that the video must be extracted from another 
 279     location, possibly by a different extractor. Its only required key is: 
 280     "url" - the next URL to extract. 
 281     The key "ie_key" can be set to the class name (minus the trailing "IE", 
 282     e.g. "Youtube") if the extractor class is known in advance. 
 283     Additionally, the dictionary may have any properties of the resolved entity 
 284     known in advance, for example "title" if the title of the referred video is 
 288     _type "url_transparent" entities have the same specification as "url", but 
 289     indicate that the given additional information is more precise than the one 
 290     associated with the resolved URL. 
 291     This is useful when a site employs a video service that hosts the video and 
 292     its technical metadata, but that video service does not embed a useful 
 293     title, description etc. 
 296     Subclasses of this one should re-define the _real_initialize() and 
 297     _real_extract() methods and define a _VALID_URL regexp. 
 298     Probably, they should also be added to the list of extractors. 
 300     Finally, the _WORKING attribute should be set to False for broken IEs 
 301     in order to warn the users and skip the tests. 
 308     def __init__(self
, downloader
=None): 
 309         """Constructor. Receives an optional downloader.""" 
 311         self
.set_downloader(downloader
) 
 314     def suitable(cls
, url
): 
 315         """Receives a URL and returns True if suitable for this IE.""" 
 317         # This does not use has/getattr intentionally - we want to know whether 
 318         # we have cached the regexp for *this* class, whereas getattr would also 
 319         # match the superclass 
 320         if '_VALID_URL_RE' not in cls
.__dict
__: 
 321             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 322         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 325     def _match_id(cls
, url
): 
 326         if '_VALID_URL_RE' not in cls
.__dict
__: 
 327             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 328         m 
= cls
._VALID
_URL
_RE
.match(url
) 
 334         """Getter method for _WORKING.""" 
 337     def initialize(self
): 
 338         """Initializes an instance (authentication, etc).""" 
 340             self
._real
_initialize
() 
 343     def extract(self
, url
): 
 344         """Extracts URL information and returns it in list of dicts.""" 
 347             return self
._real
_extract
(url
) 
 348         except ExtractorError
: 
 350         except compat_http_client
.IncompleteRead 
as e
: 
 351             raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True) 
 352         except (KeyError, StopIteration) as e
: 
 353             raise ExtractorError('An extractor error has occurred.', cause
=e
) 
 355     def set_downloader(self
, downloader
): 
 356         """Sets the downloader for this IE.""" 
 357         self
._downloader 
= downloader
 
 359     def _real_initialize(self
): 
 360         """Real initialization process. Redefine in subclasses.""" 
 363     def _real_extract(self
, url
): 
 364         """Real extraction process. Redefine in subclasses.""" 
 369         """A string for getting the InfoExtractor with get_info_extractor""" 
 370         return compat_str(cls
.__name
__[:-2]) 
 374         return compat_str(type(self
).__name
__[:-2]) 
 376     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query
={}): 
 377         """ Returns the response handle """ 
 379             self
.report_download_webpage(video_id
) 
 380         elif note 
is not False: 
 382                 self
.to_screen('%s' % (note
,)) 
 384                 self
.to_screen('%s: %s' % (video_id
, note
)) 
 385         if isinstance(url_or_request
, compat_urllib_request
.Request
): 
 386             url_or_request 
= update_Request( 
 387                 url_or_request
, data
=data
, headers
=headers
, query
=query
) 
 390                 url_or_request 
= update_url_query(url_or_request
, query
) 
 391             if data 
is not None or headers
: 
 392                 url_or_request 
= sanitized_Request(url_or_request
, data
, headers
) 
 394             return self
._downloader
.urlopen(url_or_request
) 
 395         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 399                 errnote 
= 'Unable to download webpage' 
 401             errmsg 
= '%s: %s' % (errnote
, error_to_compat_str(err
)) 
 403                 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
) 
 405                 self
._downloader
.report_warning(errmsg
) 
 408     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query
={}): 
 409         """ Returns a tuple (page content as string, URL handle) """ 
 410         # Strip hashes from the URL (#1038) 
 411         if isinstance(url_or_request
, (compat_str
, str)): 
 412             url_or_request 
= url_or_request
.partition('#')[0] 
 414         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
) 
 418         content 
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
) 
 419         return (content
, urlh
) 
 422     def _guess_encoding_from_content(content_type
, webpage_bytes
): 
 423         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 425             encoding 
= m
.group(1) 
 427             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 428                           webpage_bytes[:1024]) 
 430                 encoding = m.group(1).decode('ascii') 
 431             elif webpage_bytes.startswith(b'\xff\xfe'): 
 438     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): 
 439         content_type = urlh.headers.get('Content-Type', '') 
 440         webpage_bytes = urlh.read() 
 441         if prefix is not None: 
 442             webpage_bytes = prefix + webpage_bytes 
 444             encoding = self._guess_encoding_from_content(content_type, webpage_bytes) 
 445         if self._downloader.params.get('dump_intermediate_pages', False): 
 447                 url = url_or_request.get_full_url() 
 448             except AttributeError: 
 450             self.to_screen('Dumping request to ' + url) 
 451             dump = base64.b64encode(webpage_bytes).decode('ascii') 
 452             self._downloader.to_screen(dump) 
 453         if self._downloader.params.get('write_pages', False): 
 455                 url = url_or_request.get_full_url() 
 456             except AttributeError: 
 458             basen = '%s_%s' % (video_id, url) 
 460                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() 
 461                 basen = basen[:240 - len(h)] + h 
 462             raw_filename = basen + '.dump' 
 463             filename = sanitize_filename(raw_filename, restricted=True) 
 464             self.to_screen('Saving request to ' + filename) 
 465             # Working around MAX_PATH limitation on Windows (see 
 466             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) 
 467             if compat_os_name == 'nt': 
 468                 absfilepath = os.path.abspath(filename) 
 469                 if len(absfilepath) > 259: 
 470                     filename = '\\\\?\\' + absfilepath 
 471             with open(filename, 'wb') as outf: 
 472                 outf.write(webpage_bytes) 
 475             content = webpage_bytes.decode(encoding, 'replace') 
 477             content = webpage_bytes.decode('utf-8', 'replace') 
 479         if ('<title>Access to this site is blocked</title>' in content and 
 480                 'Websense' in content[:512]): 
 481             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' 
 482             blocked_iframe = self._html_search_regex( 
 483                 r'<iframe src="([^
"]+)"', content, 
 484                 'Websense information URL
', default=None) 
 486                 msg += ' Visit 
%s for more details
' % blocked_iframe 
 487             raise ExtractorError(msg, expected=True) 
 488         if '<title
>The URL you requested has been blocked
</title
>' in content[:512]: 
 490                 'Access to this webpage has been blocked by Indian censorship
. ' 
 491                 'Use a VPN 
or proxy 
server (with --proxy
) to route around it
.') 
 492             block_msg = self._html_search_regex( 
 493                 r'</h1
><p
>(.*?
)</p
>', 
 494                 content, 'block message
', default=None) 
 496                 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ') 
 497             raise ExtractorError(msg, expected=True) 
 501     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): 
 502         """ Returns the data of the page as a string """ 
 505         while success is False: 
 507                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) 
 509             except compat_http_client.IncompleteRead as e: 
 511                 if try_count >= tries: 
 513                 self._sleep(timeout, video_id) 
 520     def _download_xml(self, url_or_request, video_id, 
 521                       note='Downloading XML
', errnote='Unable to download XML
', 
 522                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): 
 523         """Return the xml as an xml.etree.ElementTree.Element""" 
 524         xml_string = self._download_webpage( 
 525             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) 
 526         if xml_string is False: 
 529             xml_string = transform_source(xml_string) 
 530         return compat_etree_fromstring(xml_string.encode('utf
-8')) 
 532     def _download_json(self, url_or_request, video_id, 
 533                        note='Downloading JSON metadata
', 
 534                        errnote='Unable to download JSON metadata
', 
 535                        transform_source=None, 
 536                        fatal=True, encoding=None, data=None, headers={}, query={}): 
 537         json_string = self._download_webpage( 
 538             url_or_request, video_id, note, errnote, fatal=fatal, 
 539             encoding=encoding, data=data, headers=headers, query=query) 
 540         if (not fatal) and json_string is False: 
 542         return self._parse_json( 
 543             json_string, video_id, transform_source=transform_source, fatal=fatal) 
 545     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): 
 547             json_string = transform_source(json_string) 
 549             return json.loads(json_string) 
 550         except ValueError as ve: 
 551             errmsg = '%s: Failed to parse JSON 
' % video_id 
 553                 raise ExtractorError(errmsg, cause=ve) 
 555                 self.report_warning(errmsg + str(ve)) 
 557     def report_warning(self, msg, video_id=None): 
 558         idstr = '' if video_id is None else '%s: ' % video_id 
 559         self._downloader.report_warning( 
 560             '[%s] %s%s' % (self.IE_NAME, idstr, msg)) 
 562     def to_screen(self, msg): 
 563         """Print msg to screen, prefixing it with '[ie_name
]'""" 
 564         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) 
 566     def report_extraction(self, id_or_name): 
 567         """Report information extraction.""" 
 568         self.to_screen('%s: Extracting information
' % id_or_name) 
 570     def report_download_webpage(self, video_id): 
 571         """Report webpage download.""" 
 572         self.to_screen('%s: Downloading webpage
' % video_id) 
 574     def report_age_confirmation(self): 
 575         """Report attempt to confirm age.""" 
 576         self.to_screen('Confirming age
') 
 578     def report_login(self): 
 579         """Report attempt to log in.""" 
 580         self.to_screen('Logging 
in') 
 583     def raise_login_required(msg='This video 
is only available 
for registered users
'): 
 584         raise ExtractorError( 
 585             '%s. Use 
--username 
and --password 
or --netrc to provide account credentials
.' % msg, 
 589     def raise_geo_restricted(msg='This video 
is not available 
from your location due to geo restriction
'): 
 590         raise ExtractorError( 
 591             '%s. You might want to use 
--proxy to workaround
.' % msg, 
 594     # Methods for following #608 
 596     def url_result(url, ie=None, video_id=None, video_title=None): 
 597         """Returns a URL that points to a page that should be processed""" 
 598         # TODO: ie should be the class used for getting the info 
 599         video_info = {'_type
': 'url
', 
 602         if video_id is not None: 
 603             video_info['id'] = video_id 
 604         if video_title is not None: 
 605             video_info['title
'] = video_title 
 609     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): 
 610         """Returns a playlist""" 
 611         video_info = {'_type
': 'playlist
', 
 614             video_info['id'] = playlist_id 
 616             video_info['title
'] = playlist_title 
 617         if playlist_description: 
 618             video_info['description
'] = playlist_description 
 621     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 623         Perform a regex search on the given string, using a single or a list of 
 624         patterns returning the first matching group. 
 625         In case of failure return a default value or raise a WARNING or a 
 626         RegexNotFoundError, depending on fatal, specifying the field name. 
 628         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 629             mobj = re.search(pattern, string, flags) 
 632                 mobj = re.search(p, string, flags) 
 636         if not self._downloader.params.get('no_color
') and compat_os_name != 'nt
' and sys.stderr.isatty(): 
 637             _name = '\033[0;34m
%s\033[0m
' % name 
 643                 # return the first matching group 
 644                 return next(g for g in mobj.groups() if g is not None) 
 646                 return mobj.group(group) 
 647         elif default is not NO_DEFAULT: 
 650             raise RegexNotFoundError('Unable to extract 
%s' % _name) 
 652             self._downloader.report_warning('unable to extract 
%s' % _name + bug_reports_message()) 
 655     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 657         Like _search_regex, but strips HTML tags and unescapes entities. 
 659         res = self._search_regex(pattern, string, name, default, fatal, flags, group) 
 661             return clean_html(res).strip() 
 665     def _get_netrc_login_info(self, netrc_machine=None): 
 668         netrc_machine = netrc_machine or self._NETRC_MACHINE 
 670         if self._downloader.params.get('usenetrc
', False): 
 672                 info = netrc.netrc().authenticators(netrc_machine) 
 677                     raise netrc.NetrcParseError('No authenticators 
for %s' % netrc_machine) 
 678             except (IOError, netrc.NetrcParseError) as err: 
 679                 self._downloader.report_warning('parsing 
.netrc
: %s' % error_to_compat_str(err)) 
 681         return (username, password) 
 683     def _get_login_info(self): 
 685         Get the login info as (username, password) 
 686         It will look in the netrc file using the _NETRC_MACHINE value 
 687         If there's no info available
, return (None, None) 
 689         if self._downloader is None: 
 694         downloader_params = self._downloader.params 
 696         # Attempt to use provided username and password or .netrc data 
 697         if downloader_params.get('username') is not None: 
 698             username = downloader_params['username'] 
 699             password = downloader_params['password'] 
 701             username, password = self._get_netrc_login_info() 
 703         return (username, password) 
 705     def _get_tfa_info(self, note='two-factor verification code'): 
 707         Get the two
-factor authentication info
 
 708         TODO 
- asking the user will be required 
for sms
/phone verify
 
 709         currently just uses the command line option
 
 710         If there
's no info available, return None 
 712         if self._downloader is None: 
 714         downloader_params = self._downloader.params 
 716         if downloader_params.get('twofactor
') is not None: 
 717             return downloader_params['twofactor
'] 
 719         return compat_getpass('Type 
%s and press 
[Return
]: ' % note) 
 721     # Helper functions for extracting OpenGraph info 
 723     def _og_regexes(prop): 
 724         content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))' 
 725         property_re = (r'(?
:name|
property)=(?
:\'og
:%(prop)s\'|
"og:%(prop)s"|\s
*og
:%(prop)s\b)' 
 726                        % {'prop
': re.escape(prop)}) 
 727         template = r'<meta
[^
>]+?
%s[^
>]+?
%s' 
 729             template % (property_re, content_re), 
 730             template % (content_re, property_re), 
 734     def _meta_regex(prop): 
 735         return r'''(?isx)<meta 
 736                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) 
 737                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) 
 739     def _og_search_property(self, prop, html, name=None, **kargs): 
 740         if not isinstance(prop, (list, tuple)): 
 743             name = 'OpenGraph 
%s' % prop[0] 
 746             og_regexes.extend(self._og_regexes(p)) 
 747         escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) 
 750         return unescapeHTML(escaped) 
 752     def _og_search_thumbnail(self, html, **kargs): 
 753         return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs) 
 755     def _og_search_description(self, html, **kargs): 
 756         return self._og_search_property('description
', html, fatal=False, **kargs) 
 758     def _og_search_title(self, html, **kargs): 
 759         return self._og_search_property('title
', html, **kargs) 
 761     def _og_search_video_url(self, html, name='video url
', secure=True, **kargs): 
 762         regexes = self._og_regexes('video
') + self._og_regexes('video
:url
') 
 764             regexes = self._og_regexes('video
:secure_url
') + regexes 
 765         return self._html_search_regex(regexes, html, name, **kargs) 
 767     def _og_search_url(self, html, **kargs): 
 768         return self._og_search_property('url
', html, **kargs) 
 770     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): 
 771         if not isinstance(name, (list, tuple)): 
 773         if display_name is None: 
 774             display_name = name[0] 
 775         return self._html_search_regex( 
 776             [self._meta_regex(n) for n in name], 
 777             html, display_name, fatal=fatal, group='content
', **kwargs) 
 779     def _dc_search_uploader(self, html): 
 780         return self._html_search_meta('dc
.creator
', html, 'uploader
') 
 782     def _rta_search(self, html): 
 783         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
 784         if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+' 
 785                      r'     content
="RTA-5042-1996-1400-1577-RTA"', 
 790     def _media_rating_search(self, html): 
 791         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 
 792         rating = self._html_search_meta('rating
', html) 
 804         return RATING_TABLE.get(rating.lower()) 
 806     def _family_friendly_search(self, html): 
 807         # See http://schema.org/VideoObject 
 808         family_friendly = self._html_search_meta('isFamilyFriendly
', html) 
 810         if not family_friendly: 
 819         return RATING_TABLE.get(family_friendly.lower()) 
 821     def _twitter_search_player(self, html): 
 822         return self._html_search_meta('twitter
:player
', html, 
 823                                       'twitter card player
') 
 825     def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): 
 826         json_ld = self._search_regex( 
 827             r'(?s
)<script
[^
>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', 
 828             html, 'JSON-LD', group='json_ld', **kwargs) 
 829         default = kwargs.get('default', NO_DEFAULT) 
 831             return default if default is not NO_DEFAULT else {} 
 832         # JSON-LD may be malformed and thus `fatal` should be respected. 
 833         # At the same time `default` may be passed that assumes `fatal=False` 
 834         # for _search_regex. Let's simulate the same behavior here as well. 
 835         fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False 
 836         return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) 
 838     def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): 
 839         if isinstance(json_ld, compat_str): 
 840             json_ld = self._parse_json(json_ld, video_id, fatal=fatal) 
 844         if not isinstance(json_ld, (list, tuple, dict)): 
 846         if isinstance(json_ld, dict): 
 849             if e.get('@context') == 'http://schema.org': 
 850                 item_type = e.get('@type') 
 851                 if expected_type is not None and expected_type != item_type: 
 853                 if item_type == 'TVEpisode': 
 855                         'episode': unescapeHTML(e.get('name')), 
 856                         'episode_number': int_or_none(e.get('episodeNumber')), 
 857                         'description': unescapeHTML(e.get('description')), 
 859                     part_of_season = e.get('partOfSeason') 
 860                     if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': 
 861                         info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) 
 862                     part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') 
 863                     if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': 
 864                         info['series'] = unescapeHTML(part_of_series.get('name')) 
 865                 elif item_type == 'Article': 
 867                         'timestamp': parse_iso8601(e.get('datePublished')), 
 868                         'title': unescapeHTML(e.get('headline')), 
 869                         'description': unescapeHTML(e.get('articleBody')), 
 871                 elif item_type == 'VideoObject': 
 873                         'url': e.get('contentUrl'), 
 874                         'title': unescapeHTML(e.get('name')), 
 875                         'description': unescapeHTML(e.get('description')), 
 876                         'thumbnail': e.get('thumbnailUrl'), 
 877                         'duration': parse_duration(e.get('duration')), 
 878                         'timestamp': unified_timestamp(e.get('uploadDate')), 
 879                         'filesize': float_or_none(e.get('contentSize')), 
 880                         'tbr': int_or_none(e.get('bitrate')), 
 881                         'width': int_or_none(e.get('width')), 
 882                         'height': int_or_none(e.get('height')), 
 885         return dict((k, v) for k, v in info.items() if v is not None) 
 888     def _hidden_inputs(html): 
 889         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) 
 891         for input in re.findall(r'(?i)<input([^>]+)>', html): 
 892             if not re.search(r'type=(["\'])(?
:hidden|submit
)\
1', input): 
 894             name = re.search(r'(?
:name|
id)=(["\'])(?P<value>.+?)\1', input) 
 897             value = re.search(r'value=(["\'])(?P
<value
>.*?
)\
1', input) 
 900             hidden_inputs[name.group('value
')] = value.group('value
') 
 903     def _form_hidden_inputs(self, form_id, html): 
 904         form = self._search_regex( 
 905             r'(?
is)<form
[^
>]+?
id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, 
 906             html, '%s form' % form_id, group='form') 
 907         return self._hidden_inputs(form) 
 909     def _sort_formats(self, formats, field_preference=None): 
 911             raise ExtractorError('No video formats found') 
 914             # Automatically determine tbr when missing based on abr and vbr (improves 
 915             # formats sorting in some cases) 
 916             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: 
 917                 f['tbr'] = f['abr'] + f['vbr'] 
 920             # TODO remove the following workaround 
 921             from ..utils import determine_ext 
 922             if not f.get('ext') and 'url' in f: 
 923                 f['ext'] = determine_ext(f['url']) 
 925             if isinstance(field_preference, (list, tuple)): 
 928                     if f.get(field) is not None 
 929                     else ('' if field == 'format_id' else -1) 
 930                     for field in field_preference) 
 932             preference = f.get('preference') 
 933             if preference is None: 
 935                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported 
 938             protocol = f.get('protocol') or determine_protocol(f) 
 939             proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) 
 941             if f.get('vcodec') == 'none':  # audio only 
 943                 if self._downloader.params.get('prefer_free_formats'): 
 944                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] 
 946                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] 
 949                     audio_ext_preference = ORDER.index(f['ext']) 
 951                     audio_ext_preference = -1 
 953                 if f.get('acodec') == 'none':  # video only 
 955                 if self._downloader.params.get('prefer_free_formats'): 
 956                     ORDER = ['flv', 'mp4', 'webm'] 
 958                     ORDER = ['webm', 'flv', 'mp4'] 
 960                     ext_preference = ORDER.index(f['ext']) 
 963                 audio_ext_preference = 0 
 967                 f.get('language_preference') if f.get('language_preference') is not None else -1, 
 968                 f.get('quality') if f.get('quality') is not None else -1, 
 969                 f.get('tbr') if f.get('tbr') is not None else -1, 
 970                 f.get('filesize') if f.get('filesize') is not None else -1, 
 971                 f.get('vbr') if f.get('vbr') is not None else -1, 
 972                 f.get('height') if f.get('height') is not None else -1, 
 973                 f.get('width') if f.get('width') is not None else -1, 
 976                 f.get('abr') if f.get('abr') is not None else -1, 
 977                 audio_ext_preference, 
 978                 f.get('fps') if f.get('fps') is not None else -1, 
 979                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, 
 980                 f.get('source_preference') if f.get('source_preference') is not None else -1, 
 981                 f.get('format_id') if f.get('format_id') is not None else '', 
 983         formats.sort(key=_formats_key) 
 985     def _check_formats(self, formats, video_id): 
 988                 lambda f: self._is_valid_url( 
 990                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), 
 994     def _remove_duplicate_formats(formats): 
 998             if f['url'] not in format_urls: 
 999                 format_urls.add(f['url']) 
1000                 unique_formats.append(f) 
1001         formats[:] = unique_formats 
1003     def _is_valid_url(self, url, video_id, item='video'): 
1004         url = self._proto_relative_url(url, scheme='http:') 
1005         # For now assume non HTTP(S) URLs always valid 
1006         if not (url.startswith('http://') or url.startswith('https://')): 
1009             self._request_webpage(url, video_id, 'Checking %s URL' % item) 
1011         except ExtractorError as e: 
1012             if isinstance(e.cause, compat_urllib_error.URLError): 
1014                     '%s: %s URL is invalid, skipping' % (video_id, item)) 
1018     def http_scheme(self): 
1019         """ Either "http
:" or "https
:", depending on the user's preferences """ 
1022             if self._downloader.params.get('prefer_insecure', False) 
1025     def _proto_relative_url(self, url, scheme=None): 
1028         if url.startswith('//'): 
1030                 scheme = self.http_scheme() 
1035     def _sleep(self, timeout, video_id, msg_template=None): 
1036         if msg_template is None: 
1037             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' 
1038         msg = msg_template % {'video_id': video_id, 'timeout': timeout} 
1042     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, 
1043                              transform_source=lambda s: fix_xml_ampersands(s).strip(), 
1044                              fatal=True, m3u8_id=None): 
1045         manifest = self._download_xml( 
1046             manifest_url, video_id, 'Downloading f4m manifest', 
1047             'Unable to download f4m manifest', 
1048             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests 
1049             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) 
1050             transform_source=transform_source, 
1053         if manifest is False: 
1056         return self._parse_f4m_formats( 
1057             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1058             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) 
1060     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, 
1061                            transform_source=lambda s: fix_xml_ampersands(s).strip(), 
1062                            fatal=True, m3u8_id=None): 
1063         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy 
1064         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') 
1065         if akamai_pv is not None and ';' in akamai_pv.text: 
1066             playerVerificationChallenge = akamai_pv.text.split(';')[0] 
1067             if playerVerificationChallenge.strip() != '': 
1071         manifest_version = '1.0' 
1072         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') 
1074             manifest_version = '2.0' 
1075             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') 
1076         # Remove unsupported DRM protected media from final formats 
1077         # rendition (see https://github.com/rg3/youtube-dl/issues/8573). 
1078         media_nodes = remove_encrypted_media(media_nodes) 
1081         base_url = xpath_text( 
1082             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 
1083             'base URL', default=None) 
1085             base_url = base_url.strip() 
1087         bootstrap_info = xpath_element( 
1088             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 
1089             'bootstrap info', default=None) 
1091         for i, media_el in enumerate(media_nodes): 
1092             tbr = int_or_none(media_el.attrib.get('bitrate')) 
1093             width = int_or_none(media_el.attrib.get('width')) 
1094             height = int_or_none(media_el.attrib.get('height')) 
1095             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) 
1096             # If <bootstrapInfo> is present, the specified f4m is a 
1097             # stream-level manifest, and only set-level manifests may refer to 
1098             # external resources.  See section 11.4 and section 4 of F4M spec 
1099             if bootstrap_info is None: 
1101                 # @href is introduced in 2.0, see section 11.6 of F4M spec 
1102                 if manifest_version == '2.0': 
1103                     media_url = media_el.attrib.get('href') 
1104                 if media_url is None: 
1105                     media_url = media_el.attrib.get('url') 
1109                     media_url if media_url.startswith('http://') or media_url.startswith('https://') 
1110                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) 
1111                 # If media_url is itself a f4m manifest do the recursive extraction 
1112                 # since bitrates in parent manifest (this one) and media_url manifest 
1113                 # may differ leading to inability to resolve the format by requested 
1114                 # bitrate in f4m downloader 
1115                 ext = determine_ext(manifest_url) 
1117                     f4m_formats = self._extract_f4m_formats( 
1118                         manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1119                         transform_source=transform_source, fatal=fatal) 
1120                     # Sometimes stream-level manifest contains single media entry that 
1121                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). 
1122                     # At the same time parent's media entry in set-level manifest may 
1123                     # contain it. We will copy it from parent in such cases. 
1124                     if len(f4m_formats) == 1: 
1127                             'tbr': f.get('tbr') or tbr, 
1128                             'width': f.get('width') or width, 
1129                             'height': f.get('height') or height, 
1130                             'format_id': f.get('format_id') if not tbr else format_id, 
1132                     formats.extend(f4m_formats) 
1135                     formats.extend(self._extract_m3u8_formats( 
1136                         manifest_url, video_id, 'mp4', preference=preference, 
1137                         m3u8_id=m3u8_id, fatal=fatal)) 
1140                 'format_id': format_id, 
1141                 'url': manifest_url, 
1142                 'ext': 'flv' if bootstrap_info is not None else None, 
1146                 'preference': preference, 
1150     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): 
1152             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 
1156             'preference': preference - 100 if preference else -100, 
1157             'resolution': 'multiple', 
1158             'format_note': 'Quality selection URL', 
1161     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, 
1162                               entry_protocol='m3u8', preference=None, 
1163                               m3u8_id=None, note=None, errnote=None, 
1164                               fatal=True, live=False): 
1166         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] 
1168         format_url = lambda u: ( 
1170             if re.match(r'^https?://', u) 
1171             else compat_urlparse.urljoin(m3u8_url, u)) 
1173         res = self._download_webpage_handle( 
1175             note=note or 'Downloading m3u8 information', 
1176             errnote=errnote or 'Failed to download m3u8 information', 
1180         m3u8_doc, urlh = res 
1181         m3u8_url = urlh.geturl() 
1183         # We should try extracting formats only from master playlists [1], i.e. 
1184         # playlists that describe available qualities. On the other hand media 
1185         # playlists [2] should be returned as is since they contain just the media 
1186         # without qualities renditions. 
1187         # Fortunately, master playlist can be easily distinguished from media 
1188         # playlist based on particular tags availability. As of [1, 2] master 
1189         # playlist tags MUST NOT appear in a media playist and vice versa. 
1190         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist 
1191         # and MUST NOT appear in master playlist thus we can clearly detect media 
1192         # playlist with this criterion. 
1193         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 
1194         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 
1195         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 
1196         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is 
1199                 'format_id': m3u8_id, 
1201                 'protocol': entry_protocol, 
1202                 'preference': preference, 
1206         for line in m3u8_doc.splitlines(): 
1207             if line.startswith('#EXT-X-STREAM-INF:'): 
1208                 last_info = parse_m3u8_attributes(line) 
1209             elif line.startswith('#EXT-X-MEDIA:'): 
1210                 last_media = parse_m3u8_attributes(line) 
1211             elif line.startswith('#') or not line.strip(): 
1214                 if last_info is None: 
1215                     formats.append({'url': format_url(line)}) 
1217                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) 
1220                     format_id.append(m3u8_id) 
1221                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None 
1222                 # Despite specification does not mention NAME attribute for 
1223                 # EXT-X-STREAM-INF it still sometimes may be present 
1224                 stream_name = last_info.get('NAME') or last_media_name 
1225                 # Bandwidth of live streams may differ over time thus making 
1226                 # format_id unpredictable. So it's better to keep provided 
1229                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) 
1231                     'format_id': '-'.join(format_id), 
1232                     'url': format_url(line.strip()), 
1235                     'fps': float_or_none(last_info.get('FRAME-RATE')), 
1236                     'protocol': entry_protocol, 
1237                     'preference': preference, 
1239                 resolution = last_info.get('RESOLUTION') 
1241                     width_str, height_str = resolution.split('x') 
1242                     f['width'] = int(width_str) 
1243                     f['height'] = int(height_str) 
1244                 # Unified Streaming Platform 
1246                     r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) 
1248                     abr, vbr = mobj.groups() 
1249                     abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) 
1254                 f.update(parse_codecs(last_info.get('CODECS'))) 
1255                 if last_media is not None: 
1256                     f['m3u8_media'] = last_media 
1263     def _xpath_ns(path, namespace=None): 
1267         for c in path.split('/'): 
1268             if not c or c == '.': 
1271                 out.append('{%s}%s' % (namespace, c)) 
1272         return '/'.join(out) 
1274     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): 
1275         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) 
1281         namespace = self._parse_smil_namespace(smil) 
1283         return self._parse_smil_formats( 
1284             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1286     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): 
1287         smil = self._download_smil(smil_url, video_id, fatal=fatal) 
1290         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) 
1292     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): 
1293         return self._download_xml( 
1294             smil_url, video_id, 'Downloading SMIL file', 
1295             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) 
1297     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): 
1298         namespace = self._parse_smil_namespace(smil) 
1300         formats = self._parse_smil_formats( 
1301             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1302         subtitles = self._parse_smil_subtitles(smil, namespace=namespace) 
1304         video_id = os.path.splitext(url_basename(smil_url))[0] 
1308         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1309             name = meta.attrib.get('name') 
1310             content = meta.attrib.get('content') 
1311             if not name or not content: 
1313             if not title and name == 'title': 
1315             elif not description and name in ('description', 'abstract'): 
1316                 description = content 
1317             elif not upload_date and name == 'date': 
1318                 upload_date = unified_strdate(content) 
1321             'id': image.get('type'), 
1322             'url': image.get('src'), 
1323             'width': int_or_none(image.get('width')), 
1324             'height': int_or_none(image.get('height')), 
1325         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] 
1329             'title': title or video_id, 
1330             'description': description, 
1331             'upload_date': upload_date, 
1332             'thumbnails': thumbnails, 
1334             'subtitles': subtitles, 
1337     def _parse_smil_namespace(self, smil): 
1338         return self._search_regex( 
1339             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) 
1341     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): 
1343         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1344             b = meta.get('base') or meta.get('httpBase') 
1355         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) 
1356         for medium in media: 
1357             src = medium.get('src') 
1358             if not src or src in srcs: 
1362             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) 
1363             filesize = int_or_none(medium.get('size') or medium.get('fileSize')) 
1364             width = int_or_none(medium.get('width')) 
1365             height = int_or_none(medium.get('height')) 
1366             proto = medium.get('proto') 
1367             ext = medium.get('ext') 
1368             src_ext = determine_ext(src) 
1369             streamer = medium.get('streamer') or base 
1371             if proto == 'rtmp' or streamer.startswith('rtmp'): 
1377                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), 
1379                     'filesize': filesize, 
1383                 if transform_rtmp_url: 
1384                     streamer, src = transform_rtmp_url(streamer, src) 
1385                     formats[-1].update({ 
1391             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) 
1392             src_url = src_url.strip() 
1394             if proto == 'm3u8' or src_ext == 'm3u8': 
1395                 m3u8_formats = self._extract_m3u8_formats( 
1396                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) 
1397                 if len(m3u8_formats) == 1: 
1399                     m3u8_formats[0].update({ 
1400                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), 
1405                 formats.extend(m3u8_formats) 
1408             if src_ext == 'f4m': 
1413                         'plugin': 'flowplayer-3.2.0.1', 
1415                 f4m_url += '&' if '?' in f4m_url else '?' 
1416                 f4m_url += compat_urllib_parse_urlencode(f4m_params) 
1417                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) 
1420             if src_url.startswith('http') and self._is_valid_url(src, video_id): 
1424                     'ext': ext or src_ext or 'flv', 
1425                     'format_id': 'http-%d' % (bitrate or http_count), 
1427                     'filesize': filesize, 
1435     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): 
1438         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): 
1439             src = textstream.get('src') 
1440             if not src or src in urls: 
1443             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) 
1444             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang 
1445             subtitles.setdefault(lang, []).append({ 
1451     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): 
1452         xspf = self._download_xml( 
1453             playlist_url, playlist_id, 'Downloading xpsf playlist', 
1454             'Unable to download xspf manifest', fatal=fatal) 
1457         return self._parse_xspf(xspf, playlist_id) 
1459     def _parse_xspf(self, playlist, playlist_id): 
1461             'xspf': 'http://xspf.org/ns/0/', 
1462             's1': 'http://static.streamone.nl/player/ns/0', 
1466         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): 
1468                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) 
1469             description = xpath_text( 
1470                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') 
1471             thumbnail = xpath_text( 
1472                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') 
1473             duration = float_or_none( 
1474                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) 
1477                 'url': location.text, 
1478                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), 
1479                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 
1480                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), 
1481             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] 
1482             self._sort_formats(formats) 
1487                 'description': description, 
1488                 'thumbnail': thumbnail, 
1489                 'duration': duration, 
1494     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): 
1495         res = self._download_webpage_handle( 
1497             note=note or 'Downloading MPD manifest', 
1498             errnote=errnote or 'Failed to download MPD manifest', 
1503         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() 
1505         return self._parse_mpd_formats( 
1506             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) 
1508     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): 
1510         Parse formats from MPD manifest. 
1512          1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), 
1513             http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip 
1514          2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP 
1516         if mpd_doc.get('type') == 'dynamic': 
1519         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) 
1522             return self._xpath_ns(path, namespace) 
1524         def is_drm_protected(element): 
1525             return element.find(_add_ns('ContentProtection')) is not None 
1527         def extract_multisegment_info(element, ms_parent_info): 
1528             ms_info = ms_parent_info.copy() 
1529             segment_list = element.find(_add_ns('SegmentList')) 
1530             if segment_list is not None: 
1531                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) 
1533                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] 
1534                 initialization = segment_list.find(_add_ns('Initialization')) 
1535                 if initialization is not None: 
1536                     ms_info['initialization_url'] = initialization.attrib['sourceURL'] 
1538                 segment_template = element.find(_add_ns('SegmentTemplate')) 
1539                 if segment_template is not None: 
1540                     start_number = segment_template.get('startNumber') 
1542                         ms_info['start_number'] = int(start_number) 
1543                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline')) 
1544                     if segment_timeline is not None: 
1545                         s_e = segment_timeline.findall(_add_ns('S')) 
1547                             ms_info['total_number'] = 0 
1550                                 r = int(s.get('r', 0)) 
1551                                 ms_info['total_number'] += 1 + r 
1552                                 ms_info['s'].append({ 
1553                                     't': int(s.get('t', 0)), 
1554                                     # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) 
1555                                     'd': int(s.attrib['d']), 
1559                         timescale = segment_template.get('timescale') 
1561                             ms_info['timescale'] = int(timescale) 
1562                         segment_duration = segment_template.get('duration') 
1563                         if segment_duration: 
1564                             ms_info['segment_duration'] = int(segment_duration) 
1565                     media_template = segment_template.get('media') 
1567                         ms_info['media_template'] = media_template 
1568                     initialization = segment_template.get('initialization') 
1570                         ms_info['initialization_url'] = initialization 
1572                         initialization = segment_template.find(_add_ns('Initialization')) 
1573                         if initialization is not None: 
1574                             ms_info['initialization_url'] = initialization.attrib['sourceURL'] 
1577         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) 
1579         for period in mpd_doc.findall(_add_ns('Period')): 
1580             period_duration = parse_duration(period.get('duration')) or mpd_duration 
1581             period_ms_info = extract_multisegment_info(period, { 
1585             for adaptation_set in period.findall(_add_ns('AdaptationSet')): 
1586                 if is_drm_protected(adaptation_set): 
1588                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) 
1589                 for representation in adaptation_set.findall(_add_ns('Representation')): 
1590                     if is_drm_protected(representation): 
1592                     representation_attrib = adaptation_set.attrib.copy() 
1593                     representation_attrib.update(representation.attrib) 
1594                     # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory 
1595                     mime_type = representation_attrib['mimeType'] 
1596                     content_type = mime_type.split('/')[0] 
1597                     if content_type == 'text': 
1598                         # TODO implement WebVTT downloading 
1600                     elif content_type == 'video' or content_type == 'audio': 
1602                         for element in (representation, adaptation_set, period, mpd_doc): 
1603                             base_url_e = element.find(_add_ns('BaseURL')) 
1604                             if base_url_e is not None: 
1605                                 base_url = base_url_e.text + base_url 
1606                                 if re.match(r'^https?://', base_url): 
1608                         if mpd_base_url and not re.match(r'^https?://', base_url): 
1609                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'): 
1611                             base_url = mpd_base_url + base_url 
1612                         representation_id = representation_attrib.get('id') 
1613                         lang = representation_attrib.get('lang') 
1614                         url_el = representation.find(_add_ns('BaseURL')) 
1615                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) 
1617                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 
1619                             'ext': mimetype2ext(mime_type), 
1620                             'width': int_or_none(representation_attrib.get('width')), 
1621                             'height': int_or_none(representation_attrib.get('height')), 
1622                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), 
1623                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 
1624                             'fps': int_or_none(representation_attrib.get('frameRate')), 
1625                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'), 
1626                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 
1627                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 
1628                             'format_note': 'DASH %s' % content_type, 
1629                             'filesize': filesize, 
1631                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) 
1632                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: 
1633                             if 'total_number' not in representation_ms_info and 'segment_duration': 
1634                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale']) 
1635                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) 
1636                             media_template = representation_ms_info['media_template'] 
1637                             media_template = media_template.replace('$RepresentationID$', representation_id) 
1638                             media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) 
1639                             media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) 
1640                             media_template.replace('$$', '$') 
1642                             # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ 
1643                             # can't be used at the same time 
1644                             if '%(Number' in media_template: 
1645                                 representation_ms_info['segment_urls'] = [ 
1647                                         'Number': segment_number, 
1648                                         'Bandwidth': representation_attrib.get('bandwidth'), 
1650                                     for segment_number in range( 
1651                                         representation_ms_info['start_number'], 
1652                                         representation_ms_info['total_number'] + representation_ms_info['start_number'])] 
1654                                 representation_ms_info['segment_urls'] = [] 
1657                                 def add_segment_url(): 
1658                                     representation_ms_info['segment_urls'].append( 
1660                                             'Time': segment_time, 
1661                                             'Bandwidth': representation_attrib.get('bandwidth'), 
1665                                 for num, s in enumerate(representation_ms_info['s']): 
1666                                     segment_time = s.get('t') or segment_time 
1668                                     for r in range(s.get('r', 0)): 
1669                                         segment_time += s['d'] 
1671                                     segment_time += s['d'] 
1672                         if 'segment_urls' in representation_ms_info: 
1674                                 'segment_urls': representation_ms_info['segment_urls'], 
1675                                 'protocol': 'http_dash_segments', 
1677                             if 'initialization_url' in representation_ms_info: 
1678                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) 
1680                                     'initialization_url': initialization_url, 
1682                                 if not f.get('url'): 
1683                                     f['url'] = initialization_url 
1685                             existing_format = next( 
1686                                 fo for fo in formats 
1687                                 if fo['format_id'] == representation_id) 
1688                         except StopIteration: 
1689                             full_info = formats_dict.get(representation_id, {}).copy() 
1691                             formats.append(full_info) 
1693                             existing_format.update(f) 
1695                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) 
1698     def _parse_html5_media_entries(self, base_url, webpage): 
1699         def absolute_url(video_url): 
1700             return compat_urlparse.urljoin(base_url, video_url) 
1702         def parse_content_type(content_type): 
1703             if not content_type: 
1705             ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?
(?P
<codecs
>[^
"]+))?', content_type) 
1707                 mimetype, codecs = ctr.groups() 
1708                 f = parse_codecs(codecs) 
1709                 f['ext'] = mimetype2ext(mimetype) 
1714         for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): 
1719             media_attributes = extract_attributes(media_tag) 
1720             src = media_attributes.get('src') 
1722                 media_info['formats'].append({ 
1723                     'url': absolute_url(src), 
1724                     'vcodec': 'none' if media_type == 'audio' else None, 
1726             media_info['thumbnail'] = media_attributes.get('poster') 
1728                 for source_tag in re.findall(r'<source[^>]+>', media_content): 
1729                     source_attributes = extract_attributes(source_tag) 
1730                     src = source_attributes.get('src') 
1733                     f = parse_content_type(source_attributes.get('type')) 
1735                         'url': absolute_url(src), 
1736                         'vcodec': 'none' if media_type == 'audio' else None, 
1738                     media_info['formats'].append(f) 
1739                 for track_tag in re.findall(r'<track[^>]+>', media_content): 
1740                     track_attributes = extract_attributes(track_tag) 
1741                     kind = track_attributes.get('kind') 
1742                     if not kind or kind == 'subtitles': 
1743                         src = track_attributes.get('src') 
1746                         lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') 
1747                         media_info['subtitles'].setdefault(lang, []).append({ 
1748                             'url': absolute_url(src), 
1750             if media_info['formats']: 
1751                 entries.append(media_info) 
1754     def _live_title(self, name): 
1755         """ Generate the title for a live video """ 
1756         now = datetime.datetime.now() 
1757         now_str = now.strftime('%Y-%m-%d %H:%M') 
1758         return name + ' ' + now_str 
1760     def _int(self, v, name, fatal=False, **kwargs): 
1761         res = int_or_none(v, **kwargs) 
1762         if 'get_attr' in kwargs: 
1763             print(getattr(v, kwargs['get_attr'])) 
1765             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
1767                 raise ExtractorError(msg) 
1769                 self._downloader.report_warning(msg) 
1772     def _float(self, v, name, fatal=False, **kwargs): 
1773         res = float_or_none(v, **kwargs) 
1775             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
1777                 raise ExtractorError(msg) 
1779                 self._downloader.report_warning(msg) 
1782     def _set_cookie(self, domain, name, value, expire_time=None): 
1783         cookie = compat_cookiejar.Cookie( 
1784             0, name, value, None, None, domain, None, 
1785             None, '/', True, False, expire_time, '', None, None, None) 
1786         self._downloader.cookiejar.set_cookie(cookie) 
1788     def _get_cookies(self, url): 
1789         """ Return a compat_cookies.SimpleCookie with the cookies for the url """ 
1790         req = sanitized_Request(url) 
1791         self._downloader.cookiejar.add_cookie_header(req) 
1792         return compat_cookies.SimpleCookie(req.get_header('Cookie')) 
1794     def get_testcases(self, include_onlymatching=False): 
1795         t = getattr(self, '_TEST', None) 
1797             assert not hasattr(self, '_TESTS'), \ 
1798                 '%s has _TEST and _TESTS' % type(self).__name__ 
1801             tests = getattr(self, '_TESTS', []) 
1803             if not include_onlymatching and t.get('only_matching', False): 
1805             t['name'] = type(self).__name__[:-len('IE')] 
1808     def is_suitable(self, age_limit): 
1809         """ Test whether the extractor is generally suitable for the given 
1810         age limit (i.e. pornographic sites are not, all others usually are) """ 
1812         any_restricted = False 
1813         for tc in self.get_testcases(include_onlymatching=False): 
1814             if tc.get('playlist', []): 
1815                 tc = tc['playlist'][0] 
1816             is_restricted = age_restricted( 
1817                 tc.get('info_dict', {}).get('age_limit'), age_limit) 
1818             if not is_restricted: 
1820             any_restricted = any_restricted or is_restricted 
1821         return not any_restricted 
1823     def extract_subtitles(self, *args, **kwargs): 
1824         if (self._downloader.params.get('writesubtitles', False) or 
1825                 self._downloader.params.get('listsubtitles')): 
1826             return self._get_subtitles(*args, **kwargs) 
1829     def _get_subtitles(self, *args, **kwargs): 
1830         raise NotImplementedError('This method must be implemented by subclasses') 
1833     def _merge_subtitle_items(subtitle_list1, subtitle_list2): 
1834         """ Merge subtitle items for one language. Items with duplicated URLs 
1835         will be dropped. """ 
1836         list1_urls = set([item['url'] for item in subtitle_list1]) 
1837         ret = list(subtitle_list1) 
1838         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) 
1842     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): 
1843         """ Merge two subtitle dictionaries, language by language. """ 
1844         ret = dict(subtitle_dict1) 
1845         for lang in subtitle_dict2: 
1846             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) 
1849     def extract_automatic_captions(self, *args, **kwargs): 
1850         if (self._downloader.params.get('writeautomaticsub', False) or 
1851                 self._downloader.params.get('listsubtitles')): 
1852             return self._get_automatic_captions(*args, **kwargs) 
1855     def _get_automatic_captions(self, *args, **kwargs): 
1856         raise NotImplementedError('This method must be implemented by subclasses') 
1858     def mark_watched(self, *args, **kwargs): 
1859         if (self._downloader.params.get('mark_watched', False) and 
1860                 (self._get_login_info()[0] is not None or 
1861                     self._downloader.params.get('cookiefile') is not None)): 
1862             self._mark_watched(*args, **kwargs) 
1864     def _mark_watched(self, *args, **kwargs): 
1865         raise NotImplementedError('This method must be implemented by subclasses') 
1867     def geo_verification_headers(self): 
1869         geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') 
1870         if geo_verification_proxy: 
1871             headers['Ytdl-request-proxy'] = geo_verification_proxy 
1875 class SearchInfoExtractor(InfoExtractor): 
1877     Base class for paged search queries extractors. 
1878     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} 
1879     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
1883     def _make_valid_url(cls): 
1884         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY 
1887     def suitable(cls, url): 
1888         return re.match(cls._make_valid_url(), url) is not None 
1890     def _real_extract(self, query): 
1891         mobj = re.match(self._make_valid_url(), query) 
1893             raise ExtractorError('Invalid search query "%s"' % query) 
1895         prefix = mobj.group('prefix') 
1896         query = mobj.group('query') 
1898             return self._get_n_results(query, 1) 
1899         elif prefix == 'all': 
1900             return self._get_n_results(query, self._MAX_RESULTS) 
1904                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) 
1905             elif n > self._MAX_RESULTS: 
1906                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 
1907                 n = self._MAX_RESULTS 
1908             return self._get_n_results(query, n) 
1910     def _get_n_results(self, query, n): 
1911         """Get a specified number of results for a query""" 
1912         raise NotImplementedError('This method must be implemented by subclasses') 
1915     def SEARCH_KEY(self): 
1916         return self._SEARCH_KEY