1 from __future__ 
import unicode_literals
 
  15 from ..compat 
import ( 
  18     compat_etree_fromstring
, 
  24     compat_urllib_parse_urlencode
, 
  25     compat_urllib_request
, 
  28 from ..downloader
.f4m 
import remove_encrypted_media
 
  56     parse_m3u8_attributes
, 
  60 class InfoExtractor(object): 
  61     """Information Extractor class. 
  63     Information extractors are the classes that, given a URL, extract 
  64     information about the video (or videos) the URL refers to. This 
  65     information includes the real video URL, the video title, author and 
  66     others. The information is stored in a dictionary which is then 
  67     passed to the YoutubeDL. The YoutubeDL processes this 
  68     information possibly downloading the video to the file system, among 
  69     other possible outcomes. 
  71     The type field determines the type of the result. 
  72     By far the most common value (and the default if _type is missing) is 
  73     "video", which indicates a single video. 
  75     For a video, the dictionaries must include the following fields: 
  78     title:          Video title, unescaped. 
  80     Additionally, it must contain either a formats entry or a url one: 
  82     formats:        A list of dictionaries for each format available, ordered 
  83                     from worst to best quality. 
  86                     * url        Mandatory. The URL of the video file 
  87                     * ext        Will be calculated from URL if missing 
  88                     * format     A human-readable description of the format 
  89                                  ("mp4 container with h264/opus"). 
  90                                  Calculated from the format_id, width, height. 
  91                                  and format_note fields if missing. 
  92                     * format_id  A short description of the format 
  93                                  ("mp4_h264_opus" or "19"). 
  94                                 Technically optional, but strongly recommended. 
  95                     * format_note Additional info about the format 
  96                                  ("3D" or "DASH video") 
  97                     * width      Width of the video, if known 
  98                     * height     Height of the video, if known 
  99                     * resolution Textual description of width and height 
 100                     * tbr        Average bitrate of audio and video in KBit/s 
 101                     * abr        Average audio bitrate in KBit/s 
 102                     * acodec     Name of the audio codec in use 
 103                     * asr        Audio sampling rate in Hertz 
 104                     * vbr        Average video bitrate in KBit/s 
 106                     * vcodec     Name of the video codec in use 
 107                     * container  Name of the container format 
 108                     * filesize   The number of bytes, if known in advance 
 109                     * filesize_approx  An estimate for the number of bytes 
 110                     * player_url SWF Player URL (used for rtmpdump). 
 111                     * protocol   The protocol that will be used for the actual 
 112                                  download, lower-case. 
 113                                  "http", "https", "rtsp", "rtmp", "rtmpe", 
 114                                  "m3u8", "m3u8_native" or "http_dash_segments". 
 115                     * preference Order number of this format. If this field is 
 116                                  present and not None, the formats get sorted 
 117                                  by this field, regardless of all other values. 
 118                                  -1 for default (order by other properties), 
 119                                  -2 or smaller for less than default. 
 120                                  < -1000 to hide the format (if there is 
 121                                     another one which is strictly better) 
 122                     * language   Language code, e.g. "de" or "en-US". 
 123                     * language_preference  Is this in the language mentioned in 
 125                                  10 if it's what the URL is about, 
 126                                  -1 for default (don't know), 
 127                                  -10 otherwise, other values reserved for now. 
 128                     * quality    Order number of the video quality of this 
 129                                  format, irrespective of the file format. 
 130                                  -1 for default (order by other properties), 
 131                                  -2 or smaller for less than default. 
 132                     * source_preference  Order number for this video source 
 133                                   (quality takes higher priority) 
 134                                  -1 for default (order by other properties), 
 135                                  -2 or smaller for less than default. 
 136                     * http_headers  A dictionary of additional HTTP headers 
 137                                  to add to the request. 
 138                     * stretched_ratio  If given and not 1, indicates that the 
 139                                  video's pixels are not square. 
 140                                  width : height ratio as float. 
 141                     * no_resume  The server does not support resuming the 
 142                                  (HTTP or RTMP) download. Boolean. 
 144     url:            Final video URL. 
 145     ext:            Video filename extension. 
 146     format:         The video format, defaults to ext (used for --get-format) 
 147     player_url:     SWF Player URL (used for rtmpdump). 
 149     The following fields are optional: 
 151     alt_title:      A secondary title of the video. 
 152     display_id      An alternative identifier for the video, not necessarily 
 153                     unique, but available before title. Typically, id is 
 154                     something like "4234987", title "Dancing naked mole rats", 
 155                     and display_id "dancing-naked-mole-rats" 
 156     thumbnails:     A list of dictionaries, with the following entries: 
 157                         * "id" (optional, string) - Thumbnail format ID 
 159                         * "preference" (optional, int) - quality of the image 
 160                         * "width" (optional, int) 
 161                         * "height" (optional, int) 
 162                         * "resolution" (optional, string "{width}x{height"}, 
 164     thumbnail:      Full URL to a video thumbnail image. 
 165     description:    Full video description. 
 166     uploader:       Full name of the video uploader. 
 167     license:        License name the video is licensed under. 
 168     creator:        The creator of the video. 
 169     release_date:   The date (YYYYMMDD) when the video was released. 
 170     timestamp:      UNIX timestamp of the moment the video became available. 
 171     upload_date:    Video upload date (YYYYMMDD). 
 172                     If not explicitly set, calculated from timestamp. 
 173     uploader_id:    Nickname or id of the video uploader. 
 174     uploader_url:   Full URL to a personal webpage of the video uploader. 
 175     location:       Physical location where the video was filmed. 
 176     subtitles:      The available subtitles as a dictionary in the format 
 177                     {language: subformats}. "subformats" is a list sorted from 
 178                     lower to higher preference, each element is a dictionary 
 179                     with the "ext" entry and one of: 
 180                         * "data": The subtitles file contents 
 181                         * "url": A URL pointing to the subtitles file 
 182                     "ext" will be calculated from URL if missing 
 183     automatic_captions: Like 'subtitles', used by the YoutubeIE for 
 184                     automatically generated captions 
 185     duration:       Length of the video in seconds, as an integer or float. 
 186     view_count:     How many users have watched the video on the platform. 
 187     like_count:     Number of positive ratings of the video 
 188     dislike_count:  Number of negative ratings of the video 
 189     repost_count:   Number of reposts of the video 
 190     average_rating: Average rating give by users, the scale used depends on the webpage 
 191     comment_count:  Number of comments on the video 
 192     comments:       A list of comments, each with one or more of the following 
 193                     properties (all but one of text or html optional): 
 194                         * "author" - human-readable name of the comment author 
 195                         * "author_id" - user ID of the comment author 
 197                         * "html" - Comment as HTML 
 198                         * "text" - Plain text of the comment 
 199                         * "timestamp" - UNIX timestamp of comment 
 200                         * "parent" - ID of the comment this one is replying to. 
 201                                      Set to "root" to indicate that this is a 
 202                                      comment to the original video. 
 203     age_limit:      Age restriction for the video, as an integer (years) 
 204     webpage_url:    The URL to the video webpage, if given to youtube-dl it 
 205                     should allow to get the same result again. (It will be set 
 206                     by YoutubeDL if it's missing) 
 207     categories:     A list of categories that the video falls in, for example 
 209     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"] 
 210     is_live:        True, False, or None (=unknown). Whether this video is a 
 211                     live stream that goes on instead of a fixed-length video. 
 212     start_time:     Time in seconds where the reproduction should start, as 
 213                     specified in the URL. 
 214     end_time:       Time in seconds where the reproduction should end, as 
 215                     specified in the URL. 
 217     The following fields should only be used when the video belongs to some logical 
 220     chapter:        Name or title of the chapter the video belongs to. 
 221     chapter_number: Number of the chapter the video belongs to, as an integer. 
 222     chapter_id:     Id of the chapter the video belongs to, as a unicode string. 
 224     The following fields should only be used when the video is an episode of some 
 227     series:         Title of the series or programme the video episode belongs to. 
 228     season:         Title of the season the video episode belongs to. 
 229     season_number:  Number of the season the video episode belongs to, as an integer. 
 230     season_id:      Id of the season the video episode belongs to, as a unicode string. 
 231     episode:        Title of the video episode. Unlike mandatory video title field, 
 232                     this field should denote the exact title of the video episode 
 233                     without any kind of decoration. 
 234     episode_number: Number of the video episode within a season, as an integer. 
 235     episode_id:     Id of the video episode, as a unicode string. 
 237     The following fields should only be used when the media is a track or a part of 
 240     track:          Title of the track. 
 241     track_number:   Number of the track within an album or a disc, as an integer. 
 242     track_id:       Id of the track (useful in case of custom indexing, e.g. 6.iii), 
 244     artist:         Artist(s) of the track. 
 245     genre:          Genre(s) of the track. 
 246     album:          Title of the album the track belongs to. 
 247     album_type:     Type of the album (e.g. "Demo", "Full-length", "Split", "Compilation", etc). 
 248     album_artist:   List of all artists appeared on the album (e.g. 
 249                     "Ash Borer / Fell Voices" or "Various Artists", useful for splits 
 251     disc_number:    Number of the disc or other physical medium the track belongs to, 
 253     release_year:   Year (YYYY) when the album was released. 
 255     Unless mentioned otherwise, the fields should be Unicode strings. 
 257     Unless mentioned otherwise, None is equivalent to absence of information. 
 260     _type "playlist" indicates multiple videos. 
 261     There must be a key "entries", which is a list, an iterable, or a PagedList 
 262     object, each element of which is a valid dictionary by this specification. 
 264     Additionally, playlists can have "title", "description" and "id" attributes 
 265     with the same semantics as videos (see above). 
 268     _type "multi_video" indicates that there are multiple videos that 
 269     form a single show, for examples multiple acts of an opera or TV episode. 
 270     It must have an entries key like a playlist and contain all the keys 
 271     required for a video at the same time. 
 274     _type "url" indicates that the video must be extracted from another 
 275     location, possibly by a different extractor. Its only required key is: 
 276     "url" - the next URL to extract. 
 277     The key "ie_key" can be set to the class name (minus the trailing "IE", 
 278     e.g. "Youtube") if the extractor class is known in advance. 
 279     Additionally, the dictionary may have any properties of the resolved entity 
 280     known in advance, for example "title" if the title of the referred video is 
 284     _type "url_transparent" entities have the same specification as "url", but 
 285     indicate that the given additional information is more precise than the one 
 286     associated with the resolved URL. 
 287     This is useful when a site employs a video service that hosts the video and 
 288     its technical metadata, but that video service does not embed a useful 
 289     title, description etc. 
 292     Subclasses of this one should re-define the _real_initialize() and 
 293     _real_extract() methods and define a _VALID_URL regexp. 
 294     Probably, they should also be added to the list of extractors. 
 296     Finally, the _WORKING attribute should be set to False for broken IEs 
 297     in order to warn the users and skip the tests. 
 304     def __init__(self
, downloader
=None): 
 305         """Constructor. Receives an optional downloader.""" 
 307         self
.set_downloader(downloader
) 
 310     def suitable(cls
, url
): 
 311         """Receives a URL and returns True if suitable for this IE.""" 
 313         # This does not use has/getattr intentionally - we want to know whether 
 314         # we have cached the regexp for *this* class, whereas getattr would also 
 315         # match the superclass 
 316         if '_VALID_URL_RE' not in cls
.__dict
__: 
 317             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 318         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 321     def _match_id(cls
, url
): 
 322         if '_VALID_URL_RE' not in cls
.__dict
__: 
 323             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 324         m 
= cls
._VALID
_URL
_RE
.match(url
) 
 330         """Getter method for _WORKING.""" 
 333     def initialize(self
): 
 334         """Initializes an instance (authentication, etc).""" 
 336             self
._real
_initialize
() 
 339     def extract(self
, url
): 
 340         """Extracts URL information and returns it in list of dicts.""" 
 343             return self
._real
_extract
(url
) 
 344         except ExtractorError
: 
 346         except compat_http_client
.IncompleteRead 
as e
: 
 347             raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True) 
 348         except (KeyError, StopIteration) as e
: 
 349             raise ExtractorError('An extractor error has occurred.', cause
=e
) 
 351     def set_downloader(self
, downloader
): 
 352         """Sets the downloader for this IE.""" 
 353         self
._downloader 
= downloader
 
 355     def _real_initialize(self
): 
 356         """Real initialization process. Redefine in subclasses.""" 
 359     def _real_extract(self
, url
): 
 360         """Real extraction process. Redefine in subclasses.""" 
 365         """A string for getting the InfoExtractor with get_info_extractor""" 
 366         return compat_str(cls
.__name
__[:-2]) 
 370         return compat_str(type(self
).__name
__[:-2]) 
 372     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, data
=None, headers
={}, query
={}): 
 373         """ Returns the response handle """ 
 375             self
.report_download_webpage(video_id
) 
 376         elif note 
is not False: 
 378                 self
.to_screen('%s' % (note
,)) 
 380                 self
.to_screen('%s: %s' % (video_id
, note
)) 
 381         if isinstance(url_or_request
, compat_urllib_request
.Request
): 
 382             url_or_request 
= update_Request( 
 383                 url_or_request
, data
=data
, headers
=headers
, query
=query
) 
 386                 url_or_request 
= update_url_query(url_or_request
, query
) 
 387             if data 
is not None or headers
: 
 388                 url_or_request 
= sanitized_Request(url_or_request
, data
, headers
) 
 390             return self
._downloader
.urlopen(url_or_request
) 
 391         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 395                 errnote 
= 'Unable to download webpage' 
 397             errmsg 
= '%s: %s' % (errnote
, error_to_compat_str(err
)) 
 399                 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
) 
 401                 self
._downloader
.report_warning(errmsg
) 
 404     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None, data
=None, headers
={}, query
={}): 
 405         """ Returns a tuple (page content as string, URL handle) """ 
 406         # Strip hashes from the URL (#1038) 
 407         if isinstance(url_or_request
, (compat_str
, str)): 
 408             url_or_request 
= url_or_request
.partition('#')[0] 
 410         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
, data
=data
, headers
=headers
, query
=query
) 
 414         content 
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
) 
 415         return (content
, urlh
) 
 418     def _guess_encoding_from_content(content_type
, webpage_bytes
): 
 419         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 421             encoding 
= m
.group(1) 
 423             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 424                           webpage_bytes[:1024]) 
 426                 encoding = m.group(1).decode('ascii') 
 427             elif webpage_bytes.startswith(b'\xff\xfe'): 
 434     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): 
 435         content_type = urlh.headers.get('Content-Type', '') 
 436         webpage_bytes = urlh.read() 
 437         if prefix is not None: 
 438             webpage_bytes = prefix + webpage_bytes 
 440             encoding = self._guess_encoding_from_content(content_type, webpage_bytes) 
 441         if self._downloader.params.get('dump_intermediate_pages', False): 
 443                 url = url_or_request.get_full_url() 
 444             except AttributeError: 
 446             self.to_screen('Dumping request to ' + url) 
 447             dump = base64.b64encode(webpage_bytes).decode('ascii') 
 448             self._downloader.to_screen(dump) 
 449         if self._downloader.params.get('write_pages', False): 
 451                 url = url_or_request.get_full_url() 
 452             except AttributeError: 
 454             basen = '%s_%s' % (video_id, url) 
 456                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() 
 457                 basen = basen[:240 - len(h)] + h 
 458             raw_filename = basen + '.dump' 
 459             filename = sanitize_filename(raw_filename, restricted=True) 
 460             self.to_screen('Saving request to ' + filename) 
 461             # Working around MAX_PATH limitation on Windows (see 
 462             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) 
 463             if compat_os_name == 'nt': 
 464                 absfilepath = os.path.abspath(filename) 
 465                 if len(absfilepath) > 259: 
 466                     filename = '\\\\?\\' + absfilepath 
 467             with open(filename, 'wb') as outf: 
 468                 outf.write(webpage_bytes) 
 471             content = webpage_bytes.decode(encoding, 'replace') 
 473             content = webpage_bytes.decode('utf-8', 'replace') 
 475         if ('<title>Access to this site is blocked</title>' in content and 
 476                 'Websense' in content[:512]): 
 477             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' 
 478             blocked_iframe = self._html_search_regex( 
 479                 r'<iframe src="([^
"]+)"', content, 
 480                 'Websense information URL
', default=None) 
 482                 msg += ' Visit 
%s for more details
' % blocked_iframe 
 483             raise ExtractorError(msg, expected=True) 
 484         if '<title
>The URL you requested has been blocked
</title
>' in content[:512]: 
 486                 'Access to this webpage has been blocked by Indian censorship
. ' 
 487                 'Use a VPN 
or proxy 
server (with --proxy
) to route around it
.') 
 488             block_msg = self._html_search_regex( 
 489                 r'</h1
><p
>(.*?
)</p
>', 
 490                 content, 'block message
', default=None) 
 492                 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ') 
 493             raise ExtractorError(msg, expected=True) 
 497     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None, data=None, headers={}, query={}): 
 498         """ Returns the data of the page as a string """ 
 501         while success is False: 
 503                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding, data=data, headers=headers, query=query) 
 505             except compat_http_client.IncompleteRead as e: 
 507                 if try_count >= tries: 
 509                 self._sleep(timeout, video_id) 
 516     def _download_xml(self, url_or_request, video_id, 
 517                       note='Downloading XML
', errnote='Unable to download XML
', 
 518                       transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}): 
 519         """Return the xml as an xml.etree.ElementTree.Element""" 
 520         xml_string = self._download_webpage( 
 521             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query) 
 522         if xml_string is False: 
 525             xml_string = transform_source(xml_string) 
 526         return compat_etree_fromstring(xml_string.encode('utf
-8')) 
 528     def _download_json(self, url_or_request, video_id, 
 529                        note='Downloading JSON metadata
', 
 530                        errnote='Unable to download JSON metadata
', 
 531                        transform_source=None, 
 532                        fatal=True, encoding=None, data=None, headers={}, query={}): 
 533         json_string = self._download_webpage( 
 534             url_or_request, video_id, note, errnote, fatal=fatal, 
 535             encoding=encoding, data=data, headers=headers, query=query) 
 536         if (not fatal) and json_string is False: 
 538         return self._parse_json( 
 539             json_string, video_id, transform_source=transform_source, fatal=fatal) 
 541     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): 
 543             json_string = transform_source(json_string) 
 545             return json.loads(json_string) 
 546         except ValueError as ve: 
 547             errmsg = '%s: Failed to parse JSON 
' % video_id 
 549                 raise ExtractorError(errmsg, cause=ve) 
 551                 self.report_warning(errmsg + str(ve)) 
 553     def report_warning(self, msg, video_id=None): 
 554         idstr = '' if video_id is None else '%s: ' % video_id 
 555         self._downloader.report_warning( 
 556             '[%s] %s%s' % (self.IE_NAME, idstr, msg)) 
 558     def to_screen(self, msg): 
 559         """Print msg to screen, prefixing it with '[ie_name
]'""" 
 560         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) 
 562     def report_extraction(self, id_or_name): 
 563         """Report information extraction.""" 
 564         self.to_screen('%s: Extracting information
' % id_or_name) 
 566     def report_download_webpage(self, video_id): 
 567         """Report webpage download.""" 
 568         self.to_screen('%s: Downloading webpage
' % video_id) 
 570     def report_age_confirmation(self): 
 571         """Report attempt to confirm age.""" 
 572         self.to_screen('Confirming age
') 
 574     def report_login(self): 
 575         """Report attempt to log in.""" 
 576         self.to_screen('Logging 
in') 
 579     def raise_login_required(msg='This video 
is only available 
for registered users
'): 
 580         raise ExtractorError( 
 581             '%s. Use 
--username 
and --password 
or --netrc to provide account credentials
.' % msg, 
 585     def raise_geo_restricted(msg='This video 
is not available 
from your location due to geo restriction
'): 
 586         raise ExtractorError( 
 587             '%s. You might want to use 
--proxy to workaround
.' % msg, 
 590     # Methods for following #608 
 592     def url_result(url, ie=None, video_id=None, video_title=None): 
 593         """Returns a URL that points to a page that should be processed""" 
 594         # TODO: ie should be the class used for getting the info 
 595         video_info = {'_type
': 'url
', 
 598         if video_id is not None: 
 599             video_info['id'] = video_id 
 600         if video_title is not None: 
 601             video_info['title
'] = video_title 
 605     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): 
 606         """Returns a playlist""" 
 607         video_info = {'_type
': 'playlist
', 
 610             video_info['id'] = playlist_id 
 612             video_info['title
'] = playlist_title 
 613         if playlist_description: 
 614             video_info['description
'] = playlist_description 
 617     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 619         Perform a regex search on the given string, using a single or a list of 
 620         patterns returning the first matching group. 
 621         In case of failure return a default value or raise a WARNING or a 
 622         RegexNotFoundError, depending on fatal, specifying the field name. 
 624         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 625             mobj = re.search(pattern, string, flags) 
 628                 mobj = re.search(p, string, flags) 
 632         if not self._downloader.params.get('no_color
') and compat_os_name != 'nt
' and sys.stderr.isatty(): 
 633             _name = '\033[0;34m
%s\033[0m
' % name 
 639                 # return the first matching group 
 640                 return next(g for g in mobj.groups() if g is not None) 
 642                 return mobj.group(group) 
 643         elif default is not NO_DEFAULT: 
 646             raise RegexNotFoundError('Unable to extract 
%s' % _name) 
 648             self._downloader.report_warning('unable to extract 
%s' % _name + bug_reports_message()) 
 651     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 653         Like _search_regex, but strips HTML tags and unescapes entities. 
 655         res = self._search_regex(pattern, string, name, default, fatal, flags, group) 
 657             return clean_html(res).strip() 
 661     def _get_login_info(self): 
 663         Get the login info as (username, password) 
 664         It will look in the netrc file using the _NETRC_MACHINE value 
 665         If there's no info available
, return (None, None) 
 667         if self._downloader is None: 
 672         downloader_params = self._downloader.params 
 674         # Attempt to use provided username and password or .netrc data 
 675         if downloader_params.get('username') is not None: 
 676             username = downloader_params['username'] 
 677             password = downloader_params['password'] 
 678         elif downloader_params.get('usenetrc', False): 
 680                 info = netrc.netrc().authenticators(self._NETRC_MACHINE) 
 685                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 
 686             except (IOError, netrc.NetrcParseError) as err: 
 687                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) 
 689         return (username, password) 
 691     def _get_tfa_info(self, note='two-factor verification code'): 
 693         Get the two
-factor authentication info
 
 694         TODO 
- asking the user will be required 
for sms
/phone verify
 
 695         currently just uses the command line option
 
 696         If there
's no info available, return None 
 698         if self._downloader is None: 
 700         downloader_params = self._downloader.params 
 702         if downloader_params.get('twofactor
') is not None: 
 703             return downloader_params['twofactor
'] 
 705         return compat_getpass('Type 
%s and press 
[Return
]: ' % note) 
 707     # Helper functions for extracting OpenGraph info 
 709     def _og_regexes(prop): 
 710         content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))' 
 711         property_re = (r'(?
:name|
property)=(?
:\'og
:%(prop)s\'|
"og:%(prop)s"|\s
*og
:%(prop)s\b)' 
 712                        % {'prop
': re.escape(prop)}) 
 713         template = r'<meta
[^
>]+?
%s[^
>]+?
%s' 
 715             template % (property_re, content_re), 
 716             template % (content_re, property_re), 
 720     def _meta_regex(prop): 
 721         return r'''(?isx)<meta 
 722                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) 
 723                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) 
 725     def _og_search_property(self, prop, html, name=None, **kargs): 
 727             name = 'OpenGraph 
%s' % prop 
 728         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) 
 731         return unescapeHTML(escaped) 
 733     def _og_search_thumbnail(self, html, **kargs): 
 734         return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs) 
 736     def _og_search_description(self, html, **kargs): 
 737         return self._og_search_property('description
', html, fatal=False, **kargs) 
 739     def _og_search_title(self, html, **kargs): 
 740         return self._og_search_property('title
', html, **kargs) 
 742     def _og_search_video_url(self, html, name='video url
', secure=True, **kargs): 
 743         regexes = self._og_regexes('video
') + self._og_regexes('video
:url
') 
 745             regexes = self._og_regexes('video
:secure_url
') + regexes 
 746         return self._html_search_regex(regexes, html, name, **kargs) 
 748     def _og_search_url(self, html, **kargs): 
 749         return self._og_search_property('url
', html, **kargs) 
 751     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): 
 752         if display_name is None: 
 754         return self._html_search_regex( 
 755             self._meta_regex(name), 
 756             html, display_name, fatal=fatal, group='content
', **kwargs) 
 758     def _dc_search_uploader(self, html): 
 759         return self._html_search_meta('dc
.creator
', html, 'uploader
') 
 761     def _rta_search(self, html): 
 762         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
 763         if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+' 
 764                      r'     content
="RTA-5042-1996-1400-1577-RTA"', 
 769     def _media_rating_search(self, html): 
 770         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 
 771         rating = self._html_search_meta('rating
', html) 
 783         return RATING_TABLE.get(rating.lower()) 
 785     def _family_friendly_search(self, html): 
 786         # See http://schema.org/VideoObject 
 787         family_friendly = self._html_search_meta('isFamilyFriendly
', html) 
 789         if not family_friendly: 
 798         return RATING_TABLE.get(family_friendly.lower()) 
 800     def _twitter_search_player(self, html): 
 801         return self._html_search_meta('twitter
:player
', html, 
 802                                       'twitter card player
') 
 804     def _search_json_ld(self, html, video_id, **kwargs): 
 805         json_ld = self._search_regex( 
 806             r'(?s
)<script
[^
>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', 
 807             html, 'JSON-LD', group='json_ld', **kwargs) 
 810         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) 
 812     def _json_ld(self, json_ld, video_id, fatal=True): 
 813         if isinstance(json_ld, compat_str): 
 814             json_ld = self._parse_json(json_ld, video_id, fatal=fatal) 
 818         if json_ld.get('@context') == 'http://schema.org': 
 819             item_type = json_ld.get('@type') 
 820             if item_type == 'TVEpisode': 
 822                     'episode': unescapeHTML(json_ld.get('name')), 
 823                     'episode_number': int_or_none(json_ld.get('episodeNumber')), 
 824                     'description': unescapeHTML(json_ld.get('description')), 
 826                 part_of_season = json_ld.get('partOfSeason') 
 827                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': 
 828                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) 
 829                 part_of_series = json_ld.get('partOfSeries') 
 830                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': 
 831                     info['series'] = unescapeHTML(part_of_series.get('name')) 
 832             elif item_type == 'Article': 
 834                     'timestamp': parse_iso8601(json_ld.get('datePublished')), 
 835                     'title': unescapeHTML(json_ld.get('headline')), 
 836                     'description': unescapeHTML(json_ld.get('articleBody')), 
 838         return dict((k, v) for k, v in info.items() if v is not None) 
 841     def _hidden_inputs(html): 
 842         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) 
 844         for input in re.findall(r'(?i)<input([^>]+)>', html): 
 845             if not re.search(r'type=(["\'])(?
:hidden|submit
)\
1', input): 
 847             name = re.search(r'(?
:name|
id)=(["\'])(?P<value>.+?)\1', input) 
 850             value = re.search(r'value=(["\'])(?P
<value
>.*?
)\
1', input) 
 853             hidden_inputs[name.group('value
')] = value.group('value
') 
 856     def _form_hidden_inputs(self, form_id, html): 
 857         form = self._search_regex( 
 858             r'(?
is)<form
[^
>]+?
id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, 
 859             html, '%s form' % form_id, group='form') 
 860         return self._hidden_inputs(form) 
 862     def _sort_formats(self, formats, field_preference=None): 
 864             raise ExtractorError('No video formats found') 
 867             # Automatically determine tbr when missing based on abr and vbr (improves 
 868             # formats sorting in some cases) 
 869             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: 
 870                 f['tbr'] = f['abr'] + f['vbr'] 
 873             # TODO remove the following workaround 
 874             from ..utils import determine_ext 
 875             if not f.get('ext') and 'url' in f: 
 876                 f['ext'] = determine_ext(f['url']) 
 878             if isinstance(field_preference, (list, tuple)): 
 879                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) 
 881             preference = f.get('preference') 
 882             if preference is None: 
 884                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported 
 887             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 
 889             if f.get('vcodec') == 'none':  # audio only 
 891                 if self._downloader.params.get('prefer_free_formats'): 
 892                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] 
 894                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] 
 897                     audio_ext_preference = ORDER.index(f['ext']) 
 899                     audio_ext_preference = -1 
 901                 if f.get('acodec') == 'none':  # video only 
 903                 if self._downloader.params.get('prefer_free_formats'): 
 904                     ORDER = ['flv', 'mp4', 'webm'] 
 906                     ORDER = ['webm', 'flv', 'mp4'] 
 908                     ext_preference = ORDER.index(f['ext']) 
 911                 audio_ext_preference = 0 
 915                 f.get('language_preference') if f.get('language_preference') is not None else -1, 
 916                 f.get('quality') if f.get('quality') is not None else -1, 
 917                 f.get('tbr') if f.get('tbr') is not None else -1, 
 918                 f.get('filesize') if f.get('filesize') is not None else -1, 
 919                 f.get('vbr') if f.get('vbr') is not None else -1, 
 920                 f.get('height') if f.get('height') is not None else -1, 
 921                 f.get('width') if f.get('width') is not None else -1, 
 924                 f.get('abr') if f.get('abr') is not None else -1, 
 925                 audio_ext_preference, 
 926                 f.get('fps') if f.get('fps') is not None else -1, 
 927                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, 
 928                 f.get('source_preference') if f.get('source_preference') is not None else -1, 
 929                 f.get('format_id') if f.get('format_id') is not None else '', 
 931         formats.sort(key=_formats_key) 
 933     def _check_formats(self, formats, video_id): 
 936                 lambda f: self._is_valid_url( 
 938                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), 
 942     def _remove_duplicate_formats(formats): 
 946             if f['url'] not in format_urls: 
 947                 format_urls.add(f['url']) 
 948                 unique_formats.append(f) 
 949         formats[:] = unique_formats 
 951     def _is_valid_url(self, url, video_id, item='video'): 
 952         url = self._proto_relative_url(url, scheme='http:') 
 953         # For now assume non HTTP(S) URLs always valid 
 954         if not (url.startswith('http://') or url.startswith('https://')): 
 957             self._request_webpage(url, video_id, 'Checking %s URL' % item) 
 959         except ExtractorError as e: 
 960             if isinstance(e.cause, compat_urllib_error.URLError): 
 962                     '%s: %s URL is invalid, skipping' % (video_id, item)) 
 966     def http_scheme(self): 
 967         """ Either "http
:" or "https
:", depending on the user's preferences """ 
 970             if self._downloader.params.get('prefer_insecure', False) 
 973     def _proto_relative_url(self, url, scheme=None): 
 976         if url.startswith('//'): 
 978                 scheme = self.http_scheme() 
 983     def _sleep(self, timeout, video_id, msg_template=None): 
 984         if msg_template is None: 
 985             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' 
 986         msg = msg_template % {'video_id': video_id, 'timeout': timeout} 
 990     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, 
 991                              transform_source=lambda s: fix_xml_ampersands(s).strip(), 
 992                              fatal=True, m3u8_id=None): 
 993         manifest = self._download_xml( 
 994             manifest_url, video_id, 'Downloading f4m manifest', 
 995             'Unable to download f4m manifest', 
 996             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests 
 997             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) 
 998             transform_source=transform_source, 
1001         if manifest is False: 
1004         return self._parse_f4m_formats( 
1005             manifest, manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1006             transform_source=transform_source, fatal=fatal, m3u8_id=m3u8_id) 
1008     def _parse_f4m_formats(self, manifest, manifest_url, video_id, preference=None, f4m_id=None, 
1009                            transform_source=lambda s: fix_xml_ampersands(s).strip(), 
1010                            fatal=True, m3u8_id=None): 
1011         # currently youtube-dl cannot decode the playerVerificationChallenge as Akamai uses Adobe Alchemy 
1012         akamai_pv = manifest.find('{http://ns.adobe.com/f4m/1.0}pv-2.0') 
1013         if akamai_pv is not None and ';' in akamai_pv.text: 
1014             playerVerificationChallenge = akamai_pv.text.split(';')[0] 
1015             if playerVerificationChallenge.strip() != '': 
1019         manifest_version = '1.0' 
1020         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') 
1022             manifest_version = '2.0' 
1023             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') 
1024         # Remove unsupported DRM protected media from final formats 
1025         # rendition (see https://github.com/rg3/youtube-dl/issues/8573). 
1026         media_nodes = remove_encrypted_media(media_nodes) 
1029         base_url = xpath_text( 
1030             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 
1031             'base URL', default=None) 
1033             base_url = base_url.strip() 
1035         bootstrap_info = xpath_element( 
1036             manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], 
1037             'bootstrap info', default=None) 
1039         for i, media_el in enumerate(media_nodes): 
1040             tbr = int_or_none(media_el.attrib.get('bitrate')) 
1041             width = int_or_none(media_el.attrib.get('width')) 
1042             height = int_or_none(media_el.attrib.get('height')) 
1043             format_id = '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])) 
1044             # If <bootstrapInfo> is present, the specified f4m is a 
1045             # stream-level manifest, and only set-level manifests may refer to 
1046             # external resources.  See section 11.4 and section 4 of F4M spec 
1047             if bootstrap_info is None: 
1049                 # @href is introduced in 2.0, see section 11.6 of F4M spec 
1050                 if manifest_version == '2.0': 
1051                     media_url = media_el.attrib.get('href') 
1052                 if media_url is None: 
1053                     media_url = media_el.attrib.get('url') 
1057                     media_url if media_url.startswith('http://') or media_url.startswith('https://') 
1058                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) 
1059                 # If media_url is itself a f4m manifest do the recursive extraction 
1060                 # since bitrates in parent manifest (this one) and media_url manifest 
1061                 # may differ leading to inability to resolve the format by requested 
1062                 # bitrate in f4m downloader 
1063                 ext = determine_ext(manifest_url) 
1065                     f4m_formats = self._extract_f4m_formats( 
1066                         manifest_url, video_id, preference=preference, f4m_id=f4m_id, 
1067                         transform_source=transform_source, fatal=fatal) 
1068                     # Sometimes stream-level manifest contains single media entry that 
1069                     # does not contain any quality metadata (e.g. http://matchtv.ru/#live-player). 
1070                     # At the same time parent's media entry in set-level manifest may 
1071                     # contain it. We will copy it from parent in such cases. 
1072                     if len(f4m_formats) == 1: 
1075                             'tbr': f.get('tbr') or tbr, 
1076                             'width': f.get('width') or width, 
1077                             'height': f.get('height') or height, 
1078                             'format_id': f.get('format_id') if not tbr else format_id, 
1080                     formats.extend(f4m_formats) 
1083                     formats.extend(self._extract_m3u8_formats( 
1084                         manifest_url, video_id, 'mp4', preference=preference, 
1085                         m3u8_id=m3u8_id, fatal=fatal)) 
1088                 'format_id': format_id, 
1089                 'url': manifest_url, 
1090                 'ext': 'flv' if bootstrap_info is not None else None, 
1094                 'preference': preference, 
1098     def _m3u8_meta_format(self, m3u8_url, ext=None, preference=None, m3u8_id=None): 
1100             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 
1104             'preference': preference - 1 if preference else -1, 
1105             'resolution': 'multiple', 
1106             'format_note': 'Quality selection URL', 
1109     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, 
1110                               entry_protocol='m3u8', preference=None, 
1111                               m3u8_id=None, note=None, errnote=None, 
1112                               fatal=True, live=False): 
1114         formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] 
1116         format_url = lambda u: ( 
1118             if re.match(r'^https?://', u) 
1119             else compat_urlparse.urljoin(m3u8_url, u)) 
1121         res = self._download_webpage_handle( 
1123             note=note or 'Downloading m3u8 information', 
1124             errnote=errnote or 'Failed to download m3u8 information', 
1128         m3u8_doc, urlh = res 
1129         m3u8_url = urlh.geturl() 
1131         # We should try extracting formats only from master playlists [1], i.e. 
1132         # playlists that describe available qualities. On the other hand media 
1133         # playlists [2] should be returned as is since they contain just the media 
1134         # without qualities renditions. 
1135         # Fortunately, master playlist can be easily distinguished from media 
1136         # playlist based on particular tags availability. As of [1, 2] master 
1137         # playlist tags MUST NOT appear in a media playist and vice versa. 
1138         # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist 
1139         # and MUST NOT appear in master playlist thus we can clearly detect media 
1140         # playlist with this criterion. 
1141         # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 
1142         # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 
1143         # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 
1144         if '#EXT-X-TARGETDURATION' in m3u8_doc:  # media playlist, return as is 
1147                 'format_id': m3u8_id, 
1149                 'protocol': entry_protocol, 
1150                 'preference': preference, 
1154         for line in m3u8_doc.splitlines(): 
1155             if line.startswith('#EXT-X-STREAM-INF:'): 
1156                 last_info = parse_m3u8_attributes(line) 
1157             elif line.startswith('#EXT-X-MEDIA:'): 
1158                 last_media = parse_m3u8_attributes(line) 
1159             elif line.startswith('#') or not line.strip(): 
1162                 if last_info is None: 
1163                     formats.append({'url': format_url(line)}) 
1165                 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) 
1168                     format_id.append(m3u8_id) 
1169                 last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') not in ('SUBTITLES', 'CLOSED-CAPTIONS') else None 
1170                 # Despite specification does not mention NAME attribute for 
1171                 # EXT-X-STREAM-INF it still sometimes may be present 
1172                 stream_name = last_info.get('NAME') or last_media_name 
1173                 # Bandwidth of live streams may differ over time thus making 
1174                 # format_id unpredictable. So it's better to keep provided 
1177                     format_id.append(stream_name if stream_name else '%d' % (tbr if tbr else len(formats))) 
1179                     'format_id': '-'.join(format_id), 
1180                     'url': format_url(line.strip()), 
1183                     'protocol': entry_protocol, 
1184                     'preference': preference, 
1186                 resolution = last_info.get('RESOLUTION') 
1188                     width_str, height_str = resolution.split('x') 
1189                     f['width'] = int(width_str) 
1190                     f['height'] = int(height_str) 
1191                 codecs = last_info.get('CODECS') 
1193                     vcodec, acodec = [None] * 2 
1194                     va_codecs = codecs.split(',') 
1195                     if len(va_codecs) == 1: 
1196                         # Audio only entries usually come with single codec and 
1197                         # no resolution. For more robustness we also check it to 
1199                         if not resolution and va_codecs[0].startswith('mp4a'): 
1200                             vcodec, acodec = 'none', va_codecs[0] 
1202                             vcodec = va_codecs[0] 
1204                         vcodec, acodec = va_codecs[:2] 
1209                 if last_media is not None: 
1210                     f['m3u8_media'] = last_media 
1217     def _xpath_ns(path, namespace=None): 
1221         for c in path.split('/'): 
1222             if not c or c == '.': 
1225                 out.append('{%s}%s' % (namespace, c)) 
1226         return '/'.join(out) 
1228     def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None, transform_source=None): 
1229         smil = self._download_smil(smil_url, video_id, fatal=fatal, transform_source=transform_source) 
1235         namespace = self._parse_smil_namespace(smil) 
1237         return self._parse_smil_formats( 
1238             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1240     def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): 
1241         smil = self._download_smil(smil_url, video_id, fatal=fatal) 
1244         return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) 
1246     def _download_smil(self, smil_url, video_id, fatal=True, transform_source=None): 
1247         return self._download_xml( 
1248             smil_url, video_id, 'Downloading SMIL file', 
1249             'Unable to download SMIL file', fatal=fatal, transform_source=transform_source) 
1251     def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): 
1252         namespace = self._parse_smil_namespace(smil) 
1254         formats = self._parse_smil_formats( 
1255             smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) 
1256         subtitles = self._parse_smil_subtitles(smil, namespace=namespace) 
1258         video_id = os.path.splitext(url_basename(smil_url))[0] 
1262         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1263             name = meta.attrib.get('name') 
1264             content = meta.attrib.get('content') 
1265             if not name or not content: 
1267             if not title and name == 'title': 
1269             elif not description and name in ('description', 'abstract'): 
1270                 description = content 
1271             elif not upload_date and name == 'date': 
1272                 upload_date = unified_strdate(content) 
1275             'id': image.get('type'), 
1276             'url': image.get('src'), 
1277             'width': int_or_none(image.get('width')), 
1278             'height': int_or_none(image.get('height')), 
1279         } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] 
1283             'title': title or video_id, 
1284             'description': description, 
1285             'upload_date': upload_date, 
1286             'thumbnails': thumbnails, 
1288             'subtitles': subtitles, 
1291     def _parse_smil_namespace(self, smil): 
1292         return self._search_regex( 
1293             r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) 
1295     def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): 
1297         for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): 
1298             b = meta.get('base') or meta.get('httpBase') 
1309         media = smil.findall(self._xpath_ns('.//video', namespace)) + smil.findall(self._xpath_ns('.//audio', namespace)) 
1310         for medium in media: 
1311             src = medium.get('src') 
1312             if not src or src in srcs: 
1316             bitrate = float_or_none(medium.get('system-bitrate') or medium.get('systemBitrate'), 1000) 
1317             filesize = int_or_none(medium.get('size') or medium.get('fileSize')) 
1318             width = int_or_none(medium.get('width')) 
1319             height = int_or_none(medium.get('height')) 
1320             proto = medium.get('proto') 
1321             ext = medium.get('ext') 
1322             src_ext = determine_ext(src) 
1323             streamer = medium.get('streamer') or base 
1325             if proto == 'rtmp' or streamer.startswith('rtmp'): 
1331                     'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), 
1333                     'filesize': filesize, 
1337                 if transform_rtmp_url: 
1338                     streamer, src = transform_rtmp_url(streamer, src) 
1339                     formats[-1].update({ 
1345             src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) 
1346             src_url = src_url.strip() 
1348             if proto == 'm3u8' or src_ext == 'm3u8': 
1349                 m3u8_formats = self._extract_m3u8_formats( 
1350                     src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) 
1351                 if len(m3u8_formats) == 1: 
1353                     m3u8_formats[0].update({ 
1354                         'format_id': 'hls-%d' % (m3u8_count if bitrate is None else bitrate), 
1359                 formats.extend(m3u8_formats) 
1362             if src_ext == 'f4m': 
1367                         'plugin': 'flowplayer-3.2.0.1', 
1369                 f4m_url += '&' if '?' in f4m_url else '?' 
1370                 f4m_url += compat_urllib_parse_urlencode(f4m_params) 
1371                 formats.extend(self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False)) 
1374             if src_url.startswith('http') and self._is_valid_url(src, video_id): 
1378                     'ext': ext or src_ext or 'flv', 
1379                     'format_id': 'http-%d' % (bitrate or http_count), 
1381                     'filesize': filesize, 
1389     def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): 
1392         for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): 
1393             src = textstream.get('src') 
1394             if not src or src in urls: 
1397             ext = textstream.get('ext') or mimetype2ext(textstream.get('type')) or determine_ext(src) 
1398             lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang 
1399             subtitles.setdefault(lang, []).append({ 
1405     def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): 
1406         xspf = self._download_xml( 
1407             playlist_url, playlist_id, 'Downloading xpsf playlist', 
1408             'Unable to download xspf manifest', fatal=fatal) 
1411         return self._parse_xspf(xspf, playlist_id) 
1413     def _parse_xspf(self, playlist, playlist_id): 
1415             'xspf': 'http://xspf.org/ns/0/', 
1416             's1': 'http://static.streamone.nl/player/ns/0', 
1420         for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): 
1422                 track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) 
1423             description = xpath_text( 
1424                 track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') 
1425             thumbnail = xpath_text( 
1426                 track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') 
1427             duration = float_or_none( 
1428                 xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) 
1431                 'url': location.text, 
1432                 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), 
1433                 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), 
1434                 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), 
1435             } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] 
1436             self._sort_formats(formats) 
1441                 'description': description, 
1442                 'thumbnail': thumbnail, 
1443                 'duration': duration, 
1448     def _extract_mpd_formats(self, mpd_url, video_id, mpd_id=None, note=None, errnote=None, fatal=True, formats_dict={}): 
1449         res = self._download_webpage_handle( 
1451             note=note or 'Downloading MPD manifest', 
1452             errnote=errnote or 'Failed to download MPD manifest', 
1457         mpd_base_url = re.match(r'https?://.+/', urlh.geturl()).group() 
1459         return self._parse_mpd_formats( 
1460             compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) 
1462     def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): 
1463         if mpd_doc.get('type') == 'dynamic': 
1466         namespace = self._search_regex(r'(?i)^{([^}]+)?}MPD$', mpd_doc.tag, 'namespace', default=None) 
1469             return self._xpath_ns(path, namespace) 
1471         def is_drm_protected(element): 
1472             return element.find(_add_ns('ContentProtection')) is not None 
1474         def extract_multisegment_info(element, ms_parent_info): 
1475             ms_info = ms_parent_info.copy() 
1476             segment_list = element.find(_add_ns('SegmentList')) 
1477             if segment_list is not None: 
1478                 segment_urls_e = segment_list.findall(_add_ns('SegmentURL')) 
1480                     ms_info['segment_urls'] = [segment.attrib['media'] for segment in segment_urls_e] 
1481                 initialization = segment_list.find(_add_ns('Initialization')) 
1482                 if initialization is not None: 
1483                     ms_info['initialization_url'] = initialization.attrib['sourceURL'] 
1485                 segment_template = element.find(_add_ns('SegmentTemplate')) 
1486                 if segment_template is not None: 
1487                     start_number = segment_template.get('startNumber') 
1489                         ms_info['start_number'] = int(start_number) 
1490                     segment_timeline = segment_template.find(_add_ns('SegmentTimeline')) 
1491                     if segment_timeline is not None: 
1492                         s_e = segment_timeline.findall(_add_ns('S')) 
1494                             ms_info['total_number'] = 0 
1496                                 ms_info['total_number'] += 1 + int(s.get('r', '0')) 
1498                         timescale = segment_template.get('timescale') 
1500                             ms_info['timescale'] = int(timescale) 
1501                         segment_duration = segment_template.get('duration') 
1502                         if segment_duration: 
1503                             ms_info['segment_duration'] = int(segment_duration) 
1504                     media_template = segment_template.get('media') 
1506                         ms_info['media_template'] = media_template 
1507                     initialization = segment_template.get('initialization') 
1509                         ms_info['initialization_url'] = initialization 
1511                         initialization = segment_template.find(_add_ns('Initialization')) 
1512                         if initialization is not None: 
1513                             ms_info['initialization_url'] = initialization.attrib['sourceURL'] 
1516         mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) 
1518         for period in mpd_doc.findall(_add_ns('Period')): 
1519             period_duration = parse_duration(period.get('duration')) or mpd_duration 
1520             period_ms_info = extract_multisegment_info(period, { 
1524             for adaptation_set in period.findall(_add_ns('AdaptationSet')): 
1525                 if is_drm_protected(adaptation_set): 
1527                 adaption_set_ms_info = extract_multisegment_info(adaptation_set, period_ms_info) 
1528                 for representation in adaptation_set.findall(_add_ns('Representation')): 
1529                     if is_drm_protected(representation): 
1531                     representation_attrib = adaptation_set.attrib.copy() 
1532                     representation_attrib.update(representation.attrib) 
1533                     # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory 
1534                     mime_type = representation_attrib['mimeType'] 
1535                     content_type = mime_type.split('/')[0] 
1536                     if content_type == 'text': 
1537                         # TODO implement WebVTT downloading 
1539                     elif content_type == 'video' or content_type == 'audio': 
1541                         for element in (representation, adaptation_set, period, mpd_doc): 
1542                             base_url_e = element.find(_add_ns('BaseURL')) 
1543                             if base_url_e is not None: 
1544                                 base_url = base_url_e.text + base_url 
1545                                 if re.match(r'^https?://', base_url): 
1547                         if mpd_base_url and not re.match(r'^https?://', base_url): 
1548                             if not mpd_base_url.endswith('/') and not base_url.startswith('/'): 
1550                             base_url = mpd_base_url + base_url 
1551                         representation_id = representation_attrib.get('id') 
1552                         lang = representation_attrib.get('lang') 
1553                         url_el = representation.find(_add_ns('BaseURL')) 
1554                         filesize = int_or_none(url_el.attrib.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el is not None else None) 
1556                             'format_id': '%s-%s' % (mpd_id, representation_id) if mpd_id else representation_id, 
1558                             'ext': mimetype2ext(mime_type), 
1559                             'width': int_or_none(representation_attrib.get('width')), 
1560                             'height': int_or_none(representation_attrib.get('height')), 
1561                             'tbr': int_or_none(representation_attrib.get('bandwidth'), 1000), 
1562                             'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 
1563                             'fps': int_or_none(representation_attrib.get('frameRate')), 
1564                             'vcodec': 'none' if content_type == 'audio' else representation_attrib.get('codecs'), 
1565                             'acodec': 'none' if content_type == 'video' else representation_attrib.get('codecs'), 
1566                             'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, 
1567                             'format_note': 'DASH %s' % content_type, 
1568                             'filesize': filesize, 
1570                         representation_ms_info = extract_multisegment_info(representation, adaption_set_ms_info) 
1571                         if 'segment_urls' not in representation_ms_info and 'media_template' in representation_ms_info: 
1572                             if 'total_number' not in representation_ms_info and 'segment_duration': 
1573                                 segment_duration = float(representation_ms_info['segment_duration']) / float(representation_ms_info['timescale']) 
1574                                 representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) 
1575                             media_template = representation_ms_info['media_template'] 
1576                             media_template = media_template.replace('$RepresentationID$', representation_id) 
1577                             media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) 
1578                             media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) 
1579                             media_template.replace('$$', '$') 
1580                             representation_ms_info['segment_urls'] = [ 
1582                                     'Number': segment_number, 
1583                                     'Bandwidth': representation_attrib.get('bandwidth')} 
1584                                 for segment_number in range( 
1585                                     representation_ms_info['start_number'], 
1586                                     representation_ms_info['total_number'] + representation_ms_info['start_number'])] 
1587                         if 'segment_urls' in representation_ms_info: 
1589                                 'segment_urls': representation_ms_info['segment_urls'], 
1590                                 'protocol': 'http_dash_segments', 
1592                             if 'initialization_url' in representation_ms_info: 
1593                                 initialization_url = representation_ms_info['initialization_url'].replace('$RepresentationID$', representation_id) 
1595                                     'initialization_url': initialization_url, 
1597                                 if not f.get('url'): 
1598                                     f['url'] = initialization_url 
1600                             existing_format = next( 
1601                                 fo for fo in formats 
1602                                 if fo['format_id'] == representation_id) 
1603                         except StopIteration: 
1604                             full_info = formats_dict.get(representation_id, {}).copy() 
1606                             formats.append(full_info) 
1608                             existing_format.update(f) 
1610                         self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) 
1613     def _live_title(self, name): 
1614         """ Generate the title for a live video """ 
1615         now = datetime.datetime.now() 
1616         now_str = now.strftime('%Y-%m-%d %H:%M') 
1617         return name + ' ' + now_str 
1619     def _int(self, v, name, fatal=False, **kwargs): 
1620         res = int_or_none(v, **kwargs) 
1621         if 'get_attr' in kwargs: 
1622             print(getattr(v, kwargs['get_attr'])) 
1624             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
1626                 raise ExtractorError(msg) 
1628                 self._downloader.report_warning(msg) 
1631     def _float(self, v, name, fatal=False, **kwargs): 
1632         res = float_or_none(v, **kwargs) 
1634             msg = 'Failed to extract %s: Could not parse value %r' % (name, v) 
1636                 raise ExtractorError(msg) 
1638                 self._downloader.report_warning(msg) 
1641     def _set_cookie(self, domain, name, value, expire_time=None): 
1642         cookie = compat_cookiejar.Cookie( 
1643             0, name, value, None, None, domain, None, 
1644             None, '/', True, False, expire_time, '', None, None, None) 
1645         self._downloader.cookiejar.set_cookie(cookie) 
1647     def _get_cookies(self, url): 
1648         """ Return a compat_cookies.SimpleCookie with the cookies for the url """ 
1649         req = sanitized_Request(url) 
1650         self._downloader.cookiejar.add_cookie_header(req) 
1651         return compat_cookies.SimpleCookie(req.get_header('Cookie')) 
1653     def get_testcases(self, include_onlymatching=False): 
1654         t = getattr(self, '_TEST', None) 
1656             assert not hasattr(self, '_TESTS'), \ 
1657                 '%s has _TEST and _TESTS' % type(self).__name__ 
1660             tests = getattr(self, '_TESTS', []) 
1662             if not include_onlymatching and t.get('only_matching', False): 
1664             t['name'] = type(self).__name__[:-len('IE')] 
1667     def is_suitable(self, age_limit): 
1668         """ Test whether the extractor is generally suitable for the given 
1669         age limit (i.e. pornographic sites are not, all others usually are) """ 
1671         any_restricted = False 
1672         for tc in self.get_testcases(include_onlymatching=False): 
1673             if 'playlist' in tc: 
1674                 tc = tc['playlist'][0] 
1675             is_restricted = age_restricted( 
1676                 tc.get('info_dict', {}).get('age_limit'), age_limit) 
1677             if not is_restricted: 
1679             any_restricted = any_restricted or is_restricted 
1680         return not any_restricted 
1682     def extract_subtitles(self, *args, **kwargs): 
1683         if (self._downloader.params.get('writesubtitles', False) or 
1684                 self._downloader.params.get('listsubtitles')): 
1685             return self._get_subtitles(*args, **kwargs) 
1688     def _get_subtitles(self, *args, **kwargs): 
1689         raise NotImplementedError('This method must be implemented by subclasses') 
1692     def _merge_subtitle_items(subtitle_list1, subtitle_list2): 
1693         """ Merge subtitle items for one language. Items with duplicated URLs 
1694         will be dropped. """ 
1695         list1_urls = set([item['url'] for item in subtitle_list1]) 
1696         ret = list(subtitle_list1) 
1697         ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) 
1701     def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): 
1702         """ Merge two subtitle dictionaries, language by language. """ 
1703         ret = dict(subtitle_dict1) 
1704         for lang in subtitle_dict2: 
1705             ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) 
1708     def extract_automatic_captions(self, *args, **kwargs): 
1709         if (self._downloader.params.get('writeautomaticsub', False) or 
1710                 self._downloader.params.get('listsubtitles')): 
1711             return self._get_automatic_captions(*args, **kwargs) 
1714     def _get_automatic_captions(self, *args, **kwargs): 
1715         raise NotImplementedError('This method must be implemented by subclasses') 
1717     def mark_watched(self, *args, **kwargs): 
1718         if (self._downloader.params.get('mark_watched', False) and 
1719                 (self._get_login_info()[0] is not None or 
1720                     self._downloader.params.get('cookiefile') is not None)): 
1721             self._mark_watched(*args, **kwargs) 
1723     def _mark_watched(self, *args, **kwargs): 
1724         raise NotImplementedError('This method must be implemented by subclasses') 
1727 class SearchInfoExtractor(InfoExtractor): 
1729     Base class for paged search queries extractors. 
1730     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} 
1731     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
1735     def _make_valid_url(cls): 
1736         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY 
1739     def suitable(cls, url): 
1740         return re.match(cls._make_valid_url(), url) is not None 
1742     def _real_extract(self, query): 
1743         mobj = re.match(self._make_valid_url(), query) 
1745             raise ExtractorError('Invalid search query "%s"' % query) 
1747         prefix = mobj.group('prefix') 
1748         query = mobj.group('query') 
1750             return self._get_n_results(query, 1) 
1751         elif prefix == 'all': 
1752             return self._get_n_results(query, self._MAX_RESULTS) 
1756                 raise ExtractorError('invalid download number %s for query "%s"' % (n, query)) 
1757             elif n > self._MAX_RESULTS: 
1758                 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 
1759                 n = self._MAX_RESULTS 
1760             return self._get_n_results(query, n) 
1762     def _get_n_results(self, query, n): 
1763         """Get a specified number of results for a query""" 
1764         raise NotImplementedError('This method must be implemented by subclasses') 
1767     def SEARCH_KEY(self): 
1768         return self._SEARCH_KEY