1 from __future__ 
import unicode_literals
 
  15 from ..compat 
import ( 
  24     compat_etree_fromstring
, 
  53 class InfoExtractor(object): 
  54     """Information Extractor class. 
  56     Information extractors are the classes that, given a URL, extract 
  57     information about the video (or videos) the URL refers to. This 
  58     information includes the real video URL, the video title, author and 
  59     others. The information is stored in a dictionary which is then 
  60     passed to the YoutubeDL. The YoutubeDL processes this 
  61     information possibly downloading the video to the file system, among 
  62     other possible outcomes. 
  64     The type field determines the type of the result. 
  65     By far the most common value (and the default if _type is missing) is 
  66     "video", which indicates a single video. 
  68     For a video, the dictionaries must include the following fields: 
  71     title:          Video title, unescaped. 
  73     Additionally, it must contain either a formats entry or a url one: 
  75     formats:        A list of dictionaries for each format available, ordered 
  76                     from worst to best quality. 
  79                     * url        Mandatory. The URL of the video file 
  80                     * ext        Will be calculated from URL if missing 
  81                     * format     A human-readable description of the format 
  82                                  ("mp4 container with h264/opus"). 
  83                                  Calculated from the format_id, width, height. 
  84                                  and format_note fields if missing. 
  85                     * format_id  A short description of the format 
  86                                  ("mp4_h264_opus" or "19"). 
  87                                 Technically optional, but strongly recommended. 
  88                     * format_note Additional info about the format 
  89                                  ("3D" or "DASH video") 
  90                     * width      Width of the video, if known 
  91                     * height     Height of the video, if known 
  92                     * resolution Textual description of width and height 
  93                     * tbr        Average bitrate of audio and video in KBit/s 
  94                     * abr        Average audio bitrate in KBit/s 
  95                     * acodec     Name of the audio codec in use 
  96                     * asr        Audio sampling rate in Hertz 
  97                     * vbr        Average video bitrate in KBit/s 
  99                     * vcodec     Name of the video codec in use 
 100                     * container  Name of the container format 
 101                     * filesize   The number of bytes, if known in advance 
 102                     * filesize_approx  An estimate for the number of bytes 
 103                     * player_url SWF Player URL (used for rtmpdump). 
 104                     * protocol   The protocol that will be used for the actual 
 105                                  download, lower-case. 
 106                                  "http", "https", "rtsp", "rtmp", "rtmpe", 
 107                                  "m3u8", or "m3u8_native". 
 108                     * preference Order number of this format. If this field is 
 109                                  present and not None, the formats get sorted 
 110                                  by this field, regardless of all other values. 
 111                                  -1 for default (order by other properties), 
 112                                  -2 or smaller for less than default. 
 113                                  < -1000 to hide the format (if there is 
 114                                     another one which is strictly better) 
 115                     * language   Language code, e.g. "de" or "en-US". 
 116                     * language_preference  Is this in the language mentioned in 
 118                                  10 if it's what the URL is about, 
 119                                  -1 for default (don't know), 
 120                                  -10 otherwise, other values reserved for now. 
 121                     * quality    Order number of the video quality of this 
 122                                  format, irrespective of the file format. 
 123                                  -1 for default (order by other properties), 
 124                                  -2 or smaller for less than default. 
 125                     * source_preference  Order number for this video source 
 126                                   (quality takes higher priority) 
 127                                  -1 for default (order by other properties), 
 128                                  -2 or smaller for less than default. 
 129                     * http_headers  A dictionary of additional HTTP headers 
 130                                  to add to the request. 
 131                     * stretched_ratio  If given and not 1, indicates that the 
 132                                  video's pixels are not square. 
 133                                  width : height ratio as float. 
 134                     * no_resume  The server does not support resuming the 
 135                                  (HTTP or RTMP) download. Boolean. 
 137     url:            Final video URL. 
 138     ext:            Video filename extension. 
 139     format:         The video format, defaults to ext (used for --get-format) 
 140     player_url:     SWF Player URL (used for rtmpdump). 
 142     The following fields are optional: 
 144     alt_title:      A secondary title of the video. 
 145     display_id      An alternative identifier for the video, not necessarily 
 146                     unique, but available before title. Typically, id is 
 147                     something like "4234987", title "Dancing naked mole rats", 
 148                     and display_id "dancing-naked-mole-rats" 
 149     thumbnails:     A list of dictionaries, with the following entries: 
 150                         * "id" (optional, string) - Thumbnail format ID 
 152                         * "preference" (optional, int) - quality of the image 
 153                         * "width" (optional, int) 
 154                         * "height" (optional, int) 
 155                         * "resolution" (optional, string "{width}x{height"}, 
 157     thumbnail:      Full URL to a video thumbnail image. 
 158     description:    Full video description. 
 159     uploader:       Full name of the video uploader. 
 160     creator:        The main artist who created the video. 
 161     release_date:   The date (YYYYMMDD) when the video was released. 
 162     timestamp:      UNIX timestamp of the moment the video became available. 
 163     upload_date:    Video upload date (YYYYMMDD). 
 164                     If not explicitly set, calculated from timestamp. 
 165     uploader_id:    Nickname or id of the video uploader. 
 166     location:       Physical location where the video was filmed. 
 167     subtitles:      The available subtitles as a dictionary in the format 
 168                     {language: subformats}. "subformats" is a list sorted from 
 169                     lower to higher preference, each element is a dictionary 
 170                     with the "ext" entry and one of: 
 171                         * "data": The subtitles file contents 
 172                         * "url": A URL pointing to the subtitles file 
 173                     "ext" will be calculated from URL if missing 
 174     automatic_captions: Like 'subtitles', used by the YoutubeIE for 
 175                     automatically generated captions 
 176     duration:       Length of the video in seconds, as an integer or float. 
 177     view_count:     How many users have watched the video on the platform. 
 178     like_count:     Number of positive ratings of the video 
 179     dislike_count:  Number of negative ratings of the video 
 180     repost_count:   Number of reposts of the video 
 181     average_rating: Average rating give by users, the scale used depends on the webpage 
 182     comment_count:  Number of comments on the video 
 183     comments:       A list of comments, each with one or more of the following 
 184                     properties (all but one of text or html optional): 
 185                         * "author" - human-readable name of the comment author 
 186                         * "author_id" - user ID of the comment author 
 188                         * "html" - Comment as HTML 
 189                         * "text" - Plain text of the comment 
 190                         * "timestamp" - UNIX timestamp of comment 
 191                         * "parent" - ID of the comment this one is replying to. 
 192                                      Set to "root" to indicate that this is a 
 193                                      comment to the original video. 
 194     age_limit:      Age restriction for the video, as an integer (years) 
 195     webpage_url:    The URL to the video webpage, if given to youtube-dl it 
 196                     should allow to get the same result again. (It will be set 
 197                     by YoutubeDL if it's missing) 
 198     categories:     A list of categories that the video falls in, for example 
 200     tags:           A list of tags assigned to the video, e.g. ["sweden", "pop music"] 
 201     is_live:        True, False, or None (=unknown). Whether this video is a 
 202                     live stream that goes on instead of a fixed-length video. 
 203     start_time:     Time in seconds where the reproduction should start, as 
 204                     specified in the URL. 
 205     end_time:       Time in seconds where the reproduction should end, as 
 206                     specified in the URL. 
 208     The following fields should only be used when the video belongs to some logical 
 211     chapter:        Name or title of the chapter the video belongs to. 
 212     chapter_number: Number of the chapter the video belongs to, as an integer. 
 213     chapter_id:     Id of the chapter the video belongs to, as a unicode string. 
 215     The following fields should only be used when the video is an episode of some 
 218     series:         Title of the series or programme the video episode belongs to. 
 219     season:         Title of the season the video episode belongs to. 
 220     season_number:  Number of the season the video episode belongs to, as an integer. 
 221     season_id:      Id of the season the video episode belongs to, as a unicode string. 
 222     episode:        Title of the video episode. Unlike mandatory video title field, 
 223                     this field should denote the exact title of the video episode 
 224                     without any kind of decoration. 
 225     episode_number: Number of the video episode within a season, as an integer. 
 226     episode_id:     Id of the video episode, as a unicode string. 
 228     Unless mentioned otherwise, the fields should be Unicode strings. 
 230     Unless mentioned otherwise, None is equivalent to absence of information. 
 233     _type "playlist" indicates multiple videos. 
 234     There must be a key "entries", which is a list, an iterable, or a PagedList 
 235     object, each element of which is a valid dictionary by this specification. 
 237     Additionally, playlists can have "title", "description" and "id" attributes 
 238     with the same semantics as videos (see above). 
 241     _type "multi_video" indicates that there are multiple videos that 
 242     form a single show, for examples multiple acts of an opera or TV episode. 
 243     It must have an entries key like a playlist and contain all the keys 
 244     required for a video at the same time. 
 247     _type "url" indicates that the video must be extracted from another 
 248     location, possibly by a different extractor. Its only required key is: 
 249     "url" - the next URL to extract. 
 250     The key "ie_key" can be set to the class name (minus the trailing "IE", 
 251     e.g. "Youtube") if the extractor class is known in advance. 
 252     Additionally, the dictionary may have any properties of the resolved entity 
 253     known in advance, for example "title" if the title of the referred video is 
 257     _type "url_transparent" entities have the same specification as "url", but 
 258     indicate that the given additional information is more precise than the one 
 259     associated with the resolved URL. 
 260     This is useful when a site employs a video service that hosts the video and 
 261     its technical metadata, but that video service does not embed a useful 
 262     title, description etc. 
 265     Subclasses of this one should re-define the _real_initialize() and 
 266     _real_extract() methods and define a _VALID_URL regexp. 
 267     Probably, they should also be added to the list of extractors. 
 269     Finally, the _WORKING attribute should be set to False for broken IEs 
 270     in order to warn the users and skip the tests. 
 277     def __init__(self
, downloader
=None): 
 278         """Constructor. Receives an optional downloader.""" 
 280         self
.set_downloader(downloader
) 
 283     def suitable(cls
, url
): 
 284         """Receives a URL and returns True if suitable for this IE.""" 
 286         # This does not use has/getattr intentionally - we want to know whether 
 287         # we have cached the regexp for *this* class, whereas getattr would also 
 288         # match the superclass 
 289         if '_VALID_URL_RE' not in cls
.__dict
__: 
 290             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 291         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 294     def _match_id(cls
, url
): 
 295         if '_VALID_URL_RE' not in cls
.__dict
__: 
 296             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 297         m 
= cls
._VALID
_URL
_RE
.match(url
) 
 303         """Getter method for _WORKING.""" 
 306     def initialize(self
): 
 307         """Initializes an instance (authentication, etc).""" 
 309             self
._real
_initialize
() 
 312     def extract(self
, url
): 
 313         """Extracts URL information and returns it in list of dicts.""" 
 316             return self
._real
_extract
(url
) 
 317         except ExtractorError
: 
 319         except compat_http_client
.IncompleteRead 
as e
: 
 320             raise ExtractorError('A network error has occurred.', cause
=e
, expected
=True) 
 321         except (KeyError, StopIteration) as e
: 
 322             raise ExtractorError('An extractor error has occurred.', cause
=e
) 
 324     def set_downloader(self
, downloader
): 
 325         """Sets the downloader for this IE.""" 
 326         self
._downloader 
= downloader
 
 328     def _real_initialize(self
): 
 329         """Real initialization process. Redefine in subclasses.""" 
 332     def _real_extract(self
, url
): 
 333         """Real extraction process. Redefine in subclasses.""" 
 338         """A string for getting the InfoExtractor with get_info_extractor""" 
 339         return compat_str(cls
.__name
__[:-2]) 
 343         return compat_str(type(self
).__name
__[:-2]) 
 345     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True): 
 346         """ Returns the response handle """ 
 348             self
.report_download_webpage(video_id
) 
 349         elif note 
is not False: 
 351                 self
.to_screen('%s' % (note
,)) 
 353                 self
.to_screen('%s: %s' % (video_id
, note
)) 
 355             return self
._downloader
.urlopen(url_or_request
) 
 356         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 360                 errnote 
= 'Unable to download webpage' 
 362             errmsg 
= '%s: %s' % (errnote
, error_to_compat_str(err
)) 
 364                 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
) 
 366                 self
._downloader
.report_warning(errmsg
) 
 369     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True, encoding
=None): 
 370         """ Returns a tuple (page content as string, URL handle) """ 
 371         # Strip hashes from the URL (#1038) 
 372         if isinstance(url_or_request
, (compat_str
, str)): 
 373             url_or_request 
= url_or_request
.partition('#')[0] 
 375         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
) 
 379         content 
= self
._webpage
_read
_content
(urlh
, url_or_request
, video_id
, note
, errnote
, fatal
, encoding
=encoding
) 
 380         return (content
, urlh
) 
 383     def _guess_encoding_from_content(content_type
, webpage_bytes
): 
 384         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 386             encoding 
= m
.group(1) 
 388             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 389                           webpage_bytes[:1024]) 
 391                 encoding = m.group(1).decode('ascii') 
 392             elif webpage_bytes.startswith(b'\xff\xfe'): 
 399     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): 
 400         content_type = urlh.headers.get('Content-Type', '') 
 401         webpage_bytes = urlh.read() 
 402         if prefix is not None: 
 403             webpage_bytes = prefix + webpage_bytes 
 405             encoding = self._guess_encoding_from_content(content_type, webpage_bytes) 
 406         if self._downloader.params.get('dump_intermediate_pages', False): 
 408                 url = url_or_request.get_full_url() 
 409             except AttributeError: 
 411             self.to_screen('Dumping request to ' + url) 
 412             dump = base64.b64encode(webpage_bytes).decode('ascii') 
 413             self._downloader.to_screen(dump) 
 414         if self._downloader.params.get('write_pages', False): 
 416                 url = url_or_request.get_full_url() 
 417             except AttributeError: 
 419             basen = '%s_%s' % (video_id, url) 
 421                 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() 
 422                 basen = basen[:240 - len(h)] + h 
 423             raw_filename = basen + '.dump' 
 424             filename = sanitize_filename(raw_filename, restricted=True) 
 425             self.to_screen('Saving request to ' + filename) 
 426             # Working around MAX_PATH limitation on Windows (see 
 427             # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) 
 429                 absfilepath = os.path.abspath(filename) 
 430                 if len(absfilepath) > 259: 
 431                     filename = '\\\\?\\' + absfilepath 
 432             with open(filename, 'wb') as outf: 
 433                 outf.write(webpage_bytes) 
 436             content = webpage_bytes.decode(encoding, 'replace') 
 438             content = webpage_bytes.decode('utf-8', 'replace') 
 440         if ('<title>Access to this site is blocked</title>' in content and 
 441                 'Websense' in content[:512]): 
 442             msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' 
 443             blocked_iframe = self._html_search_regex( 
 444                 r'<iframe src="([^
"]+)"', content, 
 445                 'Websense information URL
', default=None) 
 447                 msg += ' Visit 
%s for more details
' % blocked_iframe 
 448             raise ExtractorError(msg, expected=True) 
 449         if '<title
>The URL you requested has been blocked
</title
>' in content[:512]: 
 451                 'Access to this webpage has been blocked by Indian censorship
. ' 
 452                 'Use a VPN 
or proxy 
server (with --proxy
) to route around it
.') 
 453             block_msg = self._html_search_regex( 
 454                 r'</h1
><p
>(.*?
)</p
>', 
 455                 content, 'block message
', default=None) 
 457                 msg += ' (Message
: "%s")' % block_msg.replace('\n', ' ') 
 458             raise ExtractorError(msg, expected=True) 
 462     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): 
 463         """ Returns the data of the page as a string """ 
 466         while success is False: 
 468                 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) 
 470             except compat_http_client.IncompleteRead as e: 
 472                 if try_count >= tries: 
 474                 self._sleep(timeout, video_id) 
 481     def _download_xml(self, url_or_request, video_id, 
 482                       note='Downloading XML
', errnote='Unable to download XML
', 
 483                       transform_source=None, fatal=True, encoding=None): 
 484         """Return the xml as an xml.etree.ElementTree.Element""" 
 485         xml_string = self._download_webpage( 
 486             url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) 
 487         if xml_string is False: 
 490             xml_string = transform_source(xml_string) 
 491         return compat_etree_fromstring(xml_string.encode('utf
-8')) 
 493     def _download_json(self, url_or_request, video_id, 
 494                        note='Downloading JSON metadata
', 
 495                        errnote='Unable to download JSON metadata
', 
 496                        transform_source=None, 
 497                        fatal=True, encoding=None): 
 498         json_string = self._download_webpage( 
 499             url_or_request, video_id, note, errnote, fatal=fatal, 
 501         if (not fatal) and json_string is False: 
 503         return self._parse_json( 
 504             json_string, video_id, transform_source=transform_source, fatal=fatal) 
 506     def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): 
 508             json_string = transform_source(json_string) 
 510             return json.loads(json_string) 
 511         except ValueError as ve: 
 512             errmsg = '%s: Failed to parse JSON 
' % video_id 
 514                 raise ExtractorError(errmsg, cause=ve) 
 516                 self.report_warning(errmsg + str(ve)) 
 518     def report_warning(self, msg, video_id=None): 
 519         idstr = '' if video_id is None else '%s: ' % video_id 
 520         self._downloader.report_warning( 
 521             '[%s] %s%s' % (self.IE_NAME, idstr, msg)) 
 523     def to_screen(self, msg): 
 524         """Print msg to screen, prefixing it with '[ie_name
]'""" 
 525         self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg)) 
 527     def report_extraction(self, id_or_name): 
 528         """Report information extraction.""" 
 529         self.to_screen('%s: Extracting information
' % id_or_name) 
 531     def report_download_webpage(self, video_id): 
 532         """Report webpage download.""" 
 533         self.to_screen('%s: Downloading webpage
' % video_id) 
 535     def report_age_confirmation(self): 
 536         """Report attempt to confirm age.""" 
 537         self.to_screen('Confirming age
') 
 539     def report_login(self): 
 540         """Report attempt to log in.""" 
 541         self.to_screen('Logging 
in') 
 544     def raise_login_required(msg='This video 
is only available 
for registered users
'): 
 545         raise ExtractorError( 
 546             '%s. Use 
--username 
and --password 
or --netrc to provide account credentials
.' % msg, 
 550     def raise_geo_restricted(msg='This video 
is not available 
from your location due to geo restriction
'): 
 551         raise ExtractorError( 
 552             '%s. You might want to use 
--proxy to workaround
.' % msg, 
 555     # Methods for following #608 
 557     def url_result(url, ie=None, video_id=None, video_title=None): 
 558         """Returns a URL that points to a page that should be processed""" 
 559         # TODO: ie should be the class used for getting the info 
 560         video_info = {'_type
': 'url
', 
 563         if video_id is not None: 
 564             video_info['id'] = video_id 
 565         if video_title is not None: 
 566             video_info['title
'] = video_title 
 570     def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): 
 571         """Returns a playlist""" 
 572         video_info = {'_type
': 'playlist
', 
 575             video_info['id'] = playlist_id 
 577             video_info['title
'] = playlist_title 
 578         if playlist_description: 
 579             video_info['description
'] = playlist_description 
 582     def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 584         Perform a regex search on the given string, using a single or a list of 
 585         patterns returning the first matching group. 
 586         In case of failure return a default value or raise a WARNING or a 
 587         RegexNotFoundError, depending on fatal, specifying the field name. 
 589         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 590             mobj = re.search(pattern, string, flags) 
 593                 mobj = re.search(p, string, flags) 
 597         if not self._downloader.params.get('no_color
') and os.name != 'nt
' and sys.stderr.isatty(): 
 598             _name = '\033[0;34m
%s\033[0m
' % name 
 604                 # return the first matching group 
 605                 return next(g for g in mobj.groups() if g is not None) 
 607                 return mobj.group(group) 
 608         elif default is not NO_DEFAULT: 
 611             raise RegexNotFoundError('Unable to extract 
%s' % _name) 
 613             self._downloader.report_warning('unable to extract 
%s' % _name + bug_reports_message()) 
 616     def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): 
 618         Like _search_regex, but strips HTML tags and unescapes entities. 
 620         res = self._search_regex(pattern, string, name, default, fatal, flags, group) 
 622             return clean_html(res).strip() 
 626     def _get_login_info(self): 
 628         Get the login info as (username, password) 
 629         It will look in the netrc file using the _NETRC_MACHINE value 
 630         If there's no info available
, return (None, None) 
 632         if self._downloader is None: 
 637         downloader_params = self._downloader.params 
 639         # Attempt to use provided username and password or .netrc data 
 640         if downloader_params.get('username') is not None: 
 641             username = downloader_params['username'] 
 642             password = downloader_params['password'] 
 643         elif downloader_params.get('usenetrc', False): 
 645                 info = netrc.netrc().authenticators(self._NETRC_MACHINE) 
 650                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 
 651             except (IOError, netrc.NetrcParseError) as err: 
 652                 self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) 
 654         return (username, password) 
 656     def _get_tfa_info(self, note='two-factor verification code'): 
 658         Get the two
-factor authentication info
 
 659         TODO 
- asking the user will be required 
for sms
/phone verify
 
 660         currently just uses the command line option
 
 661         If there
's no info available, return None 
 663         if self._downloader is None: 
 665         downloader_params = self._downloader.params 
 667         if downloader_params.get('twofactor
') is not None: 
 668             return downloader_params['twofactor
'] 
 670         return compat_getpass('Type 
%s and press 
[Return
]: ' % note) 
 672     # Helper functions for extracting OpenGraph info 
 674     def _og_regexes(prop): 
 675         content_re = r'content
=(?
:"([^"]+?
)"|\'([^\']+?)\'|\s*([^\s"\'=<>`
]+?
))' 
 676         property_re = (r'(?
:name|
property)=(?
:\'og
:%(prop)s\'|
"og:%(prop)s"|\s
*og
:%(prop)s\b)' 
 677                        % {'prop
': re.escape(prop)}) 
 678         template = r'<meta
[^
>]+?
%s[^
>]+?
%s' 
 680             template % (property_re, content_re), 
 681             template % (content_re, property_re), 
 685     def _meta_regex(prop): 
 686         return r'''(?isx)<meta 
 687                     (?=[^>]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) 
 688                     [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) 
 690     def _og_search_property(self, prop, html, name=None, **kargs): 
 692             name = 'OpenGraph 
%s' % prop 
 693         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) 
 696         return unescapeHTML(escaped) 
 698     def _og_search_thumbnail(self, html, **kargs): 
 699         return self._og_search_property('image
', html, 'thumbnail URL
', fatal=False, **kargs) 
 701     def _og_search_description(self, html, **kargs): 
 702         return self._og_search_property('description
', html, fatal=False, **kargs) 
 704     def _og_search_title(self, html, **kargs): 
 705         return self._og_search_property('title
', html, **kargs) 
 707     def _og_search_video_url(self, html, name='video url
', secure=True, **kargs): 
 708         regexes = self._og_regexes('video
') + self._og_regexes('video
:url
') 
 710             regexes = self._og_regexes('video
:secure_url
') + regexes 
 711         return self._html_search_regex(regexes, html, name, **kargs) 
 713     def _og_search_url(self, html, **kargs): 
 714         return self._og_search_property('url
', html, **kargs) 
 716     def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): 
 717         if display_name is None: 
 719         return self._html_search_regex( 
 720             self._meta_regex(name), 
 721             html, display_name, fatal=fatal, group='content
', **kwargs) 
 723     def _dc_search_uploader(self, html): 
 724         return self._html_search_meta('dc
.creator
', html, 'uploader
') 
 726     def _rta_search(self, html): 
 727         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
 728         if re.search(r'(?ix
)<meta\s
+name
="rating"\s
+' 
 729                      r'     content
="RTA-5042-1996-1400-1577-RTA"', 
 734     def _media_rating_search(self, html): 
 735         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 
 736         rating = self._html_search_meta('rating
', html) 
 748         return RATING_TABLE.get(rating.lower()) 
 750     def _family_friendly_search(self, html): 
 751         # See http://schema.org/VideoObject 
 752         family_friendly = self._html_search_meta('isFamilyFriendly
', html) 
 754         if not family_friendly: 
 763         return RATING_TABLE.get(family_friendly.lower()) 
 765     def _twitter_search_player(self, html): 
 766         return self._html_search_meta('twitter
:player
', html, 
 767                                       'twitter card player
') 
 769     def _search_json_ld(self, html, video_id, **kwargs): 
 770         json_ld = self._search_regex( 
 771             r'(?s
)<script
[^
>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', 
 772             html, 'JSON-LD', group='json_ld', **kwargs) 
 775         return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) 
 777     def _json_ld(self, json_ld, video_id, fatal=True): 
 778         if isinstance(json_ld, compat_str): 
 779             json_ld = self._parse_json(json_ld, video_id, fatal=fatal) 
 783         if json_ld.get('@context') == 'http://schema.org': 
 784             item_type = json_ld.get('@type') 
 785             if item_type == 'TVEpisode': 
 787                     'episode': unescapeHTML(json_ld.get('name')), 
 788                     'episode_number': int_or_none(json_ld.get('episodeNumber')), 
 789                     'description': unescapeHTML(json_ld.get('description')), 
 791                 part_of_season = json_ld.get('partOfSeason') 
 792                 if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': 
 793                     info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) 
 794                 part_of_series = json_ld.get('partOfSeries') 
 795                 if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': 
 796                     info['series'] = unescapeHTML(part_of_series.get('name')) 
 797             elif item_type == 'Article': 
 799                     'timestamp': parse_iso8601(json_ld.get('datePublished')), 
 800                     'title': unescapeHTML(json_ld.get('headline')), 
 801                     'description': unescapeHTML(json_ld.get('articleBody')), 
 803         return dict((k, v) for k, v in info.items() if v is not None) 
 806     def _hidden_inputs(html): 
 807         html = re.sub(r'<!--(?:(?!<!--).)*-->', '', html) 
 809         for input in re.findall(r'(?i)<input([^>]+)>', html): 
 810             if not re.search(r'type=(["\'])(?
:hidden|submit
)\
1', input): 
 812             name = re.search(r'name
=(["\'])(?P<value>.+?)\1', input) 
 815             value = re.search(r'value=(["\'])(?P
<value
>.*?
)\
1', input) 
 818             hidden_inputs[name.group('value
')] = value.group('value
') 
 821     def _form_hidden_inputs(self, form_id, html): 
 822         form = self._search_regex( 
 823             r'(?
is)<form
[^
>]+?
id=(["\'])%s\1[^>]*>(?P<form>.+?)</form>' % form_id, 
 824             html, '%s form' % form_id, group='form') 
 825         return self._hidden_inputs(form) 
 827     def _sort_formats(self, formats, field_preference=None): 
 829             raise ExtractorError('No video formats found') 
 832             # Automatically determine tbr when missing based on abr and vbr (improves 
 833             # formats sorting in some cases) 
 834             if 'tbr' not in f and f.get('abr') is not None and f.get('vbr') is not None: 
 835                 f['tbr'] = f['abr'] + f['vbr'] 
 838             # TODO remove the following workaround 
 839             from ..utils import determine_ext 
 840             if not f.get('ext') and 'url' in f: 
 841                 f['ext'] = determine_ext(f['url']) 
 843             if isinstance(field_preference, (list, tuple)): 
 844                 return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) 
 846             preference = f.get('preference') 
 847             if preference is None: 
 849                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported 
 852             proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 
 854             if f.get('vcodec') == 'none':  # audio only 
 855                 if self._downloader.params.get('prefer_free_formats'): 
 856                     ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus'] 
 858                     ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a'] 
 861                     audio_ext_preference = ORDER.index(f['ext']) 
 863                     audio_ext_preference = -1 
 865                 if self._downloader.params.get('prefer_free_formats'): 
 866                     ORDER = ['flv', 'mp4', 'webm'] 
 868                     ORDER = ['webm', 'flv', 'mp4'] 
 870                     ext_preference = ORDER.index(f['ext']) 
 873                 audio_ext_preference = 0 
 877                 f.get('language_preference') if f.get('language_preference') is not None else -1, 
 878                 f.get('quality') if f.get('quality') is not None else -1, 
 879                 f.get('tbr') if f.get('tbr') is not None else -1, 
 880                 f.get('filesize') if f.get('filesize') is not None else -1, 
 881                 f.get('vbr') if f.get('vbr') is not None else -1, 
 882                 f.get('height') if f.get('height') is not None else -1, 
 883                 f.get('width') if f.get('width') is not None else -1, 
 886                 f.get('abr') if f.get('abr') is not None else -1, 
 887                 audio_ext_preference, 
 888                 f.get('fps') if f.get('fps') is not None else -1, 
 889                 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, 
 890                 f.get('source_preference') if f.get('source_preference') is not None else -1, 
 891                 f.get('format_id') if f.get('format_id') is not None else '', 
 893         formats.sort(key=_formats_key) 
 895     def _check_formats(self, formats, video_id): 
 898                 lambda f: self._is_valid_url( 
 900                     item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), 
 904     def _remove_duplicate_formats(formats): 
 908             if f['url'] not in format_urls: 
 909                 format_urls.add(f['url']) 
 910                 unique_formats.append(f) 
 911         formats[:] = unique_formats 
 913     def _is_valid_url(self, url, video_id, item='video'): 
 914         url = self._proto_relative_url(url, scheme='http:') 
 915         # For now assume non HTTP(S) URLs always valid 
 916         if not (url.startswith('http://') or url.startswith('https://')): 
 919             self._request_webpage(url, video_id, 'Checking %s URL' % item) 
 921         except ExtractorError as e: 
 922             if isinstance(e.cause, compat_urllib_error.URLError): 
 924                     '%s: %s URL is invalid, skipping' % (video_id, item)) 
 928     def http_scheme(self): 
 929         """ Either "http
:" or "https
:", depending on the user's preferences """ 
 932             if self._downloader.params.get('prefer_insecure', False) 
 935     def _proto_relative_url(self, url, scheme=None): 
 938         if url.startswith('//'): 
 940                 scheme = self.http_scheme() 
 945     def _sleep(self, timeout, video_id, msg_template=None): 
 946         if msg_template is None: 
 947             msg_template = '%(video_id)s: Waiting for %(timeout)s seconds' 
 948         msg = msg_template % {'video_id': video_id, 'timeout': timeout} 
 952     def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, 
 953                              transform_source=lambda s: fix_xml_ampersands(s).strip(), 
 955         manifest = self._download_xml( 
 956             manifest_url, video_id, 'Downloading f4m manifest', 
 957             'Unable to download f4m manifest', 
 958             # Some manifests may be malformed, e.g. prosiebensat1 generated manifests 
 959             # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) 
 960             transform_source=transform_source, 
 963         if manifest is False: 
 967         manifest_version = '1.0' 
 968         media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') 
 970             manifest_version = '2.0' 
 971             media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') 
 972         base_url = xpath_text( 
 973             manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], 
 974             'base URL', default=None) 
 976             base_url = base_url.strip() 
 977         for i, media_el in enumerate(media_nodes): 
 978             if manifest_version == '2.0': 
 979                 media_url = media_el.attrib.get('href') or media_el.attrib.get('url') 
 983                     media_url if media_url.startswith('http://') or media_url.startswith('https://') 
 984                     else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) 
 985                 # If media_url is itself a f4m manifest do the recursive extraction 
 986                 # since bitrates in parent manifest (this one) and media_url manifest 
 987                 # may differ leading to inability to resolve the format by requested 
 988                 # bitrate in f4m downloader 
 989                 if determine_ext(manifest_url) == 'f4m': 
 990                     formats.extend(self._extract_f4m_formats( 
 991                         manifest_url, video_id, preference, f4m_id, fatal=fatal)) 
 993             tbr = int_or_none(media_el.attrib.get('bitrate')) 
 995                 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), 
 999                 'width': int_or_none(media_el.attrib.get('width')), 
1000                 'height': int_or_none(media_el.attrib.get('height')), 
1001                 'preference': preference, 
1003         self._sort_formats(formats) 
1007     def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, 
1008                               entry_protocol='m3u8', preference=None, 
1009                               m3u8_id=None, note=None, errnote=None, 
1013             'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 
1017             'preference': preference - 1 if preference else -1, 
1018             'resolution': 'multiple', 
1019             'format_note': 'Quality selection URL', 
1022         format_url = lambda u: ( 
1024             if re.match(r'^https?://', u) 
1025             else compat_urlparse.urljoin(m3u8_url, u)) 
1027         res = self._download_webpage_handle( 
1029             note=note or 'Downloading m3u8 information', 
1030             errnote=errnote or 'Failed to download m3u8 information', 
1034         m3u8_doc, urlh = res 
1035         m3u8_url = urlh.geturl() 
1036         # A Media Playlist Tag MUST NOT appear in a Master Playlist 
1037         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 
1038         # The EXT-X-TARGETDURATION tag is REQUIRED for every M3U8 Media Playlists 
1039         # https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 
1040         if '#EXT-X-TARGETDURATION' in m3u8_doc: 
1043                 'format_id': m3u8_id, 
1045                 'protocol': entry_protocol, 
1046                 'preference': preference, 
1050         kv_rex = re.compile( 
1051             r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^
"]+"|
[^
",]+)(?:,|$)') 
1052         for line in m3u8_doc.splitlines(): 
1053             if line.startswith('#EXT-X-STREAM-INF:'): 
1055                 for m in kv_rex.finditer(line): 
1057                     if v.startswith('"'): 
1059                     last_info[m.group('key
')] = v 
1060             elif line.startswith('#EXT-X-MEDIA:'): 
1062                 for m 
in kv_rex
.finditer(line
): 
1064                     if v
.startswith('"'): 
1066                     last_media
[m
.group('key')] = v
 
1067             elif line
.startswith('#') or not line
.strip(): 
1070                 if last_info 
is None: 
1071                     formats
.append({'url': format_url(line
)}) 
1073                 tbr 
= int_or_none(last_info
.get('BANDWIDTH'), scale
=1000) 
1076                     format_id
.append(m3u8_id
) 
1077                 last_media_name 
= last_media
.get('NAME') if last_media 
and last_media
.get('TYPE') != 'SUBTITLES' else None 
1078                 format_id
.append(last_media_name 
if last_media_name 
else '%d' % (tbr 
if tbr 
else len(formats
))) 
1080                     'format_id': '-'.join(format_id
), 
1081                     'url': format_url(line
.strip()), 
1084                     'protocol': entry_protocol
, 
1085                     'preference': preference
, 
1087                 codecs 
= last_info
.get('CODECS') 
1089                     # TODO: looks like video codec is not always necessarily goes first 
1090                     va_codecs 
= codecs
.split(',') 
1092                         f
['vcodec'] = va_codecs
[0] 
1093                     if len(va_codecs
) > 1 and va_codecs
[1]: 
1094                         f
['acodec'] = va_codecs
[1] 
1095                 resolution 
= last_info
.get('RESOLUTION') 
1097                     width_str
, height_str 
= resolution
.split('x') 
1098                     f
['width'] = int(width_str
) 
1099                     f
['height'] = int(height_str
) 
1100                 if last_media 
is not None: 
1101                     f
['m3u8_media'] = last_media
 
1105         self
._sort
_formats
(formats
) 
1109     def _xpath_ns(path
, namespace
=None): 
1113         for c 
in path
.split('/'): 
1114             if not c 
or c 
== '.': 
1117                 out
.append('{%s}%s' % (namespace
, c
)) 
1118         return '/'.join(out
) 
1120     def _extract_smil_formats(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None): 
1121         smil 
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
) 
1127         namespace 
= self
._parse
_smil
_namespace
(smil
) 
1129         return self
._parse
_smil
_formats
( 
1130             smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
) 
1132     def _extract_smil_info(self
, smil_url
, video_id
, fatal
=True, f4m_params
=None): 
1133         smil 
= self
._download
_smil
(smil_url
, video_id
, fatal
=fatal
) 
1136         return self
._parse
_smil
(smil
, smil_url
, video_id
, f4m_params
=f4m_params
) 
1138     def _download_smil(self
, smil_url
, video_id
, fatal
=True): 
1139         return self
._download
_xml
( 
1140             smil_url
, video_id
, 'Downloading SMIL file', 
1141             'Unable to download SMIL file', fatal
=fatal
) 
1143     def _parse_smil(self
, smil
, smil_url
, video_id
, f4m_params
=None): 
1144         namespace 
= self
._parse
_smil
_namespace
(smil
) 
1146         formats 
= self
._parse
_smil
_formats
( 
1147             smil
, smil_url
, video_id
, namespace
=namespace
, f4m_params
=f4m_params
) 
1148         subtitles 
= self
._parse
_smil
_subtitles
(smil
, namespace
=namespace
) 
1150         video_id 
= os
.path
.splitext(url_basename(smil_url
))[0] 
1154         for meta 
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)): 
1155             name 
= meta
.attrib
.get('name') 
1156             content 
= meta
.attrib
.get('content') 
1157             if not name 
or not content
: 
1159             if not title 
and name 
== 'title': 
1161             elif not description 
and name 
in ('description', 'abstract'): 
1162                 description 
= content
 
1163             elif not upload_date 
and name 
== 'date': 
1164                 upload_date 
= unified_strdate(content
) 
1167             'id': image
.get('type'), 
1168             'url': image
.get('src'), 
1169             'width': int_or_none(image
.get('width')), 
1170             'height': int_or_none(image
.get('height')), 
1171         } for image 
in smil
.findall(self
._xpath
_ns
('.//image', namespace
)) if image
.get('src')] 
1175             'title': title 
or video_id
, 
1176             'description': description
, 
1177             'upload_date': upload_date
, 
1178             'thumbnails': thumbnails
, 
1180             'subtitles': subtitles
, 
1183     def _parse_smil_namespace(self
, smil
): 
1184         return self
._search
_regex
( 
1185             r
'(?i)^{([^}]+)?}smil$', smil
.tag
, 'namespace', default
=None) 
1187     def _parse_smil_formats(self
, smil
, smil_url
, video_id
, namespace
=None, f4m_params
=None, transform_rtmp_url
=None): 
1189         for meta 
in smil
.findall(self
._xpath
_ns
('./head/meta', namespace
)): 
1190             b 
= meta
.get('base') or meta
.get('httpBase') 
1201         videos 
= smil
.findall(self
._xpath
_ns
('.//video', namespace
)) 
1202         for video 
in videos
: 
1203             src 
= video
.get('src') 
1204             if not src 
or src 
in srcs
: 
1208             bitrate 
= float_or_none(video
.get('system-bitrate') or video
.get('systemBitrate'), 1000) 
1209             filesize 
= int_or_none(video
.get('size') or video
.get('fileSize')) 
1210             width 
= int_or_none(video
.get('width')) 
1211             height 
= int_or_none(video
.get('height')) 
1212             proto 
= video
.get('proto') 
1213             ext 
= video
.get('ext') 
1214             src_ext 
= determine_ext(src
) 
1215             streamer 
= video
.get('streamer') or base
 
1217             if proto 
== 'rtmp' or streamer
.startswith('rtmp'): 
1223                     'format_id': 'rtmp-%d' % (rtmp_count 
if bitrate 
is None else bitrate
), 
1225                     'filesize': filesize
, 
1229                 if transform_rtmp_url
: 
1230                     streamer
, src 
= transform_rtmp_url(streamer
, src
) 
1231                     formats
[-1].update({ 
1237             src_url 
= src 
if src
.startswith('http') else compat_urlparse
.urljoin(base
, src
) 
1238             src_url 
= src_url
.strip() 
1240             if proto 
== 'm3u8' or src_ext 
== 'm3u8': 
1241                 m3u8_formats 
= self
._extract
_m
3u8_formats
( 
1242                     src_url
, video_id
, ext 
or 'mp4', m3u8_id
='hls', fatal
=False) 
1243                 if len(m3u8_formats
) == 1: 
1245                     m3u8_formats
[0].update({ 
1246                         'format_id': 'hls-%d' % (m3u8_count 
if bitrate 
is None else bitrate
), 
1251                 formats
.extend(m3u8_formats
) 
1254             if src_ext 
== 'f4m': 
1259                         'plugin': 'flowplayer-3.2.0.1', 
1261                 f4m_url 
+= '&' if '?' in f4m_url 
else '?' 
1262                 f4m_url 
+= compat_urllib_parse
.urlencode(f4m_params
) 
1263                 formats
.extend(self
._extract
_f
4m
_formats
(f4m_url
, video_id
, f4m_id
='hds', fatal
=False)) 
1266             if src_url
.startswith('http') and self
._is
_valid
_url
(src
, video_id
): 
1270                     'ext': ext 
or src_ext 
or 'flv', 
1271                     'format_id': 'http-%d' % (bitrate 
or http_count
), 
1273                     'filesize': filesize
, 
1279         self
._sort
_formats
(formats
) 
1283     def _parse_smil_subtitles(self
, smil
, namespace
=None, subtitles_lang
='en'): 
1286         for num
, textstream 
in enumerate(smil
.findall(self
._xpath
_ns
('.//textstream', namespace
))): 
1287             src 
= textstream
.get('src') 
1288             if not src 
or src 
in urls
: 
1291             ext 
= textstream
.get('ext') or determine_ext(src
) or mimetype2ext(textstream
.get('type')) 
1292             lang 
= textstream
.get('systemLanguage') or textstream
.get('systemLanguageName') or textstream
.get('lang') or subtitles_lang
 
1293             subtitles
.setdefault(lang
, []).append({ 
1299     def _extract_xspf_playlist(self
, playlist_url
, playlist_id
, fatal
=True): 
1300         xspf 
= self
._download
_xml
( 
1301             playlist_url
, playlist_id
, 'Downloading xpsf playlist', 
1302             'Unable to download xspf manifest', fatal
=fatal
) 
1305         return self
._parse
_xspf
(xspf
, playlist_id
) 
1307     def _parse_xspf(self
, playlist
, playlist_id
): 
1309             'xspf': 'http://xspf.org/ns/0/', 
1310             's1': 'http://static.streamone.nl/player/ns/0', 
1314         for track 
in playlist
.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP
)): 
1316                 track
, xpath_with_ns('./xspf:title', NS_MAP
), 'title', default
=playlist_id
) 
1317             description 
= xpath_text( 
1318                 track
, xpath_with_ns('./xspf:annotation', NS_MAP
), 'description') 
1319             thumbnail 
= xpath_text( 
1320                 track
, xpath_with_ns('./xspf:image', NS_MAP
), 'thumbnail') 
1321             duration 
= float_or_none( 
1322                 xpath_text(track
, xpath_with_ns('./xspf:duration', NS_MAP
), 'duration'), 1000) 
1325                 'url': location
.text
, 
1326                 'format_id': location
.get(xpath_with_ns('s1:label', NS_MAP
)), 
1327                 'width': int_or_none(location
.get(xpath_with_ns('s1:width', NS_MAP
))), 
1328                 'height': int_or_none(location
.get(xpath_with_ns('s1:height', NS_MAP
))), 
1329             } for location 
in track
.findall(xpath_with_ns('./xspf:location', NS_MAP
))] 
1330             self
._sort
_formats
(formats
) 
1335                 'description': description
, 
1336                 'thumbnail': thumbnail
, 
1337                 'duration': duration
, 
1342     def _extract_mpd_formats(self
, mpd_url
, video_id
, mpd_id
=None, note
=None, errnote
=None, fatal
=True, formats_dict
={}): 
1343         res 
= self
._download
_webpage
_handle
( 
1345             note
=note 
or 'Downloading MPD manifest', 
1346             errnote
=errnote 
or 'Failed to download MPD manifest', 
1351         mpd_base_url 
= re
.match(r
'https?://.+/', urlh
.geturl()).group() 
1353         return self
._parse
_mpd
_formats
( 
1354             compat_etree_fromstring(mpd
.encode('utf-8')), mpd_id
, mpd_base_url
, formats_dict
=formats_dict
) 
1356     def _parse_mpd_formats(self
, mpd_doc
, mpd_id
=None, mpd_base_url
='', formats_dict
={}): 
1357         if mpd_doc
.get('type') == 'dynamic': 
1360         namespace 
= self
._search
_regex
(r
'(?i)^{([^}]+)?}MPD$', mpd_doc
.tag
, 'namespace', default
=None) 
1363             return self
._xpath
_ns
(path
, namespace
) 
1365         def is_drm_protected(element
): 
1366             return element
.find(_add_ns('ContentProtection')) is not None 
1368         def extract_multisegment_info(element
, ms_parent_info
): 
1369             ms_info 
= ms_parent_info
.copy() 
1370             segment_list 
= element
.find(_add_ns('SegmentList')) 
1371             if segment_list 
is not None: 
1372                 segment_urls_e 
= segment_list
.findall(_add_ns('SegmentURL')) 
1374                     ms_info
['segment_urls'] = [segment
.attrib
['media'] for segment 
in segment_urls_e
] 
1375                 initialization 
= segment_list
.find(_add_ns('Initialization')) 
1376                 if initialization 
is not None: 
1377                     ms_info
['initialization_url'] = initialization
.attrib
['sourceURL'] 
1379                 segment_template 
= element
.find(_add_ns('SegmentTemplate')) 
1380                 if segment_template 
is not None: 
1381                     start_number 
= segment_template
.get('startNumber') 
1383                         ms_info
['start_number'] = int(start_number
) 
1384                     segment_timeline 
= segment_template
.find(_add_ns('SegmentTimeline')) 
1385                     if segment_timeline 
is not None: 
1386                         s_e 
= segment_timeline
.findall(_add_ns('S')) 
1388                             ms_info
['total_number'] = 0 
1390                                 ms_info
['total_number'] += 1 + int(s
.get('r', '0')) 
1392                         timescale 
= segment_template
.get('timescale') 
1394                             ms_info
['timescale'] = int(timescale
) 
1395                         segment_duration 
= segment_template
.get('duration') 
1396                         if segment_duration
: 
1397                             ms_info
['segment_duration'] = int(segment_duration
) 
1398                     media_template 
= segment_template
.get('media') 
1400                         ms_info
['media_template'] = media_template
 
1401                     initialization 
= segment_template
.get('initialization') 
1403                         ms_info
['initialization_url'] = initialization
 
1405                         initialization 
= segment_template
.find(_add_ns('Initialization')) 
1406                         if initialization 
is not None: 
1407                             ms_info
['initialization_url'] = initialization
.attrib
['sourceURL'] 
1410         mpd_duration 
= parse_duration(mpd_doc
.get('mediaPresentationDuration')) 
1412         for period 
in mpd_doc
.findall(_add_ns('Period')): 
1413             period_duration 
= parse_duration(period
.get('duration')) or mpd_duration
 
1414             period_ms_info 
= extract_multisegment_info(period
, { 
1418             for adaptation_set 
in period
.findall(_add_ns('AdaptationSet')): 
1419                 if is_drm_protected(adaptation_set
): 
1421                 adaption_set_ms_info 
= extract_multisegment_info(adaptation_set
, period_ms_info
) 
1422                 for representation 
in adaptation_set
.findall(_add_ns('Representation')): 
1423                     if is_drm_protected(representation
): 
1425                     representation_attrib 
= adaptation_set
.attrib
.copy() 
1426                     representation_attrib
.update(representation
.attrib
) 
1427                     mime_type 
= representation_attrib
.get('mimeType') 
1428                     content_type 
= mime_type
.split('/')[0] if mime_type 
else representation_attrib
.get('contentType') 
1429                     if content_type 
== 'text': 
1430                         # TODO implement WebVTT downloading 
1432                     elif content_type 
== 'video' or content_type 
== 'audio': 
1434                         for element 
in (representation
, adaptation_set
, period
, mpd_doc
): 
1435                             base_url_e 
= element
.find(_add_ns('BaseURL')) 
1436                             if base_url_e 
is not None: 
1437                                 base_url 
= base_url_e
.text 
+ base_url
 
1438                                 if re
.match(r
'^https?://', base_url
): 
1440                         if mpd_base_url 
and not re
.match(r
'^https?://', base_url
): 
1441                             if not mpd_base_url
.endswith('/') and not base_url
.startswith('/'): 
1443                             base_url 
= mpd_base_url 
+ base_url
 
1444                         representation_id 
= representation_attrib
.get('id') 
1445                         lang 
= representation_attrib
.get('lang') 
1446                         url_el 
= representation
.find(_add_ns('BaseURL')) 
1447                         filesize 
= int_or_none(url_el
.attrib
.get('{http://youtube.com/yt/2012/10/10}contentLength') if url_el 
is not None else None) 
1449                             'format_id': '%s-%s' % (mpd_id
, representation_id
) if mpd_id 
else representation_id
, 
1451                             'width': int_or_none(representation_attrib
.get('width')), 
1452                             'height': int_or_none(representation_attrib
.get('height')), 
1453                             'tbr': int_or_none(representation_attrib
.get('bandwidth'), 1000), 
1454                             'asr': int_or_none(representation_attrib
.get('audioSamplingRate')), 
1455                             'fps': int_or_none(representation_attrib
.get('frameRate')), 
1456                             'vcodec': 'none' if content_type 
== 'audio' else representation_attrib
.get('codecs'), 
1457                             'acodec': 'none' if content_type 
== 'video' else representation_attrib
.get('codecs'), 
1458                             'language': lang 
if lang 
not in ('mul', 'und', 'zxx', 'mis') else None, 
1459                             'format_note': 'DASH %s' % content_type
, 
1460                             'filesize': filesize
, 
1462                         representation_ms_info 
= extract_multisegment_info(representation
, adaption_set_ms_info
) 
1463                         if 'segment_urls' not in representation_ms_info 
and 'media_template' in representation_ms_info
: 
1464                             if 'total_number' not in representation_ms_info 
and 'segment_duration': 
1465                                 segment_duration 
= float(representation_ms_info
['segment_duration']) / float(representation_ms_info
['timescale']) 
1466                                 representation_ms_info
['total_number'] = int(math
.ceil(float(period_duration
) / segment_duration
)) 
1467                             media_template 
= representation_ms_info
['media_template'] 
1468                             media_template 
= media_template
.replace('$RepresentationID$', representation_id
) 
1469                             media_template 
= re
.sub(r
'\$(Number|Bandwidth)(?:%(0\d+)d)?\$', r
'%(\1)\2d', media_template
) 
1470                             media_template
.replace('$$', '$') 
1471                             representation_ms_info
['segment_urls'] = [media_template 
% {'Number': segment_number
, 'Bandwidth': representation_attrib
.get('bandwidth')} for segment_number 
in range(representation_ms_info
['start_number'], representation_ms_info
['total_number'] + representation_ms_info
['start_number'])] 
1472                         if 'segment_urls' in representation_ms_info
: 
1474                                 'segment_urls': representation_ms_info
['segment_urls'], 
1475                                 'protocol': 'http_dash_segments', 
1477                             if 'initialization_url' in representation_ms_info
: 
1478                                 initialization_url 
= representation_ms_info
['initialization_url'].replace('$RepresentationID$', representation_id
) 
1480                                     'initialization_url': initialization_url
, 
1482                                 if not f
.get('url'): 
1483                                     f
['url'] = initialization_url
 
1485                             existing_format 
= next( 
1486                                 fo 
for fo 
in formats
 
1487                                 if fo
['format_id'] == representation_id
) 
1488                         except StopIteration: 
1489                             full_info 
= formats_dict
.get(representation_id
, {}).copy() 
1491                             formats
.append(full_info
) 
1493                             existing_format
.update(f
) 
1495                         self
.report_warning('Unknown MIME type %s in DASH manifest' % mime_type
) 
1496         self
._sort
_formats
(formats
) 
1499     def _live_title(self
, name
): 
1500         """ Generate the title for a live video """ 
1501         now 
= datetime
.datetime
.now() 
1502         now_str 
= now
.strftime('%Y-%m-%d %H:%M') 
1503         return name 
+ ' ' + now_str
 
1505     def _int(self
, v
, name
, fatal
=False, **kwargs
): 
1506         res 
= int_or_none(v
, **kwargs
) 
1507         if 'get_attr' in kwargs
: 
1508             print(getattr(v
, kwargs
['get_attr'])) 
1510             msg 
= 'Failed to extract %s: Could not parse value %r' % (name
, v
) 
1512                 raise ExtractorError(msg
) 
1514                 self
._downloader
.report_warning(msg
) 
1517     def _float(self
, v
, name
, fatal
=False, **kwargs
): 
1518         res 
= float_or_none(v
, **kwargs
) 
1520             msg 
= 'Failed to extract %s: Could not parse value %r' % (name
, v
) 
1522                 raise ExtractorError(msg
) 
1524                 self
._downloader
.report_warning(msg
) 
1527     def _set_cookie(self
, domain
, name
, value
, expire_time
=None): 
1528         cookie 
= compat_cookiejar
.Cookie( 
1529             0, name
, value
, None, None, domain
, None, 
1530             None, '/', True, False, expire_time
, '', None, None, None) 
1531         self
._downloader
.cookiejar
.set_cookie(cookie
) 
1533     def _get_cookies(self
, url
): 
1534         """ Return a compat_cookies.SimpleCookie with the cookies for the url """ 
1535         req 
= sanitized_Request(url
) 
1536         self
._downloader
.cookiejar
.add_cookie_header(req
) 
1537         return compat_cookies
.SimpleCookie(req
.get_header('Cookie')) 
1539     def get_testcases(self
, include_onlymatching
=False): 
1540         t 
= getattr(self
, '_TEST', None) 
1542             assert not hasattr(self
, '_TESTS'), \
 
1543                 '%s has _TEST and _TESTS' % type(self
).__name
__ 
1546             tests 
= getattr(self
, '_TESTS', []) 
1548             if not include_onlymatching 
and t
.get('only_matching', False): 
1550             t
['name'] = type(self
).__name
__[:-len('IE')] 
1553     def is_suitable(self
, age_limit
): 
1554         """ Test whether the extractor is generally suitable for the given 
1555         age limit (i.e. pornographic sites are not, all others usually are) """ 
1557         any_restricted 
= False 
1558         for tc 
in self
.get_testcases(include_onlymatching
=False): 
1559             if 'playlist' in tc
: 
1560                 tc 
= tc
['playlist'][0] 
1561             is_restricted 
= age_restricted( 
1562                 tc
.get('info_dict', {}).get('age_limit'), age_limit
) 
1563             if not is_restricted
: 
1565             any_restricted 
= any_restricted 
or is_restricted
 
1566         return not any_restricted
 
1568     def extract_subtitles(self
, *args
, **kwargs
): 
1569         if (self
._downloader
.params
.get('writesubtitles', False) or 
1570                 self
._downloader
.params
.get('listsubtitles')): 
1571             return self
._get
_subtitles
(*args
, **kwargs
) 
1574     def _get_subtitles(self
, *args
, **kwargs
): 
1575         raise NotImplementedError('This method must be implemented by subclasses') 
1578     def _merge_subtitle_items(subtitle_list1
, subtitle_list2
): 
1579         """ Merge subtitle items for one language. Items with duplicated URLs 
1580         will be dropped. """ 
1581         list1_urls 
= set([item
['url'] for item 
in subtitle_list1
]) 
1582         ret 
= list(subtitle_list1
) 
1583         ret
.extend([item 
for item 
in subtitle_list2 
if item
['url'] not in list1_urls
]) 
1587     def _merge_subtitles(cls
, subtitle_dict1
, subtitle_dict2
): 
1588         """ Merge two subtitle dictionaries, language by language. """ 
1589         ret 
= dict(subtitle_dict1
) 
1590         for lang 
in subtitle_dict2
: 
1591             ret
[lang
] = cls
._merge
_subtitle
_items
(subtitle_dict1
.get(lang
, []), subtitle_dict2
[lang
]) 
1594     def extract_automatic_captions(self
, *args
, **kwargs
): 
1595         if (self
._downloader
.params
.get('writeautomaticsub', False) or 
1596                 self
._downloader
.params
.get('listsubtitles')): 
1597             return self
._get
_automatic
_captions
(*args
, **kwargs
) 
1600     def _get_automatic_captions(self
, *args
, **kwargs
): 
1601         raise NotImplementedError('This method must be implemented by subclasses') 
1604 class SearchInfoExtractor(InfoExtractor
): 
1606     Base class for paged search queries extractors. 
1607     They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} 
1608     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
1612     def _make_valid_url(cls
): 
1613         return r
'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls
._SEARCH
_KEY
 
1616     def suitable(cls
, url
): 
1617         return re
.match(cls
._make
_valid
_url
(), url
) is not None 
1619     def _real_extract(self
, query
): 
1620         mobj 
= re
.match(self
._make
_valid
_url
(), query
) 
1622             raise ExtractorError('Invalid search query "%s"' % query
) 
1624         prefix 
= mobj
.group('prefix') 
1625         query 
= mobj
.group('query') 
1627             return self
._get
_n
_results
(query
, 1) 
1628         elif prefix 
== 'all': 
1629             return self
._get
_n
_results
(query
, self
._MAX
_RESULTS
) 
1633                 raise ExtractorError('invalid download number %s for query "%s"' % (n
, query
)) 
1634             elif n 
> self
._MAX
_RESULTS
: 
1635                 self
._downloader
.report_warning('%s returns max %i results (you requested %i)' % (self
._SEARCH
_KEY
, self
._MAX
_RESULTS
, n
)) 
1636                 n 
= self
._MAX
_RESULTS
 
1637             return self
._get
_n
_results
(query
, n
) 
1639     def _get_n_results(self
, query
, n
): 
1640         """Get a specified number of results for a query""" 
1641         raise NotImplementedError('This method must be implemented by subclasses') 
1644     def SEARCH_KEY(self
): 
1645         return self
._SEARCH
_KEY