9 import xml
.etree
.ElementTree
 
  14     compat_urllib_parse_urlparse
, 
  24 _NO_DEFAULT 
= object() 
  27 class InfoExtractor(object): 
  28     """Information Extractor class. 
  30     Information extractors are the classes that, given a URL, extract 
  31     information about the video (or videos) the URL refers to. This 
  32     information includes the real video URL, the video title, author and 
  33     others. The information is stored in a dictionary which is then 
  34     passed to the FileDownloader. The FileDownloader processes this 
  35     information possibly downloading the video to the file system, among 
  36     other possible outcomes. 
  38     The dictionaries must include the following fields: 
  41     title:          Video title, unescaped. 
  43     Additionally, it must contain either a formats entry or a url one: 
  45     formats:        A list of dictionaries for each format available, ordered 
  46                     from worst to best quality. 
  49                     * url        Mandatory. The URL of the video file 
  50                     * ext        Will be calculated from url if missing 
  51                     * format     A human-readable description of the format 
  52                                  ("mp4 container with h264/opus"). 
  53                                  Calculated from the format_id, width, height. 
  54                                  and format_note fields if missing. 
  55                     * format_id  A short description of the format 
  56                                  ("mp4_h264_opus" or "19"). 
  57                                 Technically optional, but strongly recommended. 
  58                     * format_note Additional info about the format 
  59                                  ("3D" or "DASH video") 
  60                     * width      Width of the video, if known 
  61                     * height     Height of the video, if known 
  62                     * resolution Textual description of width and height 
  63                     * tbr        Average bitrate of audio and video in KBit/s 
  64                     * abr        Average audio bitrate in KBit/s 
  65                     * acodec     Name of the audio codec in use 
  66                     * asr        Audio sampling rate in Hertz 
  67                     * vbr        Average video bitrate in KBit/s 
  68                     * vcodec     Name of the video codec in use 
  69                     * container  Name of the container format 
  70                     * filesize   The number of bytes, if known in advance 
  71                     * player_url SWF Player URL (used for rtmpdump). 
  72                     * protocol   The protocol that will be used for the actual 
  74                                  "http", "https", "rtsp", "rtmp", "m3u8" or so. 
  75                     * preference Order number of this format. If this field is 
  76                                  present and not None, the formats get sorted 
  77                                  by this field, regardless of all other values. 
  78                                  -1 for default (order by other properties), 
  79                                  -2 or smaller for less than default. 
  80                     * quality    Order number of the video quality of this 
  81                                  format, irrespective of the file format. 
  82                                  -1 for default (order by other properties), 
  83                                  -2 or smaller for less than default. 
  85     ext:            Video filename extension. 
  86     format:         The video format, defaults to ext (used for --get-format) 
  87     player_url:     SWF Player URL (used for rtmpdump). 
  89     The following fields are optional: 
  91     display_id      An alternative identifier for the video, not necessarily 
  92                     unique, but available before title. Typically, id is 
  93                     something like "4234987", title "Dancing naked mole rats", 
  94                     and display_id "dancing-naked-mole-rats" 
  95     thumbnails:     A list of dictionaries, with the following entries: 
  97                         * "width" (optional, int) 
  98                         * "height" (optional, int) 
  99                         * "resolution" (optional, string "{width}x{height"}, 
 101     thumbnail:      Full URL to a video thumbnail image. 
 102     description:    One-line video description. 
 103     uploader:       Full name of the video uploader. 
 104     timestamp:      UNIX timestamp of the moment the video became available. 
 105     upload_date:    Video upload date (YYYYMMDD). 
 106                     If not explicitly set, calculated from timestamp. 
 107     uploader_id:    Nickname or id of the video uploader. 
 108     location:       Physical location of the video. 
 109     subtitles:      The subtitle file contents as a dictionary in the format 
 110                     {language: subtitles}. 
 111     duration:       Length of the video in seconds, as an integer. 
 112     view_count:     How many users have watched the video on the platform. 
 113     like_count:     Number of positive ratings of the video 
 114     dislike_count:  Number of negative ratings of the video 
 115     comment_count:  Number of comments on the video 
 116     age_limit:      Age restriction for the video, as an integer (years) 
 117     webpage_url:    The url to the video webpage, if given to youtube-dl it 
 118                     should allow to get the same result again. (It will be set 
 119                     by YoutubeDL if it's missing) 
 120     categories:     A list of categories that the video falls in, for example 
 123     Unless mentioned otherwise, the fields should be Unicode strings. 
 125     Subclasses of this one should re-define the _real_initialize() and 
 126     _real_extract() methods and define a _VALID_URL regexp. 
 127     Probably, they should also be added to the list of extractors. 
 129     Finally, the _WORKING attribute should be set to False for broken IEs 
 130     in order to warn the users and skip the tests. 
 137     def __init__(self
, downloader
=None): 
 138         """Constructor. Receives an optional downloader.""" 
 140         self
.set_downloader(downloader
) 
 143     def suitable(cls
, url
): 
 144         """Receives a URL and returns True if suitable for this IE.""" 
 146         # This does not use has/getattr intentionally - we want to know whether 
 147         # we have cached the regexp for *this* class, whereas getattr would also 
 148         # match the superclass 
 149         if '_VALID_URL_RE' not in cls
.__dict
__: 
 150             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 151         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 155         """Getter method for _WORKING.""" 
 158     def initialize(self
): 
 159         """Initializes an instance (authentication, etc).""" 
 161             self
._real
_initialize
() 
 164     def extract(self
, url
): 
 165         """Extracts URL information and returns it in list of dicts.""" 
 167         return self
._real
_extract
(url
) 
 169     def set_downloader(self
, downloader
): 
 170         """Sets the downloader for this IE.""" 
 171         self
._downloader 
= downloader
 
 173     def _real_initialize(self
): 
 174         """Real initialization process. Redefine in subclasses.""" 
 177     def _real_extract(self
, url
): 
 178         """Real extraction process. Redefine in subclasses.""" 
 183         """A string for getting the InfoExtractor with get_info_extractor""" 
 184         return cls
.__name
__[:-2] 
 188         return type(self
).__name
__[:-2] 
 190     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True): 
 191         """ Returns the response handle """ 
 193             self
.report_download_webpage(video_id
) 
 194         elif note 
is not False: 
 196                 self
.to_screen(u
'%s' % (note
,)) 
 198                 self
.to_screen(u
'%s: %s' % (video_id
, note
)) 
 200             return self
._downloader
.urlopen(url_or_request
) 
 201         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 205                 errnote 
= u
'Unable to download webpage' 
 206             errmsg 
= u
'%s: %s' % (errnote
, compat_str(err
)) 
 208                 raise ExtractorError(errmsg
, sys
.exc_info()[2], cause
=err
) 
 210                 self
._downloader
.report_warning(errmsg
) 
 213     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None, fatal
=True): 
 214         """ Returns a tuple (page content as string, URL handle) """ 
 216         # Strip hashes from the URL (#1038) 
 217         if isinstance(url_or_request
, (compat_str
, str)): 
 218             url_or_request 
= url_or_request
.partition('#')[0] 
 220         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
, fatal
) 
 224         content_type 
= urlh
.headers
.get('Content-Type', '') 
 225         webpage_bytes 
= urlh
.read() 
 226         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 228             encoding 
= m
.group(1) 
 230             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 231                           webpage_bytes[:1024]) 
 233                 encoding = m.group(1).decode('ascii') 
 234             elif webpage_bytes.startswith(b'\xff\xfe'): 
 238         if self._downloader.params.get('dump_intermediate_pages', False): 
 240                 url = url_or_request.get_full_url() 
 241             except AttributeError: 
 243             self.to_screen(u'Dumping request to ' + url) 
 244             dump = base64.b64encode(webpage_bytes).decode('ascii') 
 245             self._downloader.to_screen(dump) 
 246         if self._downloader.params.get('write_pages', False): 
 248                 url = url_or_request.get_full_url() 
 249             except AttributeError: 
 251             basen = '%s_%s' % (video_id, url) 
 253                 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() 
 254                 basen = basen[:240 - len(h)] + h 
 255             raw_filename = basen + '.dump' 
 256             filename = sanitize_filename(raw_filename, restricted=True) 
 257             self.to_screen(u'Saving request to ' + filename) 
 258             with open(filename, 'wb') as outf: 
 259                 outf.write(webpage_bytes) 
 262             content = webpage_bytes.decode(encoding, 'replace') 
 264             content = webpage_bytes.decode('utf-8', 'replace') 
 266         if (u'<title>Access to this site is blocked</title>' in content and 
 267                 u'Websense' in content[:512]): 
 268             msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' 
 269             blocked_iframe = self._html_search_regex( 
 270                 r'<iframe src="([^
"]+)"', content, 
 271                 u'Websense information URL
', default=None) 
 273                 msg += u' Visit 
%s for more details
' % blocked_iframe 
 274             raise ExtractorError(msg, expected=True) 
 276         return (content, urlh) 
 278     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): 
 279         """ Returns the data of the page as a string """ 
 280         res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) 
 287     def _download_xml(self, url_or_request, video_id, 
 288                       note=u'Downloading XML
', errnote=u'Unable to download XML
', 
 289                       transform_source=None, fatal=True): 
 290         """Return the xml as an xml.etree.ElementTree.Element""" 
 291         xml_string = self._download_webpage( 
 292             url_or_request, video_id, note, errnote, fatal=fatal) 
 293         if xml_string is False: 
 296             xml_string = transform_source(xml_string) 
 297         return xml.etree.ElementTree.fromstring(xml_string.encode('utf
-8')) 
 299     def _download_json(self, url_or_request, video_id, 
 300                        note=u'Downloading JSON metadata
', 
 301                        errnote=u'Unable to download JSON metadata
', 
 302                        transform_source=None): 
 303         json_string = self._download_webpage(url_or_request, video_id, note, errnote) 
 305             json_string = transform_source(json_string) 
 307             return json.loads(json_string) 
 308         except ValueError as ve: 
 309             raise ExtractorError('Failed to download JSON
', cause=ve) 
 311     def report_warning(self, msg, video_id=None): 
 312         idstr = u'' if video_id is None else u'%s: ' % video_id 
 313         self._downloader.report_warning( 
 314             u'[%s] %s%s' % (self.IE_NAME, idstr, msg)) 
 316     def to_screen(self, msg): 
 317         """Print msg to screen, prefixing it with '[ie_name
]'""" 
 318         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) 
 320     def report_extraction(self, id_or_name): 
 321         """Report information extraction.""" 
 322         self.to_screen(u'%s: Extracting information
' % id_or_name) 
 324     def report_download_webpage(self, video_id): 
 325         """Report webpage download.""" 
 326         self.to_screen(u'%s: Downloading webpage
' % video_id) 
 328     def report_age_confirmation(self): 
 329         """Report attempt to confirm age.""" 
 330         self.to_screen(u'Confirming age
') 
 332     def report_login(self): 
 333         """Report attempt to log in.""" 
 334         self.to_screen(u'Logging 
in') 
 336     #Methods for following #608 
 338     def url_result(url, ie=None, video_id=None): 
 339         """Returns a url that points to a page that should be processed""" 
 340         #TODO: ie should be the class used for getting the info 
 341         video_info = {'_type
': 'url
', 
 344         if video_id is not None: 
 345             video_info['id'] = video_id 
 348     def playlist_result(entries, playlist_id=None, playlist_title=None): 
 349         """Returns a playlist""" 
 350         video_info = {'_type
': 'playlist
', 
 353             video_info['id'] = playlist_id 
 355             video_info['title
'] = playlist_title 
 358     def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): 
 360         Perform a regex search on the given string, using a single or a list of 
 361         patterns returning the first matching group. 
 362         In case of failure return a default value or raise a WARNING or a 
 363         RegexNotFoundError, depending on fatal, specifying the field name. 
 365         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 366             mobj = re.search(pattern, string, flags) 
 369                 mobj = re.search(p, string, flags) 
 372         if os.name != 'nt
' and sys.stderr.isatty(): 
 373             _name = u'\033[0;34m
%s\033[0m
' % name 
 378             # return the first matching group 
 379             return next(g for g in mobj.groups() if g is not None) 
 380         elif default is not _NO_DEFAULT: 
 383             raise RegexNotFoundError(u'Unable to extract 
%s' % _name) 
 385             self._downloader.report_warning(u'unable to extract 
%s; ' 
 386                 u'please report this issue on http
://yt
-dl
.org
/bug
' % _name) 
 389     def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): 
 391         Like _search_regex, but strips HTML tags and unescapes entities. 
 393         res = self._search_regex(pattern, string, name, default, fatal, flags) 
 395             return clean_html(res).strip() 
 399     def _get_login_info(self): 
 401         Get the the login info as (username, password) 
 402         It will look in the netrc file using the _NETRC_MACHINE value 
 403         If there's no info available
, return (None, None) 
 405         if self._downloader is None: 
 410         downloader_params = self._downloader.params 
 412         # Attempt to use provided username and password or .netrc data 
 413         if downloader_params.get('username', None) is not None: 
 414             username = downloader_params['username'] 
 415             password = downloader_params['password'] 
 416         elif downloader_params.get('usenetrc', False): 
 418                 info = netrc.netrc().authenticators(self._NETRC_MACHINE) 
 423                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 
 424             except (IOError, netrc.NetrcParseError) as err: 
 425                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) 
 427         return (username, password) 
 429     # Helper functions for extracting OpenGraph info 
 431     def _og_regexes(prop): 
 432         content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' 
 433         property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) 
 434         template = r'<meta[^>]+?%s[^>]+?%s' 
 436             template % (property_re, content_re), 
 437             template % (content_re, property_re), 
 440     def _og_search_property(self, prop, html, name=None, **kargs): 
 442             name = 'OpenGraph %s' % prop 
 443         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) 
 446         return unescapeHTML(escaped) 
 448     def _og_search_thumbnail(self, html, **kargs): 
 449         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) 
 451     def _og_search_description(self, html, **kargs): 
 452         return self._og_search_property('description', html, fatal=False, **kargs) 
 454     def _og_search_title(self, html, **kargs): 
 455         return self._og_search_property('title', html, **kargs) 
 457     def _og_search_video_url(self, html, name='video url', secure=True, **kargs): 
 458         regexes = self._og_regexes('video') 
 459         if secure: regexes = self._og_regexes('video:secure_url') + regexes 
 460         return self._html_search_regex(regexes, html, name, **kargs) 
 462     def _html_search_meta(self, name, html, display_name=None, fatal=False): 
 463         if display_name is None: 
 465         return self._html_search_regex( 
 467                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) 
 468                     [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), 
 469             html, display_name, fatal=fatal) 
 471     def _dc_search_uploader(self, html): 
 472         return self._html_search_meta('dc.creator', html, 'uploader') 
 474     def _rta_search(self, html): 
 475         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
 476         if re.search(r'(?ix)<meta\s+name="rating"\s+' 
 477                      r'     content="RTA-5042-1996-1400-1577-RTA"', 
 482     def _media_rating_search(self, html): 
 483         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 
 484         rating = self._html_search_meta('rating', html) 
 496         return RATING_TABLE.get(rating.lower(), None) 
 498     def _twitter_search_player(self, html): 
 499         return self._html_search_meta('twitter:player', html, 
 500             'twitter card player') 
 502     def _sort_formats(self, formats): 
 504             raise ExtractorError(u'No video formats found') 
 507             # TODO remove the following workaround 
 508             from ..utils import determine_ext 
 509             if not f.get('ext') and 'url' in f: 
 510                 f['ext'] = determine_ext(f['url']) 
 512             preference = f.get('preference') 
 513             if preference is None: 
 514                 proto = f.get('protocol') 
 516                     proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme 
 518                 preference = 0 if proto in ['http', 'https'] else -0.1 
 519                 if f.get('ext') in ['f4f', 'f4m']:  # Not yet supported 
 522             if f.get('vcodec') == 'none':  # audio only 
 523                 if self._downloader.params.get('prefer_free_formats'): 
 524                     ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus'] 
 526                     ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a'] 
 529                     audio_ext_preference = ORDER.index(f['ext']) 
 531                     audio_ext_preference = -1 
 533                 if self._downloader.params.get('prefer_free_formats'): 
 534                     ORDER = [u'flv', u'mp4', u'webm'] 
 536                     ORDER = [u'webm', u'flv', u'mp4'] 
 538                     ext_preference = ORDER.index(f['ext']) 
 541                 audio_ext_preference = 0 
 545                 f.get('quality') if f.get('quality') is not None else -1, 
 546                 f.get('height') if f.get('height') is not None else -1, 
 547                 f.get('width') if f.get('width') is not None else -1, 
 549                 f.get('tbr') if f.get('tbr') is not None else -1, 
 550                 f.get('vbr') if f.get('vbr') is not None else -1, 
 551                 f.get('abr') if f.get('abr') is not None else -1, 
 552                 audio_ext_preference, 
 553                 f.get('filesize') if f.get('filesize') is not None else -1, 
 556         formats.sort(key=_formats_key) 
 558     def http_scheme(self): 
 559         """ Either 
"https:" or "https:", depending on the user
's preferences """ 
 562             if self._downloader.params.get('prefer_insecure
', False) 
 565     def _proto_relative_url(self, url, scheme=None): 
 568         if url.startswith('//'): 
 570                 scheme = self.http_scheme() 
 576 class SearchInfoExtractor(InfoExtractor): 
 578     Base class for paged search queries extractors. 
 579     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} 
 580     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
 584     def _make_valid_url(cls): 
 585         return r'%s(?P
<prefix
>|
[1-9][0-9]*|all
):(?P
<query
>[\s\S
]+)' % cls._SEARCH_KEY 
 588     def suitable(cls, url): 
 589         return re.match(cls._make_valid_url(), url) is not None 
 591     def _real_extract(self, query): 
 592         mobj = re.match(self._make_valid_url(), query) 
 594             raise ExtractorError(u'Invalid search query 
"%s"' % query) 
 596         prefix = mobj.group('prefix
') 
 597         query = mobj.group('query
') 
 599             return self._get_n_results(query, 1) 
 600         elif prefix == 'all
': 
 601             return self._get_n_results(query, self._MAX_RESULTS) 
 605                 raise ExtractorError(u'invalid download number 
%s for query 
"%s"' % (n, query)) 
 606             elif n > self._MAX_RESULTS: 
 607                 self._downloader.report_warning(u'%s returns 
max %i results (you requested 
%i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 
 608                 n = self._MAX_RESULTS 
 609             return self._get_n_results(query, n) 
 611     def _get_n_results(self, query, n): 
 612         """Get a specified number of results for a query""" 
 613         raise NotImplementedError("This method must be implemented by subclasses") 
 616     def SEARCH_KEY(self): 
 617         return self._SEARCH_KEY