11     compat_urllib_request
, 
  22 class InfoExtractor(object): 
  23     """Information Extractor class. 
  25     Information extractors are the classes that, given a URL, extract 
  26     information about the video (or videos) the URL refers to. This 
  27     information includes the real video URL, the video title, author and 
  28     others. The information is stored in a dictionary which is then 
  29     passed to the FileDownloader. The FileDownloader processes this 
  30     information possibly downloading the video to the file system, among 
  31     other possible outcomes. 
  33     The dictionaries must include the following fields: 
  37     title:          Video title, unescaped. 
  38     ext:            Video filename extension. 
  40     Instead of url and ext, formats can also specified. 
  42     The following fields are optional: 
  44     format:         The video format, defaults to ext (used for --get-format) 
  45     thumbnails:     A list of dictionaries (with the entries "resolution" and 
  46                     "url") for the varying thumbnails 
  47     thumbnail:      Full URL to a video thumbnail image. 
  48     description:    One-line video description. 
  49     uploader:       Full name of the video uploader. 
  50     upload_date:    Video upload date (YYYYMMDD). 
  51     uploader_id:    Nickname or id of the video uploader. 
  52     location:       Physical location of the video. 
  53     player_url:     SWF Player URL (used for rtmpdump). 
  54     subtitles:      The subtitle file contents as a dictionary in the format 
  55                     {language: subtitles}. 
  56     view_count:     How many users have watched the video on the platform. 
  57     urlhandle:      [internal] The urlHandle to be used to download the file, 
  58                     like returned by urllib.request.urlopen 
  59     age_limit:      Age restriction for the video, as an integer (years) 
  60     formats:        A list of dictionaries for each format available, it must 
  61                     be ordered from worst to best quality. Potential fields: 
  62                     * url       Mandatory. The URL of the video file 
  63                     * ext       Will be calculated from url if missing 
  64                     * format    A human-readable description of the format 
  65                                 ("mp4 container with h264/opus"). 
  66                                 Calculated from the format_id, width, height. 
  67                                 and format_note fields if missing. 
  68                     * format_id A short description of the format 
  69                                 ("mp4_h264_opus" or "19") 
  70                     * format_note Additional info about the format 
  71                                 ("3D" or "DASH video") 
  72                     * width     Width of the video, if known 
  73                     * height    Height of the video, if known 
  74     webpage_url:    The url to the video webpage, if given to youtube-dl it 
  75                     should allow to get the same result again. (It will be set 
  76                     by YoutubeDL if it's missing) 
  78     Unless mentioned otherwise, the fields should be Unicode strings. 
  80     Subclasses of this one should re-define the _real_initialize() and 
  81     _real_extract() methods and define a _VALID_URL regexp. 
  82     Probably, they should also be added to the list of extractors. 
  84     _real_extract() must return a *list* of information dictionaries as 
  87     Finally, the _WORKING attribute should be set to False for broken IEs 
  88     in order to warn the users and skip the tests. 
  95     def __init__(self
, downloader
=None): 
  96         """Constructor. Receives an optional downloader.""" 
  98         self
.set_downloader(downloader
) 
 101     def suitable(cls
, url
): 
 102         """Receives a URL and returns True if suitable for this IE.""" 
 104         # This does not use has/getattr intentionally - we want to know whether 
 105         # we have cached the regexp for *this* class, whereas getattr would also 
 106         # match the superclass 
 107         if '_VALID_URL_RE' not in cls
.__dict
__: 
 108             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 109         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 113         """Getter method for _WORKING.""" 
 116     def initialize(self
): 
 117         """Initializes an instance (authentication, etc).""" 
 119             self
._real
_initialize
() 
 122     def extract(self
, url
): 
 123         """Extracts URL information and returns it in list of dicts.""" 
 125         return self
._real
_extract
(url
) 
 127     def set_downloader(self
, downloader
): 
 128         """Sets the downloader for this IE.""" 
 129         self
._downloader 
= downloader
 
 131     def _real_initialize(self
): 
 132         """Real initialization process. Redefine in subclasses.""" 
 135     def _real_extract(self
, url
): 
 136         """Real extraction process. Redefine in subclasses.""" 
 141         """A string for getting the InfoExtractor with get_info_extractor""" 
 142         return cls
.__name
__[:-2] 
 146         return type(self
).__name
__[:-2] 
 148     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 149         """ Returns the response handle """ 
 151             self
.report_download_webpage(video_id
) 
 152         elif note 
is not False: 
 153             self
.to_screen(u
'%s: %s' % (video_id
, note
)) 
 155             return compat_urllib_request
.urlopen(url_or_request
) 
 156         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 158                 errnote 
= u
'Unable to download webpage' 
 159             raise ExtractorError(u
'%s: %s' % (errnote
, compat_str(err
)), sys
.exc_info()[2], cause
=err
) 
 161     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 162         """ Returns a tuple (page content as string, URL handle) """ 
 164         # Strip hashes from the URL (#1038) 
 165         if isinstance(url_or_request
, (compat_str
, str)): 
 166             url_or_request 
= url_or_request
.partition('#')[0] 
 168         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
) 
 169         content_type 
= urlh
.headers
.get('Content-Type', '') 
 170         webpage_bytes 
= urlh
.read() 
 171         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 173             encoding 
= m
.group(1) 
 175             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 176                           webpage_bytes[:1024]) 
 178                 encoding = m.group(1).decode('ascii') 
 181         if self._downloader.params.get('dump_intermediate_pages', False): 
 183                 url = url_or_request.get_full_url() 
 184             except AttributeError: 
 186             self.to_screen(u'Dumping request to ' + url) 
 187             dump = base64.b64encode(webpage_bytes).decode('ascii') 
 188             self._downloader.to_screen(dump) 
 189         if self._downloader.params.get('write_pages', False): 
 191                 url = url_or_request.get_full_url() 
 192             except AttributeError: 
 194             raw_filename = ('%s_%s.dump' % (video_id, url)) 
 195             filename = sanitize_filename(raw_filename, restricted=True) 
 196             self.to_screen(u'Saving request to ' + filename) 
 197             with open(filename, 'wb') as outf: 
 198                 outf.write(webpage_bytes) 
 200         content = webpage_bytes.decode(encoding, 'replace') 
 201         return (content, urlh) 
 203     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): 
 204         """ Returns the data of the page as a string """ 
 205         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] 
 207     def to_screen(self, msg): 
 208         """Print msg to screen, prefixing it with '[ie_name]'""" 
 209         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) 
 211     def report_extraction(self, id_or_name): 
 212         """Report information extraction.""" 
 213         self.to_screen(u'%s: Extracting information' % id_or_name) 
 215     def report_download_webpage(self, video_id): 
 216         """Report webpage download.""" 
 217         self.to_screen(u'%s: Downloading webpage' % video_id) 
 219     def report_age_confirmation(self): 
 220         """Report attempt to confirm age.""" 
 221         self.to_screen(u'Confirming age') 
 223     def report_login(self): 
 224         """Report attempt to log in.""" 
 225         self.to_screen(u'Logging in') 
 227     #Methods for following #608 
 228     def url_result(self, url, ie=None): 
 229         """Returns a url that points to a page that should be processed""" 
 230         #TODO: ie should be the class used for getting the info 
 231         video_info = {'_type': 'url', 
 235     def playlist_result(self, entries, playlist_id=None, playlist_title=None): 
 236         """Returns a playlist""" 
 237         video_info = {'_type': 'playlist', 
 240             video_info['id'] = playlist_id 
 242             video_info['title'] = playlist_title 
 245     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): 
 247         Perform a regex search on the given string, using a single or a list of 
 248         patterns returning the first matching group. 
 249         In case of failure return a default value or raise a WARNING or a 
 250         RegexNotFoundError, depending on fatal, specifying the field name. 
 252         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 253             mobj = re.search(pattern, string, flags) 
 256                 mobj = re.search(p, string, flags) 
 259         if sys.stderr.isatty() and os.name != 'nt': 
 260             _name = u'\033[0;34m%s\033[0m' % name 
 265             # return the first matching group 
 266             return next(g for g in mobj.groups() if g is not None) 
 267         elif default is not None: 
 270             raise RegexNotFoundError(u'Unable to extract %s' % _name) 
 272             self._downloader.report_warning(u'unable to extract %s; ' 
 273                 u'please report this issue on http://yt-dl.org/bug' % _name) 
 276     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): 
 278         Like _search_regex, but strips HTML tags and unescapes entities. 
 280         res = self._search_regex(pattern, string, name, default, fatal, flags) 
 282             return clean_html(res).strip() 
 286     def _get_login_info(self): 
 288         Get the the login info as (username, password) 
 289         It will look in the netrc file using the _NETRC_MACHINE value 
 290         If there's no info available, return (None, None) 
 292         if self._downloader is None: 
 297         downloader_params = self._downloader.params 
 299         # Attempt to use provided username and password or .netrc data 
 300         if downloader_params.get('username', None) is not None: 
 301             username = downloader_params['username'] 
 302             password = downloader_params['password'] 
 303         elif downloader_params.get('usenetrc', False): 
 305                 info = netrc.netrc().authenticators(self._NETRC_MACHINE) 
 310                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 
 311             except (IOError, netrc.NetrcParseError) as err: 
 312                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) 
 314         return (username, password) 
 316     # Helper functions for extracting OpenGraph info 
 319         return r'<meta.+?property=[\'"]og
:%s[\'"].+?content=(?:"(.+?
)"|\'(.+?)\')' % re.escape(prop) 
 321     def _og_search_property(self, prop, html, name=None, **kargs): 
 323             name = 'OpenGraph %s' % prop 
 324         escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs) 
 325         if not escaped is None: 
 326             return unescapeHTML(escaped) 
 329     def _og_search_thumbnail(self, html, **kargs): 
 330         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) 
 332     def _og_search_description(self, html, **kargs): 
 333         return self._og_search_property('description', html, fatal=False, **kargs) 
 335     def _og_search_title(self, html, **kargs): 
 336         return self._og_search_property('title', html, **kargs) 
 338     def _og_search_video_url(self, html, name='video url', secure=True, **kargs): 
 339         regexes = [self._og_regex('video')] 
 340         if secure: regexes.insert(0, self._og_regex('video:secure_url')) 
 341         return self._html_search_regex(regexes, html, name, **kargs) 
 343     def _rta_search(self, html): 
 344         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
 345         if re.search(r'(?ix)<meta\s+name="rating
"\s+' 
 346                      r'     content="RTA
-5042-1996-1400-1577-RTA
"', 
 352 class SearchInfoExtractor(InfoExtractor): 
 354     Base class for paged search queries extractors. 
 355     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} 
 356     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
 360     def _make_valid_url(cls): 
 361         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY 
 364     def suitable(cls, url): 
 365         return re.match(cls._make_valid_url(), url) is not None 
 367     def _real_extract(self, query): 
 368         mobj = re.match(self._make_valid_url(), query) 
 370             raise ExtractorError(u'Invalid search query "%s"' % query) 
 372         prefix = mobj.group('prefix') 
 373         query = mobj.group('query') 
 375             return self._get_n_results(query, 1) 
 376         elif prefix == 'all': 
 377             return self._get_n_results(query, self._MAX_RESULTS) 
 381                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) 
 382             elif n > self._MAX_RESULTS: 
 383                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 
 384                 n = self._MAX_RESULTS 
 385             return self._get_n_results(query, n) 
 387     def _get_n_results(self, query, n): 
 388         """Get a specified number of results for a query""" 
 389         raise NotImplementedError("This method must be implemented by subclasses
") 
 392     def SEARCH_KEY(self): 
 393         return self._SEARCH_KEY