]>
Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
1b049082de5bbc9541a6513acc4124a478b87ea0
   7 import xml
.etree
.ElementTree
 
  23 class InfoExtractor(object): 
  24     """Information Extractor class. 
  26     Information extractors are the classes that, given a URL, extract 
  27     information about the video (or videos) the URL refers to. This 
  28     information includes the real video URL, the video title, author and 
  29     others. The information is stored in a dictionary which is then 
  30     passed to the FileDownloader. The FileDownloader processes this 
  31     information possibly downloading the video to the file system, among 
  32     other possible outcomes. 
  34     The dictionaries must include the following fields: 
  38     title:          Video title, unescaped. 
  39     ext:            Video filename extension. 
  41     Instead of url and ext, formats can also specified. 
  43     The following fields are optional: 
  45     format:         The video format, defaults to ext (used for --get-format) 
  46     thumbnails:     A list of dictionaries (with the entries "resolution" and 
  47                     "url") for the varying thumbnails 
  48     thumbnail:      Full URL to a video thumbnail image. 
  49     description:    One-line video description. 
  50     uploader:       Full name of the video uploader. 
  51     upload_date:    Video upload date (YYYYMMDD). 
  52     uploader_id:    Nickname or id of the video uploader. 
  53     location:       Physical location of the video. 
  54     player_url:     SWF Player URL (used for rtmpdump). 
  55     subtitles:      The subtitle file contents as a dictionary in the format 
  56                     {language: subtitles}. 
  57     view_count:     How many users have watched the video on the platform. 
  58     urlhandle:      [internal] The urlHandle to be used to download the file, 
  59                     like returned by urllib.request.urlopen 
  60     age_limit:      Age restriction for the video, as an integer (years) 
  61     formats:        A list of dictionaries for each format available, it must 
  62                     be ordered from worst to best quality. Potential fields: 
  63                     * url       Mandatory. The URL of the video file 
  64                     * ext       Will be calculated from url if missing 
  65                     * format    A human-readable description of the format 
  66                                 ("mp4 container with h264/opus"). 
  67                                 Calculated from the format_id, width, height. 
  68                                 and format_note fields if missing. 
  69                     * format_id A short description of the format 
  70                                 ("mp4_h264_opus" or "19") 
  71                     * format_note Additional info about the format 
  72                                 ("3D" or "DASH video") 
  73                     * width     Width of the video, if known 
  74                     * height    Height of the video, if known 
  75                     * abr       Average audio bitrate in KBit/s 
  76                     * acodec    Name of the audio codec in use 
  77                     * vbr       Average video bitrate in KBit/s 
  78                     * vcodec    Name of the video codec in use 
  79                     * filesize  The number of bytes, if known in advance 
  80     webpage_url:    The url to the video webpage, if given to youtube-dl it 
  81                     should allow to get the same result again. (It will be set 
  82                     by YoutubeDL if it's missing) 
  84     Unless mentioned otherwise, the fields should be Unicode strings. 
  86     Subclasses of this one should re-define the _real_initialize() and 
  87     _real_extract() methods and define a _VALID_URL regexp. 
  88     Probably, they should also be added to the list of extractors. 
  90     _real_extract() must return a *list* of information dictionaries as 
  93     Finally, the _WORKING attribute should be set to False for broken IEs 
  94     in order to warn the users and skip the tests. 
 101     def __init__(self
, downloader
=None): 
 102         """Constructor. Receives an optional downloader.""" 
 104         self
.set_downloader(downloader
) 
 107     def suitable(cls
, url
): 
 108         """Receives a URL and returns True if suitable for this IE.""" 
 110         # This does not use has/getattr intentionally - we want to know whether 
 111         # we have cached the regexp for *this* class, whereas getattr would also 
 112         # match the superclass 
 113         if '_VALID_URL_RE' not in cls
.__dict
__: 
 114             cls
._VALID
_URL
_RE 
= re
.compile(cls
._VALID
_URL
) 
 115         return cls
._VALID
_URL
_RE
.match(url
) is not None 
 119         """Getter method for _WORKING.""" 
 122     def initialize(self
): 
 123         """Initializes an instance (authentication, etc).""" 
 125             self
._real
_initialize
() 
 128     def extract(self
, url
): 
 129         """Extracts URL information and returns it in list of dicts.""" 
 131         return self
._real
_extract
(url
) 
 133     def set_downloader(self
, downloader
): 
 134         """Sets the downloader for this IE.""" 
 135         self
._downloader 
= downloader
 
 137     def _real_initialize(self
): 
 138         """Real initialization process. Redefine in subclasses.""" 
 141     def _real_extract(self
, url
): 
 142         """Real extraction process. Redefine in subclasses.""" 
 147         """A string for getting the InfoExtractor with get_info_extractor""" 
 148         return cls
.__name
__[:-2] 
 152         return type(self
).__name
__[:-2] 
 154     def _request_webpage(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 155         """ Returns the response handle """ 
 157             self
.report_download_webpage(video_id
) 
 158         elif note 
is not False: 
 159             self
.to_screen(u
'%s: %s' % (video_id
, note
)) 
 161             return self
._downloader
.urlopen(url_or_request
) 
 162         except (compat_urllib_error
.URLError
, compat_http_client
.HTTPException
, socket
.error
) as err
: 
 164                 errnote 
= u
'Unable to download webpage' 
 165             raise ExtractorError(u
'%s: %s' % (errnote
, compat_str(err
)), sys
.exc_info()[2], cause
=err
) 
 167     def _download_webpage_handle(self
, url_or_request
, video_id
, note
=None, errnote
=None): 
 168         """ Returns a tuple (page content as string, URL handle) """ 
 170         # Strip hashes from the URL (#1038) 
 171         if isinstance(url_or_request
, (compat_str
, str)): 
 172             url_or_request 
= url_or_request
.partition('#')[0] 
 174         urlh 
= self
._request
_webpage
(url_or_request
, video_id
, note
, errnote
) 
 175         content_type 
= urlh
.headers
.get('Content-Type', '') 
 176         webpage_bytes 
= urlh
.read() 
 177         m 
= re
.match(r
'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type
) 
 179             encoding 
= m
.group(1) 
 181             m 
= re
.search(br
'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]', 
 182                           webpage_bytes[:1024]) 
 184                 encoding = m.group(1).decode('ascii') 
 187         if self._downloader.params.get('dump_intermediate_pages', False): 
 189                 url = url_or_request.get_full_url() 
 190             except AttributeError: 
 192             self.to_screen(u'Dumping request to ' + url) 
 193             dump = base64.b64encode(webpage_bytes).decode('ascii') 
 194             self._downloader.to_screen(dump) 
 195         if self._downloader.params.get('write_pages', False): 
 197                 url = url_or_request.get_full_url() 
 198             except AttributeError: 
 200             raw_filename = ('%s_%s.dump' % (video_id, url)) 
 201             filename = sanitize_filename(raw_filename, restricted=True) 
 202             self.to_screen(u'Saving request to ' + filename) 
 203             with open(filename, 'wb') as outf: 
 204                 outf.write(webpage_bytes) 
 206         content = webpage_bytes.decode(encoding, 'replace') 
 207         return (content, urlh) 
 209     def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): 
 210         """ Returns the data of the page as a string """ 
 211         return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] 
 213     def _download_xml(self, url_or_request, video_id, 
 214                       note=u'Downloading XML', errnote=u'Unable to download XML'): 
 215         """Return the xml as an xml.etree.ElementTree.Element""" 
 216         xml_string = self._download_webpage(url_or_request, video_id, note, errnote) 
 217         return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) 
 219     def to_screen(self, msg): 
 220         """Print msg to screen, prefixing it with '[ie_name]'""" 
 221         self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) 
 223     def report_extraction(self, id_or_name): 
 224         """Report information extraction.""" 
 225         self.to_screen(u'%s: Extracting information' % id_or_name) 
 227     def report_download_webpage(self, video_id): 
 228         """Report webpage download.""" 
 229         self.to_screen(u'%s: Downloading webpage' % video_id) 
 231     def report_age_confirmation(self): 
 232         """Report attempt to confirm age.""" 
 233         self.to_screen(u'Confirming age') 
 235     def report_login(self): 
 236         """Report attempt to log in.""" 
 237         self.to_screen(u'Logging in') 
 239     #Methods for following #608 
 240     def url_result(self, url, ie=None, video_id=None): 
 241         """Returns a url that points to a page that should be processed""" 
 242         #TODO: ie should be the class used for getting the info 
 243         video_info = {'_type': 'url', 
 246         if video_id is not None: 
 247             video_info['id'] = video_id 
 249     def playlist_result(self, entries, playlist_id=None, playlist_title=None): 
 250         """Returns a playlist""" 
 251         video_info = {'_type': 'playlist', 
 254             video_info['id'] = playlist_id 
 256             video_info['title'] = playlist_title 
 259     def _search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): 
 261         Perform a regex search on the given string, using a single or a list of 
 262         patterns returning the first matching group. 
 263         In case of failure return a default value or raise a WARNING or a 
 264         RegexNotFoundError, depending on fatal, specifying the field name. 
 266         if isinstance(pattern, (str, compat_str, compiled_regex_type)): 
 267             mobj = re.search(pattern, string, flags) 
 270                 mobj = re.search(p, string, flags) 
 273         if sys.stderr.isatty() and os.name != 'nt': 
 274             _name = u'\033[0;34m%s\033[0m' % name 
 279             # return the first matching group 
 280             return next(g for g in mobj.groups() if g is not None) 
 281         elif default is not None: 
 284             raise RegexNotFoundError(u'Unable to extract %s' % _name) 
 286             self._downloader.report_warning(u'unable to extract %s; ' 
 287                 u'please report this issue on http://yt-dl.org/bug' % _name) 
 290     def _html_search_regex(self, pattern, string, name, default=None, fatal=True, flags=0): 
 292         Like _search_regex, but strips HTML tags and unescapes entities. 
 294         res = self._search_regex(pattern, string, name, default, fatal, flags) 
 296             return clean_html(res).strip() 
 300     def _get_login_info(self): 
 302         Get the the login info as (username, password) 
 303         It will look in the netrc file using the _NETRC_MACHINE value 
 304         If there's no info available, return (None, None) 
 306         if self._downloader is None: 
 311         downloader_params = self._downloader.params 
 313         # Attempt to use provided username and password or .netrc data 
 314         if downloader_params.get('username', None) is not None: 
 315             username = downloader_params['username'] 
 316             password = downloader_params['password'] 
 317         elif downloader_params.get('usenetrc', False): 
 319                 info = netrc.netrc().authenticators(self._NETRC_MACHINE) 
 324                     raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) 
 325             except (IOError, netrc.NetrcParseError) as err: 
 326                 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) 
 328         return (username, password) 
 330     # Helper functions for extracting OpenGraph info 
 332     def _og_regexes(prop): 
 333         content_re = r'content=(?:"([^
>]+?
)"|\'(.+?)\')' 
 334         property_re = r'property=[\'"]og
:%s[\'"]' % re.escape(prop) 
 335         template = r'<meta[^>]+?%s[^>]+?%s' 
 337             template % (property_re, content_re), 
 338             template % (content_re, property_re), 
 341     def _og_search_property(self, prop, html, name=None, **kargs): 
 343             name = 'OpenGraph %s' % prop 
 344         escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) 
 347         return unescapeHTML(escaped) 
 349     def _og_search_thumbnail(self, html, **kargs): 
 350         return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs) 
 352     def _og_search_description(self, html, **kargs): 
 353         return self._og_search_property('description', html, fatal=False, **kargs) 
 355     def _og_search_title(self, html, **kargs): 
 356         return self._og_search_property('title', html, **kargs) 
 358     def _og_search_video_url(self, html, name='video url', secure=True, **kargs): 
 359         regexes = self._og_regexes('video') 
 360         if secure: regexes = self._og_regexes('video:secure_url') + regexes 
 361         return self._html_search_regex(regexes, html, name, **kargs) 
 363     def _html_search_meta(self, name, html, display_name=None): 
 364         if display_name is None: 
 366         return self._html_search_regex( 
 368                     (?=[^>]+(?:itemprop|name|property)=["\']%s["\']) 
 369                     [^>]+content=["\']([^
"\']+)["\']''' % re.escape(name), 
 370             html, display_name, fatal=False) 
 372     def _dc_search_uploader(self, html): 
 373         return self._html_search_meta('dc.creator', html, 'uploader') 
 375     def _rta_search(self, html): 
 376         # See http://www.rtalabel.org/index.php?content=howtofaq#single 
 377         if re.search(r'(?ix)<meta\s+name="rating"\s+' 
 378                      r'     content="RTA-5042-1996-1400-1577-RTA"', 
 383     def _media_rating_search(self, html): 
 384         # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/ 
 385         rating = self._html_search_meta('rating', html) 
 397         return RATING_TABLE.get(rating.lower(), None) 
 401 class SearchInfoExtractor(InfoExtractor): 
 403     Base class for paged search queries extractors. 
 404     They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} 
 405     Instances should define _SEARCH_KEY and _MAX_RESULTS. 
 409     def _make_valid_url(cls): 
 410         return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY 
 413     def suitable(cls, url): 
 414         return re.match(cls._make_valid_url(), url) is not None 
 416     def _real_extract(self, query): 
 417         mobj = re.match(self._make_valid_url(), query) 
 419             raise ExtractorError(u'Invalid search query "%s"' % query) 
 421         prefix = mobj.group('prefix') 
 422         query = mobj.group('query') 
 424             return self._get_n_results(query, 1) 
 425         elif prefix == 'all': 
 426             return self._get_n_results(query, self._MAX_RESULTS) 
 430                 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) 
 431             elif n > self._MAX_RESULTS: 
 432                 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) 
 433                 n = self._MAX_RESULTS 
 434             return self._get_n_results(query, n) 
 436     def _get_n_results(self, query, n): 
 437         """Get a specified number of results for a query""" 
 438         raise NotImplementedError("This method must be implemented by subclasses") 
 441     def SEARCH_KEY(self): 
 442         return self._SEARCH_KEY