]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
ba46a7bc77d17ed4bcf4dcf7764b1d39f4799958
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import os
3 import re
4 import socket
5 import sys
6 import netrc
7 import xml.etree.ElementTree
8
9 from ..utils import (
10 compat_http_client,
11 compat_urllib_error,
12 compat_str,
13
14 clean_html,
15 compiled_regex_type,
16 ExtractorError,
17 RegexNotFoundError,
18 sanitize_filename,
19 unescapeHTML,
20 )
21 _NO_DEFAULT = object()
22
23
24 class InfoExtractor(object):
25 """Information Extractor class.
26
27 Information extractors are the classes that, given a URL, extract
28 information about the video (or videos) the URL refers to. This
29 information includes the real video URL, the video title, author and
30 others. The information is stored in a dictionary which is then
31 passed to the FileDownloader. The FileDownloader processes this
32 information possibly downloading the video to the file system, among
33 other possible outcomes.
34
35 The dictionaries must include the following fields:
36
37 id: Video identifier.
38 title: Video title, unescaped.
39
40 Additionally, it must contain either a formats entry or url and ext:
41
42 formats: A list of dictionaries for each format available, it must
43 be ordered from worst to best quality. Potential fields:
44 * url Mandatory. The URL of the video file
45 * ext Will be calculated from url if missing
46 * format A human-readable description of the format
47 ("mp4 container with h264/opus").
48 Calculated from the format_id, width, height.
49 and format_note fields if missing.
50 * format_id A short description of the format
51 ("mp4_h264_opus" or "19")
52 * format_note Additional info about the format
53 ("3D" or "DASH video")
54 * width Width of the video, if known
55 * height Height of the video, if known
56 * abr Average audio bitrate in KBit/s
57 * acodec Name of the audio codec in use
58 * vbr Average video bitrate in KBit/s
59 * vcodec Name of the video codec in use
60 * filesize The number of bytes, if known in advance
61 * player_url SWF Player URL (used for rtmpdump).
62 url: Final video URL.
63 ext: Video filename extension.
64 format: The video format, defaults to ext (used for --get-format)
65 player_url: SWF Player URL (used for rtmpdump).
66 urlhandle: [internal] The urlHandle to be used to download the file,
67 like returned by urllib.request.urlopen
68
69 The following fields are optional:
70
71 thumbnails: A list of dictionaries (with the entries "resolution" and
72 "url") for the varying thumbnails
73 thumbnail: Full URL to a video thumbnail image.
74 description: One-line video description.
75 uploader: Full name of the video uploader.
76 upload_date: Video upload date (YYYYMMDD).
77 uploader_id: Nickname or id of the video uploader.
78 location: Physical location of the video.
79 subtitles: The subtitle file contents as a dictionary in the format
80 {language: subtitles}.
81 duration: Length of the video in seconds, as an integer.
82 view_count: How many users have watched the video on the platform.
83 like_count: Number of positive ratings of the video
84 dislike_count: Number of negative ratings of the video
85 comment_count: Number of comments on the video
86 age_limit: Age restriction for the video, as an integer (years)
87 webpage_url: The url to the video webpage, if given to youtube-dl it
88 should allow to get the same result again. (It will be set
89 by YoutubeDL if it's missing)
90
91 Unless mentioned otherwise, the fields should be Unicode strings.
92
93 Subclasses of this one should re-define the _real_initialize() and
94 _real_extract() methods and define a _VALID_URL regexp.
95 Probably, they should also be added to the list of extractors.
96
97 _real_extract() must return a *list* of information dictionaries as
98 described above.
99
100 Finally, the _WORKING attribute should be set to False for broken IEs
101 in order to warn the users and skip the tests.
102 """
103
104 _ready = False
105 _downloader = None
106 _WORKING = True
107
108 def __init__(self, downloader=None):
109 """Constructor. Receives an optional downloader."""
110 self._ready = False
111 self.set_downloader(downloader)
112
113 @classmethod
114 def suitable(cls, url):
115 """Receives a URL and returns True if suitable for this IE."""
116
117 # This does not use has/getattr intentionally - we want to know whether
118 # we have cached the regexp for *this* class, whereas getattr would also
119 # match the superclass
120 if '_VALID_URL_RE' not in cls.__dict__:
121 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
122 return cls._VALID_URL_RE.match(url) is not None
123
124 @classmethod
125 def working(cls):
126 """Getter method for _WORKING."""
127 return cls._WORKING
128
129 def initialize(self):
130 """Initializes an instance (authentication, etc)."""
131 if not self._ready:
132 self._real_initialize()
133 self._ready = True
134
135 def extract(self, url):
136 """Extracts URL information and returns it in list of dicts."""
137 self.initialize()
138 return self._real_extract(url)
139
140 def set_downloader(self, downloader):
141 """Sets the downloader for this IE."""
142 self._downloader = downloader
143
144 def _real_initialize(self):
145 """Real initialization process. Redefine in subclasses."""
146 pass
147
148 def _real_extract(self, url):
149 """Real extraction process. Redefine in subclasses."""
150 pass
151
152 @classmethod
153 def ie_key(cls):
154 """A string for getting the InfoExtractor with get_info_extractor"""
155 return cls.__name__[:-2]
156
157 @property
158 def IE_NAME(self):
159 return type(self).__name__[:-2]
160
161 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
162 """ Returns the response handle """
163 if note is None:
164 self.report_download_webpage(video_id)
165 elif note is not False:
166 if video_id is None:
167 self.to_screen(u'%s' % (note,))
168 else:
169 self.to_screen(u'%s: %s' % (video_id, note))
170 try:
171 return self._downloader.urlopen(url_or_request)
172 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
173 if errnote is False:
174 return False
175 if errnote is None:
176 errnote = u'Unable to download webpage'
177 errmsg = u'%s: %s' % (errnote, compat_str(err))
178 if fatal:
179 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
180 else:
181 self._downloader.report_warning(errmsg)
182 return False
183
184 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
185 """ Returns a tuple (page content as string, URL handle) """
186
187 # Strip hashes from the URL (#1038)
188 if isinstance(url_or_request, (compat_str, str)):
189 url_or_request = url_or_request.partition('#')[0]
190
191 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
192 if urlh is False:
193 assert not fatal
194 return False
195 content_type = urlh.headers.get('Content-Type', '')
196 webpage_bytes = urlh.read()
197 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
198 if m:
199 encoding = m.group(1)
200 else:
201 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
202 webpage_bytes[:1024])
203 if m:
204 encoding = m.group(1).decode('ascii')
205 else:
206 encoding = 'utf-8'
207 if self._downloader.params.get('dump_intermediate_pages', False):
208 try:
209 url = url_or_request.get_full_url()
210 except AttributeError:
211 url = url_or_request
212 self.to_screen(u'Dumping request to ' + url)
213 dump = base64.b64encode(webpage_bytes).decode('ascii')
214 self._downloader.to_screen(dump)
215 if self._downloader.params.get('write_pages', False):
216 try:
217 url = url_or_request.get_full_url()
218 except AttributeError:
219 url = url_or_request
220 raw_filename = ('%s_%s.dump' % (video_id, url))
221 filename = sanitize_filename(raw_filename, restricted=True)
222 self.to_screen(u'Saving request to ' + filename)
223 with open(filename, 'wb') as outf:
224 outf.write(webpage_bytes)
225
226 content = webpage_bytes.decode(encoding, 'replace')
227 return (content, urlh)
228
229 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
230 """ Returns the data of the page as a string """
231 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
232 if res is False:
233 return res
234 else:
235 content, _ = res
236 return content
237
238 def _download_xml(self, url_or_request, video_id,
239 note=u'Downloading XML', errnote=u'Unable to download XML',
240 transform_source=None):
241 """Return the xml as an xml.etree.ElementTree.Element"""
242 xml_string = self._download_webpage(url_or_request, video_id, note, errnote)
243 if transform_source:
244 xml_string = transform_source(xml_string)
245 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
246
247 def to_screen(self, msg):
248 """Print msg to screen, prefixing it with '[ie_name]'"""
249 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
250
251 def report_extraction(self, id_or_name):
252 """Report information extraction."""
253 self.to_screen(u'%s: Extracting information' % id_or_name)
254
255 def report_download_webpage(self, video_id):
256 """Report webpage download."""
257 self.to_screen(u'%s: Downloading webpage' % video_id)
258
259 def report_age_confirmation(self):
260 """Report attempt to confirm age."""
261 self.to_screen(u'Confirming age')
262
263 def report_login(self):
264 """Report attempt to log in."""
265 self.to_screen(u'Logging in')
266
267 #Methods for following #608
268 @staticmethod
269 def url_result(url, ie=None, video_id=None):
270 """Returns a url that points to a page that should be processed"""
271 #TODO: ie should be the class used for getting the info
272 video_info = {'_type': 'url',
273 'url': url,
274 'ie_key': ie}
275 if video_id is not None:
276 video_info['id'] = video_id
277 return video_info
278 @staticmethod
279 def playlist_result(entries, playlist_id=None, playlist_title=None):
280 """Returns a playlist"""
281 video_info = {'_type': 'playlist',
282 'entries': entries}
283 if playlist_id:
284 video_info['id'] = playlist_id
285 if playlist_title:
286 video_info['title'] = playlist_title
287 return video_info
288
289 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
290 """
291 Perform a regex search on the given string, using a single or a list of
292 patterns returning the first matching group.
293 In case of failure return a default value or raise a WARNING or a
294 RegexNotFoundError, depending on fatal, specifying the field name.
295 """
296 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
297 mobj = re.search(pattern, string, flags)
298 else:
299 for p in pattern:
300 mobj = re.search(p, string, flags)
301 if mobj: break
302
303 if os.name != 'nt' and sys.stderr.isatty():
304 _name = u'\033[0;34m%s\033[0m' % name
305 else:
306 _name = name
307
308 if mobj:
309 # return the first matching group
310 return next(g for g in mobj.groups() if g is not None)
311 elif default is not _NO_DEFAULT:
312 return default
313 elif fatal:
314 raise RegexNotFoundError(u'Unable to extract %s' % _name)
315 else:
316 self._downloader.report_warning(u'unable to extract %s; '
317 u'please report this issue on http://yt-dl.org/bug' % _name)
318 return None
319
320 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
321 """
322 Like _search_regex, but strips HTML tags and unescapes entities.
323 """
324 res = self._search_regex(pattern, string, name, default, fatal, flags)
325 if res:
326 return clean_html(res).strip()
327 else:
328 return res
329
330 def _get_login_info(self):
331 """
332 Get the the login info as (username, password)
333 It will look in the netrc file using the _NETRC_MACHINE value
334 If there's no info available, return (None, None)
335 """
336 if self._downloader is None:
337 return (None, None)
338
339 username = None
340 password = None
341 downloader_params = self._downloader.params
342
343 # Attempt to use provided username and password or .netrc data
344 if downloader_params.get('username', None) is not None:
345 username = downloader_params['username']
346 password = downloader_params['password']
347 elif downloader_params.get('usenetrc', False):
348 try:
349 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
350 if info is not None:
351 username = info[0]
352 password = info[2]
353 else:
354 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
355 except (IOError, netrc.NetrcParseError) as err:
356 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
357
358 return (username, password)
359
360 # Helper functions for extracting OpenGraph info
361 @staticmethod
362 def _og_regexes(prop):
363 content_re = r'content=(?:"([^>]+?)"|\'(.+?)\')'
364 property_re = r'property=[\'"]og:%s[\'"]' % re.escape(prop)
365 template = r'<meta[^>]+?%s[^>]+?%s'
366 return [
367 template % (property_re, content_re),
368 template % (content_re, property_re),
369 ]
370
371 def _og_search_property(self, prop, html, name=None, **kargs):
372 if name is None:
373 name = 'OpenGraph %s' % prop
374 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
375 if escaped is None:
376 return None
377 return unescapeHTML(escaped)
378
379 def _og_search_thumbnail(self, html, **kargs):
380 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
381
382 def _og_search_description(self, html, **kargs):
383 return self._og_search_property('description', html, fatal=False, **kargs)
384
385 def _og_search_title(self, html, **kargs):
386 return self._og_search_property('title', html, **kargs)
387
388 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
389 regexes = self._og_regexes('video')
390 if secure: regexes = self._og_regexes('video:secure_url') + regexes
391 return self._html_search_regex(regexes, html, name, **kargs)
392
393 def _html_search_meta(self, name, html, display_name=None):
394 if display_name is None:
395 display_name = name
396 return self._html_search_regex(
397 r'''(?ix)<meta
398 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
399 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
400 html, display_name, fatal=False)
401
402 def _dc_search_uploader(self, html):
403 return self._html_search_meta('dc.creator', html, 'uploader')
404
405 def _rta_search(self, html):
406 # See http://www.rtalabel.org/index.php?content=howtofaq#single
407 if re.search(r'(?ix)<meta\s+name="rating"\s+'
408 r' content="RTA-5042-1996-1400-1577-RTA"',
409 html):
410 return 18
411 return 0
412
413 def _media_rating_search(self, html):
414 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
415 rating = self._html_search_meta('rating', html)
416
417 if not rating:
418 return None
419
420 RATING_TABLE = {
421 'safe for kids': 0,
422 'general': 8,
423 '14 years': 14,
424 'mature': 17,
425 'restricted': 19,
426 }
427 return RATING_TABLE.get(rating.lower(), None)
428
429
430
431 class SearchInfoExtractor(InfoExtractor):
432 """
433 Base class for paged search queries extractors.
434 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
435 Instances should define _SEARCH_KEY and _MAX_RESULTS.
436 """
437
438 @classmethod
439 def _make_valid_url(cls):
440 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
441
442 @classmethod
443 def suitable(cls, url):
444 return re.match(cls._make_valid_url(), url) is not None
445
446 def _real_extract(self, query):
447 mobj = re.match(self._make_valid_url(), query)
448 if mobj is None:
449 raise ExtractorError(u'Invalid search query "%s"' % query)
450
451 prefix = mobj.group('prefix')
452 query = mobj.group('query')
453 if prefix == '':
454 return self._get_n_results(query, 1)
455 elif prefix == 'all':
456 return self._get_n_results(query, self._MAX_RESULTS)
457 else:
458 n = int(prefix)
459 if n <= 0:
460 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
461 elif n > self._MAX_RESULTS:
462 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
463 n = self._MAX_RESULTS
464 return self._get_n_results(query, n)
465
466 def _get_n_results(self, query, n):
467 """Get a specified number of results for a query"""
468 raise NotImplementedError("This method must be implemented by subclasses")
469
470 @property
471 def SEARCH_KEY(self):
472 return self._SEARCH_KEY