]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Imported Upstream version 2014.07.11
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import os
5 import re
6 import socket
7 import sys
8 import netrc
9 import xml.etree.ElementTree
10
11 from ..utils import (
12 compat_http_client,
13 compat_urllib_error,
14 compat_urllib_parse_urlparse,
15 compat_str,
16
17 clean_html,
18 compiled_regex_type,
19 ExtractorError,
20 RegexNotFoundError,
21 sanitize_filename,
22 unescapeHTML,
23 )
24 _NO_DEFAULT = object()
25
26
27 class InfoExtractor(object):
28 """Information Extractor class.
29
30 Information extractors are the classes that, given a URL, extract
31 information about the video (or videos) the URL refers to. This
32 information includes the real video URL, the video title, author and
33 others. The information is stored in a dictionary which is then
34 passed to the FileDownloader. The FileDownloader processes this
35 information possibly downloading the video to the file system, among
36 other possible outcomes.
37
38 The dictionaries must include the following fields:
39
40 id: Video identifier.
41 title: Video title, unescaped.
42
43 Additionally, it must contain either a formats entry or a url one:
44
45 formats: A list of dictionaries for each format available, ordered
46 from worst to best quality.
47
48 Potential fields:
49 * url Mandatory. The URL of the video file
50 * ext Will be calculated from url if missing
51 * format A human-readable description of the format
52 ("mp4 container with h264/opus").
53 Calculated from the format_id, width, height.
54 and format_note fields if missing.
55 * format_id A short description of the format
56 ("mp4_h264_opus" or "19").
57 Technically optional, but strongly recommended.
58 * format_note Additional info about the format
59 ("3D" or "DASH video")
60 * width Width of the video, if known
61 * height Height of the video, if known
62 * resolution Textual description of width and height
63 * tbr Average bitrate of audio and video in KBit/s
64 * abr Average audio bitrate in KBit/s
65 * acodec Name of the audio codec in use
66 * asr Audio sampling rate in Hertz
67 * vbr Average video bitrate in KBit/s
68 * vcodec Name of the video codec in use
69 * container Name of the container format
70 * filesize The number of bytes, if known in advance
71 * player_url SWF Player URL (used for rtmpdump).
72 * protocol The protocol that will be used for the actual
73 download, lower-case.
74 "http", "https", "rtsp", "rtmp", "m3u8" or so.
75 * preference Order number of this format. If this field is
76 present and not None, the formats get sorted
77 by this field, regardless of all other values.
78 -1 for default (order by other properties),
79 -2 or smaller for less than default.
80 * quality Order number of the video quality of this
81 format, irrespective of the file format.
82 -1 for default (order by other properties),
83 -2 or smaller for less than default.
84 url: Final video URL.
85 ext: Video filename extension.
86 format: The video format, defaults to ext (used for --get-format)
87 player_url: SWF Player URL (used for rtmpdump).
88
89 The following fields are optional:
90
91 display_id An alternative identifier for the video, not necessarily
92 unique, but available before title. Typically, id is
93 something like "4234987", title "Dancing naked mole rats",
94 and display_id "dancing-naked-mole-rats"
95 thumbnails: A list of dictionaries, with the following entries:
96 * "url"
97 * "width" (optional, int)
98 * "height" (optional, int)
99 * "resolution" (optional, string "{width}x{height"},
100 deprecated)
101 thumbnail: Full URL to a video thumbnail image.
102 description: One-line video description.
103 uploader: Full name of the video uploader.
104 timestamp: UNIX timestamp of the moment the video became available.
105 upload_date: Video upload date (YYYYMMDD).
106 If not explicitly set, calculated from timestamp.
107 uploader_id: Nickname or id of the video uploader.
108 location: Physical location of the video.
109 subtitles: The subtitle file contents as a dictionary in the format
110 {language: subtitles}.
111 duration: Length of the video in seconds, as an integer.
112 view_count: How many users have watched the video on the platform.
113 like_count: Number of positive ratings of the video
114 dislike_count: Number of negative ratings of the video
115 comment_count: Number of comments on the video
116 age_limit: Age restriction for the video, as an integer (years)
117 webpage_url: The url to the video webpage, if given to youtube-dl it
118 should allow to get the same result again. (It will be set
119 by YoutubeDL if it's missing)
120 categories: A list of categories that the video falls in, for example
121 ["Sports", "Berlin"]
122
123 Unless mentioned otherwise, the fields should be Unicode strings.
124
125 Subclasses of this one should re-define the _real_initialize() and
126 _real_extract() methods and define a _VALID_URL regexp.
127 Probably, they should also be added to the list of extractors.
128
129 Finally, the _WORKING attribute should be set to False for broken IEs
130 in order to warn the users and skip the tests.
131 """
132
133 _ready = False
134 _downloader = None
135 _WORKING = True
136
137 def __init__(self, downloader=None):
138 """Constructor. Receives an optional downloader."""
139 self._ready = False
140 self.set_downloader(downloader)
141
142 @classmethod
143 def suitable(cls, url):
144 """Receives a URL and returns True if suitable for this IE."""
145
146 # This does not use has/getattr intentionally - we want to know whether
147 # we have cached the regexp for *this* class, whereas getattr would also
148 # match the superclass
149 if '_VALID_URL_RE' not in cls.__dict__:
150 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
151 return cls._VALID_URL_RE.match(url) is not None
152
153 @classmethod
154 def working(cls):
155 """Getter method for _WORKING."""
156 return cls._WORKING
157
158 def initialize(self):
159 """Initializes an instance (authentication, etc)."""
160 if not self._ready:
161 self._real_initialize()
162 self._ready = True
163
164 def extract(self, url):
165 """Extracts URL information and returns it in list of dicts."""
166 self.initialize()
167 return self._real_extract(url)
168
169 def set_downloader(self, downloader):
170 """Sets the downloader for this IE."""
171 self._downloader = downloader
172
173 def _real_initialize(self):
174 """Real initialization process. Redefine in subclasses."""
175 pass
176
177 def _real_extract(self, url):
178 """Real extraction process. Redefine in subclasses."""
179 pass
180
181 @classmethod
182 def ie_key(cls):
183 """A string for getting the InfoExtractor with get_info_extractor"""
184 return cls.__name__[:-2]
185
186 @property
187 def IE_NAME(self):
188 return type(self).__name__[:-2]
189
190 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
191 """ Returns the response handle """
192 if note is None:
193 self.report_download_webpage(video_id)
194 elif note is not False:
195 if video_id is None:
196 self.to_screen(u'%s' % (note,))
197 else:
198 self.to_screen(u'%s: %s' % (video_id, note))
199 try:
200 return self._downloader.urlopen(url_or_request)
201 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
202 if errnote is False:
203 return False
204 if errnote is None:
205 errnote = u'Unable to download webpage'
206 errmsg = u'%s: %s' % (errnote, compat_str(err))
207 if fatal:
208 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
209 else:
210 self._downloader.report_warning(errmsg)
211 return False
212
213 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
214 """ Returns a tuple (page content as string, URL handle) """
215
216 # Strip hashes from the URL (#1038)
217 if isinstance(url_or_request, (compat_str, str)):
218 url_or_request = url_or_request.partition('#')[0]
219
220 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
221 if urlh is False:
222 assert not fatal
223 return False
224 content_type = urlh.headers.get('Content-Type', '')
225 webpage_bytes = urlh.read()
226 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
227 if m:
228 encoding = m.group(1)
229 else:
230 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
231 webpage_bytes[:1024])
232 if m:
233 encoding = m.group(1).decode('ascii')
234 elif webpage_bytes.startswith(b'\xff\xfe'):
235 encoding = 'utf-16'
236 else:
237 encoding = 'utf-8'
238 if self._downloader.params.get('dump_intermediate_pages', False):
239 try:
240 url = url_or_request.get_full_url()
241 except AttributeError:
242 url = url_or_request
243 self.to_screen(u'Dumping request to ' + url)
244 dump = base64.b64encode(webpage_bytes).decode('ascii')
245 self._downloader.to_screen(dump)
246 if self._downloader.params.get('write_pages', False):
247 try:
248 url = url_or_request.get_full_url()
249 except AttributeError:
250 url = url_or_request
251 basen = '%s_%s' % (video_id, url)
252 if len(basen) > 240:
253 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
254 basen = basen[:240 - len(h)] + h
255 raw_filename = basen + '.dump'
256 filename = sanitize_filename(raw_filename, restricted=True)
257 self.to_screen(u'Saving request to ' + filename)
258 with open(filename, 'wb') as outf:
259 outf.write(webpage_bytes)
260
261 try:
262 content = webpage_bytes.decode(encoding, 'replace')
263 except LookupError:
264 content = webpage_bytes.decode('utf-8', 'replace')
265
266 if (u'<title>Access to this site is blocked</title>' in content and
267 u'Websense' in content[:512]):
268 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
269 blocked_iframe = self._html_search_regex(
270 r'<iframe src="([^"]+)"', content,
271 u'Websense information URL', default=None)
272 if blocked_iframe:
273 msg += u' Visit %s for more details' % blocked_iframe
274 raise ExtractorError(msg, expected=True)
275
276 return (content, urlh)
277
278 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
279 """ Returns the data of the page as a string """
280 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
281 if res is False:
282 return res
283 else:
284 content, _ = res
285 return content
286
287 def _download_xml(self, url_or_request, video_id,
288 note=u'Downloading XML', errnote=u'Unable to download XML',
289 transform_source=None, fatal=True):
290 """Return the xml as an xml.etree.ElementTree.Element"""
291 xml_string = self._download_webpage(
292 url_or_request, video_id, note, errnote, fatal=fatal)
293 if xml_string is False:
294 return xml_string
295 if transform_source:
296 xml_string = transform_source(xml_string)
297 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
298
299 def _download_json(self, url_or_request, video_id,
300 note=u'Downloading JSON metadata',
301 errnote=u'Unable to download JSON metadata',
302 transform_source=None):
303 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
304 if transform_source:
305 json_string = transform_source(json_string)
306 try:
307 return json.loads(json_string)
308 except ValueError as ve:
309 raise ExtractorError('Failed to download JSON', cause=ve)
310
311 def report_warning(self, msg, video_id=None):
312 idstr = u'' if video_id is None else u'%s: ' % video_id
313 self._downloader.report_warning(
314 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
315
316 def to_screen(self, msg):
317 """Print msg to screen, prefixing it with '[ie_name]'"""
318 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
319
320 def report_extraction(self, id_or_name):
321 """Report information extraction."""
322 self.to_screen(u'%s: Extracting information' % id_or_name)
323
324 def report_download_webpage(self, video_id):
325 """Report webpage download."""
326 self.to_screen(u'%s: Downloading webpage' % video_id)
327
328 def report_age_confirmation(self):
329 """Report attempt to confirm age."""
330 self.to_screen(u'Confirming age')
331
332 def report_login(self):
333 """Report attempt to log in."""
334 self.to_screen(u'Logging in')
335
336 #Methods for following #608
337 @staticmethod
338 def url_result(url, ie=None, video_id=None):
339 """Returns a url that points to a page that should be processed"""
340 #TODO: ie should be the class used for getting the info
341 video_info = {'_type': 'url',
342 'url': url,
343 'ie_key': ie}
344 if video_id is not None:
345 video_info['id'] = video_id
346 return video_info
347 @staticmethod
348 def playlist_result(entries, playlist_id=None, playlist_title=None):
349 """Returns a playlist"""
350 video_info = {'_type': 'playlist',
351 'entries': entries}
352 if playlist_id:
353 video_info['id'] = playlist_id
354 if playlist_title:
355 video_info['title'] = playlist_title
356 return video_info
357
358 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
359 """
360 Perform a regex search on the given string, using a single or a list of
361 patterns returning the first matching group.
362 In case of failure return a default value or raise a WARNING or a
363 RegexNotFoundError, depending on fatal, specifying the field name.
364 """
365 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
366 mobj = re.search(pattern, string, flags)
367 else:
368 for p in pattern:
369 mobj = re.search(p, string, flags)
370 if mobj: break
371
372 if os.name != 'nt' and sys.stderr.isatty():
373 _name = u'\033[0;34m%s\033[0m' % name
374 else:
375 _name = name
376
377 if mobj:
378 # return the first matching group
379 return next(g for g in mobj.groups() if g is not None)
380 elif default is not _NO_DEFAULT:
381 return default
382 elif fatal:
383 raise RegexNotFoundError(u'Unable to extract %s' % _name)
384 else:
385 self._downloader.report_warning(u'unable to extract %s; '
386 u'please report this issue on http://yt-dl.org/bug' % _name)
387 return None
388
389 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
390 """
391 Like _search_regex, but strips HTML tags and unescapes entities.
392 """
393 res = self._search_regex(pattern, string, name, default, fatal, flags)
394 if res:
395 return clean_html(res).strip()
396 else:
397 return res
398
399 def _get_login_info(self):
400 """
401 Get the the login info as (username, password)
402 It will look in the netrc file using the _NETRC_MACHINE value
403 If there's no info available, return (None, None)
404 """
405 if self._downloader is None:
406 return (None, None)
407
408 username = None
409 password = None
410 downloader_params = self._downloader.params
411
412 # Attempt to use provided username and password or .netrc data
413 if downloader_params.get('username', None) is not None:
414 username = downloader_params['username']
415 password = downloader_params['password']
416 elif downloader_params.get('usenetrc', False):
417 try:
418 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
419 if info is not None:
420 username = info[0]
421 password = info[2]
422 else:
423 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
424 except (IOError, netrc.NetrcParseError) as err:
425 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
426
427 return (username, password)
428
429 # Helper functions for extracting OpenGraph info
430 @staticmethod
431 def _og_regexes(prop):
432 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
433 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
434 template = r'<meta[^>]+?%s[^>]+?%s'
435 return [
436 template % (property_re, content_re),
437 template % (content_re, property_re),
438 ]
439
440 def _og_search_property(self, prop, html, name=None, **kargs):
441 if name is None:
442 name = 'OpenGraph %s' % prop
443 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
444 if escaped is None:
445 return None
446 return unescapeHTML(escaped)
447
448 def _og_search_thumbnail(self, html, **kargs):
449 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
450
451 def _og_search_description(self, html, **kargs):
452 return self._og_search_property('description', html, fatal=False, **kargs)
453
454 def _og_search_title(self, html, **kargs):
455 return self._og_search_property('title', html, **kargs)
456
457 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
458 regexes = self._og_regexes('video')
459 if secure: regexes = self._og_regexes('video:secure_url') + regexes
460 return self._html_search_regex(regexes, html, name, **kargs)
461
462 def _og_search_url(self, html, **kargs):
463 return self._og_search_property('url', html, **kargs)
464
465 def _html_search_meta(self, name, html, display_name=None, fatal=False):
466 if display_name is None:
467 display_name = name
468 return self._html_search_regex(
469 r'''(?ix)<meta
470 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
471 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
472 html, display_name, fatal=fatal)
473
474 def _dc_search_uploader(self, html):
475 return self._html_search_meta('dc.creator', html, 'uploader')
476
477 def _rta_search(self, html):
478 # See http://www.rtalabel.org/index.php?content=howtofaq#single
479 if re.search(r'(?ix)<meta\s+name="rating"\s+'
480 r' content="RTA-5042-1996-1400-1577-RTA"',
481 html):
482 return 18
483 return 0
484
485 def _media_rating_search(self, html):
486 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
487 rating = self._html_search_meta('rating', html)
488
489 if not rating:
490 return None
491
492 RATING_TABLE = {
493 'safe for kids': 0,
494 'general': 8,
495 '14 years': 14,
496 'mature': 17,
497 'restricted': 19,
498 }
499 return RATING_TABLE.get(rating.lower(), None)
500
501 def _twitter_search_player(self, html):
502 return self._html_search_meta('twitter:player', html,
503 'twitter card player')
504
505 def _sort_formats(self, formats):
506 if not formats:
507 raise ExtractorError(u'No video formats found')
508
509 def _formats_key(f):
510 # TODO remove the following workaround
511 from ..utils import determine_ext
512 if not f.get('ext') and 'url' in f:
513 f['ext'] = determine_ext(f['url'])
514
515 preference = f.get('preference')
516 if preference is None:
517 proto = f.get('protocol')
518 if proto is None:
519 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
520
521 preference = 0 if proto in ['http', 'https'] else -0.1
522 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
523 preference -= 0.5
524
525 if f.get('vcodec') == 'none': # audio only
526 if self._downloader.params.get('prefer_free_formats'):
527 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
528 else:
529 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
530 ext_preference = 0
531 try:
532 audio_ext_preference = ORDER.index(f['ext'])
533 except ValueError:
534 audio_ext_preference = -1
535 else:
536 if self._downloader.params.get('prefer_free_formats'):
537 ORDER = [u'flv', u'mp4', u'webm']
538 else:
539 ORDER = [u'webm', u'flv', u'mp4']
540 try:
541 ext_preference = ORDER.index(f['ext'])
542 except ValueError:
543 ext_preference = -1
544 audio_ext_preference = 0
545
546 return (
547 preference,
548 f.get('quality') if f.get('quality') is not None else -1,
549 f.get('height') if f.get('height') is not None else -1,
550 f.get('width') if f.get('width') is not None else -1,
551 ext_preference,
552 f.get('tbr') if f.get('tbr') is not None else -1,
553 f.get('vbr') if f.get('vbr') is not None else -1,
554 f.get('abr') if f.get('abr') is not None else -1,
555 audio_ext_preference,
556 f.get('filesize') if f.get('filesize') is not None else -1,
557 f.get('format_id'),
558 )
559 formats.sort(key=_formats_key)
560
561 def http_scheme(self):
562 """ Either "https:" or "https:", depending on the user's preferences """
563 return (
564 'http:'
565 if self._downloader.params.get('prefer_insecure', False)
566 else 'https:')
567
568 def _proto_relative_url(self, url, scheme=None):
569 if url is None:
570 return url
571 if url.startswith('//'):
572 if scheme is None:
573 scheme = self.http_scheme()
574 return scheme + url
575 else:
576 return url
577
578
579 class SearchInfoExtractor(InfoExtractor):
580 """
581 Base class for paged search queries extractors.
582 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
583 Instances should define _SEARCH_KEY and _MAX_RESULTS.
584 """
585
586 @classmethod
587 def _make_valid_url(cls):
588 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
589
590 @classmethod
591 def suitable(cls, url):
592 return re.match(cls._make_valid_url(), url) is not None
593
594 def _real_extract(self, query):
595 mobj = re.match(self._make_valid_url(), query)
596 if mobj is None:
597 raise ExtractorError(u'Invalid search query "%s"' % query)
598
599 prefix = mobj.group('prefix')
600 query = mobj.group('query')
601 if prefix == '':
602 return self._get_n_results(query, 1)
603 elif prefix == 'all':
604 return self._get_n_results(query, self._MAX_RESULTS)
605 else:
606 n = int(prefix)
607 if n <= 0:
608 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
609 elif n > self._MAX_RESULTS:
610 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
611 n = self._MAX_RESULTS
612 return self._get_n_results(query, n)
613
614 def _get_n_results(self, query, n):
615 """Get a specified number of results for a query"""
616 raise NotImplementedError("This method must be implemented by subclasses")
617
618 @property
619 def SEARCH_KEY(self):
620 return self._SEARCH_KEY
621