]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Imported Upstream version 2014.07.15
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import netrc
5 import os
6 import re
7 import socket
8 import sys
9 import time
10 import xml.etree.ElementTree
11
12 from ..utils import (
13 compat_http_client,
14 compat_urllib_error,
15 compat_urllib_parse_urlparse,
16 compat_str,
17
18 clean_html,
19 compiled_regex_type,
20 ExtractorError,
21 RegexNotFoundError,
22 sanitize_filename,
23 unescapeHTML,
24 )
25 _NO_DEFAULT = object()
26
27
28 class InfoExtractor(object):
29 """Information Extractor class.
30
31 Information extractors are the classes that, given a URL, extract
32 information about the video (or videos) the URL refers to. This
33 information includes the real video URL, the video title, author and
34 others. The information is stored in a dictionary which is then
35 passed to the FileDownloader. The FileDownloader processes this
36 information possibly downloading the video to the file system, among
37 other possible outcomes.
38
39 The dictionaries must include the following fields:
40
41 id: Video identifier.
42 title: Video title, unescaped.
43
44 Additionally, it must contain either a formats entry or a url one:
45
46 formats: A list of dictionaries for each format available, ordered
47 from worst to best quality.
48
49 Potential fields:
50 * url Mandatory. The URL of the video file
51 * ext Will be calculated from url if missing
52 * format A human-readable description of the format
53 ("mp4 container with h264/opus").
54 Calculated from the format_id, width, height.
55 and format_note fields if missing.
56 * format_id A short description of the format
57 ("mp4_h264_opus" or "19").
58 Technically optional, but strongly recommended.
59 * format_note Additional info about the format
60 ("3D" or "DASH video")
61 * width Width of the video, if known
62 * height Height of the video, if known
63 * resolution Textual description of width and height
64 * tbr Average bitrate of audio and video in KBit/s
65 * abr Average audio bitrate in KBit/s
66 * acodec Name of the audio codec in use
67 * asr Audio sampling rate in Hertz
68 * vbr Average video bitrate in KBit/s
69 * vcodec Name of the video codec in use
70 * container Name of the container format
71 * filesize The number of bytes, if known in advance
72 * player_url SWF Player URL (used for rtmpdump).
73 * protocol The protocol that will be used for the actual
74 download, lower-case.
75 "http", "https", "rtsp", "rtmp", "m3u8" or so.
76 * preference Order number of this format. If this field is
77 present and not None, the formats get sorted
78 by this field, regardless of all other values.
79 -1 for default (order by other properties),
80 -2 or smaller for less than default.
81 * quality Order number of the video quality of this
82 format, irrespective of the file format.
83 -1 for default (order by other properties),
84 -2 or smaller for less than default.
85 url: Final video URL.
86 ext: Video filename extension.
87 format: The video format, defaults to ext (used for --get-format)
88 player_url: SWF Player URL (used for rtmpdump).
89
90 The following fields are optional:
91
92 display_id An alternative identifier for the video, not necessarily
93 unique, but available before title. Typically, id is
94 something like "4234987", title "Dancing naked mole rats",
95 and display_id "dancing-naked-mole-rats"
96 thumbnails: A list of dictionaries, with the following entries:
97 * "url"
98 * "width" (optional, int)
99 * "height" (optional, int)
100 * "resolution" (optional, string "{width}x{height"},
101 deprecated)
102 thumbnail: Full URL to a video thumbnail image.
103 description: One-line video description.
104 uploader: Full name of the video uploader.
105 timestamp: UNIX timestamp of the moment the video became available.
106 upload_date: Video upload date (YYYYMMDD).
107 If not explicitly set, calculated from timestamp.
108 uploader_id: Nickname or id of the video uploader.
109 location: Physical location of the video.
110 subtitles: The subtitle file contents as a dictionary in the format
111 {language: subtitles}.
112 duration: Length of the video in seconds, as an integer.
113 view_count: How many users have watched the video on the platform.
114 like_count: Number of positive ratings of the video
115 dislike_count: Number of negative ratings of the video
116 comment_count: Number of comments on the video
117 age_limit: Age restriction for the video, as an integer (years)
118 webpage_url: The url to the video webpage, if given to youtube-dl it
119 should allow to get the same result again. (It will be set
120 by YoutubeDL if it's missing)
121 categories: A list of categories that the video falls in, for example
122 ["Sports", "Berlin"]
123
124 Unless mentioned otherwise, the fields should be Unicode strings.
125
126 Subclasses of this one should re-define the _real_initialize() and
127 _real_extract() methods and define a _VALID_URL regexp.
128 Probably, they should also be added to the list of extractors.
129
130 Finally, the _WORKING attribute should be set to False for broken IEs
131 in order to warn the users and skip the tests.
132 """
133
134 _ready = False
135 _downloader = None
136 _WORKING = True
137
138 def __init__(self, downloader=None):
139 """Constructor. Receives an optional downloader."""
140 self._ready = False
141 self.set_downloader(downloader)
142
143 @classmethod
144 def suitable(cls, url):
145 """Receives a URL and returns True if suitable for this IE."""
146
147 # This does not use has/getattr intentionally - we want to know whether
148 # we have cached the regexp for *this* class, whereas getattr would also
149 # match the superclass
150 if '_VALID_URL_RE' not in cls.__dict__:
151 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
152 return cls._VALID_URL_RE.match(url) is not None
153
154 @classmethod
155 def working(cls):
156 """Getter method for _WORKING."""
157 return cls._WORKING
158
159 def initialize(self):
160 """Initializes an instance (authentication, etc)."""
161 if not self._ready:
162 self._real_initialize()
163 self._ready = True
164
165 def extract(self, url):
166 """Extracts URL information and returns it in list of dicts."""
167 self.initialize()
168 return self._real_extract(url)
169
170 def set_downloader(self, downloader):
171 """Sets the downloader for this IE."""
172 self._downloader = downloader
173
174 def _real_initialize(self):
175 """Real initialization process. Redefine in subclasses."""
176 pass
177
178 def _real_extract(self, url):
179 """Real extraction process. Redefine in subclasses."""
180 pass
181
182 @classmethod
183 def ie_key(cls):
184 """A string for getting the InfoExtractor with get_info_extractor"""
185 return cls.__name__[:-2]
186
187 @property
188 def IE_NAME(self):
189 return type(self).__name__[:-2]
190
191 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
192 """ Returns the response handle """
193 if note is None:
194 self.report_download_webpage(video_id)
195 elif note is not False:
196 if video_id is None:
197 self.to_screen(u'%s' % (note,))
198 else:
199 self.to_screen(u'%s: %s' % (video_id, note))
200 try:
201 return self._downloader.urlopen(url_or_request)
202 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
203 if errnote is False:
204 return False
205 if errnote is None:
206 errnote = u'Unable to download webpage'
207 errmsg = u'%s: %s' % (errnote, compat_str(err))
208 if fatal:
209 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
210 else:
211 self._downloader.report_warning(errmsg)
212 return False
213
214 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
215 """ Returns a tuple (page content as string, URL handle) """
216
217 # Strip hashes from the URL (#1038)
218 if isinstance(url_or_request, (compat_str, str)):
219 url_or_request = url_or_request.partition('#')[0]
220
221 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
222 if urlh is False:
223 assert not fatal
224 return False
225 content_type = urlh.headers.get('Content-Type', '')
226 webpage_bytes = urlh.read()
227 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
228 if m:
229 encoding = m.group(1)
230 else:
231 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
232 webpage_bytes[:1024])
233 if m:
234 encoding = m.group(1).decode('ascii')
235 elif webpage_bytes.startswith(b'\xff\xfe'):
236 encoding = 'utf-16'
237 else:
238 encoding = 'utf-8'
239 if self._downloader.params.get('dump_intermediate_pages', False):
240 try:
241 url = url_or_request.get_full_url()
242 except AttributeError:
243 url = url_or_request
244 self.to_screen(u'Dumping request to ' + url)
245 dump = base64.b64encode(webpage_bytes).decode('ascii')
246 self._downloader.to_screen(dump)
247 if self._downloader.params.get('write_pages', False):
248 try:
249 url = url_or_request.get_full_url()
250 except AttributeError:
251 url = url_or_request
252 basen = '%s_%s' % (video_id, url)
253 if len(basen) > 240:
254 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
255 basen = basen[:240 - len(h)] + h
256 raw_filename = basen + '.dump'
257 filename = sanitize_filename(raw_filename, restricted=True)
258 self.to_screen(u'Saving request to ' + filename)
259 with open(filename, 'wb') as outf:
260 outf.write(webpage_bytes)
261
262 try:
263 content = webpage_bytes.decode(encoding, 'replace')
264 except LookupError:
265 content = webpage_bytes.decode('utf-8', 'replace')
266
267 if (u'<title>Access to this site is blocked</title>' in content and
268 u'Websense' in content[:512]):
269 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
270 blocked_iframe = self._html_search_regex(
271 r'<iframe src="([^"]+)"', content,
272 u'Websense information URL', default=None)
273 if blocked_iframe:
274 msg += u' Visit %s for more details' % blocked_iframe
275 raise ExtractorError(msg, expected=True)
276
277 return (content, urlh)
278
279 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
280 """ Returns the data of the page as a string """
281 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
282 if res is False:
283 return res
284 else:
285 content, _ = res
286 return content
287
288 def _download_xml(self, url_or_request, video_id,
289 note=u'Downloading XML', errnote=u'Unable to download XML',
290 transform_source=None, fatal=True):
291 """Return the xml as an xml.etree.ElementTree.Element"""
292 xml_string = self._download_webpage(
293 url_or_request, video_id, note, errnote, fatal=fatal)
294 if xml_string is False:
295 return xml_string
296 if transform_source:
297 xml_string = transform_source(xml_string)
298 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
299
300 def _download_json(self, url_or_request, video_id,
301 note=u'Downloading JSON metadata',
302 errnote=u'Unable to download JSON metadata',
303 transform_source=None):
304 json_string = self._download_webpage(url_or_request, video_id, note, errnote)
305 if transform_source:
306 json_string = transform_source(json_string)
307 try:
308 return json.loads(json_string)
309 except ValueError as ve:
310 raise ExtractorError('Failed to download JSON', cause=ve)
311
312 def report_warning(self, msg, video_id=None):
313 idstr = u'' if video_id is None else u'%s: ' % video_id
314 self._downloader.report_warning(
315 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
316
317 def to_screen(self, msg):
318 """Print msg to screen, prefixing it with '[ie_name]'"""
319 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
320
321 def report_extraction(self, id_or_name):
322 """Report information extraction."""
323 self.to_screen(u'%s: Extracting information' % id_or_name)
324
325 def report_download_webpage(self, video_id):
326 """Report webpage download."""
327 self.to_screen(u'%s: Downloading webpage' % video_id)
328
329 def report_age_confirmation(self):
330 """Report attempt to confirm age."""
331 self.to_screen(u'Confirming age')
332
333 def report_login(self):
334 """Report attempt to log in."""
335 self.to_screen(u'Logging in')
336
337 #Methods for following #608
338 @staticmethod
339 def url_result(url, ie=None, video_id=None):
340 """Returns a url that points to a page that should be processed"""
341 #TODO: ie should be the class used for getting the info
342 video_info = {'_type': 'url',
343 'url': url,
344 'ie_key': ie}
345 if video_id is not None:
346 video_info['id'] = video_id
347 return video_info
348 @staticmethod
349 def playlist_result(entries, playlist_id=None, playlist_title=None):
350 """Returns a playlist"""
351 video_info = {'_type': 'playlist',
352 'entries': entries}
353 if playlist_id:
354 video_info['id'] = playlist_id
355 if playlist_title:
356 video_info['title'] = playlist_title
357 return video_info
358
359 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
360 """
361 Perform a regex search on the given string, using a single or a list of
362 patterns returning the first matching group.
363 In case of failure return a default value or raise a WARNING or a
364 RegexNotFoundError, depending on fatal, specifying the field name.
365 """
366 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
367 mobj = re.search(pattern, string, flags)
368 else:
369 for p in pattern:
370 mobj = re.search(p, string, flags)
371 if mobj: break
372
373 if os.name != 'nt' and sys.stderr.isatty():
374 _name = u'\033[0;34m%s\033[0m' % name
375 else:
376 _name = name
377
378 if mobj:
379 # return the first matching group
380 return next(g for g in mobj.groups() if g is not None)
381 elif default is not _NO_DEFAULT:
382 return default
383 elif fatal:
384 raise RegexNotFoundError(u'Unable to extract %s' % _name)
385 else:
386 self._downloader.report_warning(u'unable to extract %s; '
387 u'please report this issue on http://yt-dl.org/bug' % _name)
388 return None
389
390 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
391 """
392 Like _search_regex, but strips HTML tags and unescapes entities.
393 """
394 res = self._search_regex(pattern, string, name, default, fatal, flags)
395 if res:
396 return clean_html(res).strip()
397 else:
398 return res
399
400 def _get_login_info(self):
401 """
402 Get the the login info as (username, password)
403 It will look in the netrc file using the _NETRC_MACHINE value
404 If there's no info available, return (None, None)
405 """
406 if self._downloader is None:
407 return (None, None)
408
409 username = None
410 password = None
411 downloader_params = self._downloader.params
412
413 # Attempt to use provided username and password or .netrc data
414 if downloader_params.get('username', None) is not None:
415 username = downloader_params['username']
416 password = downloader_params['password']
417 elif downloader_params.get('usenetrc', False):
418 try:
419 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
420 if info is not None:
421 username = info[0]
422 password = info[2]
423 else:
424 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
425 except (IOError, netrc.NetrcParseError) as err:
426 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
427
428 return (username, password)
429
430 # Helper functions for extracting OpenGraph info
431 @staticmethod
432 def _og_regexes(prop):
433 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
434 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
435 template = r'<meta[^>]+?%s[^>]+?%s'
436 return [
437 template % (property_re, content_re),
438 template % (content_re, property_re),
439 ]
440
441 def _og_search_property(self, prop, html, name=None, **kargs):
442 if name is None:
443 name = 'OpenGraph %s' % prop
444 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
445 if escaped is None:
446 return None
447 return unescapeHTML(escaped)
448
449 def _og_search_thumbnail(self, html, **kargs):
450 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
451
452 def _og_search_description(self, html, **kargs):
453 return self._og_search_property('description', html, fatal=False, **kargs)
454
455 def _og_search_title(self, html, **kargs):
456 return self._og_search_property('title', html, **kargs)
457
458 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
459 regexes = self._og_regexes('video')
460 if secure: regexes = self._og_regexes('video:secure_url') + regexes
461 return self._html_search_regex(regexes, html, name, **kargs)
462
463 def _og_search_url(self, html, **kargs):
464 return self._og_search_property('url', html, **kargs)
465
466 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
467 if display_name is None:
468 display_name = name
469 return self._html_search_regex(
470 r'''(?ix)<meta
471 (?=[^>]+(?:itemprop|name|property)=["\']%s["\'])
472 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
473 html, display_name, fatal=fatal, **kwargs)
474
475 def _dc_search_uploader(self, html):
476 return self._html_search_meta('dc.creator', html, 'uploader')
477
478 def _rta_search(self, html):
479 # See http://www.rtalabel.org/index.php?content=howtofaq#single
480 if re.search(r'(?ix)<meta\s+name="rating"\s+'
481 r' content="RTA-5042-1996-1400-1577-RTA"',
482 html):
483 return 18
484 return 0
485
486 def _media_rating_search(self, html):
487 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
488 rating = self._html_search_meta('rating', html)
489
490 if not rating:
491 return None
492
493 RATING_TABLE = {
494 'safe for kids': 0,
495 'general': 8,
496 '14 years': 14,
497 'mature': 17,
498 'restricted': 19,
499 }
500 return RATING_TABLE.get(rating.lower(), None)
501
502 def _twitter_search_player(self, html):
503 return self._html_search_meta('twitter:player', html,
504 'twitter card player')
505
506 def _sort_formats(self, formats):
507 if not formats:
508 raise ExtractorError(u'No video formats found')
509
510 def _formats_key(f):
511 # TODO remove the following workaround
512 from ..utils import determine_ext
513 if not f.get('ext') and 'url' in f:
514 f['ext'] = determine_ext(f['url'])
515
516 preference = f.get('preference')
517 if preference is None:
518 proto = f.get('protocol')
519 if proto is None:
520 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
521
522 preference = 0 if proto in ['http', 'https'] else -0.1
523 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
524 preference -= 0.5
525
526 if f.get('vcodec') == 'none': # audio only
527 if self._downloader.params.get('prefer_free_formats'):
528 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
529 else:
530 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
531 ext_preference = 0
532 try:
533 audio_ext_preference = ORDER.index(f['ext'])
534 except ValueError:
535 audio_ext_preference = -1
536 else:
537 if self._downloader.params.get('prefer_free_formats'):
538 ORDER = [u'flv', u'mp4', u'webm']
539 else:
540 ORDER = [u'webm', u'flv', u'mp4']
541 try:
542 ext_preference = ORDER.index(f['ext'])
543 except ValueError:
544 ext_preference = -1
545 audio_ext_preference = 0
546
547 return (
548 preference,
549 f.get('quality') if f.get('quality') is not None else -1,
550 f.get('height') if f.get('height') is not None else -1,
551 f.get('width') if f.get('width') is not None else -1,
552 ext_preference,
553 f.get('tbr') if f.get('tbr') is not None else -1,
554 f.get('vbr') if f.get('vbr') is not None else -1,
555 f.get('abr') if f.get('abr') is not None else -1,
556 audio_ext_preference,
557 f.get('filesize') if f.get('filesize') is not None else -1,
558 f.get('format_id'),
559 )
560 formats.sort(key=_formats_key)
561
562 def http_scheme(self):
563 """ Either "https:" or "https:", depending on the user's preferences """
564 return (
565 'http:'
566 if self._downloader.params.get('prefer_insecure', False)
567 else 'https:')
568
569 def _proto_relative_url(self, url, scheme=None):
570 if url is None:
571 return url
572 if url.startswith('//'):
573 if scheme is None:
574 scheme = self.http_scheme()
575 return scheme + url
576 else:
577 return url
578
579 def _sleep(self, timeout, video_id, msg_template=None):
580 if msg_template is None:
581 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
582 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
583 self.to_screen(msg)
584 time.sleep(timeout)
585
586
587 class SearchInfoExtractor(InfoExtractor):
588 """
589 Base class for paged search queries extractors.
590 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
591 Instances should define _SEARCH_KEY and _MAX_RESULTS.
592 """
593
594 @classmethod
595 def _make_valid_url(cls):
596 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
597
598 @classmethod
599 def suitable(cls, url):
600 return re.match(cls._make_valid_url(), url) is not None
601
602 def _real_extract(self, query):
603 mobj = re.match(self._make_valid_url(), query)
604 if mobj is None:
605 raise ExtractorError(u'Invalid search query "%s"' % query)
606
607 prefix = mobj.group('prefix')
608 query = mobj.group('query')
609 if prefix == '':
610 return self._get_n_results(query, 1)
611 elif prefix == 'all':
612 return self._get_n_results(query, self._MAX_RESULTS)
613 else:
614 n = int(prefix)
615 if n <= 0:
616 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
617 elif n > self._MAX_RESULTS:
618 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
619 n = self._MAX_RESULTS
620 return self._get_n_results(query, n)
621
622 def _get_n_results(self, query, n):
623 """Get a specified number of results for a query"""
624 raise NotImplementedError("This method must be implemented by subclasses")
625
626 @property
627 def SEARCH_KEY(self):
628 return self._SEARCH_KEY