]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Imported Upstream version 2014.08.05
[youtubedl] / youtube_dl / extractor / common.py
1 import base64
2 import hashlib
3 import json
4 import netrc
5 import os
6 import re
7 import socket
8 import sys
9 import time
10 import xml.etree.ElementTree
11
12 from ..utils import (
13 compat_http_client,
14 compat_urllib_error,
15 compat_urllib_parse_urlparse,
16 compat_str,
17
18 clean_html,
19 compiled_regex_type,
20 ExtractorError,
21 int_or_none,
22 RegexNotFoundError,
23 sanitize_filename,
24 unescapeHTML,
25 )
26 _NO_DEFAULT = object()
27
28
29 class InfoExtractor(object):
30 """Information Extractor class.
31
32 Information extractors are the classes that, given a URL, extract
33 information about the video (or videos) the URL refers to. This
34 information includes the real video URL, the video title, author and
35 others. The information is stored in a dictionary which is then
36 passed to the FileDownloader. The FileDownloader processes this
37 information possibly downloading the video to the file system, among
38 other possible outcomes.
39
40 The dictionaries must include the following fields:
41
42 id: Video identifier.
43 title: Video title, unescaped.
44
45 Additionally, it must contain either a formats entry or a url one:
46
47 formats: A list of dictionaries for each format available, ordered
48 from worst to best quality.
49
50 Potential fields:
51 * url Mandatory. The URL of the video file
52 * ext Will be calculated from url if missing
53 * format A human-readable description of the format
54 ("mp4 container with h264/opus").
55 Calculated from the format_id, width, height.
56 and format_note fields if missing.
57 * format_id A short description of the format
58 ("mp4_h264_opus" or "19").
59 Technically optional, but strongly recommended.
60 * format_note Additional info about the format
61 ("3D" or "DASH video")
62 * width Width of the video, if known
63 * height Height of the video, if known
64 * resolution Textual description of width and height
65 * tbr Average bitrate of audio and video in KBit/s
66 * abr Average audio bitrate in KBit/s
67 * acodec Name of the audio codec in use
68 * asr Audio sampling rate in Hertz
69 * vbr Average video bitrate in KBit/s
70 * vcodec Name of the video codec in use
71 * container Name of the container format
72 * filesize The number of bytes, if known in advance
73 * filesize_approx An estimate for the number of bytes
74 * player_url SWF Player URL (used for rtmpdump).
75 * protocol The protocol that will be used for the actual
76 download, lower-case.
77 "http", "https", "rtsp", "rtmp", "m3u8" or so.
78 * preference Order number of this format. If this field is
79 present and not None, the formats get sorted
80 by this field, regardless of all other values.
81 -1 for default (order by other properties),
82 -2 or smaller for less than default.
83 * quality Order number of the video quality of this
84 format, irrespective of the file format.
85 -1 for default (order by other properties),
86 -2 or smaller for less than default.
87 url: Final video URL.
88 ext: Video filename extension.
89 format: The video format, defaults to ext (used for --get-format)
90 player_url: SWF Player URL (used for rtmpdump).
91
92 The following fields are optional:
93
94 display_id An alternative identifier for the video, not necessarily
95 unique, but available before title. Typically, id is
96 something like "4234987", title "Dancing naked mole rats",
97 and display_id "dancing-naked-mole-rats"
98 thumbnails: A list of dictionaries, with the following entries:
99 * "url"
100 * "width" (optional, int)
101 * "height" (optional, int)
102 * "resolution" (optional, string "{width}x{height"},
103 deprecated)
104 thumbnail: Full URL to a video thumbnail image.
105 description: One-line video description.
106 uploader: Full name of the video uploader.
107 timestamp: UNIX timestamp of the moment the video became available.
108 upload_date: Video upload date (YYYYMMDD).
109 If not explicitly set, calculated from timestamp.
110 uploader_id: Nickname or id of the video uploader.
111 location: Physical location of the video.
112 subtitles: The subtitle file contents as a dictionary in the format
113 {language: subtitles}.
114 duration: Length of the video in seconds, as an integer.
115 view_count: How many users have watched the video on the platform.
116 like_count: Number of positive ratings of the video
117 dislike_count: Number of negative ratings of the video
118 comment_count: Number of comments on the video
119 age_limit: Age restriction for the video, as an integer (years)
120 webpage_url: The url to the video webpage, if given to youtube-dl it
121 should allow to get the same result again. (It will be set
122 by YoutubeDL if it's missing)
123 categories: A list of categories that the video falls in, for example
124 ["Sports", "Berlin"]
125
126 Unless mentioned otherwise, the fields should be Unicode strings.
127
128 Subclasses of this one should re-define the _real_initialize() and
129 _real_extract() methods and define a _VALID_URL regexp.
130 Probably, they should also be added to the list of extractors.
131
132 Finally, the _WORKING attribute should be set to False for broken IEs
133 in order to warn the users and skip the tests.
134 """
135
136 _ready = False
137 _downloader = None
138 _WORKING = True
139
140 def __init__(self, downloader=None):
141 """Constructor. Receives an optional downloader."""
142 self._ready = False
143 self.set_downloader(downloader)
144
145 @classmethod
146 def suitable(cls, url):
147 """Receives a URL and returns True if suitable for this IE."""
148
149 # This does not use has/getattr intentionally - we want to know whether
150 # we have cached the regexp for *this* class, whereas getattr would also
151 # match the superclass
152 if '_VALID_URL_RE' not in cls.__dict__:
153 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
154 return cls._VALID_URL_RE.match(url) is not None
155
156 @classmethod
157 def working(cls):
158 """Getter method for _WORKING."""
159 return cls._WORKING
160
161 def initialize(self):
162 """Initializes an instance (authentication, etc)."""
163 if not self._ready:
164 self._real_initialize()
165 self._ready = True
166
167 def extract(self, url):
168 """Extracts URL information and returns it in list of dicts."""
169 self.initialize()
170 return self._real_extract(url)
171
172 def set_downloader(self, downloader):
173 """Sets the downloader for this IE."""
174 self._downloader = downloader
175
176 def _real_initialize(self):
177 """Real initialization process. Redefine in subclasses."""
178 pass
179
180 def _real_extract(self, url):
181 """Real extraction process. Redefine in subclasses."""
182 pass
183
184 @classmethod
185 def ie_key(cls):
186 """A string for getting the InfoExtractor with get_info_extractor"""
187 return cls.__name__[:-2]
188
189 @property
190 def IE_NAME(self):
191 return type(self).__name__[:-2]
192
193 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
194 """ Returns the response handle """
195 if note is None:
196 self.report_download_webpage(video_id)
197 elif note is not False:
198 if video_id is None:
199 self.to_screen(u'%s' % (note,))
200 else:
201 self.to_screen(u'%s: %s' % (video_id, note))
202 try:
203 return self._downloader.urlopen(url_or_request)
204 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
205 if errnote is False:
206 return False
207 if errnote is None:
208 errnote = u'Unable to download webpage'
209 errmsg = u'%s: %s' % (errnote, compat_str(err))
210 if fatal:
211 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
212 else:
213 self._downloader.report_warning(errmsg)
214 return False
215
216 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
217 """ Returns a tuple (page content as string, URL handle) """
218
219 # Strip hashes from the URL (#1038)
220 if isinstance(url_or_request, (compat_str, str)):
221 url_or_request = url_or_request.partition('#')[0]
222
223 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
224 if urlh is False:
225 assert not fatal
226 return False
227 content_type = urlh.headers.get('Content-Type', '')
228 webpage_bytes = urlh.read()
229 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
230 if m:
231 encoding = m.group(1)
232 else:
233 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
234 webpage_bytes[:1024])
235 if m:
236 encoding = m.group(1).decode('ascii')
237 elif webpage_bytes.startswith(b'\xff\xfe'):
238 encoding = 'utf-16'
239 else:
240 encoding = 'utf-8'
241 if self._downloader.params.get('dump_intermediate_pages', False):
242 try:
243 url = url_or_request.get_full_url()
244 except AttributeError:
245 url = url_or_request
246 self.to_screen(u'Dumping request to ' + url)
247 dump = base64.b64encode(webpage_bytes).decode('ascii')
248 self._downloader.to_screen(dump)
249 if self._downloader.params.get('write_pages', False):
250 try:
251 url = url_or_request.get_full_url()
252 except AttributeError:
253 url = url_or_request
254 basen = '%s_%s' % (video_id, url)
255 if len(basen) > 240:
256 h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
257 basen = basen[:240 - len(h)] + h
258 raw_filename = basen + '.dump'
259 filename = sanitize_filename(raw_filename, restricted=True)
260 self.to_screen(u'Saving request to ' + filename)
261 with open(filename, 'wb') as outf:
262 outf.write(webpage_bytes)
263
264 try:
265 content = webpage_bytes.decode(encoding, 'replace')
266 except LookupError:
267 content = webpage_bytes.decode('utf-8', 'replace')
268
269 if (u'<title>Access to this site is blocked</title>' in content and
270 u'Websense' in content[:512]):
271 msg = u'Access to this webpage has been blocked by Websense filtering software in your network.'
272 blocked_iframe = self._html_search_regex(
273 r'<iframe src="([^"]+)"', content,
274 u'Websense information URL', default=None)
275 if blocked_iframe:
276 msg += u' Visit %s for more details' % blocked_iframe
277 raise ExtractorError(msg, expected=True)
278
279 return (content, urlh)
280
281 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
282 """ Returns the data of the page as a string """
283 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
284 if res is False:
285 return res
286 else:
287 content, _ = res
288 return content
289
290 def _download_xml(self, url_or_request, video_id,
291 note=u'Downloading XML', errnote=u'Unable to download XML',
292 transform_source=None, fatal=True):
293 """Return the xml as an xml.etree.ElementTree.Element"""
294 xml_string = self._download_webpage(
295 url_or_request, video_id, note, errnote, fatal=fatal)
296 if xml_string is False:
297 return xml_string
298 if transform_source:
299 xml_string = transform_source(xml_string)
300 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
301
302 def _download_json(self, url_or_request, video_id,
303 note=u'Downloading JSON metadata',
304 errnote=u'Unable to download JSON metadata',
305 transform_source=None,
306 fatal=True):
307 json_string = self._download_webpage(
308 url_or_request, video_id, note, errnote, fatal=fatal)
309 if (not fatal) and json_string is False:
310 return None
311 if transform_source:
312 json_string = transform_source(json_string)
313 try:
314 return json.loads(json_string)
315 except ValueError as ve:
316 raise ExtractorError('Failed to download JSON', cause=ve)
317
318 def report_warning(self, msg, video_id=None):
319 idstr = u'' if video_id is None else u'%s: ' % video_id
320 self._downloader.report_warning(
321 u'[%s] %s%s' % (self.IE_NAME, idstr, msg))
322
323 def to_screen(self, msg):
324 """Print msg to screen, prefixing it with '[ie_name]'"""
325 self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))
326
327 def report_extraction(self, id_or_name):
328 """Report information extraction."""
329 self.to_screen(u'%s: Extracting information' % id_or_name)
330
331 def report_download_webpage(self, video_id):
332 """Report webpage download."""
333 self.to_screen(u'%s: Downloading webpage' % video_id)
334
335 def report_age_confirmation(self):
336 """Report attempt to confirm age."""
337 self.to_screen(u'Confirming age')
338
339 def report_login(self):
340 """Report attempt to log in."""
341 self.to_screen(u'Logging in')
342
343 #Methods for following #608
344 @staticmethod
345 def url_result(url, ie=None, video_id=None):
346 """Returns a url that points to a page that should be processed"""
347 #TODO: ie should be the class used for getting the info
348 video_info = {'_type': 'url',
349 'url': url,
350 'ie_key': ie}
351 if video_id is not None:
352 video_info['id'] = video_id
353 return video_info
354 @staticmethod
355 def playlist_result(entries, playlist_id=None, playlist_title=None):
356 """Returns a playlist"""
357 video_info = {'_type': 'playlist',
358 'entries': entries}
359 if playlist_id:
360 video_info['id'] = playlist_id
361 if playlist_title:
362 video_info['title'] = playlist_title
363 return video_info
364
365 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
366 """
367 Perform a regex search on the given string, using a single or a list of
368 patterns returning the first matching group.
369 In case of failure return a default value or raise a WARNING or a
370 RegexNotFoundError, depending on fatal, specifying the field name.
371 """
372 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
373 mobj = re.search(pattern, string, flags)
374 else:
375 for p in pattern:
376 mobj = re.search(p, string, flags)
377 if mobj:
378 break
379
380 if os.name != 'nt' and sys.stderr.isatty():
381 _name = u'\033[0;34m%s\033[0m' % name
382 else:
383 _name = name
384
385 if mobj:
386 # return the first matching group
387 return next(g for g in mobj.groups() if g is not None)
388 elif default is not _NO_DEFAULT:
389 return default
390 elif fatal:
391 raise RegexNotFoundError(u'Unable to extract %s' % _name)
392 else:
393 self._downloader.report_warning(u'unable to extract %s; '
394 u'please report this issue on http://yt-dl.org/bug' % _name)
395 return None
396
397 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
398 """
399 Like _search_regex, but strips HTML tags and unescapes entities.
400 """
401 res = self._search_regex(pattern, string, name, default, fatal, flags)
402 if res:
403 return clean_html(res).strip()
404 else:
405 return res
406
407 def _get_login_info(self):
408 """
409 Get the the login info as (username, password)
410 It will look in the netrc file using the _NETRC_MACHINE value
411 If there's no info available, return (None, None)
412 """
413 if self._downloader is None:
414 return (None, None)
415
416 username = None
417 password = None
418 downloader_params = self._downloader.params
419
420 # Attempt to use provided username and password or .netrc data
421 if downloader_params.get('username', None) is not None:
422 username = downloader_params['username']
423 password = downloader_params['password']
424 elif downloader_params.get('usenetrc', False):
425 try:
426 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
427 if info is not None:
428 username = info[0]
429 password = info[2]
430 else:
431 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
432 except (IOError, netrc.NetrcParseError) as err:
433 self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err))
434
435 return (username, password)
436
437 # Helper functions for extracting OpenGraph info
438 @staticmethod
439 def _og_regexes(prop):
440 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
441 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
442 template = r'<meta[^>]+?%s[^>]+?%s'
443 return [
444 template % (property_re, content_re),
445 template % (content_re, property_re),
446 ]
447
448 def _og_search_property(self, prop, html, name=None, **kargs):
449 if name is None:
450 name = 'OpenGraph %s' % prop
451 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
452 if escaped is None:
453 return None
454 return unescapeHTML(escaped)
455
456 def _og_search_thumbnail(self, html, **kargs):
457 return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
458
459 def _og_search_description(self, html, **kargs):
460 return self._og_search_property('description', html, fatal=False, **kargs)
461
462 def _og_search_title(self, html, **kargs):
463 return self._og_search_property('title', html, **kargs)
464
465 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
466 regexes = self._og_regexes('video')
467 if secure: regexes = self._og_regexes('video:secure_url') + regexes
468 return self._html_search_regex(regexes, html, name, **kargs)
469
470 def _og_search_url(self, html, **kargs):
471 return self._og_search_property('url', html, **kargs)
472
473 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
474 if display_name is None:
475 display_name = name
476 return self._html_search_regex(
477 r'''(?ix)<meta
478 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
479 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
480 html, display_name, fatal=fatal, **kwargs)
481
482 def _dc_search_uploader(self, html):
483 return self._html_search_meta('dc.creator', html, 'uploader')
484
485 def _rta_search(self, html):
486 # See http://www.rtalabel.org/index.php?content=howtofaq#single
487 if re.search(r'(?ix)<meta\s+name="rating"\s+'
488 r' content="RTA-5042-1996-1400-1577-RTA"',
489 html):
490 return 18
491 return 0
492
493 def _media_rating_search(self, html):
494 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
495 rating = self._html_search_meta('rating', html)
496
497 if not rating:
498 return None
499
500 RATING_TABLE = {
501 'safe for kids': 0,
502 'general': 8,
503 '14 years': 14,
504 'mature': 17,
505 'restricted': 19,
506 }
507 return RATING_TABLE.get(rating.lower(), None)
508
509 def _twitter_search_player(self, html):
510 return self._html_search_meta('twitter:player', html,
511 'twitter card player')
512
513 def _sort_formats(self, formats):
514 if not formats:
515 raise ExtractorError(u'No video formats found')
516
517 def _formats_key(f):
518 # TODO remove the following workaround
519 from ..utils import determine_ext
520 if not f.get('ext') and 'url' in f:
521 f['ext'] = determine_ext(f['url'])
522
523 preference = f.get('preference')
524 if preference is None:
525 proto = f.get('protocol')
526 if proto is None:
527 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
528
529 preference = 0 if proto in ['http', 'https'] else -0.1
530 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
531 preference -= 0.5
532
533 if f.get('vcodec') == 'none': # audio only
534 if self._downloader.params.get('prefer_free_formats'):
535 ORDER = [u'aac', u'mp3', u'm4a', u'webm', u'ogg', u'opus']
536 else:
537 ORDER = [u'webm', u'opus', u'ogg', u'mp3', u'aac', u'm4a']
538 ext_preference = 0
539 try:
540 audio_ext_preference = ORDER.index(f['ext'])
541 except ValueError:
542 audio_ext_preference = -1
543 else:
544 if self._downloader.params.get('prefer_free_formats'):
545 ORDER = [u'flv', u'mp4', u'webm']
546 else:
547 ORDER = [u'webm', u'flv', u'mp4']
548 try:
549 ext_preference = ORDER.index(f['ext'])
550 except ValueError:
551 ext_preference = -1
552 audio_ext_preference = 0
553
554 return (
555 preference,
556 f.get('quality') if f.get('quality') is not None else -1,
557 f.get('height') if f.get('height') is not None else -1,
558 f.get('width') if f.get('width') is not None else -1,
559 ext_preference,
560 f.get('tbr') if f.get('tbr') is not None else -1,
561 f.get('vbr') if f.get('vbr') is not None else -1,
562 f.get('abr') if f.get('abr') is not None else -1,
563 audio_ext_preference,
564 f.get('filesize') if f.get('filesize') is not None else -1,
565 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
566 f.get('format_id'),
567 )
568 formats.sort(key=_formats_key)
569
570 def http_scheme(self):
571 """ Either "https:" or "https:", depending on the user's preferences """
572 return (
573 'http:'
574 if self._downloader.params.get('prefer_insecure', False)
575 else 'https:')
576
577 def _proto_relative_url(self, url, scheme=None):
578 if url is None:
579 return url
580 if url.startswith('//'):
581 if scheme is None:
582 scheme = self.http_scheme()
583 return scheme + url
584 else:
585 return url
586
587 def _sleep(self, timeout, video_id, msg_template=None):
588 if msg_template is None:
589 msg_template = u'%(video_id)s: Waiting for %(timeout)s seconds'
590 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
591 self.to_screen(msg)
592 time.sleep(timeout)
593
594 def _extract_f4m_formats(self, manifest_url, video_id):
595 manifest = self._download_xml(
596 manifest_url, video_id, 'Downloading f4m manifest',
597 'Unable to download f4m manifest')
598
599 formats = []
600 for media_el in manifest.findall('{http://ns.adobe.com/f4m/1.0}media'):
601 formats.append({
602 'url': manifest_url,
603 'ext': 'flv',
604 'tbr': int_or_none(media_el.attrib.get('bitrate')),
605 'width': int_or_none(media_el.attrib.get('width')),
606 'height': int_or_none(media_el.attrib.get('height')),
607 })
608 self._sort_formats(formats)
609
610 return formats
611
612
613 class SearchInfoExtractor(InfoExtractor):
614 """
615 Base class for paged search queries extractors.
616 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
617 Instances should define _SEARCH_KEY and _MAX_RESULTS.
618 """
619
620 @classmethod
621 def _make_valid_url(cls):
622 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
623
624 @classmethod
625 def suitable(cls, url):
626 return re.match(cls._make_valid_url(), url) is not None
627
628 def _real_extract(self, query):
629 mobj = re.match(self._make_valid_url(), query)
630 if mobj is None:
631 raise ExtractorError(u'Invalid search query "%s"' % query)
632
633 prefix = mobj.group('prefix')
634 query = mobj.group('query')
635 if prefix == '':
636 return self._get_n_results(query, 1)
637 elif prefix == 'all':
638 return self._get_n_results(query, self._MAX_RESULTS)
639 else:
640 n = int(prefix)
641 if n <= 0:
642 raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query))
643 elif n > self._MAX_RESULTS:
644 self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
645 n = self._MAX_RESULTS
646 return self._get_n_results(query, n)
647
648 def _get_n_results(self, query, n):
649 """Get a specified number of results for a query"""
650 raise NotImplementedError("This method must be implemented by subclasses")
651
652 @property
653 def SEARCH_KEY(self):
654 return self._SEARCH_KEY