]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
e80a2dad0b2e12c5e9d14a486feb0b600de66823
[youtubedl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import xml.etree.ElementTree
14
15 from ..compat import (
16 compat_cookiejar,
17 compat_http_client,
18 compat_urllib_error,
19 compat_urllib_parse_urlparse,
20 compat_urlparse,
21 compat_str,
22 )
23 from ..utils import (
24 clean_html,
25 compiled_regex_type,
26 ExtractorError,
27 float_or_none,
28 int_or_none,
29 RegexNotFoundError,
30 sanitize_filename,
31 unescapeHTML,
32 )
33 _NO_DEFAULT = object()
34
35
36 class InfoExtractor(object):
37 """Information Extractor class.
38
39 Information extractors are the classes that, given a URL, extract
40 information about the video (or videos) the URL refers to. This
41 information includes the real video URL, the video title, author and
42 others. The information is stored in a dictionary which is then
43 passed to the FileDownloader. The FileDownloader processes this
44 information possibly downloading the video to the file system, among
45 other possible outcomes.
46
47 The type field determines the the type of the result.
48 By far the most common value (and the default if _type is missing) is
49 "video", which indicates a single video.
50
51 For a video, the dictionaries must include the following fields:
52
53 id: Video identifier.
54 title: Video title, unescaped.
55
56 Additionally, it must contain either a formats entry or a url one:
57
58 formats: A list of dictionaries for each format available, ordered
59 from worst to best quality.
60
61 Potential fields:
62 * url Mandatory. The URL of the video file
63 * ext Will be calculated from url if missing
64 * format A human-readable description of the format
65 ("mp4 container with h264/opus").
66 Calculated from the format_id, width, height.
67 and format_note fields if missing.
68 * format_id A short description of the format
69 ("mp4_h264_opus" or "19").
70 Technically optional, but strongly recommended.
71 * format_note Additional info about the format
72 ("3D" or "DASH video")
73 * width Width of the video, if known
74 * height Height of the video, if known
75 * resolution Textual description of width and height
76 * tbr Average bitrate of audio and video in KBit/s
77 * abr Average audio bitrate in KBit/s
78 * acodec Name of the audio codec in use
79 * asr Audio sampling rate in Hertz
80 * vbr Average video bitrate in KBit/s
81 * fps Frame rate
82 * vcodec Name of the video codec in use
83 * container Name of the container format
84 * filesize The number of bytes, if known in advance
85 * filesize_approx An estimate for the number of bytes
86 * player_url SWF Player URL (used for rtmpdump).
87 * protocol The protocol that will be used for the actual
88 download, lower-case.
89 "http", "https", "rtsp", "rtmp", "m3u8" or so.
90 * preference Order number of this format. If this field is
91 present and not None, the formats get sorted
92 by this field, regardless of all other values.
93 -1 for default (order by other properties),
94 -2 or smaller for less than default.
95 * language_preference Is this in the correct requested
96 language?
97 10 if it's what the URL is about,
98 -1 for default (don't know),
99 -10 otherwise, other values reserved for now.
100 * quality Order number of the video quality of this
101 format, irrespective of the file format.
102 -1 for default (order by other properties),
103 -2 or smaller for less than default.
104 * source_preference Order number for this video source
105 (quality takes higher priority)
106 -1 for default (order by other properties),
107 -2 or smaller for less than default.
108 * http_referer HTTP Referer header value to set.
109 * http_method HTTP method to use for the download.
110 * http_headers A dictionary of additional HTTP headers
111 to add to the request.
112 * http_post_data Additional data to send with a POST
113 request.
114 url: Final video URL.
115 ext: Video filename extension.
116 format: The video format, defaults to ext (used for --get-format)
117 player_url: SWF Player URL (used for rtmpdump).
118
119 The following fields are optional:
120
121 display_id An alternative identifier for the video, not necessarily
122 unique, but available before title. Typically, id is
123 something like "4234987", title "Dancing naked mole rats",
124 and display_id "dancing-naked-mole-rats"
125 thumbnails: A list of dictionaries, with the following entries:
126 * "url"
127 * "width" (optional, int)
128 * "height" (optional, int)
129 * "resolution" (optional, string "{width}x{height"},
130 deprecated)
131 thumbnail: Full URL to a video thumbnail image.
132 description: One-line video description.
133 uploader: Full name of the video uploader.
134 timestamp: UNIX timestamp of the moment the video became available.
135 upload_date: Video upload date (YYYYMMDD).
136 If not explicitly set, calculated from timestamp.
137 uploader_id: Nickname or id of the video uploader.
138 location: Physical location where the video was filmed.
139 subtitles: The subtitle file contents as a dictionary in the format
140 {language: subtitles}.
141 duration: Length of the video in seconds, as an integer.
142 view_count: How many users have watched the video on the platform.
143 like_count: Number of positive ratings of the video
144 dislike_count: Number of negative ratings of the video
145 comment_count: Number of comments on the video
146 age_limit: Age restriction for the video, as an integer (years)
147 webpage_url: The url to the video webpage, if given to youtube-dl it
148 should allow to get the same result again. (It will be set
149 by YoutubeDL if it's missing)
150 categories: A list of categories that the video falls in, for example
151 ["Sports", "Berlin"]
152 is_live: True, False, or None (=unknown). Whether this video is a
153 live stream that goes on instead of a fixed-length video.
154
155 Unless mentioned otherwise, the fields should be Unicode strings.
156
157 Unless mentioned otherwise, None is equivalent to absence of information.
158
159
160 _type "playlist" indicates multiple videos.
161 There must be a key "entries", which is a list or a PagedList object, each
162 element of which is a valid dictionary under this specfication.
163
164 Additionally, playlists can have "title" and "id" attributes with the same
165 semantics as videos (see above).
166
167
168 _type "multi_video" indicates that there are multiple videos that
169 form a single show, for examples multiple acts of an opera or TV episode.
170 It must have an entries key like a playlist and contain all the keys
171 required for a video at the same time.
172
173
174 _type "url" indicates that the video must be extracted from another
175 location, possibly by a different extractor. Its only required key is:
176 "url" - the next URL to extract.
177
178 Additionally, it may have properties believed to be identical to the
179 resolved entity, for example "title" if the title of the referred video is
180 known ahead of time.
181
182
183 _type "url_transparent" entities have the same specification as "url", but
184 indicate that the given additional information is more precise than the one
185 associated with the resolved URL.
186 This is useful when a site employs a video service that hosts the video and
187 its technical metadata, but that video service does not embed a useful
188 title, description etc.
189
190
191 Subclasses of this one should re-define the _real_initialize() and
192 _real_extract() methods and define a _VALID_URL regexp.
193 Probably, they should also be added to the list of extractors.
194
195 Finally, the _WORKING attribute should be set to False for broken IEs
196 in order to warn the users and skip the tests.
197 """
198
199 _ready = False
200 _downloader = None
201 _WORKING = True
202
203 def __init__(self, downloader=None):
204 """Constructor. Receives an optional downloader."""
205 self._ready = False
206 self.set_downloader(downloader)
207
208 @classmethod
209 def suitable(cls, url):
210 """Receives a URL and returns True if suitable for this IE."""
211
212 # This does not use has/getattr intentionally - we want to know whether
213 # we have cached the regexp for *this* class, whereas getattr would also
214 # match the superclass
215 if '_VALID_URL_RE' not in cls.__dict__:
216 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
217 return cls._VALID_URL_RE.match(url) is not None
218
219 @classmethod
220 def _match_id(cls, url):
221 if '_VALID_URL_RE' not in cls.__dict__:
222 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
223 m = cls._VALID_URL_RE.match(url)
224 assert m
225 return m.group('id')
226
227 @classmethod
228 def working(cls):
229 """Getter method for _WORKING."""
230 return cls._WORKING
231
232 def initialize(self):
233 """Initializes an instance (authentication, etc)."""
234 if not self._ready:
235 self._real_initialize()
236 self._ready = True
237
238 def extract(self, url):
239 """Extracts URL information and returns it in list of dicts."""
240 self.initialize()
241 return self._real_extract(url)
242
243 def set_downloader(self, downloader):
244 """Sets the downloader for this IE."""
245 self._downloader = downloader
246
247 def _real_initialize(self):
248 """Real initialization process. Redefine in subclasses."""
249 pass
250
251 def _real_extract(self, url):
252 """Real extraction process. Redefine in subclasses."""
253 pass
254
255 @classmethod
256 def ie_key(cls):
257 """A string for getting the InfoExtractor with get_info_extractor"""
258 return cls.__name__[:-2]
259
260 @property
261 def IE_NAME(self):
262 return type(self).__name__[:-2]
263
264 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
265 """ Returns the response handle """
266 if note is None:
267 self.report_download_webpage(video_id)
268 elif note is not False:
269 if video_id is None:
270 self.to_screen('%s' % (note,))
271 else:
272 self.to_screen('%s: %s' % (video_id, note))
273 try:
274 return self._downloader.urlopen(url_or_request)
275 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
276 if errnote is False:
277 return False
278 if errnote is None:
279 errnote = 'Unable to download webpage'
280 errmsg = '%s: %s' % (errnote, compat_str(err))
281 if fatal:
282 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
283 else:
284 self._downloader.report_warning(errmsg)
285 return False
286
287 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
288 """ Returns a tuple (page content as string, URL handle) """
289 # Strip hashes from the URL (#1038)
290 if isinstance(url_or_request, (compat_str, str)):
291 url_or_request = url_or_request.partition('#')[0]
292
293 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
294 if urlh is False:
295 assert not fatal
296 return False
297 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
298 return (content, urlh)
299
300 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None):
301 content_type = urlh.headers.get('Content-Type', '')
302 webpage_bytes = urlh.read()
303 if prefix is not None:
304 webpage_bytes = prefix + webpage_bytes
305 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
306 if m:
307 encoding = m.group(1)
308 else:
309 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
310 webpage_bytes[:1024])
311 if m:
312 encoding = m.group(1).decode('ascii')
313 elif webpage_bytes.startswith(b'\xff\xfe'):
314 encoding = 'utf-16'
315 else:
316 encoding = 'utf-8'
317 if self._downloader.params.get('dump_intermediate_pages', False):
318 try:
319 url = url_or_request.get_full_url()
320 except AttributeError:
321 url = url_or_request
322 self.to_screen('Dumping request to ' + url)
323 dump = base64.b64encode(webpage_bytes).decode('ascii')
324 self._downloader.to_screen(dump)
325 if self._downloader.params.get('write_pages', False):
326 try:
327 url = url_or_request.get_full_url()
328 except AttributeError:
329 url = url_or_request
330 basen = '%s_%s' % (video_id, url)
331 if len(basen) > 240:
332 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
333 basen = basen[:240 - len(h)] + h
334 raw_filename = basen + '.dump'
335 filename = sanitize_filename(raw_filename, restricted=True)
336 self.to_screen('Saving request to ' + filename)
337 # Working around MAX_PATH limitation on Windows (see
338 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
339 if os.name == 'nt':
340 absfilepath = os.path.abspath(filename)
341 if len(absfilepath) > 259:
342 filename = '\\\\?\\' + absfilepath
343 with open(filename, 'wb') as outf:
344 outf.write(webpage_bytes)
345
346 try:
347 content = webpage_bytes.decode(encoding, 'replace')
348 except LookupError:
349 content = webpage_bytes.decode('utf-8', 'replace')
350
351 if ('<title>Access to this site is blocked</title>' in content and
352 'Websense' in content[:512]):
353 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
354 blocked_iframe = self._html_search_regex(
355 r'<iframe src="([^"]+)"', content,
356 'Websense information URL', default=None)
357 if blocked_iframe:
358 msg += ' Visit %s for more details' % blocked_iframe
359 raise ExtractorError(msg, expected=True)
360
361 return content
362
363 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
364 """ Returns the data of the page as a string """
365 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
366 if res is False:
367 return res
368 else:
369 content, _ = res
370 return content
371
372 def _download_xml(self, url_or_request, video_id,
373 note='Downloading XML', errnote='Unable to download XML',
374 transform_source=None, fatal=True):
375 """Return the xml as an xml.etree.ElementTree.Element"""
376 xml_string = self._download_webpage(
377 url_or_request, video_id, note, errnote, fatal=fatal)
378 if xml_string is False:
379 return xml_string
380 if transform_source:
381 xml_string = transform_source(xml_string)
382 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
383
384 def _download_json(self, url_or_request, video_id,
385 note='Downloading JSON metadata',
386 errnote='Unable to download JSON metadata',
387 transform_source=None,
388 fatal=True):
389 json_string = self._download_webpage(
390 url_or_request, video_id, note, errnote, fatal=fatal)
391 if (not fatal) and json_string is False:
392 return None
393 if transform_source:
394 json_string = transform_source(json_string)
395 try:
396 return json.loads(json_string)
397 except ValueError as ve:
398 errmsg = '%s: Failed to parse JSON ' % video_id
399 if fatal:
400 raise ExtractorError(errmsg, cause=ve)
401 else:
402 self.report_warning(errmsg + str(ve))
403
404 def report_warning(self, msg, video_id=None):
405 idstr = '' if video_id is None else '%s: ' % video_id
406 self._downloader.report_warning(
407 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
408
409 def to_screen(self, msg):
410 """Print msg to screen, prefixing it with '[ie_name]'"""
411 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
412
413 def report_extraction(self, id_or_name):
414 """Report information extraction."""
415 self.to_screen('%s: Extracting information' % id_or_name)
416
417 def report_download_webpage(self, video_id):
418 """Report webpage download."""
419 self.to_screen('%s: Downloading webpage' % video_id)
420
421 def report_age_confirmation(self):
422 """Report attempt to confirm age."""
423 self.to_screen('Confirming age')
424
425 def report_login(self):
426 """Report attempt to log in."""
427 self.to_screen('Logging in')
428
429 # Methods for following #608
430 @staticmethod
431 def url_result(url, ie=None, video_id=None):
432 """Returns a url that points to a page that should be processed"""
433 # TODO: ie should be the class used for getting the info
434 video_info = {'_type': 'url',
435 'url': url,
436 'ie_key': ie}
437 if video_id is not None:
438 video_info['id'] = video_id
439 return video_info
440
441 @staticmethod
442 def playlist_result(entries, playlist_id=None, playlist_title=None):
443 """Returns a playlist"""
444 video_info = {'_type': 'playlist',
445 'entries': entries}
446 if playlist_id:
447 video_info['id'] = playlist_id
448 if playlist_title:
449 video_info['title'] = playlist_title
450 return video_info
451
452 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
453 """
454 Perform a regex search on the given string, using a single or a list of
455 patterns returning the first matching group.
456 In case of failure return a default value or raise a WARNING or a
457 RegexNotFoundError, depending on fatal, specifying the field name.
458 """
459 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
460 mobj = re.search(pattern, string, flags)
461 else:
462 for p in pattern:
463 mobj = re.search(p, string, flags)
464 if mobj:
465 break
466
467 if os.name != 'nt' and sys.stderr.isatty():
468 _name = '\033[0;34m%s\033[0m' % name
469 else:
470 _name = name
471
472 if mobj:
473 if group is None:
474 # return the first matching group
475 return next(g for g in mobj.groups() if g is not None)
476 else:
477 return mobj.group(group)
478 elif default is not _NO_DEFAULT:
479 return default
480 elif fatal:
481 raise RegexNotFoundError('Unable to extract %s' % _name)
482 else:
483 self._downloader.report_warning('unable to extract %s; '
484 'please report this issue on http://yt-dl.org/bug' % _name)
485 return None
486
487 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None):
488 """
489 Like _search_regex, but strips HTML tags and unescapes entities.
490 """
491 res = self._search_regex(pattern, string, name, default, fatal, flags, group)
492 if res:
493 return clean_html(res).strip()
494 else:
495 return res
496
497 def _get_login_info(self):
498 """
499 Get the the login info as (username, password)
500 It will look in the netrc file using the _NETRC_MACHINE value
501 If there's no info available, return (None, None)
502 """
503 if self._downloader is None:
504 return (None, None)
505
506 username = None
507 password = None
508 downloader_params = self._downloader.params
509
510 # Attempt to use provided username and password or .netrc data
511 if downloader_params.get('username', None) is not None:
512 username = downloader_params['username']
513 password = downloader_params['password']
514 elif downloader_params.get('usenetrc', False):
515 try:
516 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
517 if info is not None:
518 username = info[0]
519 password = info[2]
520 else:
521 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
522 except (IOError, netrc.NetrcParseError) as err:
523 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
524
525 return (username, password)
526
527 def _get_tfa_info(self):
528 """
529 Get the two-factor authentication info
530 TODO - asking the user will be required for sms/phone verify
531 currently just uses the command line option
532 If there's no info available, return None
533 """
534 if self._downloader is None:
535 return None
536 downloader_params = self._downloader.params
537
538 if downloader_params.get('twofactor', None) is not None:
539 return downloader_params['twofactor']
540
541 return None
542
543 # Helper functions for extracting OpenGraph info
544 @staticmethod
545 def _og_regexes(prop):
546 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
547 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
548 template = r'<meta[^>]+?%s[^>]+?%s'
549 return [
550 template % (property_re, content_re),
551 template % (content_re, property_re),
552 ]
553
554 def _og_search_property(self, prop, html, name=None, **kargs):
555 if name is None:
556 name = 'OpenGraph %s' % prop
557 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
558 if escaped is None:
559 return None
560 return unescapeHTML(escaped)
561
562 def _og_search_thumbnail(self, html, **kargs):
563 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
564
565 def _og_search_description(self, html, **kargs):
566 return self._og_search_property('description', html, fatal=False, **kargs)
567
568 def _og_search_title(self, html, **kargs):
569 return self._og_search_property('title', html, **kargs)
570
571 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
572 regexes = self._og_regexes('video') + self._og_regexes('video:url')
573 if secure:
574 regexes = self._og_regexes('video:secure_url') + regexes
575 return self._html_search_regex(regexes, html, name, **kargs)
576
577 def _og_search_url(self, html, **kargs):
578 return self._og_search_property('url', html, **kargs)
579
580 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
581 if display_name is None:
582 display_name = name
583 return self._html_search_regex(
584 r'''(?ix)<meta
585 (?=[^>]+(?:itemprop|name|property)=(["\']?)%s\1)
586 [^>]+content=(["\'])(?P<content>.*?)\1''' % re.escape(name),
587 html, display_name, fatal=fatal, group='content', **kwargs)
588
589 def _dc_search_uploader(self, html):
590 return self._html_search_meta('dc.creator', html, 'uploader')
591
592 def _rta_search(self, html):
593 # See http://www.rtalabel.org/index.php?content=howtofaq#single
594 if re.search(r'(?ix)<meta\s+name="rating"\s+'
595 r' content="RTA-5042-1996-1400-1577-RTA"',
596 html):
597 return 18
598 return 0
599
600 def _media_rating_search(self, html):
601 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
602 rating = self._html_search_meta('rating', html)
603
604 if not rating:
605 return None
606
607 RATING_TABLE = {
608 'safe for kids': 0,
609 'general': 8,
610 '14 years': 14,
611 'mature': 17,
612 'restricted': 19,
613 }
614 return RATING_TABLE.get(rating.lower(), None)
615
616 def _twitter_search_player(self, html):
617 return self._html_search_meta('twitter:player', html,
618 'twitter card player')
619
620 def _sort_formats(self, formats):
621 if not formats:
622 raise ExtractorError('No video formats found')
623
624 def _formats_key(f):
625 # TODO remove the following workaround
626 from ..utils import determine_ext
627 if not f.get('ext') and 'url' in f:
628 f['ext'] = determine_ext(f['url'])
629
630 preference = f.get('preference')
631 if preference is None:
632 proto = f.get('protocol')
633 if proto is None:
634 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
635
636 preference = 0 if proto in ['http', 'https'] else -0.1
637 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
638 preference -= 0.5
639
640 if f.get('vcodec') == 'none': # audio only
641 if self._downloader.params.get('prefer_free_formats'):
642 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
643 else:
644 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
645 ext_preference = 0
646 try:
647 audio_ext_preference = ORDER.index(f['ext'])
648 except ValueError:
649 audio_ext_preference = -1
650 else:
651 if self._downloader.params.get('prefer_free_formats'):
652 ORDER = ['flv', 'mp4', 'webm']
653 else:
654 ORDER = ['webm', 'flv', 'mp4']
655 try:
656 ext_preference = ORDER.index(f['ext'])
657 except ValueError:
658 ext_preference = -1
659 audio_ext_preference = 0
660
661 return (
662 preference,
663 f.get('language_preference') if f.get('language_preference') is not None else -1,
664 f.get('quality') if f.get('quality') is not None else -1,
665 f.get('height') if f.get('height') is not None else -1,
666 f.get('width') if f.get('width') is not None else -1,
667 ext_preference,
668 f.get('tbr') if f.get('tbr') is not None else -1,
669 f.get('vbr') if f.get('vbr') is not None else -1,
670 f.get('abr') if f.get('abr') is not None else -1,
671 audio_ext_preference,
672 f.get('fps') if f.get('fps') is not None else -1,
673 f.get('filesize') if f.get('filesize') is not None else -1,
674 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
675 f.get('source_preference') if f.get('source_preference') is not None else -1,
676 f.get('format_id'),
677 )
678 formats.sort(key=_formats_key)
679
680 def http_scheme(self):
681 """ Either "http:" or "https:", depending on the user's preferences """
682 return (
683 'http:'
684 if self._downloader.params.get('prefer_insecure', False)
685 else 'https:')
686
687 def _proto_relative_url(self, url, scheme=None):
688 if url is None:
689 return url
690 if url.startswith('//'):
691 if scheme is None:
692 scheme = self.http_scheme()
693 return scheme + url
694 else:
695 return url
696
697 def _sleep(self, timeout, video_id, msg_template=None):
698 if msg_template is None:
699 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
700 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
701 self.to_screen(msg)
702 time.sleep(timeout)
703
704 def _extract_f4m_formats(self, manifest_url, video_id):
705 manifest = self._download_xml(
706 manifest_url, video_id, 'Downloading f4m manifest',
707 'Unable to download f4m manifest')
708
709 formats = []
710 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
711 for i, media_el in enumerate(media_nodes):
712 tbr = int_or_none(media_el.attrib.get('bitrate'))
713 format_id = 'f4m-%d' % (i if tbr is None else tbr)
714 formats.append({
715 'format_id': format_id,
716 'url': manifest_url,
717 'ext': 'flv',
718 'tbr': tbr,
719 'width': int_or_none(media_el.attrib.get('width')),
720 'height': int_or_none(media_el.attrib.get('height')),
721 })
722 self._sort_formats(formats)
723
724 return formats
725
726 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
727 entry_protocol='m3u8', preference=None):
728
729 formats = [{
730 'format_id': 'm3u8-meta',
731 'url': m3u8_url,
732 'ext': ext,
733 'protocol': 'm3u8',
734 'preference': -1,
735 'resolution': 'multiple',
736 'format_note': 'Quality selection URL',
737 }]
738
739 format_url = lambda u: (
740 u
741 if re.match(r'^https?://', u)
742 else compat_urlparse.urljoin(m3u8_url, u))
743
744 m3u8_doc = self._download_webpage(
745 m3u8_url, video_id,
746 note='Downloading m3u8 information',
747 errnote='Failed to download m3u8 information')
748 last_info = None
749 kv_rex = re.compile(
750 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
751 for line in m3u8_doc.splitlines():
752 if line.startswith('#EXT-X-STREAM-INF:'):
753 last_info = {}
754 for m in kv_rex.finditer(line):
755 v = m.group('val')
756 if v.startswith('"'):
757 v = v[1:-1]
758 last_info[m.group('key')] = v
759 elif line.startswith('#') or not line.strip():
760 continue
761 else:
762 if last_info is None:
763 formats.append({'url': format_url(line)})
764 continue
765 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
766
767 f = {
768 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
769 'url': format_url(line.strip()),
770 'tbr': tbr,
771 'ext': ext,
772 'protocol': entry_protocol,
773 'preference': preference,
774 }
775 codecs = last_info.get('CODECS')
776 if codecs:
777 # TODO: looks like video codec is not always necessarily goes first
778 va_codecs = codecs.split(',')
779 if va_codecs[0]:
780 f['vcodec'] = va_codecs[0].partition('.')[0]
781 if len(va_codecs) > 1 and va_codecs[1]:
782 f['acodec'] = va_codecs[1].partition('.')[0]
783 resolution = last_info.get('RESOLUTION')
784 if resolution:
785 width_str, height_str = resolution.split('x')
786 f['width'] = int(width_str)
787 f['height'] = int(height_str)
788 formats.append(f)
789 last_info = {}
790 self._sort_formats(formats)
791 return formats
792
793 def _live_title(self, name):
794 """ Generate the title for a live video """
795 now = datetime.datetime.now()
796 now_str = now.strftime("%Y-%m-%d %H:%M")
797 return name + ' ' + now_str
798
799 def _int(self, v, name, fatal=False, **kwargs):
800 res = int_or_none(v, **kwargs)
801 if 'get_attr' in kwargs:
802 print(getattr(v, kwargs['get_attr']))
803 if res is None:
804 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
805 if fatal:
806 raise ExtractorError(msg)
807 else:
808 self._downloader.report_warning(msg)
809 return res
810
811 def _float(self, v, name, fatal=False, **kwargs):
812 res = float_or_none(v, **kwargs)
813 if res is None:
814 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
815 if fatal:
816 raise ExtractorError(msg)
817 else:
818 self._downloader.report_warning(msg)
819 return res
820
821 def _set_cookie(self, domain, name, value, expire_time=None):
822 cookie = compat_cookiejar.Cookie(0, name, value, None, None, domain, None,
823 None, '/', True, False, expire_time, '', None, None, None)
824 self._downloader.cookiejar.set_cookie(cookie)
825
826
827 class SearchInfoExtractor(InfoExtractor):
828 """
829 Base class for paged search queries extractors.
830 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
831 Instances should define _SEARCH_KEY and _MAX_RESULTS.
832 """
833
834 @classmethod
835 def _make_valid_url(cls):
836 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
837
838 @classmethod
839 def suitable(cls, url):
840 return re.match(cls._make_valid_url(), url) is not None
841
842 def _real_extract(self, query):
843 mobj = re.match(self._make_valid_url(), query)
844 if mobj is None:
845 raise ExtractorError('Invalid search query "%s"' % query)
846
847 prefix = mobj.group('prefix')
848 query = mobj.group('query')
849 if prefix == '':
850 return self._get_n_results(query, 1)
851 elif prefix == 'all':
852 return self._get_n_results(query, self._MAX_RESULTS)
853 else:
854 n = int(prefix)
855 if n <= 0:
856 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
857 elif n > self._MAX_RESULTS:
858 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
859 n = self._MAX_RESULTS
860 return self._get_n_results(query, n)
861
862 def _get_n_results(self, query, n):
863 """Get a specified number of results for a query"""
864 raise NotImplementedError("This method must be implemented by subclasses")
865
866 @property
867 def SEARCH_KEY(self):
868 return self._SEARCH_KEY