]> Raphaƫl G. Git Repositories - youtubedl/blob - youtube_dl/extractor/common.py
Prepare for upload.
[youtubedl] / youtube_dl / extractor / common.py
1 from __future__ import unicode_literals
2
3 import base64
4 import datetime
5 import hashlib
6 import json
7 import netrc
8 import os
9 import re
10 import socket
11 import sys
12 import time
13 import xml.etree.ElementTree
14
15 from ..utils import (
16 compat_http_client,
17 compat_urllib_error,
18 compat_urllib_parse_urlparse,
19 compat_urlparse,
20 compat_str,
21
22 clean_html,
23 compiled_regex_type,
24 ExtractorError,
25 float_or_none,
26 int_or_none,
27 RegexNotFoundError,
28 sanitize_filename,
29 unescapeHTML,
30 )
31 _NO_DEFAULT = object()
32
33
34 class InfoExtractor(object):
35 """Information Extractor class.
36
37 Information extractors are the classes that, given a URL, extract
38 information about the video (or videos) the URL refers to. This
39 information includes the real video URL, the video title, author and
40 others. The information is stored in a dictionary which is then
41 passed to the FileDownloader. The FileDownloader processes this
42 information possibly downloading the video to the file system, among
43 other possible outcomes.
44
45 The dictionaries must include the following fields:
46
47 id: Video identifier.
48 title: Video title, unescaped.
49
50 Additionally, it must contain either a formats entry or a url one:
51
52 formats: A list of dictionaries for each format available, ordered
53 from worst to best quality.
54
55 Potential fields:
56 * url Mandatory. The URL of the video file
57 * ext Will be calculated from url if missing
58 * format A human-readable description of the format
59 ("mp4 container with h264/opus").
60 Calculated from the format_id, width, height.
61 and format_note fields if missing.
62 * format_id A short description of the format
63 ("mp4_h264_opus" or "19").
64 Technically optional, but strongly recommended.
65 * format_note Additional info about the format
66 ("3D" or "DASH video")
67 * width Width of the video, if known
68 * height Height of the video, if known
69 * resolution Textual description of width and height
70 * tbr Average bitrate of audio and video in KBit/s
71 * abr Average audio bitrate in KBit/s
72 * acodec Name of the audio codec in use
73 * asr Audio sampling rate in Hertz
74 * vbr Average video bitrate in KBit/s
75 * fps Frame rate
76 * vcodec Name of the video codec in use
77 * container Name of the container format
78 * filesize The number of bytes, if known in advance
79 * filesize_approx An estimate for the number of bytes
80 * player_url SWF Player URL (used for rtmpdump).
81 * protocol The protocol that will be used for the actual
82 download, lower-case.
83 "http", "https", "rtsp", "rtmp", "m3u8" or so.
84 * preference Order number of this format. If this field is
85 present and not None, the formats get sorted
86 by this field, regardless of all other values.
87 -1 for default (order by other properties),
88 -2 or smaller for less than default.
89 * quality Order number of the video quality of this
90 format, irrespective of the file format.
91 -1 for default (order by other properties),
92 -2 or smaller for less than default.
93 * source_preference Order number for this video source
94 (quality takes higher priority)
95 -1 for default (order by other properties),
96 -2 or smaller for less than default.
97 * http_referer HTTP Referer header value to set.
98 * http_method HTTP method to use for the download.
99 * http_headers A dictionary of additional HTTP headers
100 to add to the request.
101 * http_post_data Additional data to send with a POST
102 request.
103 url: Final video URL.
104 ext: Video filename extension.
105 format: The video format, defaults to ext (used for --get-format)
106 player_url: SWF Player URL (used for rtmpdump).
107
108 The following fields are optional:
109
110 display_id An alternative identifier for the video, not necessarily
111 unique, but available before title. Typically, id is
112 something like "4234987", title "Dancing naked mole rats",
113 and display_id "dancing-naked-mole-rats"
114 thumbnails: A list of dictionaries, with the following entries:
115 * "url"
116 * "width" (optional, int)
117 * "height" (optional, int)
118 * "resolution" (optional, string "{width}x{height"},
119 deprecated)
120 thumbnail: Full URL to a video thumbnail image.
121 description: One-line video description.
122 uploader: Full name of the video uploader.
123 timestamp: UNIX timestamp of the moment the video became available.
124 upload_date: Video upload date (YYYYMMDD).
125 If not explicitly set, calculated from timestamp.
126 uploader_id: Nickname or id of the video uploader.
127 location: Physical location where the video was filmed.
128 subtitles: The subtitle file contents as a dictionary in the format
129 {language: subtitles}.
130 duration: Length of the video in seconds, as an integer.
131 view_count: How many users have watched the video on the platform.
132 like_count: Number of positive ratings of the video
133 dislike_count: Number of negative ratings of the video
134 comment_count: Number of comments on the video
135 age_limit: Age restriction for the video, as an integer (years)
136 webpage_url: The url to the video webpage, if given to youtube-dl it
137 should allow to get the same result again. (It will be set
138 by YoutubeDL if it's missing)
139 categories: A list of categories that the video falls in, for example
140 ["Sports", "Berlin"]
141 is_live: True, False, or None (=unknown). Whether this video is a
142 live stream that goes on instead of a fixed-length video.
143
144 Unless mentioned otherwise, the fields should be Unicode strings.
145
146 Unless mentioned otherwise, None is equivalent to absence of information.
147
148 Subclasses of this one should re-define the _real_initialize() and
149 _real_extract() methods and define a _VALID_URL regexp.
150 Probably, they should also be added to the list of extractors.
151
152 Finally, the _WORKING attribute should be set to False for broken IEs
153 in order to warn the users and skip the tests.
154 """
155
156 _ready = False
157 _downloader = None
158 _WORKING = True
159
160 def __init__(self, downloader=None):
161 """Constructor. Receives an optional downloader."""
162 self._ready = False
163 self.set_downloader(downloader)
164
165 @classmethod
166 def suitable(cls, url):
167 """Receives a URL and returns True if suitable for this IE."""
168
169 # This does not use has/getattr intentionally - we want to know whether
170 # we have cached the regexp for *this* class, whereas getattr would also
171 # match the superclass
172 if '_VALID_URL_RE' not in cls.__dict__:
173 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
174 return cls._VALID_URL_RE.match(url) is not None
175
176 @classmethod
177 def _match_id(cls, url):
178 if '_VALID_URL_RE' not in cls.__dict__:
179 cls._VALID_URL_RE = re.compile(cls._VALID_URL)
180 m = cls._VALID_URL_RE.match(url)
181 assert m
182 return m.group('id')
183
184 @classmethod
185 def working(cls):
186 """Getter method for _WORKING."""
187 return cls._WORKING
188
189 def initialize(self):
190 """Initializes an instance (authentication, etc)."""
191 if not self._ready:
192 self._real_initialize()
193 self._ready = True
194
195 def extract(self, url):
196 """Extracts URL information and returns it in list of dicts."""
197 self.initialize()
198 return self._real_extract(url)
199
200 def set_downloader(self, downloader):
201 """Sets the downloader for this IE."""
202 self._downloader = downloader
203
204 def _real_initialize(self):
205 """Real initialization process. Redefine in subclasses."""
206 pass
207
208 def _real_extract(self, url):
209 """Real extraction process. Redefine in subclasses."""
210 pass
211
212 @classmethod
213 def ie_key(cls):
214 """A string for getting the InfoExtractor with get_info_extractor"""
215 return cls.__name__[:-2]
216
217 @property
218 def IE_NAME(self):
219 return type(self).__name__[:-2]
220
221 def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
222 """ Returns the response handle """
223 if note is None:
224 self.report_download_webpage(video_id)
225 elif note is not False:
226 if video_id is None:
227 self.to_screen('%s' % (note,))
228 else:
229 self.to_screen('%s: %s' % (video_id, note))
230 try:
231 return self._downloader.urlopen(url_or_request)
232 except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
233 if errnote is False:
234 return False
235 if errnote is None:
236 errnote = 'Unable to download webpage'
237 errmsg = '%s: %s' % (errnote, compat_str(err))
238 if fatal:
239 raise ExtractorError(errmsg, sys.exc_info()[2], cause=err)
240 else:
241 self._downloader.report_warning(errmsg)
242 return False
243
244 def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
245 """ Returns a tuple (page content as string, URL handle) """
246 # Strip hashes from the URL (#1038)
247 if isinstance(url_or_request, (compat_str, str)):
248 url_or_request = url_or_request.partition('#')[0]
249
250 urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal)
251 if urlh is False:
252 assert not fatal
253 return False
254 content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal)
255 return (content, urlh)
256
257 def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True):
258 content_type = urlh.headers.get('Content-Type', '')
259 webpage_bytes = urlh.read()
260 m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
261 if m:
262 encoding = m.group(1)
263 else:
264 m = re.search(br'<meta[^>]+charset=[\'"]?([^\'")]+)[ /\'">]',
265 webpage_bytes[:1024])
266 if m:
267 encoding = m.group(1).decode('ascii')
268 elif webpage_bytes.startswith(b'\xff\xfe'):
269 encoding = 'utf-16'
270 else:
271 encoding = 'utf-8'
272 if self._downloader.params.get('dump_intermediate_pages', False):
273 try:
274 url = url_or_request.get_full_url()
275 except AttributeError:
276 url = url_or_request
277 self.to_screen('Dumping request to ' + url)
278 dump = base64.b64encode(webpage_bytes).decode('ascii')
279 self._downloader.to_screen(dump)
280 if self._downloader.params.get('write_pages', False):
281 try:
282 url = url_or_request.get_full_url()
283 except AttributeError:
284 url = url_or_request
285 basen = '%s_%s' % (video_id, url)
286 if len(basen) > 240:
287 h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest()
288 basen = basen[:240 - len(h)] + h
289 raw_filename = basen + '.dump'
290 filename = sanitize_filename(raw_filename, restricted=True)
291 self.to_screen('Saving request to ' + filename)
292 # Working around MAX_PATH limitation on Windows (see
293 # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx)
294 if os.name == 'nt':
295 absfilepath = os.path.abspath(filename)
296 if len(absfilepath) > 259:
297 filename = '\\\\?\\' + absfilepath
298 with open(filename, 'wb') as outf:
299 outf.write(webpage_bytes)
300
301 try:
302 content = webpage_bytes.decode(encoding, 'replace')
303 except LookupError:
304 content = webpage_bytes.decode('utf-8', 'replace')
305
306 if ('<title>Access to this site is blocked</title>' in content and
307 'Websense' in content[:512]):
308 msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
309 blocked_iframe = self._html_search_regex(
310 r'<iframe src="([^"]+)"', content,
311 'Websense information URL', default=None)
312 if blocked_iframe:
313 msg += ' Visit %s for more details' % blocked_iframe
314 raise ExtractorError(msg, expected=True)
315
316 return content
317
318 def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True):
319 """ Returns the data of the page as a string """
320 res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal)
321 if res is False:
322 return res
323 else:
324 content, _ = res
325 return content
326
327 def _download_xml(self, url_or_request, video_id,
328 note='Downloading XML', errnote='Unable to download XML',
329 transform_source=None, fatal=True):
330 """Return the xml as an xml.etree.ElementTree.Element"""
331 xml_string = self._download_webpage(
332 url_or_request, video_id, note, errnote, fatal=fatal)
333 if xml_string is False:
334 return xml_string
335 if transform_source:
336 xml_string = transform_source(xml_string)
337 return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8'))
338
339 def _download_json(self, url_or_request, video_id,
340 note='Downloading JSON metadata',
341 errnote='Unable to download JSON metadata',
342 transform_source=None,
343 fatal=True):
344 json_string = self._download_webpage(
345 url_or_request, video_id, note, errnote, fatal=fatal)
346 if (not fatal) and json_string is False:
347 return None
348 if transform_source:
349 json_string = transform_source(json_string)
350 try:
351 return json.loads(json_string)
352 except ValueError as ve:
353 errmsg = '%s: Failed to parse JSON ' % video_id
354 if fatal:
355 raise ExtractorError(errmsg, cause=ve)
356 else:
357 self.report_warning(errmsg + str(ve))
358
359 def report_warning(self, msg, video_id=None):
360 idstr = '' if video_id is None else '%s: ' % video_id
361 self._downloader.report_warning(
362 '[%s] %s%s' % (self.IE_NAME, idstr, msg))
363
364 def to_screen(self, msg):
365 """Print msg to screen, prefixing it with '[ie_name]'"""
366 self._downloader.to_screen('[%s] %s' % (self.IE_NAME, msg))
367
368 def report_extraction(self, id_or_name):
369 """Report information extraction."""
370 self.to_screen('%s: Extracting information' % id_or_name)
371
372 def report_download_webpage(self, video_id):
373 """Report webpage download."""
374 self.to_screen('%s: Downloading webpage' % video_id)
375
376 def report_age_confirmation(self):
377 """Report attempt to confirm age."""
378 self.to_screen('Confirming age')
379
380 def report_login(self):
381 """Report attempt to log in."""
382 self.to_screen('Logging in')
383
384 #Methods for following #608
385 @staticmethod
386 def url_result(url, ie=None, video_id=None):
387 """Returns a url that points to a page that should be processed"""
388 #TODO: ie should be the class used for getting the info
389 video_info = {'_type': 'url',
390 'url': url,
391 'ie_key': ie}
392 if video_id is not None:
393 video_info['id'] = video_id
394 return video_info
395 @staticmethod
396 def playlist_result(entries, playlist_id=None, playlist_title=None):
397 """Returns a playlist"""
398 video_info = {'_type': 'playlist',
399 'entries': entries}
400 if playlist_id:
401 video_info['id'] = playlist_id
402 if playlist_title:
403 video_info['title'] = playlist_title
404 return video_info
405
406 def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
407 """
408 Perform a regex search on the given string, using a single or a list of
409 patterns returning the first matching group.
410 In case of failure return a default value or raise a WARNING or a
411 RegexNotFoundError, depending on fatal, specifying the field name.
412 """
413 if isinstance(pattern, (str, compat_str, compiled_regex_type)):
414 mobj = re.search(pattern, string, flags)
415 else:
416 for p in pattern:
417 mobj = re.search(p, string, flags)
418 if mobj:
419 break
420
421 if os.name != 'nt' and sys.stderr.isatty():
422 _name = '\033[0;34m%s\033[0m' % name
423 else:
424 _name = name
425
426 if mobj:
427 # return the first matching group
428 return next(g for g in mobj.groups() if g is not None)
429 elif default is not _NO_DEFAULT:
430 return default
431 elif fatal:
432 raise RegexNotFoundError('Unable to extract %s' % _name)
433 else:
434 self._downloader.report_warning('unable to extract %s; '
435 'please report this issue on http://yt-dl.org/bug' % _name)
436 return None
437
438 def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0):
439 """
440 Like _search_regex, but strips HTML tags and unescapes entities.
441 """
442 res = self._search_regex(pattern, string, name, default, fatal, flags)
443 if res:
444 return clean_html(res).strip()
445 else:
446 return res
447
448 def _get_login_info(self):
449 """
450 Get the the login info as (username, password)
451 It will look in the netrc file using the _NETRC_MACHINE value
452 If there's no info available, return (None, None)
453 """
454 if self._downloader is None:
455 return (None, None)
456
457 username = None
458 password = None
459 downloader_params = self._downloader.params
460
461 # Attempt to use provided username and password or .netrc data
462 if downloader_params.get('username', None) is not None:
463 username = downloader_params['username']
464 password = downloader_params['password']
465 elif downloader_params.get('usenetrc', False):
466 try:
467 info = netrc.netrc().authenticators(self._NETRC_MACHINE)
468 if info is not None:
469 username = info[0]
470 password = info[2]
471 else:
472 raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE)
473 except (IOError, netrc.NetrcParseError) as err:
474 self._downloader.report_warning('parsing .netrc: %s' % compat_str(err))
475
476 return (username, password)
477
478 def _get_tfa_info(self):
479 """
480 Get the two-factor authentication info
481 TODO - asking the user will be required for sms/phone verify
482 currently just uses the command line option
483 If there's no info available, return None
484 """
485 if self._downloader is None:
486 return None
487 downloader_params = self._downloader.params
488
489 if downloader_params.get('twofactor', None) is not None:
490 return downloader_params['twofactor']
491
492 return None
493
494 # Helper functions for extracting OpenGraph info
495 @staticmethod
496 def _og_regexes(prop):
497 content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')'
498 property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop)
499 template = r'<meta[^>]+?%s[^>]+?%s'
500 return [
501 template % (property_re, content_re),
502 template % (content_re, property_re),
503 ]
504
505 def _og_search_property(self, prop, html, name=None, **kargs):
506 if name is None:
507 name = 'OpenGraph %s' % prop
508 escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
509 if escaped is None:
510 return None
511 return unescapeHTML(escaped)
512
513 def _og_search_thumbnail(self, html, **kargs):
514 return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs)
515
516 def _og_search_description(self, html, **kargs):
517 return self._og_search_property('description', html, fatal=False, **kargs)
518
519 def _og_search_title(self, html, **kargs):
520 return self._og_search_property('title', html, **kargs)
521
522 def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
523 regexes = self._og_regexes('video') + self._og_regexes('video:url')
524 if secure:
525 regexes = self._og_regexes('video:secure_url') + regexes
526 return self._html_search_regex(regexes, html, name, **kargs)
527
528 def _og_search_url(self, html, **kargs):
529 return self._og_search_property('url', html, **kargs)
530
531 def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs):
532 if display_name is None:
533 display_name = name
534 return self._html_search_regex(
535 r'''(?ix)<meta
536 (?=[^>]+(?:itemprop|name|property)=["\']?%s["\']?)
537 [^>]+content=["\']([^"\']+)["\']''' % re.escape(name),
538 html, display_name, fatal=fatal, **kwargs)
539
540 def _dc_search_uploader(self, html):
541 return self._html_search_meta('dc.creator', html, 'uploader')
542
543 def _rta_search(self, html):
544 # See http://www.rtalabel.org/index.php?content=howtofaq#single
545 if re.search(r'(?ix)<meta\s+name="rating"\s+'
546 r' content="RTA-5042-1996-1400-1577-RTA"',
547 html):
548 return 18
549 return 0
550
551 def _media_rating_search(self, html):
552 # See http://www.tjg-designs.com/WP/metadata-code-examples-adding-metadata-to-your-web-pages/
553 rating = self._html_search_meta('rating', html)
554
555 if not rating:
556 return None
557
558 RATING_TABLE = {
559 'safe for kids': 0,
560 'general': 8,
561 '14 years': 14,
562 'mature': 17,
563 'restricted': 19,
564 }
565 return RATING_TABLE.get(rating.lower(), None)
566
567 def _twitter_search_player(self, html):
568 return self._html_search_meta('twitter:player', html,
569 'twitter card player')
570
571 def _sort_formats(self, formats):
572 if not formats:
573 raise ExtractorError('No video formats found')
574
575 def _formats_key(f):
576 # TODO remove the following workaround
577 from ..utils import determine_ext
578 if not f.get('ext') and 'url' in f:
579 f['ext'] = determine_ext(f['url'])
580
581 preference = f.get('preference')
582 if preference is None:
583 proto = f.get('protocol')
584 if proto is None:
585 proto = compat_urllib_parse_urlparse(f.get('url', '')).scheme
586
587 preference = 0 if proto in ['http', 'https'] else -0.1
588 if f.get('ext') in ['f4f', 'f4m']: # Not yet supported
589 preference -= 0.5
590
591 if f.get('vcodec') == 'none': # audio only
592 if self._downloader.params.get('prefer_free_formats'):
593 ORDER = ['aac', 'mp3', 'm4a', 'webm', 'ogg', 'opus']
594 else:
595 ORDER = ['webm', 'opus', 'ogg', 'mp3', 'aac', 'm4a']
596 ext_preference = 0
597 try:
598 audio_ext_preference = ORDER.index(f['ext'])
599 except ValueError:
600 audio_ext_preference = -1
601 else:
602 if self._downloader.params.get('prefer_free_formats'):
603 ORDER = ['flv', 'mp4', 'webm']
604 else:
605 ORDER = ['webm', 'flv', 'mp4']
606 try:
607 ext_preference = ORDER.index(f['ext'])
608 except ValueError:
609 ext_preference = -1
610 audio_ext_preference = 0
611
612 return (
613 preference,
614 f.get('quality') if f.get('quality') is not None else -1,
615 f.get('height') if f.get('height') is not None else -1,
616 f.get('width') if f.get('width') is not None else -1,
617 ext_preference,
618 f.get('tbr') if f.get('tbr') is not None else -1,
619 f.get('vbr') if f.get('vbr') is not None else -1,
620 f.get('abr') if f.get('abr') is not None else -1,
621 audio_ext_preference,
622 f.get('fps') if f.get('fps') is not None else -1,
623 f.get('filesize') if f.get('filesize') is not None else -1,
624 f.get('filesize_approx') if f.get('filesize_approx') is not None else -1,
625 f.get('source_preference') if f.get('source_preference') is not None else -1,
626 f.get('format_id'),
627 )
628 formats.sort(key=_formats_key)
629
630 def http_scheme(self):
631 """ Either "http:" or "https:", depending on the user's preferences """
632 return (
633 'http:'
634 if self._downloader.params.get('prefer_insecure', False)
635 else 'https:')
636
637 def _proto_relative_url(self, url, scheme=None):
638 if url is None:
639 return url
640 if url.startswith('//'):
641 if scheme is None:
642 scheme = self.http_scheme()
643 return scheme + url
644 else:
645 return url
646
647 def _sleep(self, timeout, video_id, msg_template=None):
648 if msg_template is None:
649 msg_template = '%(video_id)s: Waiting for %(timeout)s seconds'
650 msg = msg_template % {'video_id': video_id, 'timeout': timeout}
651 self.to_screen(msg)
652 time.sleep(timeout)
653
654 def _extract_f4m_formats(self, manifest_url, video_id):
655 manifest = self._download_xml(
656 manifest_url, video_id, 'Downloading f4m manifest',
657 'Unable to download f4m manifest')
658
659 formats = []
660 media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media')
661 for i, media_el in enumerate(media_nodes):
662 tbr = int_or_none(media_el.attrib.get('bitrate'))
663 format_id = 'f4m-%d' % (i if tbr is None else tbr)
664 formats.append({
665 'format_id': format_id,
666 'url': manifest_url,
667 'ext': 'flv',
668 'tbr': tbr,
669 'width': int_or_none(media_el.attrib.get('width')),
670 'height': int_or_none(media_el.attrib.get('height')),
671 })
672 self._sort_formats(formats)
673
674 return formats
675
676 def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None,
677 entry_protocol='m3u8', preference=None):
678
679 formats = [{
680 'format_id': 'm3u8-meta',
681 'url': m3u8_url,
682 'ext': ext,
683 'protocol': 'm3u8',
684 'preference': -1,
685 'resolution': 'multiple',
686 'format_note': 'Quality selection URL',
687 }]
688
689 format_url = lambda u: (
690 u
691 if re.match(r'^https?://', u)
692 else compat_urlparse.urljoin(m3u8_url, u))
693
694 m3u8_doc = self._download_webpage(
695 m3u8_url, video_id,
696 note='Downloading m3u8 information',
697 errnote='Failed to download m3u8 information')
698 last_info = None
699 kv_rex = re.compile(
700 r'(?P<key>[a-zA-Z_-]+)=(?P<val>"[^"]+"|[^",]+)(?:,|$)')
701 for line in m3u8_doc.splitlines():
702 if line.startswith('#EXT-X-STREAM-INF:'):
703 last_info = {}
704 for m in kv_rex.finditer(line):
705 v = m.group('val')
706 if v.startswith('"'):
707 v = v[1:-1]
708 last_info[m.group('key')] = v
709 elif line.startswith('#') or not line.strip():
710 continue
711 else:
712 if last_info is None:
713 formats.append({'url': format_url(line)})
714 continue
715 tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000)
716
717 f = {
718 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)),
719 'url': format_url(line.strip()),
720 'tbr': tbr,
721 'ext': ext,
722 'protocol': entry_protocol,
723 'preference': preference,
724 }
725 codecs = last_info.get('CODECS')
726 if codecs:
727 # TODO: looks like video codec is not always necessarily goes first
728 va_codecs = codecs.split(',')
729 if va_codecs[0]:
730 f['vcodec'] = va_codecs[0].partition('.')[0]
731 if len(va_codecs) > 1 and va_codecs[1]:
732 f['acodec'] = va_codecs[1].partition('.')[0]
733 resolution = last_info.get('RESOLUTION')
734 if resolution:
735 width_str, height_str = resolution.split('x')
736 f['width'] = int(width_str)
737 f['height'] = int(height_str)
738 formats.append(f)
739 last_info = {}
740 self._sort_formats(formats)
741 return formats
742
743 def _live_title(self, name):
744 """ Generate the title for a live video """
745 now = datetime.datetime.now()
746 now_str = now.strftime("%Y-%m-%d %H:%M")
747 return name + ' ' + now_str
748
749 def _int(self, v, name, fatal=False, **kwargs):
750 res = int_or_none(v, **kwargs)
751 if 'get_attr' in kwargs:
752 print(getattr(v, kwargs['get_attr']))
753 if res is None:
754 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
755 if fatal:
756 raise ExtractorError(msg)
757 else:
758 self._downloader.report_warning(msg)
759 return res
760
761 def _float(self, v, name, fatal=False, **kwargs):
762 res = float_or_none(v, **kwargs)
763 if res is None:
764 msg = 'Failed to extract %s: Could not parse value %r' % (name, v)
765 if fatal:
766 raise ExtractorError(msg)
767 else:
768 self._downloader.report_warning(msg)
769 return res
770
771
772 class SearchInfoExtractor(InfoExtractor):
773 """
774 Base class for paged search queries extractors.
775 They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query}
776 Instances should define _SEARCH_KEY and _MAX_RESULTS.
777 """
778
779 @classmethod
780 def _make_valid_url(cls):
781 return r'%s(?P<prefix>|[1-9][0-9]*|all):(?P<query>[\s\S]+)' % cls._SEARCH_KEY
782
783 @classmethod
784 def suitable(cls, url):
785 return re.match(cls._make_valid_url(), url) is not None
786
787 def _real_extract(self, query):
788 mobj = re.match(self._make_valid_url(), query)
789 if mobj is None:
790 raise ExtractorError('Invalid search query "%s"' % query)
791
792 prefix = mobj.group('prefix')
793 query = mobj.group('query')
794 if prefix == '':
795 return self._get_n_results(query, 1)
796 elif prefix == 'all':
797 return self._get_n_results(query, self._MAX_RESULTS)
798 else:
799 n = int(prefix)
800 if n <= 0:
801 raise ExtractorError('invalid download number %s for query "%s"' % (n, query))
802 elif n > self._MAX_RESULTS:
803 self._downloader.report_warning('%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n))
804 n = self._MAX_RESULTS
805 return self._get_n_results(query, n)
806
807 def _get_n_results(self, query, n):
808 """Get a specified number of results for a query"""
809 raise NotImplementedError("This method must be implemented by subclasses")
810
811 @property
812 def SEARCH_KEY(self):
813 return self._SEARCH_KEY