X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/6d835d4d6903cf5e18ad01844736929e06d16004..432fd38466ba4fc2f31597488104ed0e729149a6:/youtube_dl/extractor/common.py diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7e41132..5e263f8 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -10,25 +10,39 @@ import re import socket import sys import time -import xml.etree.ElementTree -from ..utils import ( +from ..compat import ( + compat_cookiejar, + compat_cookies, + compat_getpass, compat_http_client, compat_urllib_error, + compat_urllib_parse, compat_urllib_parse_urlparse, + compat_urllib_request, compat_urlparse, compat_str, - + compat_etree_fromstring, +) +from ..utils import ( + NO_DEFAULT, + age_restricted, + bug_reports_message, clean_html, compiled_regex_type, + determine_ext, ExtractorError, + fix_xml_ampersands, float_or_none, int_or_none, RegexNotFoundError, sanitize_filename, unescapeHTML, + unified_strdate, + url_basename, + xpath_text, + xpath_with_ns, ) -_NO_DEFAULT = object() class InfoExtractor(object): @@ -38,11 +52,15 @@ class InfoExtractor(object): information about the video (or videos) the URL refers to. This information includes the real video URL, the video title, author and others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this + passed to the YoutubeDL. The YoutubeDL processes this information possibly downloading the video to the file system, among other possible outcomes. - The dictionaries must include the following fields: + The type field determines the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: id: Video identifier. title: Video title, unescaped. @@ -54,7 +72,7 @@ class InfoExtractor(object): Potential fields: * url Mandatory. The URL of the video file - * ext Will be calculated from url if missing + * ext Will be calculated from URL if missing * format A human-readable description of the format ("mp4 container with h264/opus"). Calculated from the format_id, width, height. @@ -80,12 +98,20 @@ class InfoExtractor(object): * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. - "http", "https", "rtsp", "rtmp", "m3u8" or so. + "http", "https", "rtsp", "rtmp", "rtmpe", + "m3u8", or "m3u8_native". * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language_preference Is this in the correct requested + language? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. * quality Order number of the video quality of this format, irrespective of the file format. -1 for default (order by other properties), @@ -94,12 +120,14 @@ class InfoExtractor(object): (quality takes higher priority) -1 for default (order by other properties), -2 or smaller for less than default. - * http_referer HTTP Referer header value to set. - * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers to add to the request. - * http_post_data Additional data to send with a POST - request. + * stretched_ratio If given and not 1, indicates that the + video's pixels are not square. + width : height ratio as float. + * no_resume The server does not support resuming the + (HTTP or RTMP) download. Boolean. + url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -107,44 +135,107 @@ class InfoExtractor(object): The following fields are optional: + alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" thumbnails: A list of dictionaries, with the following entries: + * "id" (optional, string) - Thumbnail format ID * "url" + * "preference" (optional, int) - quality of the image * "width" (optional, int) * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) thumbnail: Full URL to a video thumbnail image. - description: One-line video description. + description: Full video description. uploader: Full name of the video uploader. + creator: The main artist who created the video. + release_date: The date (YYYYMMDD) when the video was released. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location where the video was filmed. - subtitles: The subtitle file contents as a dictionary in the format - {language: subtitles}. + subtitles: The available subtitles as a dictionary in the format + {language: subformats}. "subformats" is a list sorted from + lower to higher preference, each element is a dictionary + with the "ext" entry and one of: + * "data": The subtitles file contents + * "url": A URL pointing to the subtitles file + "ext" will be calculated from URL if missing + automatic_captions: Like 'subtitles', used by the YoutubeIE for + automatically generated captions duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video + repost_count: Number of reposts of the video + average_rating: Average rating give by users, the scale used depends on the webpage comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. age_limit: Age restriction for the video, as an integer (years) - webpage_url: The url to the video webpage, if given to youtube-dl it + webpage_url: The URL to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) categories: A list of categories that the video falls in, for example ["Sports", "Berlin"] + tags: A list of tags assigned to the video, e.g. ["sweden", "pop music"] is_live: True, False, or None (=unknown). Whether this video is a live stream that goes on instead of a fixed-length video. + start_time: Time in seconds where the reproduction should start, as + specified in the URL. + end_time: Time in seconds where the reproduction should end, as + specified in the URL. Unless mentioned otherwise, the fields should be Unicode strings. Unless mentioned otherwise, None is equivalent to absence of information. + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "title", "description" and "id" attributes + with the same semantics as videos (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@ -194,8 +285,15 @@ class InfoExtractor(object): def extract(self, url): """Extracts URL information and returns it in list of dicts.""" - self.initialize() - return self._real_extract(url) + try: + self.initialize() + return self._real_extract(url) + except ExtractorError: + raise + except compat_http_client.IncompleteRead as e: + raise ExtractorError('A network error has occured.', cause=e, expected=True) + except (KeyError, StopIteration) as e: + raise ExtractorError('An extractor error has occured.', cause=e) def set_downloader(self, downloader): """Sets the downloader for this IE.""" @@ -212,11 +310,11 @@ class InfoExtractor(object): @classmethod def ie_key(cls): """A string for getting the InfoExtractor with get_info_extractor""" - return cls.__name__[:-2] + return compat_str(cls.__name__[:-2]) @property def IE_NAME(self): - return type(self).__name__[:-2] + return compat_str(type(self).__name__[:-2]) def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ @@ -241,7 +339,7 @@ class InfoExtractor(object): self._downloader.report_warning(errmsg) return False - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, encoding=None): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): @@ -251,12 +349,11 @@ class InfoExtractor(object): if urlh is False: assert not fatal return False - content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal, encoding=encoding) return (content, urlh) - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): - content_type = urlh.headers.get('Content-Type', '') - webpage_bytes = urlh.read() + @staticmethod + def _guess_encoding_from_content(content_type, webpage_bytes): m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@ -269,6 +366,16 @@ class InfoExtractor(object): encoding = 'utf-16' else: encoding = 'utf-8' + + return encoding + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): + content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes + if not encoding: + encoding = self._guess_encoding_from_content(content_type, webpage_bytes) if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() @@ -312,12 +419,32 @@ class InfoExtractor(object): if blocked_iframe: msg += ' Visit %s for more details' % blocked_iframe raise ExtractorError(msg, expected=True) + if 'The URL you requested has been blocked' in content[:512]: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'

(.*?)

', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5, encoding=None): """ Returns the data of the page as a string """ - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + success = False + try_count = 0 + while success is False: + try: + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal, encoding=encoding) + success = True + except compat_http_client.IncompleteRead as e: + try_count += 1 + if try_count >= tries: + raise e + self._sleep(timeout, video_id) if res is False: return res else: @@ -326,25 +453,30 @@ class InfoExtractor(object): def _download_xml(self, url_or_request, video_id, note='Downloading XML', errnote='Unable to download XML', - transform_source=None, fatal=True): + transform_source=None, fatal=True, encoding=None): """Return the xml as an xml.etree.ElementTree.Element""" xml_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding) if xml_string is False: return xml_string if transform_source: xml_string = transform_source(xml_string) - return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + return compat_etree_fromstring(xml_string.encode('utf-8')) def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', errnote='Unable to download JSON metadata', transform_source=None, - fatal=True): + fatal=True, encoding=None): json_string = self._download_webpage( - url_or_request, video_id, note, errnote, fatal=fatal) + url_or_request, video_id, note, errnote, fatal=fatal, + encoding=encoding) if (not fatal) and json_string is False: return None + return self._parse_json( + json_string, video_id, transform_source=transform_source, fatal=fatal) + + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: json_string = transform_source(json_string) try: @@ -381,19 +513,34 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') - #Methods for following #608 @staticmethod - def url_result(url, ie=None, video_id=None): - """Returns a url that points to a page that should be processed""" - #TODO: ie should be the class used for getting the info + def raise_login_required(msg='This video is only available for registered users'): + raise ExtractorError( + '%s. Use --username and --password or --netrc to provide account credentials.' % msg, + expected=True) + + @staticmethod + def raise_geo_restricted(msg='This video is not available from your location due to geo restriction'): + raise ExtractorError( + '%s. You might want to use --proxy to workaround.' % msg, + expected=True) + + # Methods for following #608 + @staticmethod + def url_result(url, ie=None, video_id=None, video_title=None): + """Returns a URL that points to a page that should be processed""" + # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} if video_id is not None: video_info['id'] = video_id + if video_title is not None: + video_info['title'] = video_title return video_info + @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} @@ -401,9 +548,11 @@ class InfoExtractor(object): video_info['id'] = playlist_id if playlist_title: video_info['title'] = playlist_title + if playlist_description: + video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -418,28 +567,30 @@ class InfoExtractor(object): if mobj: break - if os.name != 'nt' and sys.stderr.isatty(): + if not self._downloader.params.get('no_color') and os.name != 'nt' and sys.stderr.isatty(): _name = '\033[0;34m%s\033[0m' % name else: _name = name if mobj: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) - elif default is not _NO_DEFAULT: + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + else: + return mobj.group(group) + elif default is not NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: - self._downloader.report_warning('unable to extract %s; ' - 'please report this issue on http://yt-dl.org/bug' % _name) + self._downloader.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _html_search_regex(self, pattern, string, name, default=NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ - res = self._search_regex(pattern, string, name, default, fatal, flags) + res = self._search_regex(pattern, string, name, default, fatal, flags, group) if res: return clean_html(res).strip() else: @@ -447,7 +598,7 @@ class InfoExtractor(object): def _get_login_info(self): """ - Get the the login info as (username, password) + Get the login info as (username, password) It will look in the netrc file using the _NETRC_MACHINE value If there's no info available, return (None, None) """ @@ -472,10 +623,10 @@ class InfoExtractor(object): raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) - + return (username, password) - def _get_tfa_info(self): + def _get_tfa_info(self, note='two-factor verification code'): """ Get the two-factor authentication info TODO - asking the user will be required for sms/phone verify @@ -489,19 +640,26 @@ class InfoExtractor(object): if downloader_params.get('twofactor', None) is not None: return downloader_params['twofactor'] - return None + return compat_getpass('Type %s and press [Return]: ' % note) # Helper functions for extracting OpenGraph info @staticmethod def _og_regexes(prop): - content_re = r'content=(?:"([^>]+?)"|\'([^>]+?)\')' - property_re = r'(?:name|property)=[\'"]og:%s[\'"]' % re.escape(prop) + content_re = r'content=(?:"([^"]+?)"|\'([^\']+?)\'|\s*([^\s"\'=<>`]+?))' + property_re = (r'(?:name|property)=(?:\'og:%(prop)s\'|"og:%(prop)s"|\s*og:%(prop)s\b)' + % {'prop': re.escape(prop)}) template = r']+?%s[^>]+?%s' return [ template % (property_re, content_re), template % (content_re, property_re), ] + @staticmethod + def _meta_regex(prop): + return r'''(?isx)]+(?:itemprop|name|property|id|http-equiv)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(prop) + def _og_search_property(self, prop, html, name=None, **kargs): if name is None: name = 'OpenGraph %s' % prop @@ -511,7 +669,7 @@ class InfoExtractor(object): return unescapeHTML(escaped) def _og_search_thumbnail(self, html, **kargs): - return self._og_search_property('image', html, 'thumbnail url', fatal=False, **kargs) + return self._og_search_property('image', html, 'thumbnail URL', fatal=False, **kargs) def _og_search_description(self, html, **kargs): return self._og_search_property('description', html, fatal=False, **kargs) @@ -532,10 +690,8 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?ix)]+(?:itemprop|name|property)=["\']?%s["\']?) - [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=fatal, **kwargs) + self._meta_regex(name), + html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') @@ -564,11 +720,48 @@ class InfoExtractor(object): } return RATING_TABLE.get(rating.lower(), None) + def _family_friendly_search(self, html): + # See http://schema.org/VideoObject + family_friendly = self._html_search_meta('isFamilyFriendly', html) + + if not family_friendly: + return None + + RATING_TABLE = { + '1': 0, + 'true': 0, + '0': 18, + 'false': 18, + } + return RATING_TABLE.get(family_friendly.lower(), None) + def _twitter_search_player(self, html): return self._html_search_meta('twitter:player', html, - 'twitter card player') + 'twitter card player') - def _sort_formats(self, formats): + @staticmethod + def _hidden_inputs(html): + html = re.sub(r'', '', html) + hidden_inputs = {} + for input in re.findall(r'(?i)]+)>', html): + if not re.search(r'type=(["\'])(?:hidden|submit)\1', input): + continue + name = re.search(r'name=(["\'])(?P.+?)\1', input) + if not name: + continue + value = re.search(r'value=(["\'])(?P.*?)\1', input) + if not value: + continue + hidden_inputs[name.group('value')] = value.group('value') + return hidden_inputs + + def _form_hidden_inputs(self, form_id, html): + form = self._search_regex( + r'(?is)]+?id=(["\'])%s\1[^>]*>(?P
.+?)
' % form_id, + html, '%s form' % form_id, group='form') + return self._hidden_inputs(form) + + def _sort_formats(self, formats, field_preference=None): if not formats: raise ExtractorError('No video formats found') @@ -578,6 +771,9 @@ class InfoExtractor(object): if not f.get('ext') and 'url' in f: f['ext'] = determine_ext(f['url']) + if isinstance(field_preference, (list, tuple)): + return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + preference = f.get('preference') if preference is None: proto = f.get('protocol') @@ -611,22 +807,46 @@ class InfoExtractor(object): return ( preference, + f.get('language_preference') if f.get('language_preference') is not None else -1, f.get('quality') if f.get('quality') is not None else -1, + f.get('tbr') if f.get('tbr') is not None else -1, + f.get('filesize') if f.get('filesize') is not None else -1, + f.get('vbr') if f.get('vbr') is not None else -1, f.get('height') if f.get('height') is not None else -1, f.get('width') if f.get('width') is not None else -1, ext_preference, - f.get('tbr') if f.get('tbr') is not None else -1, - f.get('vbr') if f.get('vbr') is not None else -1, f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('fps') if f.get('fps') is not None else -1, - f.get('filesize') if f.get('filesize') is not None else -1, f.get('filesize_approx') if f.get('filesize_approx') is not None else -1, f.get('source_preference') if f.get('source_preference') is not None else -1, - f.get('format_id'), + f.get('format_id') if f.get('format_id') is not None else '', ) formats.sort(key=_formats_key) + def _check_formats(self, formats, video_id): + if formats: + formats[:] = filter( + lambda f: self._is_valid_url( + f['url'], video_id, + item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), + formats) + + def _is_valid_url(self, url, video_id, item='video'): + url = self._proto_relative_url(url, scheme='http:') + # For now assume non HTTP(S) URLs always valid + if not (url.startswith('http://') or url.startswith('https://')): + return True + try: + self._request_webpage(url, video_id, 'Checking %s URL' % item) + return True + except ExtractorError as e: + if isinstance(e.cause, compat_urllib_error.URLError): + self.to_screen( + '%s: %s URL is invalid, skipping' % (video_id, item)) + return False + raise + def http_scheme(self): """ Either "http:" or "https:", depending on the user's preferences """ return ( @@ -651,37 +871,69 @@ class InfoExtractor(object): self.to_screen(msg) time.sleep(timeout) - def _extract_f4m_formats(self, manifest_url, video_id): + def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None, + transform_source=lambda s: fix_xml_ampersands(s).strip(), + fatal=True): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', - 'Unable to download f4m manifest') + 'Unable to download f4m manifest', + # Some manifests may be malformed, e.g. prosiebensat1 generated manifests + # (see https://github.com/rg3/youtube-dl/issues/6215#issuecomment-121704244) + transform_source=transform_source, + fatal=fatal) + + if manifest is False: + return manifest formats = [] + manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') + if not media_nodes: + manifest_version = '2.0' + media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): + if manifest_version == '2.0': + media_url = media_el.attrib.get('href') or media_el.attrib.get('url') + if not media_url: + continue + manifest_url = ( + media_url if media_url.startswith('http://') or media_url.startswith('https://') + else ('/'.join(manifest_url.split('/')[:-1]) + '/' + media_url)) + # If media_url is itself a f4m manifest do the recursive extraction + # since bitrates in parent manifest (this one) and media_url manifest + # may differ leading to inability to resolve the format by requested + # bitrate in f4m downloader + if determine_ext(manifest_url) == 'f4m': + f4m_formats = self._extract_f4m_formats( + manifest_url, video_id, preference, f4m_id, fatal=fatal) + if f4m_formats: + formats.extend(f4m_formats) + continue tbr = int_or_none(media_el.attrib.get('bitrate')) - format_id = 'f4m-%d' % (i if tbr is None else tbr) formats.append({ - 'format_id': format_id, + 'format_id': '-'.join(filter(None, [f4m_id, compat_str(i if tbr is None else tbr)])), 'url': manifest_url, 'ext': 'flv', 'tbr': tbr, 'width': int_or_none(media_el.attrib.get('width')), 'height': int_or_none(media_el.attrib.get('height')), + 'preference': preference, }) self._sort_formats(formats) return formats def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None): + entry_protocol='m3u8', preference=None, + m3u8_id=None, note=None, errnote=None, + fatal=True): formats = [{ - 'format_id': 'm3u8-meta', + 'format_id': '-'.join(filter(None, [m3u8_id, 'meta'])), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': -1, + 'preference': preference - 1 if preference else -1, 'resolution': 'multiple', 'format_note': 'Quality selection URL', }] @@ -691,11 +943,17 @@ class InfoExtractor(object): if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - m3u8_doc = self._download_webpage( + res = self._download_webpage_handle( m3u8_url, video_id, - note='Downloading m3u8 information', - errnote='Failed to download m3u8 information') + note=note or 'Downloading m3u8 information', + errnote=errnote or 'Failed to download m3u8 information', + fatal=fatal) + if res is False: + return res + m3u8_doc, urlh = res + m3u8_url = urlh.geturl() last_info = None + last_media = None kv_rex = re.compile( r'(?P[a-zA-Z_-]+)=(?P"[^"]+"|[^",]+)(?:,|$)') for line in m3u8_doc.splitlines(): @@ -706,6 +964,13 @@ class InfoExtractor(object): if v.startswith('"'): v = v[1:-1] last_info[m.group('key')] = v + elif line.startswith('#EXT-X-MEDIA:'): + last_media = {} + for m in kv_rex.finditer(line): + v = m.group('val') + if v.startswith('"'): + v = v[1:-1] + last_media[m.group('key')] = v elif line.startswith('#') or not line.strip(): continue else: @@ -713,9 +978,13 @@ class InfoExtractor(object): formats.append({'url': format_url(line)}) continue tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) - + format_id = [] + if m3u8_id: + format_id.append(m3u8_id) + last_media_name = last_media.get('NAME') if last_media and last_media.get('TYPE') != 'SUBTITLES' else None + format_id.append(last_media_name if last_media_name else '%d' % (tbr if tbr else len(formats))) f = { - 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), + 'format_id': '-'.join(format_id), 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, @@ -735,11 +1004,246 @@ class InfoExtractor(object): width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) + if last_media is not None: + f['m3u8_media'] = last_media + last_media = None formats.append(f) last_info = {} self._sort_formats(formats) return formats + @staticmethod + def _xpath_ns(path, namespace=None): + if not namespace: + return path + out = [] + for c in path.split('/'): + if not c or c == '.': + out.append(c) + else: + out.append('{%s}%s' % (namespace, c)) + return '/'.join(out) + + def _extract_smil_formats(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + + if smil is False: + assert not fatal + return [] + + namespace = self._parse_smil_namespace(smil) + + return self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + + def _extract_smil_info(self, smil_url, video_id, fatal=True, f4m_params=None): + smil = self._download_smil(smil_url, video_id, fatal=fatal) + if smil is False: + return {} + return self._parse_smil(smil, smil_url, video_id, f4m_params=f4m_params) + + def _download_smil(self, smil_url, video_id, fatal=True): + return self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file', fatal=fatal) + + def _parse_smil(self, smil, smil_url, video_id, f4m_params=None): + namespace = self._parse_smil_namespace(smil) + + formats = self._parse_smil_formats( + smil, smil_url, video_id, namespace=namespace, f4m_params=f4m_params) + subtitles = self._parse_smil_subtitles(smil, namespace=namespace) + + video_id = os.path.splitext(url_basename(smil_url))[0] + title = None + description = None + upload_date = None + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + name = meta.attrib.get('name') + content = meta.attrib.get('content') + if not name or not content: + continue + if not title and name == 'title': + title = content + elif not description and name in ('description', 'abstract'): + description = content + elif not upload_date and name == 'date': + upload_date = unified_strdate(content) + + thumbnails = [{ + 'id': image.get('type'), + 'url': image.get('src'), + 'width': int_or_none(image.get('width')), + 'height': int_or_none(image.get('height')), + } for image in smil.findall(self._xpath_ns('.//image', namespace)) if image.get('src')] + + return { + 'id': video_id, + 'title': title or video_id, + 'description': description, + 'upload_date': upload_date, + 'thumbnails': thumbnails, + 'formats': formats, + 'subtitles': subtitles, + } + + def _parse_smil_namespace(self, smil): + return self._search_regex( + r'(?i)^{([^}]+)?}smil$', smil.tag, 'namespace', default=None) + + def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): + base = smil_url + for meta in smil.findall(self._xpath_ns('./head/meta', namespace)): + b = meta.get('base') or meta.get('httpBase') + if b: + base = b + break + + formats = [] + rtmp_count = 0 + http_count = 0 + + videos = smil.findall(self._xpath_ns('.//video', namespace)) + for video in videos: + src = video.get('src') + if not src: + continue + + bitrate = float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + filesize = int_or_none(video.get('size') or video.get('fileSize')) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + ext = video.get('ext') + src_ext = determine_ext(src) + streamer = video.get('streamer') or base + + if proto == 'rtmp' or streamer.startswith('rtmp'): + rtmp_count += 1 + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + if transform_rtmp_url: + streamer, src = transform_rtmp_url(streamer, src) + formats[-1].update({ + 'url': streamer, + 'play_path': src, + }) + continue + + src_url = src if src.startswith('http') else compat_urlparse.urljoin(base, src) + + if proto == 'm3u8' or src_ext == 'm3u8': + m3u8_formats = self._extract_m3u8_formats( + src_url, video_id, ext or 'mp4', m3u8_id='hls', fatal=False) + if m3u8_formats: + formats.extend(m3u8_formats) + continue + + if src_ext == 'f4m': + f4m_url = src_url + if not f4m_params: + f4m_params = { + 'hdcore': '3.2.0', + 'plugin': 'flowplayer-3.2.0.1', + } + f4m_url += '&' if '?' in f4m_url else '?' + f4m_url += compat_urllib_parse.urlencode(f4m_params) + f4m_formats = self._extract_f4m_formats(f4m_url, video_id, f4m_id='hds', fatal=False) + if f4m_formats: + formats.extend(f4m_formats) + continue + + if src_url.startswith('http') and self._is_valid_url(src, video_id): + http_count += 1 + formats.append({ + 'url': src_url, + 'ext': ext or src_ext or 'flv', + 'format_id': 'http-%d' % (bitrate or http_count), + 'tbr': bitrate, + 'filesize': filesize, + 'width': width, + 'height': height, + }) + continue + + self._sort_formats(formats) + + return formats + + def _parse_smil_subtitles(self, smil, namespace=None, subtitles_lang='en'): + subtitles = {} + for num, textstream in enumerate(smil.findall(self._xpath_ns('.//textstream', namespace))): + src = textstream.get('src') + if not src: + continue + ext = textstream.get('ext') or determine_ext(src) + if not ext: + type_ = textstream.get('type') + SUBTITLES_TYPES = { + 'text/vtt': 'vtt', + 'text/srt': 'srt', + 'application/smptett+xml': 'tt', + } + if type_ in SUBTITLES_TYPES: + ext = SUBTITLES_TYPES[type_] + lang = textstream.get('systemLanguage') or textstream.get('systemLanguageName') or textstream.get('lang') or subtitles_lang + subtitles.setdefault(lang, []).append({ + 'url': src, + 'ext': ext, + }) + return subtitles + + def _extract_xspf_playlist(self, playlist_url, playlist_id, fatal=True): + xspf = self._download_xml( + playlist_url, playlist_id, 'Downloading xpsf playlist', + 'Unable to download xspf manifest', fatal=fatal) + if xspf is False: + return [] + return self._parse_xspf(xspf, playlist_id) + + def _parse_xspf(self, playlist, playlist_id): + NS_MAP = { + 'xspf': 'http://xspf.org/ns/0/', + 's1': 'http://static.streamone.nl/player/ns/0', + } + + entries = [] + for track in playlist.findall(xpath_with_ns('./xspf:trackList/xspf:track', NS_MAP)): + title = xpath_text( + track, xpath_with_ns('./xspf:title', NS_MAP), 'title', default=playlist_id) + description = xpath_text( + track, xpath_with_ns('./xspf:annotation', NS_MAP), 'description') + thumbnail = xpath_text( + track, xpath_with_ns('./xspf:image', NS_MAP), 'thumbnail') + duration = float_or_none( + xpath_text(track, xpath_with_ns('./xspf:duration', NS_MAP), 'duration'), 1000) + + formats = [{ + 'url': location.text, + 'format_id': location.get(xpath_with_ns('s1:label', NS_MAP)), + 'width': int_or_none(location.get(xpath_with_ns('s1:width', NS_MAP))), + 'height': int_or_none(location.get(xpath_with_ns('s1:height', NS_MAP))), + } for location in track.findall(xpath_with_ns('./xspf:location', NS_MAP))] + self._sort_formats(formats) + + entries.append({ + 'id': playlist_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'formats': formats, + }) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -768,11 +1272,87 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res + def _set_cookie(self, domain, name, value, expire_time=None): + cookie = compat_cookiejar.Cookie( + 0, name, value, None, None, domain, None, + None, '/', True, False, expire_time, '', None, None, None) + self._downloader.cookiejar.set_cookie(cookie) + + def _get_cookies(self, url): + """ Return a compat_cookies.SimpleCookie with the cookies for the url """ + req = compat_urllib_request.Request(url) + self._downloader.cookiejar.add_cookie_header(req) + return compat_cookies.SimpleCookie(req.get_header('Cookie')) + + def get_testcases(self, include_onlymatching=False): + t = getattr(self, '_TEST', None) + if t: + assert not hasattr(self, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(self).__name__ + tests = [t] + else: + tests = getattr(self, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue + t['name'] = type(self).__name__[:-len('IE')] + yield t + + def is_suitable(self, age_limit): + """ Test whether the extractor is generally suitable for the given + age limit (i.e. pornographic sites are not, all others usually are) """ + + any_restricted = False + for tc in self.get_testcases(include_onlymatching=False): + if 'playlist' in tc: + tc = tc['playlist'][0] + is_restricted = age_restricted( + tc.get('info_dict', {}).get('age_limit'), age_limit) + if not is_restricted: + return True + any_restricted = any_restricted or is_restricted + return not any_restricted + + def extract_subtitles(self, *args, **kwargs): + if (self._downloader.params.get('writesubtitles', False) or + self._downloader.params.get('listsubtitles')): + return self._get_subtitles(*args, **kwargs) + return {} + + def _get_subtitles(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + + @staticmethod + def _merge_subtitle_items(subtitle_list1, subtitle_list2): + """ Merge subtitle items for one language. Items with duplicated URLs + will be dropped. """ + list1_urls = set([item['url'] for item in subtitle_list1]) + ret = list(subtitle_list1) + ret.extend([item for item in subtitle_list2 if item['url'] not in list1_urls]) + return ret + + @classmethod + def _merge_subtitles(cls, subtitle_dict1, subtitle_dict2): + """ Merge two subtitle dictionaries, language by language. """ + ret = dict(subtitle_dict1) + for lang in subtitle_dict2: + ret[lang] = cls._merge_subtitle_items(subtitle_dict1.get(lang, []), subtitle_dict2[lang]) + return ret + + def extract_automatic_captions(self, *args, **kwargs): + if (self._downloader.params.get('writeautomaticsub', False) or + self._downloader.params.get('listsubtitles')): + return self._get_automatic_captions(*args, **kwargs) + return {} + + def _get_automatic_captions(self, *args, **kwargs): + raise NotImplementedError("This method must be implemented by subclasses") + class SearchInfoExtractor(InfoExtractor): """ Base class for paged search queries extractors. - They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} + They accept URLs in the format _SEARCH_KEY(|all|[0-9]):{query} Instances should define _SEARCH_KEY and _MAX_RESULTS. """