X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/46113edab215c2211a604c06245c16d5d4e57dcf..620b803afb94df95e3761acf1e764a9eb3a4b00c:/youtube_dl/extractor/common.py diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 7e41132..df1a441 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -12,17 +12,22 @@ import sys import time import xml.etree.ElementTree -from ..utils import ( +from ..compat import ( + compat_cookiejar, + compat_HTTPError, compat_http_client, compat_urllib_error, compat_urllib_parse_urlparse, compat_urlparse, compat_str, - +) +from ..utils import ( + age_restricted, clean_html, compiled_regex_type, ExtractorError, float_or_none, + HEADRequest, int_or_none, RegexNotFoundError, sanitize_filename, @@ -38,11 +43,15 @@ class InfoExtractor(object): information about the video (or videos) the URL refers to. This information includes the real video URL, the video title, author and others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this + passed to the YoutubeDL. The YoutubeDL processes this information possibly downloading the video to the file system, among other possible outcomes. - The dictionaries must include the following fields: + The type field determines the the type of the result. + By far the most common value (and the default if _type is missing) is + "video", which indicates a single video. + + For a video, the dictionaries must include the following fields: id: Video identifier. title: Video title, unescaped. @@ -80,12 +89,20 @@ class InfoExtractor(object): * player_url SWF Player URL (used for rtmpdump). * protocol The protocol that will be used for the actual download, lower-case. - "http", "https", "rtsp", "rtmp", "m3u8" or so. + "http", "https", "rtsp", "rtmp", "rtmpe", + "m3u8", or "m3u8_native". * preference Order number of this format. If this field is present and not None, the formats get sorted by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. + < -1000 to hide the format (if there is + another one which is strictly better) + * language_preference Is this in the correct requested + language? + 10 if it's what the URL is about, + -1 for default (don't know), + -10 otherwise, other values reserved for now. * quality Order number of the video quality of this format, irrespective of the file format. -1 for default (order by other properties), @@ -94,12 +111,17 @@ class InfoExtractor(object): (quality takes higher priority) -1 for default (order by other properties), -2 or smaller for less than default. - * http_referer HTTP Referer header value to set. * http_method HTTP method to use for the download. * http_headers A dictionary of additional HTTP headers to add to the request. * http_post_data Additional data to send with a POST request. + * stretched_ratio If given and not 1, indicates that the + video's pixels are not square. + width : height ratio as float. + * no_resume The server does not support resuming the + (HTTP or RTMP) download. Boolean. + url: Final video URL. ext: Video filename extension. format: The video format, defaults to ext (used for --get-format) @@ -107,19 +129,23 @@ class InfoExtractor(object): The following fields are optional: + alt_title: A secondary title of the video. display_id An alternative identifier for the video, not necessarily unique, but available before title. Typically, id is something like "4234987", title "Dancing naked mole rats", and display_id "dancing-naked-mole-rats" thumbnails: A list of dictionaries, with the following entries: + * "id" (optional, string) - Thumbnail format ID * "url" + * "preference" (optional, int) - quality of the image * "width" (optional, int) * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) thumbnail: Full URL to a video thumbnail image. - description: One-line video description. + description: Full video description. uploader: Full name of the video uploader. + creator: The main artist who created the video. timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). If not explicitly set, calculated from timestamp. @@ -132,6 +158,17 @@ class InfoExtractor(object): like_count: Number of positive ratings of the video dislike_count: Number of negative ratings of the video comment_count: Number of comments on the video + comments: A list of comments, each with one or more of the following + properties (all but one of text or html optional): + * "author" - human-readable name of the comment author + * "author_id" - user ID of the comment author + * "id" - Comment ID + * "html" - Comment as HTML + * "text" - Plain text of the comment + * "timestamp" - UNIX timestamp of comment + * "parent" - ID of the comment this one is replying to. + Set to "root" to indicate that this is a + comment to the original video. age_limit: Age restriction for the video, as an integer (years) webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set @@ -145,6 +182,39 @@ class InfoExtractor(object): Unless mentioned otherwise, None is equivalent to absence of information. + + _type "playlist" indicates multiple videos. + There must be a key "entries", which is a list, an iterable, or a PagedList + object, each element of which is a valid dictionary by this specification. + + Additionally, playlists can have "title" and "id" attributes with the same + semantics as videos (see above). + + + _type "multi_video" indicates that there are multiple videos that + form a single show, for examples multiple acts of an opera or TV episode. + It must have an entries key like a playlist and contain all the keys + required for a video at the same time. + + + _type "url" indicates that the video must be extracted from another + location, possibly by a different extractor. Its only required key is: + "url" - the next URL to extract. + The key "ie_key" can be set to the class name (minus the trailing "IE", + e.g. "Youtube") if the extractor class is known in advance. + Additionally, the dictionary may have any properties of the resolved entity + known in advance, for example "title" if the title of the referred video is + known ahead of time. + + + _type "url_transparent" entities have the same specification as "url", but + indicate that the given additional information is more precise than the one + associated with the resolved URL. + This is useful when a site employs a video service that hosts the video and + its technical metadata, but that video service does not embed a useful + title, description etc. + + Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. @@ -254,9 +324,11 @@ class InfoExtractor(object): content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) return (content, urlh) - def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() + if prefix is not None: + webpage_bytes = prefix + webpage_bytes m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) @@ -315,9 +387,19 @@ class InfoExtractor(object): return content - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): + def _download_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, tries=1, timeout=5): """ Returns the data of the page as a string """ - res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + success = False + try_count = 0 + while success is False: + try: + res = self._download_webpage_handle(url_or_request, video_id, note, errnote, fatal) + success = True + except compat_http_client.IncompleteRead as e: + try_count += 1 + if try_count >= tries: + raise e + self._sleep(timeout, video_id) if res is False: return res else: @@ -345,6 +427,10 @@ class InfoExtractor(object): url_or_request, video_id, note, errnote, fatal=fatal) if (not fatal) and json_string is False: return None + return self._parse_json( + json_string, video_id, transform_source=transform_source, fatal=fatal) + + def _parse_json(self, json_string, video_id, transform_source=None, fatal=True): if transform_source: json_string = transform_source(json_string) try: @@ -381,19 +467,20 @@ class InfoExtractor(object): """Report attempt to log in.""" self.to_screen('Logging in') - #Methods for following #608 + # Methods for following #608 @staticmethod def url_result(url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" - #TODO: ie should be the class used for getting the info + # TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} if video_id is not None: video_info['id'] = video_id return video_info + @staticmethod - def playlist_result(entries, playlist_id=None, playlist_title=None): + def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None): """Returns a playlist""" video_info = {'_type': 'playlist', 'entries': entries} @@ -401,9 +488,11 @@ class InfoExtractor(object): video_info['id'] = playlist_id if playlist_title: video_info['title'] = playlist_title + if playlist_description: + video_info['description'] = playlist_description return video_info - def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Perform a regex search on the given string, using a single or a list of patterns returning the first matching group. @@ -424,22 +513,25 @@ class InfoExtractor(object): _name = name if mobj: - # return the first matching group - return next(g for g in mobj.groups() if g is not None) + if group is None: + # return the first matching group + return next(g for g in mobj.groups() if g is not None) + else: + return mobj.group(group) elif default is not _NO_DEFAULT: return default elif fatal: raise RegexNotFoundError('Unable to extract %s' % _name) else: self._downloader.report_warning('unable to extract %s; ' - 'please report this issue on http://yt-dl.org/bug' % _name) + 'please report this issue on http://yt-dl.org/bug' % _name) return None - def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0): + def _html_search_regex(self, pattern, string, name, default=_NO_DEFAULT, fatal=True, flags=0, group=None): """ Like _search_regex, but strips HTML tags and unescapes entities. """ - res = self._search_regex(pattern, string, name, default, fatal, flags) + res = self._search_regex(pattern, string, name, default, fatal, flags, group) if res: return clean_html(res).strip() else: @@ -472,7 +564,7 @@ class InfoExtractor(object): raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) except (IOError, netrc.NetrcParseError) as err: self._downloader.report_warning('parsing .netrc: %s' % compat_str(err)) - + return (username, password) def _get_tfa_info(self): @@ -532,10 +624,10 @@ class InfoExtractor(object): if display_name is None: display_name = name return self._html_search_regex( - r'''(?ix)]+(?:itemprop|name|property)=["\']?%s["\']?) - [^>]+content=["\']([^"\']+)["\']''' % re.escape(name), - html, display_name, fatal=fatal, **kwargs) + r'''(?isx)]+(?:itemprop|name|property)=(["\']?)%s\1) + [^>]+?content=(["\'])(?P.*?)\2''' % re.escape(name), + html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): return self._html_search_meta('dc.creator', html, 'uploader') @@ -566,7 +658,7 @@ class InfoExtractor(object): def _twitter_search_player(self, html): return self._html_search_meta('twitter:player', html, - 'twitter card player') + 'twitter card player') def _sort_formats(self, formats): if not formats: @@ -611,12 +703,13 @@ class InfoExtractor(object): return ( preference, + f.get('language_preference') if f.get('language_preference') is not None else -1, f.get('quality') if f.get('quality') is not None else -1, - f.get('height') if f.get('height') is not None else -1, - f.get('width') if f.get('width') is not None else -1, - ext_preference, f.get('tbr') if f.get('tbr') is not None else -1, f.get('vbr') if f.get('vbr') is not None else -1, + ext_preference, + f.get('height') if f.get('height') is not None else -1, + f.get('width') if f.get('width') is not None else -1, f.get('abr') if f.get('abr') is not None else -1, audio_ext_preference, f.get('fps') if f.get('fps') is not None else -1, @@ -627,6 +720,27 @@ class InfoExtractor(object): ) formats.sort(key=_formats_key) + def _check_formats(self, formats, video_id): + if formats: + formats[:] = filter( + lambda f: self._is_valid_url( + f['url'], video_id, + item='%s video format' % f.get('format_id') if f.get('format_id') else 'video'), + formats) + + def _is_valid_url(self, url, video_id, item='video'): + try: + self._request_webpage( + HEADRequest(url), video_id, + 'Checking %s URL' % item) + return True + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError): + self.report_warning( + '%s URL is invalid, skipping' % item, video_id) + return False + raise + def http_scheme(self): """ Either "http:" or "https:", depending on the user's preferences """ return ( @@ -651,33 +765,41 @@ class InfoExtractor(object): self.to_screen(msg) time.sleep(timeout) - def _extract_f4m_formats(self, manifest_url, video_id): + def _extract_f4m_formats(self, manifest_url, video_id, preference=None, f4m_id=None): manifest = self._download_xml( manifest_url, video_id, 'Downloading f4m manifest', 'Unable to download f4m manifest') formats = [] + manifest_version = '1.0' media_nodes = manifest.findall('{http://ns.adobe.com/f4m/1.0}media') + if not media_nodes: + manifest_version = '2.0' + media_nodes = manifest.findall('{http://ns.adobe.com/f4m/2.0}media') for i, media_el in enumerate(media_nodes): + if manifest_version == '2.0': + manifest_url = ('/'.join(manifest_url.split('/')[:-1]) + '/' + + (media_el.attrib.get('href') or media_el.attrib.get('url'))) tbr = int_or_none(media_el.attrib.get('bitrate')) - format_id = 'f4m-%d' % (i if tbr is None else tbr) formats.append({ - 'format_id': format_id, + 'format_id': '-'.join(filter(None, [f4m_id, 'f4m-%d' % (i if tbr is None else tbr)])), 'url': manifest_url, 'ext': 'flv', 'tbr': tbr, 'width': int_or_none(media_el.attrib.get('width')), 'height': int_or_none(media_el.attrib.get('height')), + 'preference': preference, }) self._sort_formats(formats) return formats def _extract_m3u8_formats(self, m3u8_url, video_id, ext=None, - entry_protocol='m3u8', preference=None): + entry_protocol='m3u8', preference=None, + m3u8_id=None): formats = [{ - 'format_id': 'm3u8-meta', + 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-meta'])), 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', @@ -713,9 +835,8 @@ class InfoExtractor(object): formats.append({'url': format_url(line)}) continue tbr = int_or_none(last_info.get('BANDWIDTH'), scale=1000) - f = { - 'format_id': 'm3u8-%d' % (tbr if tbr else len(formats)), + 'format_id': '-'.join(filter(None, [m3u8_id, 'm3u8-%d' % (tbr if tbr else len(formats))])), 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, @@ -740,6 +861,52 @@ class InfoExtractor(object): self._sort_formats(formats) return formats + # TODO: improve extraction + def _extract_smil_formats(self, smil_url, video_id, fatal=True): + smil = self._download_xml( + smil_url, video_id, 'Downloading SMIL file', + 'Unable to download SMIL file', fatal=fatal) + if smil is False: + assert not fatal + return [] + + base = smil.find('./head/meta').get('base') + + formats = [] + rtmp_count = 0 + for video in smil.findall('./body/switch/video'): + src = video.get('src') + if not src: + continue + bitrate = int_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) + width = int_or_none(video.get('width')) + height = int_or_none(video.get('height')) + proto = video.get('proto') + if not proto: + if base: + if base.startswith('rtmp'): + proto = 'rtmp' + elif base.startswith('http'): + proto = 'http' + ext = video.get('ext') + if proto == 'm3u8': + formats.extend(self._extract_m3u8_formats(src, video_id, ext)) + elif proto == 'rtmp': + rtmp_count += 1 + streamer = video.get('streamer') or base + formats.append({ + 'url': streamer, + 'play_path': src, + 'ext': 'flv', + 'format_id': 'rtmp-%d' % (rtmp_count if bitrate is None else bitrate), + 'tbr': bitrate, + 'width': width, + 'height': height, + }) + self._sort_formats(formats) + + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -768,6 +935,41 @@ class InfoExtractor(object): self._downloader.report_warning(msg) return res + def _set_cookie(self, domain, name, value, expire_time=None): + cookie = compat_cookiejar.Cookie( + 0, name, value, None, None, domain, None, + None, '/', True, False, expire_time, '', None, None, None) + self._downloader.cookiejar.set_cookie(cookie) + + def get_testcases(self, include_onlymatching=False): + t = getattr(self, '_TEST', None) + if t: + assert not hasattr(self, '_TESTS'), \ + '%s has _TEST and _TESTS' % type(self).__name__ + tests = [t] + else: + tests = getattr(self, '_TESTS', []) + for t in tests: + if not include_onlymatching and t.get('only_matching', False): + continue + t['name'] = type(self).__name__[:-len('IE')] + yield t + + def is_suitable(self, age_limit): + """ Test whether the extractor is generally suitable for the given + age limit (i.e. pornographic sites are not, all others usually are) """ + + any_restricted = False + for tc in self.get_testcases(include_onlymatching=False): + if 'playlist' in tc: + tc = tc['playlist'][0] + is_restricted = age_restricted( + tc.get('info_dict', {}).get('age_limit'), age_limit) + if not is_restricted: + return True + any_restricted = any_restricted or is_restricted + return not any_restricted + class SearchInfoExtractor(InfoExtractor): """