X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/179629569ec7a2eda04957287c17fab7e55b1a7f..6d835d4d6903cf5e18ad01844736929e06d16004:/youtube_dl/extractor/common.py?ds=sidebyside diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1d98222..7e41132 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,20 +1,35 @@ +from __future__ import unicode_literals + import base64 +import datetime +import hashlib +import json +import netrc import os import re import socket import sys -import netrc +import time +import xml.etree.ElementTree from ..utils import ( compat_http_client, compat_urllib_error, - compat_urllib_request, + compat_urllib_parse_urlparse, + compat_urlparse, compat_str, clean_html, compiled_regex_type, ExtractorError, + float_or_none, + int_or_none, + RegexNotFoundError, + sanitize_filename, + unescapeHTML, ) +_NO_DEFAULT = object() + class InfoExtractor(object): """Information Extractor class. @@ -30,36 +45,110 @@ class InfoExtractor(object): The dictionaries must include the following fields: id: Video identifier. - url: Final video URL. title: Video title, unescaped. + + Additionally, it must contain either a formats entry or a url one: + + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: + * url Mandatory. The URL of the video file + * ext Will be calculated from url if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz + * vbr Average video bitrate in KBit/s + * fps Frame rate + * vcodec Name of the video codec in use + * container Name of the container format + * filesize The number of bytes, if known in advance + * filesize_approx An estimate for the number of bytes + * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp", "m3u8" or so. + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field, regardless of all other values. + -1 for default (order by other properties), + -2 or smaller for less than default. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. + * source_preference Order number for this video source + (quality takes higher priority) + -1 for default (order by other properties), + -2 or smaller for less than default. + * http_referer HTTP Referer header value to set. + * http_method HTTP method to use for the download. + * http_headers A dictionary of additional HTTP headers + to add to the request. + * http_post_data Additional data to send with a POST + request. + url: Final video URL. ext: Video filename extension. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). The following fields are optional: - format: The video format, defaults to ext (used for --get-format) - thumbnails: A list of dictionaries (with the entries "resolution" and - "url") for the varying thumbnails + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "url" + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height"}, + deprecated) thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. + timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. - location: Physical location of the video. - player_url: SWF Player URL (used for rtmpdump). - subtitles: The subtitle file contents. + location: Physical location where the video was filmed. + subtitles: The subtitle file contents as a dictionary in the format + {language: subtitles}. + duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen - - The fields should all be Unicode strings. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + comment_count: Number of comments on the video + age_limit: Age restriction for the video, as an integer (years) + webpage_url: The url to the video webpage, if given to youtube-dl it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] + is_live: True, False, or None (=unknown). Whether this video is a + live stream that goes on instead of a fixed-length video. + + Unless mentioned otherwise, the fields should be Unicode strings. + + Unless mentioned otherwise, None is equivalent to absence of information. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _real_extract() must return a *list* of information dictionaries as - described above. - Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -76,7 +165,21 @@ class InfoExtractor(object): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None + + @classmethod + def _match_id(cls, url): + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + m = cls._VALID_URL_RE.match(url) + assert m + return m.group('id') @classmethod def working(cls): @@ -106,82 +209,191 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) elif note is not False: - self.to_screen(u'%s: %s' % (video_id, note)) + if video_id is None: + self.to_screen('%s' % (note,)) + else: + self.to_screen('%s: %s' % (video_id, note)) try: - return compat_urllib_request.urlopen(url_or_request) + return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is False: + return False if errnote is None: - errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) - - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): + errnote = 'Unable to download webpage' + errmsg = '%s: %s' % (errnote, compat_str(err)) + if fatal: + raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + else: + self._downloader.report_warning(errmsg) + return False + + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ - urlh = self._request_webpage(url_or_request, video_id, note, errnote) + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, (compat_str, str)): + url_or_request = url_or_request.partition('#')[0] + + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) + if urlh is False: + assert not fatal + return False + content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) + return (content, urlh) + + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br']+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() except AttributeError: url = url_or_request - self.to_screen(u'Dumping request to ' + url) + self.to_screen('Dumping request to ' + url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) - content = webpage_bytes.decode(encoding, 'replace') - return (content, urlh) + if self._downloader.params.get('write_pages', False): + try: + url = url_or_request.get_full_url() + except AttributeError: + url = url_or_request + basen = '%s_%s' % (video_id, url) + if len(basen) > 240: + h = '___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:240 - len(h)] + h + raw_filename = basen + '.dump' + filename = sanitize_filename(raw_filename, restricted=True) + self.to_screen('Saving request to ' + filename) + # Working around MAX_PATH limitation on Windows (see + # http://msdn.microsoft.com/en-us/library/windows/desktop/aa365247(v=vs.85).aspx) + if os.name == 'nt': + absfilepath = os.path.abspath(filename) + if len(absfilepath) > 259: + filename = '\\\\?\\' + absfilepath + with open(filename, 'wb') as outf: + outf.write(webpage_bytes) - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + if ('