X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/33cd347759d6d999325ebf3c69b7ed5692c343b2..415fdb62500dca2e22067a05008dfbf87c75b662:/youtube_dl/extractor/common.py?ds=inline diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index da50abf..49e7540 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,21 +1,28 @@ import base64 +import hashlib +import json import os import re import socket import sys import netrc +import xml.etree.ElementTree from ..utils import ( compat_http_client, compat_urllib_error, - compat_urllib_request, + compat_urllib_parse_urlparse, compat_str, clean_html, compiled_regex_type, ExtractorError, + RegexNotFoundError, + sanitize_filename, unescapeHTML, ) +_NO_DEFAULT = object() + class InfoExtractor(object): """Information Extractor class. @@ -31,36 +38,94 @@ class InfoExtractor(object): The dictionaries must include the following fields: id: Video identifier. - url: Final video URL. title: Video title, unescaped. + + Additionally, it must contain either a formats entry or a url one: + + formats: A list of dictionaries for each format available, ordered + from worst to best quality. + + Potential fields: + * url Mandatory. The URL of the video file + * ext Will be calculated from url if missing + * format A human-readable description of the format + ("mp4 container with h264/opus"). + Calculated from the format_id, width, height. + and format_note fields if missing. + * format_id A short description of the format + ("mp4_h264_opus" or "19"). + Technically optional, but strongly recommended. + * format_note Additional info about the format + ("3D" or "DASH video") + * width Width of the video, if known + * height Height of the video, if known + * resolution Textual description of width and height + * tbr Average bitrate of audio and video in KBit/s + * abr Average audio bitrate in KBit/s + * acodec Name of the audio codec in use + * asr Audio sampling rate in Hertz + * vbr Average video bitrate in KBit/s + * vcodec Name of the video codec in use + * container Name of the container format + * filesize The number of bytes, if known in advance + * player_url SWF Player URL (used for rtmpdump). + * protocol The protocol that will be used for the actual + download, lower-case. + "http", "https", "rtsp", "rtmp", "m3u8" or so. + * preference Order number of this format. If this field is + present and not None, the formats get sorted + by this field, regardless of all other values. + -1 for default (order by other properties), + -2 or smaller for less than default. + * quality Order number of the video quality of this + format, irrespective of the file format. + -1 for default (order by other properties), + -2 or smaller for less than default. + url: Final video URL. ext: Video filename extension. + format: The video format, defaults to ext (used for --get-format) + player_url: SWF Player URL (used for rtmpdump). The following fields are optional: - format: The video format, defaults to ext (used for --get-format) - thumbnails: A list of dictionaries (with the entries "resolution" and - "url") for the varying thumbnails + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "url" + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height"}, + deprecated) thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. + timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location of the video. - player_url: SWF Player URL (used for rtmpdump). - subtitles: The subtitle file contents. + subtitles: The subtitle file contents as a dictionary in the format + {language: subtitles}. + duration: Length of the video in seconds, as an integer. view_count: How many users have watched the video on the platform. - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen - - The fields should all be Unicode strings. + like_count: Number of positive ratings of the video + dislike_count: Number of negative ratings of the video + comment_count: Number of comments on the video + age_limit: Age restriction for the video, as an integer (years) + webpage_url: The url to the video webpage, if given to youtube-dl it + should allow to get the same result again. (It will be set + by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] + + Unless mentioned otherwise, the fields should be Unicode strings. Subclasses of this one should re-define the _real_initialize() and _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _real_extract() must return a *list* of information dictionaries as - described above. - Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -77,7 +142,13 @@ class InfoExtractor(object): @classmethod def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None + + # This does not use has/getattr intentionally - we want to know whether + # we have cached the regexp for *this* class, whereas getattr would also + # match the superclass + if '_VALID_URL_RE' not in cls.__dict__: + cls._VALID_URL_RE = re.compile(cls._VALID_URL) + return cls._VALID_URL_RE.match(url) is not None @classmethod def working(cls): @@ -107,38 +178,63 @@ class InfoExtractor(object): """Real extraction process. Redefine in subclasses.""" pass + @classmethod + def ie_key(cls): + """A string for getting the InfoExtractor with get_info_extractor""" + return cls.__name__[:-2] + @property def IE_NAME(self): return type(self).__name__[:-2] - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns the response handle """ if note is None: self.report_download_webpage(video_id) elif note is not False: - self.to_screen(u'%s: %s' % (video_id, note)) + if video_id is None: + self.to_screen(u'%s' % (note,)) + else: + self.to_screen(u'%s: %s' % (video_id, note)) try: - return compat_urllib_request.urlopen(url_or_request) + return self._downloader.urlopen(url_or_request) except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + if errnote is False: + return False if errnote is None: errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) - - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): + errmsg = u'%s: %s' % (errnote, compat_str(err)) + if fatal: + raise ExtractorError(errmsg, sys.exc_info()[2], cause=err) + else: + self._downloader.report_warning(errmsg) + return False + + def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True): """ Returns a tuple (page content as string, URL handle) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, (compat_str, str)): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal) + if urlh is False: + assert not fatal + return False content_type = urlh.headers.get('Content-Type', '') + webpage_bytes = urlh.read() m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) if m: encoding = m.group(1) else: - encoding = 'utf-8' - webpage_bytes = urlh.read() + m = re.search(br']+charset=[\'"]?([^\'")]+)[ /\'">]', + webpage_bytes[:1024]) + if m: + encoding = m.group(1).decode('ascii') + elif webpage_bytes.startswith(b'\xff\xfe'): + encoding = 'utf-16' + else: + encoding = 'utf-8' if self._downloader.params.get('dump_intermediate_pages', False): try: url = url_or_request.get_full_url() @@ -147,12 +243,75 @@ class InfoExtractor(object): self.to_screen(u'Dumping request to ' + url) dump = base64.b64encode(webpage_bytes).decode('ascii') self._downloader.to_screen(dump) - content = webpage_bytes.decode(encoding, 'replace') + if self._downloader.params.get('write_pages', False): + try: + url = url_or_request.get_full_url() + except AttributeError: + url = url_or_request + basen = '%s_%s' % (video_id, url) + if len(basen) > 240: + h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:240 - len(h)] + h + raw_filename = basen + '.dump' + filename = sanitize_filename(raw_filename, restricted=True) + self.to_screen(u'Saving request to ' + filename) + with open(filename, 'wb') as outf: + outf.write(webpage_bytes) + + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + if (u'Access to this site is blocked' in content and + u'Websense' in content[:512]): + msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'