X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/af478477605bdf3f5d57562035885cfee905f379..482ffa3c82c37471f3abc23d0ce3d0ab730226a7:/youtube_dl/extractor/common.py diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 84fca8b..e686573 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,11 +1,12 @@ import base64 import hashlib import json +import netrc import os import re import socket import sys -import netrc +import time import xml.etree.ElementTree from ..utils import ( @@ -74,7 +75,7 @@ class InfoExtractor(object): "http", "https", "rtsp", "rtmp", "m3u8" or so. * preference Order number of this format. If this field is present and not None, the formats get sorted - by this field. + by this field, regardless of all other values. -1 for default (order by other properties), -2 or smaller for less than default. * quality Order number of the video quality of this @@ -88,12 +89,22 @@ class InfoExtractor(object): The following fields are optional: - thumbnails: A list of dictionaries (with the entries "resolution" and - "url") for the varying thumbnails + display_id An alternative identifier for the video, not necessarily + unique, but available before title. Typically, id is + something like "4234987", title "Dancing naked mole rats", + and display_id "dancing-naked-mole-rats" + thumbnails: A list of dictionaries, with the following entries: + * "url" + * "width" (optional, int) + * "height" (optional, int) + * "resolution" (optional, string "{width}x{height"}, + deprecated) thumbnail: Full URL to a video thumbnail image. description: One-line video description. uploader: Full name of the video uploader. + timestamp: UNIX timestamp of the moment the video became available. upload_date: Video upload date (YYYYMMDD). + If not explicitly set, calculated from timestamp. uploader_id: Nickname or id of the video uploader. location: Physical location of the video. subtitles: The subtitle file contents as a dictionary in the format @@ -107,6 +118,8 @@ class InfoExtractor(object): webpage_url: The url to the video webpage, if given to youtube-dl it should allow to get the same result again. (It will be set by YoutubeDL if it's missing) + categories: A list of categories that the video falls in, for example + ["Sports", "Berlin"] Unless mentioned otherwise, the fields should be Unicode strings. @@ -114,9 +127,6 @@ class InfoExtractor(object): _real_extract() methods and define a _VALID_URL regexp. Probably, they should also be added to the list of extractors. - _real_extract() must return a *list* of information dictionaries as - described above. - Finally, the _WORKING attribute should be set to False for broken IEs in order to warn the users and skip the tests. """ @@ -239,16 +249,31 @@ class InfoExtractor(object): url = url_or_request.get_full_url() except AttributeError: url = url_or_request - if len(url) > 200: - h = u'___' + hashlib.md5(url.encode('utf-8')).hexdigest() - url = url[:200 - len(h)] + h - raw_filename = ('%s_%s.dump' % (video_id, url)) + basen = '%s_%s' % (video_id, url) + if len(basen) > 240: + h = u'___' + hashlib.md5(basen.encode('utf-8')).hexdigest() + basen = basen[:240 - len(h)] + h + raw_filename = basen + '.dump' filename = sanitize_filename(raw_filename, restricted=True) self.to_screen(u'Saving request to ' + filename) with open(filename, 'wb') as outf: outf.write(webpage_bytes) - content = webpage_bytes.decode(encoding, 'replace') + try: + content = webpage_bytes.decode(encoding, 'replace') + except LookupError: + content = webpage_bytes.decode('utf-8', 'replace') + + if (u'Access to this site is blocked' in content and + u'Websense' in content[:512]): + msg = u'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'