X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/c924a4596d685b1d61cf7f28e242e2e492678b1f..67792219d78d4576f8281748a441d056fa6fdb93:/youtube_dl/InfoExtractors.py diff --git a/youtube_dl/InfoExtractors.py b/youtube_dl/InfoExtractors.py index d318b4b..672ef9e 100755 --- a/youtube_dl/InfoExtractors.py +++ b/youtube_dl/InfoExtractors.py @@ -1,4456 +1,4 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +# Legacy file for backwards compatibility, use youtube_dl.extractor instead! -from __future__ import absolute_import - -import base64 -import datetime -import itertools -import netrc -import os -import re -import socket -import time -import email.utils -import xml.etree.ElementTree -import random -import math -import operator -import hashlib -import binascii -import urllib - -from .utils import * - - -class InfoExtractor(object): - """Information Extractor class. - - Information extractors are the classes that, given a URL, extract - information about the video (or videos) the URL refers to. This - information includes the real video URL, the video title, author and - others. The information is stored in a dictionary which is then - passed to the FileDownloader. The FileDownloader processes this - information possibly downloading the video to the file system, among - other possible outcomes. - - The dictionaries must include the following fields: - - id: Video identifier. - url: Final video URL. - title: Video title, unescaped. - ext: Video filename extension. - - The following fields are optional: - - format: The video format, defaults to ext (used for --get-format) - thumbnail: Full URL to a video thumbnail image. - description: One-line video description. - uploader: Full name of the video uploader. - upload_date: Video upload date (YYYYMMDD). - uploader_id: Nickname or id of the video uploader. - location: Physical location of the video. - player_url: SWF Player URL (used for rtmpdump). - subtitles: The subtitle file contents. - urlhandle: [internal] The urlHandle to be used to download the file, - like returned by urllib.request.urlopen - - The fields should all be Unicode strings. - - Subclasses of this one should re-define the _real_initialize() and - _real_extract() methods and define a _VALID_URL regexp. - Probably, they should also be added to the list of extractors. - - _real_extract() must return a *list* of information dictionaries as - described above. - - Finally, the _WORKING attribute should be set to False for broken IEs - in order to warn the users and skip the tests. - """ - - _ready = False - _downloader = None - _WORKING = True - - def __init__(self, downloader=None): - """Constructor. Receives an optional downloader.""" - self._ready = False - self.set_downloader(downloader) - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url) is not None - - @classmethod - def working(cls): - """Getter method for _WORKING.""" - return cls._WORKING - - def initialize(self): - """Initializes an instance (authentication, etc).""" - if not self._ready: - self._real_initialize() - self._ready = True - - def extract(self, url): - """Extracts URL information and returns it in list of dicts.""" - self.initialize() - return self._real_extract(url) - - def set_downloader(self, downloader): - """Sets the downloader for this IE.""" - self._downloader = downloader - - def _real_initialize(self): - """Real initialization process. Redefine in subclasses.""" - pass - - def _real_extract(self, url): - """Real extraction process. Redefine in subclasses.""" - pass - - @property - def IE_NAME(self): - return type(self).__name__[:-2] - - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None): - """ Returns the response handle """ - if note is None: - self.report_download_webpage(video_id) - elif note is not False: - self.to_screen(u'%s: %s' % (video_id, note)) - try: - return compat_urllib_request.urlopen(url_or_request) - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - if errnote is None: - errnote = u'Unable to download webpage' - raise ExtractorError(u'%s: %s' % (errnote, compat_str(err)), sys.exc_info()[2]) - - def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): - """ Returns a tuple (page content as string, URL handle) """ - urlh = self._request_webpage(url_or_request, video_id, note, errnote) - content_type = urlh.headers.get('Content-Type', '') - m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) - if m: - encoding = m.group(1) - else: - encoding = 'utf-8' - webpage_bytes = urlh.read() - if self._downloader.params.get('dump_intermediate_pages', False): - try: - url = url_or_request.get_full_url() - except AttributeError: - url = url_or_request - self.to_screen(u'Dumping request to ' + url) - dump = base64.b64encode(webpage_bytes).decode('ascii') - self._downloader.to_screen(dump) - content = webpage_bytes.decode(encoding, 'replace') - return (content, urlh) - - def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): - """ Returns the data of the page as a string """ - return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] - - def to_screen(self, msg): - """Print msg to screen, prefixing it with '[ie_name]'""" - self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg)) - - def report_extraction(self, id_or_name): - """Report information extraction.""" - self.to_screen(u'%s: Extracting information' % id_or_name) - - def report_download_webpage(self, video_id): - """Report webpage download.""" - self.to_screen(u'%s: Downloading webpage' % video_id) - - def report_age_confirmation(self): - """Report attempt to confirm age.""" - self.to_screen(u'Confirming age') - - #Methods for following #608 - #They set the correct value of the '_type' key - def video_result(self, video_info): - """Returns a video""" - video_info['_type'] = 'video' - return video_info - def url_result(self, url, ie=None): - """Returns a url that points to a page that should be processed""" - #TODO: ie should be the class used for getting the info - video_info = {'_type': 'url', - 'url': url, - 'ie_key': ie} - return video_info - def playlist_result(self, entries, playlist_id=None, playlist_title=None): - """Returns a playlist""" - video_info = {'_type': 'playlist', - 'entries': entries} - if playlist_id: - video_info['id'] = playlist_id - if playlist_title: - video_info['title'] = playlist_title - return video_info - -class SearchInfoExtractor(InfoExtractor): - """ - Base class for paged search queries extractors. - They accept urls in the format _SEARCH_KEY(|all|[0-9]):{query} - Instances should define _SEARCH_KEY and _MAX_RESULTS. - """ - - @classmethod - def _make_valid_url(cls): - return r'%s(?P|[1-9][0-9]*|all):(?P[\s\S]+)' % cls._SEARCH_KEY - - @classmethod - def suitable(cls, url): - return re.match(cls._make_valid_url(), url) is not None - - def _real_extract(self, query): - mobj = re.match(self._make_valid_url(), query) - if mobj is None: - raise ExtractorError(u'Invalid search query "%s"' % query) - - prefix = mobj.group('prefix') - query = mobj.group('query') - if prefix == '': - return self._get_n_results(query, 1) - elif prefix == 'all': - return self._get_n_results(query, self._MAX_RESULTS) - else: - n = int(prefix) - if n <= 0: - raise ExtractorError(u'invalid download number %s for query "%s"' % (n, query)) - elif n > self._MAX_RESULTS: - self._downloader.report_warning(u'%s returns max %i results (you requested %i)' % (self._SEARCH_KEY, self._MAX_RESULTS, n)) - n = self._MAX_RESULTS - return self._get_n_results(query, n) - - def _get_n_results(self, query, n): - """Get a specified number of results for a query""" - raise NotImplementedError("This method must be implemented by sublclasses") - - -class YoutubeIE(InfoExtractor): - """Information extractor for youtube.com.""" - - _VALID_URL = r"""^ - ( - (?:https?://)? # http(s):// (optional) - (?:youtu\.be/|(?:\w+\.)?youtube(?:-nocookie)?\.com/| - tube\.majestyc\.net/) # the various hostnames, with wildcard subdomains - (?:.*?\#/)? # handle anchor (#/) redirect urls - (?: # the various things that can precede the ID: - (?:(?:v|embed|e)/) # v/ or embed/ or e/ - |(?: # or the v= param in all its forms - (?:watch(?:_popup)?(?:\.php)?)? # preceding watch(_popup|.php) or nothing (like /?v=xxxx) - (?:\?|\#!?) # the params delimiter ? or # or #! - (?:.*?&)? # any other preceding param (like /?s=tuff&v=xxxx) - v= - ) - )? # optional -> youtube.com/xxxx is OK - )? # all until now is optional -> you can pass the naked ID - ([0-9A-Za-z_-]+) # here is it! the YouTube video ID - (?(1).+)? # if we found the ID, everything can follow - $""" - _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1' - _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' - _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en' - _NEXT_URL_RE = r'[\?&]next_url=([^&]+)' - _NETRC_MACHINE = 'youtube' - # Listed in order of quality - _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13'] - _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13'] - _video_extensions = { - '13': '3gp', - '17': 'mp4', - '18': 'mp4', - '22': 'mp4', - '37': 'mp4', - '38': 'video', # You actually don't know if this will be MOV, AVI or whatever - '43': 'webm', - '44': 'webm', - '45': 'webm', - '46': 'webm', - } - _video_dimensions = { - '5': '240x400', - '6': '???', - '13': '???', - '17': '144x176', - '18': '360x640', - '22': '720x1280', - '34': '360x640', - '35': '480x854', - '37': '1080x1920', - '38': '3072x4096', - '43': '360x640', - '44': '480x854', - '45': '720x1280', - '46': '1080x1920', - } - IE_NAME = u'youtube' - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - if YoutubePlaylistIE.suitable(url): return False - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - - def report_lang(self): - """Report attempt to set language.""" - self.to_screen(u'Setting language') - - def report_login(self): - """Report attempt to log in.""" - self.to_screen(u'Logging in') - - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self.to_screen(u'%s: Downloading video webpage' % video_id) - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video info webpage' % video_id) - - def report_video_subtitles_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Checking available subtitles' % video_id) - - def report_video_subtitles_request(self, video_id, sub_lang, format): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video subtitles for %s.%s' % (video_id, sub_lang, format)) - - def report_video_subtitles_available(self, video_id, sub_lang_list): - """Report available subtitles.""" - sub_lang = ",".join(list(sub_lang_list.keys())) - self.to_screen(u'%s: Available subtitles for video: %s' % (video_id, sub_lang)) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen(u'%s: Extracting video information' % video_id) - - def report_unavailable_format(self, video_id, format): - """Report extracted video URL.""" - self.to_screen(u'%s: Format %s not available' % (video_id, format)) - - def report_rtmp_download(self): - """Indicate the download will use the RTMP protocol.""" - self.to_screen(u'RTMP download detected') - - def _get_available_subtitles(self, video_id): - self.report_video_subtitles_download(video_id) - request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id) - try: - sub_list = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None) - sub_lang_list = re.findall(r'name="([^"]*)"[^>]+lang_code="([\w\-]+)"', sub_list) - sub_lang_list = dict((l[1], l[0]) for l in sub_lang_list) - if not sub_lang_list: - return (u'video doesn\'t have subtitles', None) - return sub_lang_list - - def _list_available_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - self.report_video_subtitles_available(video_id, sub_lang_list) - - def _request_subtitle(self, sub_lang, sub_name, video_id, format): - """ - Return tuple: - (error_message, sub_lang, sub) - """ - self.report_video_subtitles_request(video_id, sub_lang, format) - params = compat_urllib_parse.urlencode({ - 'lang': sub_lang, - 'name': sub_name, - 'v': video_id, - 'fmt': format, - }) - url = 'http://www.youtube.com/api/timedtext?' + params - try: - sub = compat_urllib_request.urlopen(url).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - return (u'unable to download video subtitles: %s' % compat_str(err), None, None) - if not sub: - return (u'Did not fetch video subtitles', None, None) - return (None, sub_lang, sub) - - def _extract_subtitle(self, video_id): - """ - Return a list with a tuple: - [(error_message, sub_lang, sub)] - """ - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - if self._downloader.params.get('subtitleslang', False): - sub_lang = self._downloader.params.get('subtitleslang') - elif 'en' in sub_lang_list: - sub_lang = 'en' - else: - sub_lang = list(sub_lang_list.keys())[0] - if not sub_lang in sub_lang_list: - return [(u'no closed captions found in the specified language "%s"' % sub_lang, None, None)] - - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - return [subtitle] - - def _extract_all_subtitles(self, video_id): - sub_lang_list = self._get_available_subtitles(video_id) - sub_format = self._downloader.params.get('subtitlesformat') - if isinstance(sub_lang_list,tuple): #There was some error, it didn't get the available subtitles - return [(sub_lang_list[0], None, None)] - subtitles = [] - for sub_lang in sub_lang_list: - subtitle = self._request_subtitle(sub_lang, sub_lang_list[sub_lang].encode('utf-8'), video_id, sub_format) - subtitles.append(subtitle) - return subtitles - - def _print_formats(self, formats): - print('Available formats:') - for x in formats: - print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???'))) - - def _real_initialize(self): - if self._downloader is None: - return - - username = None - password = None - downloader_params = self._downloader.params - - # Attempt to use provided username and password or .netrc data - if downloader_params.get('username', None) is not None: - username = downloader_params['username'] - password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning(u'parsing .netrc: %s' % compat_str(err)) - return - - # Set language - request = compat_urllib_request.Request(self._LANG_URL) - try: - self.report_lang() - compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to set language: %s' % compat_str(err)) - return - - # No authentication to be performed - if username is None: - return - - request = compat_urllib_request.Request(self._LOGIN_URL) - try: - login_page = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err)) - return - - galx = None - dsh = None - match = re.search(re.compile(r']* id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) - return - - # Confirm age - age_form = { - 'next_url': '/', - 'action_confirm': 'Confirm', - } - request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form)) - try: - self.report_age_confirmation() - age_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - - def _extract_id(self, url): - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(2) - return video_id - - def _real_extract(self, url): - # Extract original video URL from URL with redirection, like age verification, using next_url parameter - mobj = re.search(self._NEXT_URL_RE, url) - if mobj: - url = 'https://www.youtube.com/' + compat_urllib_parse.unquote(mobj.group(1)).lstrip('/') - video_id = self._extract_id(url) - - # Get video webpage - self.report_video_webpage_download(video_id) - url = 'https://www.youtube.com/watch?v=%s&gl=US&hl=en&has_verified=1' % video_id - request = compat_urllib_request.Request(url) - try: - video_webpage_bytes = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) - - video_webpage = video_webpage_bytes.decode('utf-8', 'ignore') - - # Attempt to extract SWF player URL - mobj = re.search(r'swfConfig.*?"(http:\\/\\/.*?watch.*?-.*?\.swf)"', video_webpage) - if mobj is not None: - player_url = re.sub(r'\\(.)', r'\1', mobj.group(1)) - else: - player_url = None - - # Get video info - self.report_video_info_webpage_download(video_id) - for el_type in ['&el=embedded', '&el=detailpage', '&el=vevo', '']: - video_info_url = ('https://www.youtube.com/get_video_info?&video_id=%s%s&ps=default&eurl=&gl=US&hl=en' - % (video_id, el_type)) - video_info_webpage = self._download_webpage(video_info_url, video_id, - note=False, - errnote='unable to download video info webpage') - video_info = compat_parse_qs(video_info_webpage) - if 'token' in video_info: - break - if 'token' not in video_info: - if 'reason' in video_info: - raise ExtractorError(u'YouTube said: %s' % video_info['reason'][0]) - else: - raise ExtractorError(u'"token" parameter not in video info for unknown reason') - - # Check for "rental" videos - if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError(u'"rental" videos not supported') - - # Start extracting information - self.report_information_extraction(video_id) - - # uploader - if 'author' not in video_info: - raise ExtractorError(u'Unable to extract uploader name') - video_uploader = compat_urllib_parse.unquote_plus(video_info['author'][0]) - - # uploader_id - video_uploader_id = None - mobj = re.search(r'', video_webpage) - if mobj is not None: - video_uploader_id = mobj.group(1) - else: - self._downloader.report_warning(u'unable to extract uploader nickname') - - # title - if 'title' not in video_info: - raise ExtractorError(u'Unable to extract video title') - video_title = compat_urllib_parse.unquote_plus(video_info['title'][0]) - - # thumbnail image - if 'thumbnail_url' not in video_info: - self._downloader.report_warning(u'unable to extract video thumbnail') - video_thumbnail = '' - else: # don't panic if we can't find it - video_thumbnail = compat_urllib_parse.unquote_plus(video_info['thumbnail_url'][0]) - - # upload date - upload_date = None - mobj = re.search(r'id="eow-date.*?>(.*?)', video_webpage, re.DOTALL) - if mobj is not None: - upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split()) - upload_date = unified_strdate(upload_date) - - # description - video_description = get_element_by_id("eow-description", video_webpage) - if video_description: - video_description = clean_html(video_description) - else: - fd_mobj = re.search(r'= 1: - url_map = {} - for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','): - url_data = compat_parse_qs(url_data_str) - if 'itag' in url_data and 'url' in url_data: - url = url_data['url'][0] + '&signature=' + url_data['sig'][0] - if not 'ratebypass' in url: url += '&ratebypass=yes' - url_map[url_data['itag'][0]] = url - - format_limit = self._downloader.params.get('format_limit', None) - available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats - if format_limit is not None and format_limit in available_formats: - format_list = available_formats[available_formats.index(format_limit):] - else: - format_list = available_formats - existing_formats = [x for x in format_list if x in url_map] - if len(existing_formats) == 0: - raise ExtractorError(u'no known formats available for video') - if self._downloader.params.get('listformats', None): - self._print_formats(existing_formats) - return - if req_format is None or req_format == 'best': - video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality - elif req_format == 'worst': - video_url_list = [(existing_formats[len(existing_formats)-1], url_map[existing_formats[len(existing_formats)-1]])] # worst quality - elif req_format in ('-1', 'all'): - video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats - else: - # Specific formats. We pick the first in a slash-delimeted sequence. - # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'. - req_formats = req_format.split('/') - video_url_list = None - for rf in req_formats: - if rf in url_map: - video_url_list = [(rf, url_map[rf])] - break - if video_url_list is None: - raise ExtractorError(u'requested format not available') - else: - raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info') - - results = [] - for format_param, video_real_url in video_url_list: - # Extension - video_extension = self._video_extensions.get(format_param, 'flv') - - video_format = '{0} - {1}'.format(format_param if format_param else video_extension, - self._video_dimensions.get(format_param, '???')) - - results.append({ - 'id': video_id, - 'url': video_real_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': upload_date, - 'title': video_title, - 'ext': video_extension, - 'format': video_format, - 'thumbnail': video_thumbnail, - 'description': video_description, - 'player_url': player_url, - 'subtitles': video_subtitles, - 'duration': video_duration - }) - return results - - -class MetacafeIE(InfoExtractor): - """Information Extractor for metacafe.com.""" - - _VALID_URL = r'(?:http://)?(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' - _DISCLAIMER = 'http://www.metacafe.com/family_filter/' - _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' - IE_NAME = u'metacafe' - - def report_disclaimer(self): - """Report disclaimer retrieval.""" - self.to_screen(u'Retrieving disclaimer') - - def _real_initialize(self): - # Retrieve disclaimer - request = compat_urllib_request.Request(self._DISCLAIMER) - try: - self.report_disclaimer() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve disclaimer: %s' % compat_str(err)) - - # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = compat_urllib_request.Request(self._FILTER_POST, compat_urllib_parse.urlencode(disclaimer_form)) - try: - self.report_age_confirmation() - disclaimer = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err)) - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1) - - # Check if video comes from YouTube - mobj2 = re.match(r'^yt-(.*)$', video_id) - if mobj2 is not None: - return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] - - # Retrieve video webpage to extract further information - webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) - if mobj is not None: - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - video_extension = mediaURL[-3:] - - # Extract gdaKey if available - mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) - if mobj is None: - video_url = mediaURL - else: - gdaKey = mobj.group(1) - video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) - else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - vardict = compat_parse_qs(mobj.group(1)) - if 'mediaData' not in vardict: - raise ExtractorError(u'Unable to extract media URL') - mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) - - mobj = re.search(r'(?im)(.*) - Video', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'submitter=(.*?);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader nickname') - video_uploader = mobj.group(1) - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - -class DailymotionIE(InfoExtractor): - """Information Extractor for Dailymotion""" - - _VALID_URL = r'(?i)(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/video/([^/]+)' - IE_NAME = u'dailymotion' - - def _real_extract(self, url): - # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group(1).split('_')[0].split('?')[0] - - video_extension = 'mp4' - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url) - request.add_header('Cookie', 'family_filter=off') - webpage = self._download_webpage(request, video_id) - - # Extract URL, uploader and title from webpage - self.report_extraction(video_id) - mobj = re.search(r'\s*var flashvars = (.*)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - flashvars = compat_urllib_parse.unquote(mobj.group(1)) - - for key in ['hd1080URL', 'hd720URL', 'hqURL', 'sdURL', 'ldURL', 'video_url']: - if key in flashvars: - max_quality = key - self.to_screen(u'Using %s' % key) - break - else: - raise ExtractorError(u'Unable to extract video URL') - - mobj = re.search(r'"' + max_quality + r'":"(.+?)"', flashvars) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - - video_url = compat_urllib_parse.unquote(mobj.group(1)).replace('\\/', '/') - - # TODO: support choosing qualities - - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = unescapeHTML(mobj.group('title')) - - video_uploader = None - mobj = re.search(r'(?im)[^<]+?]+?>([^<]+?)', webpage) - if mobj is None: - # lookin for official user - mobj_official = re.search(r'', webpage) - if mobj_official is None: - self._downloader.report_warning(u'unable to extract uploader nickname') - else: - video_uploader = mobj_official.group(1) - else: - video_uploader = mobj.group(1) - - video_upload_date = None - mobj = re.search(r'
([0-9]{2})-([0-9]{2})-([0-9]{4})
', webpage) - if mobj is not None: - video_upload_date = mobj.group(3) + mobj.group(2) + mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - }] - - -class PhotobucketIE(InfoExtractor): - """Information extractor for photobucket.com.""" - - # TODO: the original _VALID_URL was: - # r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*[\?\&]current=(.*\.flv)' - # Check if it's necessary to keep the old extracion process - _VALID_URL = r'(?:http://)?(?:[a-z0-9]+\.)?photobucket\.com/.*(([\?\&]current=)|_)(?P.*)\.(?P(flv)|(mp4))' - IE_NAME = u'photobucket' - - def _real_extract(self, url): - # Extract id from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('id') - - video_extension = mobj.group('ext') - - # Retrieve video webpage to extract further information - webpage = self._download_webpage(url, video_id) - - # Extract URL, uploader, and title from webpage - self.report_extraction(video_id) - # We try first by looking the javascript code: - mobj = re.search(r'Pb\.Data\.Shared\.put\(Pb\.Data\.Shared\.MEDIA, (?P.*?)\);', webpage) - if mobj is not None: - info = json.loads(mobj.group('json')) - return [{ - 'id': video_id, - 'url': info[u'downloadUrl'], - 'uploader': info[u'username'], - 'upload_date': datetime.date.fromtimestamp(info[u'creationDate']).strftime('%Y%m%d'), - 'title': info[u'title'], - 'ext': video_extension, - 'thumbnail': info[u'thumbUrl'], - }] - - # We try looking in other parts of the webpage - mobj = re.search(r'', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - - video_url = mediaURL - - mobj = re.search(r'(.*) video by (.*) - Photobucket', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') - - video_uploader = mobj.group(2).decode('utf-8') - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader, - 'upload_date': None, - 'title': video_title, - 'ext': video_extension.decode('utf-8'), - }] - - -class YahooIE(InfoExtractor): - """Information extractor for screen.yahoo.com.""" - _VALID_URL = r'http://screen\.yahoo\.com/.*?-(?P\d*?)\.html' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - m_id = re.search(r'YUI\.namespace\("Media"\)\.CONTENT_ID = "(?P.+?)";', webpage) - - if m_id is None: - # TODO: Check which url parameters are required - info_url = 'http://cosmos.bcst.yahoo.com/rest/v2/pops;lmsoverride=1;outputformat=mrss;cb=974419660;id=%s;rd=news.yahoo.com;datacontext=mdb;lg=KCa2IihxG3qE60vQ7HtyUy' % video_id - webpage = self._download_webpage(info_url, video_id, u'Downloading info webpage') - info_re = r'''<!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .*?)\ .*\]\]>.* - https?://)?(?:(?:www|player)\.)?vimeo\.com/(?:(?:groups|album)/[^/]+/)?(?Pplay_redirect_hls\?clip_id=)?(?:videos?/)?(?P[0-9]+)' - IE_NAME = u'vimeo' - - def _real_extract(self, url, new_video=True): - # Extract ID from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('id') - if not mobj.group('proto'): - url = 'https://' + url - if mobj.group('direct_link'): - url = 'https://vimeo.com/' + video_id - - # Retrieve video webpage to extract further information - request = compat_urllib_request.Request(url, None, std_headers) - webpage = self._download_webpage(request, video_id) - - # Now we begin extracting as much information as we can from what we - # retrieved. First we extract the information common to all extractors, - # and latter we extract those that are Vimeo specific. - self.report_extraction(video_id) - - # Extract the config JSON - try: - config = webpage.split(' = {config:')[1].split(',assets:')[0] - config = json.loads(config) - except: - if re.search('The creator of this video has not given you permission to embed it on this domain.', webpage): - raise ExtractorError(u'The author has restricted the access to this video, try with the "--referer" option') - else: - raise ExtractorError(u'Unable to extract info section') - - # Extract title - video_title = config["video"]["title"] - - # Extract uploader and uploader_id - video_uploader = config["video"]["owner"]["name"] - video_uploader_id = config["video"]["owner"]["url"].split('/')[-1] - - # Extract video thumbnail - video_thumbnail = config["video"]["thumbnail"] - - # Extract video description - video_description = get_element_by_attribute("itemprop", "description", webpage) - if video_description: video_description = clean_html(video_description) - else: video_description = u'' - - # Extract upload date - video_upload_date = None - mobj = re.search(r' 0: - video_quality = files[quality][0][2] - video_codec = files[quality][0][0] - video_extension = files[quality][0][1] - self.to_screen(u'%s: Downloading %s file at %s quality' % (video_id, video_codec.upper(), video_quality)) - break - else: - raise ExtractorError(u'No known codec found') - - video_url = "http://player.vimeo.com/play_redirect?clip_id=%s&sig=%s&time=%s&quality=%s&codecs=%s&type=moogaloop_local&embed_location=" \ - %(video_id, sig, timestamp, video_quality, video_codec.upper()) - - return [{ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'uploader_id': video_uploader_id, - 'upload_date': video_upload_date, - 'title': video_title, - 'ext': video_extension, - 'thumbnail': video_thumbnail, - 'description': video_description, - }] - - -class ArteTvIE(InfoExtractor): - """arte.tv information extractor.""" - - _VALID_URL = r'(?:http://)?videos\.arte\.tv/(?:fr|de)/videos/.*' - _LIVE_URL = r'index-[0-9]+\.html$' - - IE_NAME = u'arte.tv' - - def fetch_webpage(self, url): - request = compat_urllib_request.Request(url) - try: - self.report_download_webpage(url) - webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to retrieve video webpage: %s' % compat_str(err)) - except ValueError as err: - raise ExtractorError(u'Invalid URL: %s' % url) - return webpage - - def grep_webpage(self, url, regex, regexFlags, matchTuples): - page = self.fetch_webpage(url) - mobj = re.search(regex, page, regexFlags) - info = {} - - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - for (i, key, err) in matchTuples: - if mobj.group(i) is None: - raise ExtractorError(err) - else: - info[key] = mobj.group(i) - - return info - - def extractLiveStream(self, url): - video_lang = url.split('/')[-4] - info = self.grep_webpage( - url, - r'src="(.*?/videothek_js.*?\.js)', - 0, - [ - (1, 'url', u'Invalid URL: %s' % url) - ] - ) - http_host = url.split('/')[2] - next_url = 'http://%s%s' % (http_host, compat_urllib_parse.unquote(info.get('url'))) - info = self.grep_webpage( - next_url, - r'(s_artestras_scst_geoFRDE_' + video_lang + '.*?)\'.*?' + - '(http://.*?\.swf).*?' + - '(rtmp://.*?)\'', - re.DOTALL, - [ - (1, 'path', u'could not extract video path: %s' % url), - (2, 'player', u'could not extract video player: %s' % url), - (3, 'url', u'could not extract video url: %s' % url) - ] - ) - video_url = u'%s/%s' % (info.get('url'), info.get('path')) - - def extractPlus7Stream(self, url): - video_lang = url.split('/')[-3] - info = self.grep_webpage( - url, - r'param name="movie".*?videorefFileUrl=(http[^\'"&]*)', - 0, - [ - (1, 'url', u'Invalid URL: %s' % url) - ] - ) - next_url = compat_urllib_parse.unquote(info.get('url')) - info = self.grep_webpage( - next_url, - r'
""" - mobj = re.search(_title, webpage_src) - if mobj is not None: - title = mobj.group(1) - thumbnail = None - - results = [{ - 'id': video_id, - 'url' : video_url, - 'title' : title, - 'thumbnail' : thumbnail, - 'ext' : ext, - }] - return results - -class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P[^/]+)$' - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') - - webpage = self._download_webpage(url, video_id) - m = re.search(r'', webpage) - if not m: - raise ExtractorError(u'Cannot find metadata') - json_data = m.group(1) - - try: - data = json.loads(json_data) - except ValueError as e: - raise ExtractorError(u'Invalid JSON: ' + str(e)) - - video_url = data['akamai_url'] + '&cbr=256' - url_parts = compat_urllib_parse_urlparse(video_url) - video_ext = url_parts.path.rpartition('.')[2] - info = { - 'id': video_id, - 'url': video_url, - 'ext': video_ext, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), - } - return [info] - - -class YouPornIE(InfoExtractor): - """Information extractor for youporn.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youporn\.com/watch/(?P[0-9]+)/(?P[^/]+)' - - def _print_formats(self, formats): - """Print all available formats""" - print(u'Available formats:') - print(u'ext\t\tformat') - print(u'---------------------------------') - for format in formats: - print(u'%s\t\t%s' % (format['ext'], format['format'])) - - def _specific(self, req_format, formats): - for x in formats: - if(x["format"]==req_format): - return x - return None - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('videoid') - - req = compat_urllib_request.Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - - # Get the video title - result = re.search(r'<h1.*?>(?P<title>.*)</h1>', webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - video_title = result.group('title').strip() - - # Get the video date - result = re.search(r'Date:</label>(?P<date>.*) </li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract video date') - upload_date = None - else: - upload_date = unified_strdate(result.group('date').strip()) - - # Get the video uploader - result = re.search(r'Submitted:</label>(?P<uploader>.*)</li>', webpage) - if result is None: - self._downloader.report_warning(u'unable to extract uploader') - video_uploader = None - else: - video_uploader = result.group('uploader').strip() - video_uploader = clean_html( video_uploader ) - - # Get all of the formats available - DOWNLOAD_LIST_RE = r'(?s)<ul class="downloadList">(?P<download_list>.*?)</ul>' - result = re.search(DOWNLOAD_LIST_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract download list') - download_list_html = result.group('download_list').strip() - - # Get all of the links from the page - LINK_RE = r'(?s)<a href="(?P<url>[^"]+)">' - links = re.findall(LINK_RE, download_list_html) - if(len(links) == 0): - raise ExtractorError(u'ERROR: no known formats available for video') - - self.to_screen(u'Links found: %d' % len(links)) - - formats = [] - for link in links: - - # A link looks like this: - # http://cdn1.download.youporn.phncdn.com/201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4?nvb=20121113051249&nva=20121114051249&ir=1200&sr=1200&hash=014b882080310e95fb6a0 - # A path looks like this: - # /201210/31/8004515/480p_370k_8004515/YouPorn%20-%20Nubile%20Films%20The%20Pillow%20Fight.mp4 - video_url = unescapeHTML( link ) - path = compat_urllib_parse_urlparse( video_url ).path - extension = os.path.splitext( path )[1][1:] - format = path.split('/')[4].split('_')[:2] - size = format[0] - bitrate = format[1] - format = "-".join( format ) - title = u'%s-%s-%s' % (video_title, size, bitrate) - - formats.append({ - 'id': video_id, - 'url': video_url, - 'uploader': video_uploader, - 'upload_date': upload_date, - 'title': title, - 'ext': extension, - 'format': format, - 'thumbnail': None, - 'description': None, - 'player_url': None - }) - - if self._downloader.params.get('listformats', None): - self._print_formats(formats) - return - - req_format = self._downloader.params.get('format', None) - self.to_screen(u'Format: %s' % req_format) - - if req_format is None or req_format == 'best': - return [formats[0]] - elif req_format == 'worst': - return [formats[-1]] - elif req_format in ('-1', 'all'): - return formats - else: - format = self._specific( req_format, formats ) - if result is None: - raise ExtractorError(u'Requested format not available') - return [format] - - - -class PornotubeIE(InfoExtractor): - """Information extractor for pornotube.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?pornotube\.com(/c/(?P<channel>[0-9]+))?(/m/(?P<videoid>[0-9]+))(/(?P<title>.+))$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('videoid') - video_title = mobj.group('title') - - # Get webpage content - webpage = self._download_webpage(url, video_id) - - # Get the video URL - VIDEO_URL_RE = r'url: "(?P<url>http://video[0-9].pornotube.com/.+\.flv)",' - result = re.search(VIDEO_URL_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video url') - video_url = compat_urllib_parse.unquote(result.group('url')) - - #Get the uploaded date - VIDEO_UPLOADED_RE = r'<div class="video_added_by">Added (?P<date>[0-9\/]+) by' - result = re.search(VIDEO_UPLOADED_RE, webpage) - if result is None: - raise ExtractorError(u'Unable to extract video title') - upload_date = unified_strdate(result.group('date')) - - info = {'id': video_id, - 'url': video_url, - 'uploader': None, - 'upload_date': upload_date, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv'} - - return [info] - -class YouJizzIE(InfoExtractor): - """Information extractor for youjizz.com.""" - _VALID_URL = r'^(?:https?://)?(?:\w+\.)?youjizz\.com/videos/(?P<videoid>[^.]+).html$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('videoid') - - # Get webpage content - webpage = self._download_webpage(url, video_id) - - # Get the video title - result = re.search(r'<title>(?P<title>.*)', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video title') - video_title = result.group('title').strip() - - # Get the embed page - result = re.search(r'https?://www.youjizz.com/videos/embed/(?P[0-9]+)', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract embed page') - - embed_page_url = result.group(0).strip() - video_id = result.group('videoid') - - webpage = self._download_webpage(embed_page_url, video_id) - - # Get the video URL - result = re.search(r'so.addVariable\("file",encodeURIComponent\("(?P[^"]+)"\)\);', webpage) - if result is None: - raise ExtractorError(u'ERROR: unable to extract video url') - video_url = result.group('source') - - info = {'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'format': 'flv', - 'player_url': embed_page_url} - - return [info] - -class EightTracksIE(InfoExtractor): - IE_NAME = '8tracks' - _VALID_URL = r'https?://8tracks.com/(?P[^/]+)/(?P[^/#]+)(?:#.*)?$' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - playlist_id = mobj.group('id') - - webpage = self._download_webpage(url, playlist_id) - - m = re.search(r"PAGE.mix = (.*?);\n", webpage, flags=re.DOTALL) - if not m: - raise ExtractorError(u'Cannot find trax information') - json_like = m.group(1) - data = json.loads(json_like) - - session = str(random.randint(0, 1000000000)) - mix_id = data['id'] - track_count = data['tracks_count'] - first_url = 'http://8tracks.com/sets/%s/play?player=sm&mix_id=%s&format=jsonh' % (session, mix_id) - next_url = first_url - res = [] - for i in itertools.count(): - api_json = self._download_webpage(next_url, playlist_id, - note=u'Downloading song information %s/%s' % (str(i+1), track_count), - errnote=u'Failed to download song information') - api_data = json.loads(api_json) - track_data = api_data[u'set']['track'] - info = { - 'id': track_data['id'], - 'url': track_data['track_file_stream_url'], - 'title': track_data['performer'] + u' - ' + track_data['name'], - 'raw_title': track_data['name'], - 'uploader_id': data['user']['login'], - 'ext': 'm4a', - } - res.append(info) - if api_data['set']['at_last_track']: - break - next_url = 'http://8tracks.com/sets/%s/next?player=sm&mix_id=%s&format=jsonh&track_id=%s' % (session, mix_id, track_data['id']) - return res - -class KeekIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' - IE_NAME = u'keek' - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') - video_url = u'http://cdn.keek.com/keek/video/%s' % video_id - thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id - webpage = self._download_webpage(url, video_id) - m = re.search(r'[\S\s]+?

(?P.+?)

', webpage) - uploader = clean_html(m.group('uploader')) - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'thumbnail': thumbnail, - 'uploader': uploader - } - return [info] - -class TEDIE(InfoExtractor): - _VALID_URL=r'''http://www\.ted\.com/ - ( - ((?Pplaylists)/(?P\d+)) # We have a playlist - | - ((?Ptalks)) # We have a simple talk - ) - (/lang/(.*?))? # The url may contain the language - /(?P\w+) # Here goes the name and then ".html" - ''' - - @classmethod - def suitable(cls, url): - """Receives a URL and returns True if suitable for this IE.""" - return re.match(cls._VALID_URL, url, re.VERBOSE) is not None - - def _real_extract(self, url): - m=re.match(self._VALID_URL, url, re.VERBOSE) - if m.group('type_talk'): - return [self._talk_info(url)] - else : - playlist_id=m.group('playlist_id') - name=m.group('name') - self.to_screen(u'Getting info of playlist %s: "%s"' % (playlist_id,name)) - return [self._playlist_videos_info(url,name,playlist_id)] - - def _talk_video_link(self,mediaSlug): - '''Returns the video link for that mediaSlug''' - return 'http://download.ted.com/talks/%s.mp4' % mediaSlug - - def _playlist_videos_info(self,url,name,playlist_id=0): - '''Returns the videos of the playlist''' - video_RE=r''' -
(?P.+?)

' - webpage=self._download_webpage(url, playlist_id, 'Downloading playlist webpage') - m_videos=re.finditer(video_RE,webpage,re.VERBOSE) - m_names=re.finditer(video_name_RE,webpage) - - playlist_RE = r'div class="headline">(\s*?)

(\s*?)(?P.*?)' - m_playlist = re.search(playlist_RE, webpage) - playlist_title = m_playlist.group('playlist_title') - - playlist_entries = [] - for m_video, m_name in zip(m_videos,m_names): - video_id=m_video.group('video_id') - talk_url='http://www.ted.com%s' % m_name.group('talk_url') - playlist_entries.append(self.url_result(talk_url, 'TED')) - return self.playlist_result(playlist_entries, playlist_id = playlist_id, playlist_title = playlist_title) - - def _talk_info(self, url, video_id=0): - """Return the video for the talk in the url""" - m=re.match(self._VALID_URL, url,re.VERBOSE) - videoName=m.group('name') - webpage=self._download_webpage(url, video_id, 'Downloading \"%s\" page' % videoName) - # If the url includes the language we get the title translated - title_RE=r'(?P.*)</span>' - title=re.search(title_RE, webpage).group('title') - info_RE=r'''<script\ type="text/javascript">var\ talkDetails\ =(.*?) - "id":(?P<videoID>[\d]+).*? - "mediaSlug":"(?P<mediaSlug>[\w\d]+?)"''' - thumb_RE=r'</span>[\s.]*</div>[\s.]*<img src="(?P<thumbnail>.*?)"' - thumb_match=re.search(thumb_RE,webpage) - info_match=re.search(info_RE,webpage,re.VERBOSE) - video_id=info_match.group('videoID') - mediaSlug=info_match.group('mediaSlug') - video_url=self._talk_video_link(mediaSlug) - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'thumbnail': thumb_match.group('thumbnail') - } - return info - -class MySpassIE(InfoExtractor): - _VALID_URL = r'http://www.myspass.de/.*' - - def _real_extract(self, url): - META_DATA_URL_TEMPLATE = 'http://www.myspass.de/myspass/includes/apps/video/getvideometadataxml.php?id=%s' - - # video id is the last path element of the URL - # usually there is a trailing slash, so also try the second but last - url_path = compat_urllib_parse_urlparse(url).path - url_parent_path, video_id = os.path.split(url_path) - if not video_id: - _, video_id = os.path.split(url_parent_path) - - # get metadata - metadata_url = META_DATA_URL_TEMPLATE % video_id - metadata_text = self._download_webpage(metadata_url, video_id) - metadata = xml.etree.ElementTree.fromstring(metadata_text.encode('utf-8')) - - # extract values from metadata - url_flv_el = metadata.find('url_flv') - if url_flv_el is None: - raise ExtractorError(u'Unable to extract download url') - video_url = url_flv_el.text - extension = os.path.splitext(video_url)[1][1:] - title_el = metadata.find('title') - if title_el is None: - raise ExtractorError(u'Unable to extract title') - title = title_el.text - format_id_el = metadata.find('format_id') - if format_id_el is None: - format = ext - else: - format = format_id_el.text - description_el = metadata.find('description') - if description_el is not None: - description = description_el.text - else: - description = None - imagePreview_el = metadata.find('imagePreview') - if imagePreview_el is not None: - thumbnail = imagePreview_el.text - else: - thumbnail = None - info = { - 'id': video_id, - 'url': video_url, - 'title': title, - 'ext': extension, - 'format': format, - 'thumbnail': thumbnail, - 'description': description - } - return [info] - -class SpiegelIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?spiegel\.de/video/[^/]*-(?P<videoID>[0-9]+)(?:\.html)?(?:#.*)?$' - - def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') - - webpage = self._download_webpage(url, video_id) - m = re.search(r'<div class="spVideoTitle">(.*?)</div>', webpage) - if not m: - raise ExtractorError(u'Cannot find title') - video_title = unescapeHTML(m.group(1)) - - xml_url = u'http://video2.spiegel.de/flash/' + video_id + u'.xml' - xml_code = self._download_webpage(xml_url, video_id, - note=u'Downloading XML', errnote=u'Failed to download XML') - - idoc = xml.etree.ElementTree.fromstring(xml_code) - last_type = idoc[-1] - filename = last_type.findall('./filename')[0].text - duration = float(last_type.findall('./duration')[0].text) - - video_url = 'http://video2.spiegel.de/flash/' + filename - video_ext = filename.rpartition('.')[2] - info = { - 'id': video_id, - 'url': video_url, - 'ext': video_ext, - 'title': video_title, - 'duration': duration, - } - return [info] - -class LiveLeakIE(InfoExtractor): - - _VALID_URL = r'^(?:http?://)?(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<video_id>[\w_]+)(?:.*)' - IE_NAME = u'liveleak' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('video_id') - - webpage = self._download_webpage(url, video_id) - - m = re.search(r'file: "(.*?)",', webpage) - if not m: - raise ExtractorError(u'Unable to find video url') - video_url = m.group(1) - - m = re.search(r'<meta property="og:title" content="(?P<title>.*?)"', webpage) - if not m: - raise ExtractorError(u'Cannot find video title') - title = unescapeHTML(m.group('title')).replace('LiveLeak.com -', '').strip() - - m = re.search(r'<meta property="og:description" content="(?P<desc>.*?)"', webpage) - if m: - desc = unescapeHTML(m.group('desc')) - else: - desc = None - - m = re.search(r'By:.*?(\w+)</a>', webpage) - if m: - uploader = clean_html(m.group(1)) - else: - uploader = None - - info = { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': desc, - 'uploader': uploader - } - - return [info] - -class ARDIE(InfoExtractor): - _VALID_URL = r'^(?:https?://)?(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[^/\?]+)(?:\?.*)?' - _TITLE = r'<h1(?: class="boxTopHeadline")?>(?P<title>.*)</h1>' - _MEDIA_STREAM = r'mediaCollection\.addMediaStream\((?P<media_type>\d+), (?P<quality>\d+), "(?P<rtmp_url>[^"]*)", "(?P<video_url>[^"]*)", "[^"]*"\)' - - def _real_extract(self, url): - # determine video id from url - m = re.match(self._VALID_URL, url) - - numid = re.search(r'documentId=([0-9]+)', url) - if numid: - video_id = numid.group(1) - else: - video_id = m.group('video_id') - - # determine title and media streams from webpage - html = self._download_webpage(url, video_id) - title = re.search(self._TITLE, html).group('title') - streams = [m.groupdict() for m in re.finditer(self._MEDIA_STREAM, html)] - if not streams: - assert '"fsk"' in html - raise ExtractorError(u'This video is only available after 8:00 pm') - - # choose default media type and highest quality for now - stream = max([s for s in streams if int(s["media_type"]) == 0], - key=lambda s: int(s["quality"])) - - # there's two possibilities: RTMP stream or HTTP download - info = {'id': video_id, 'title': title, 'ext': 'mp4'} - if stream['rtmp_url']: - self.to_screen(u'RTMP download detected') - assert stream['video_url'].startswith('mp4:') - info["url"] = stream["rtmp_url"] - info["play_path"] = stream['video_url'] - else: - assert stream["video_url"].endswith('.mp4') - info["url"] = stream["video_url"] - return [info] - -class TumblrIE(InfoExtractor): - _VALID_URL = r'http://(?P<blog_name>.*?)\.tumblr\.com/((post)|(video))/(?P<id>\d*)/(.*?)' - - def _real_extract(self, url): - m_url = re.match(self._VALID_URL, url) - video_id = m_url.group('id') - blog = m_url.group('blog_name') - - url = 'http://%s.tumblr.com/post/%s/' % (blog, video_id) - webpage = self._download_webpage(url, video_id) - - re_video = r'src=\\x22(?P<video_url>http://%s\.tumblr\.com/video_file/%s/(.*?))\\x22 type=\\x22video/(?P<ext>.*?)\\x22' % (blog, video_id) - video = re.search(re_video, webpage) - if video is None: - self.to_screen("No video found") - return [] - video_url = video.group('video_url') - ext = video.group('ext') - - re_thumb = r'posters(.*?)\[\\x22(?P<thumb>.*?)\\x22' # We pick the first poster - thumb = re.search(re_thumb, webpage).group('thumb').replace('\\', '') - - # The only place where you can get a title, it's not complete, - # but searching in other places doesn't work for all videos - re_title = r'<title>(?P<title>.*?)' - title = unescapeHTML(re.search(re_title, webpage, re.DOTALL).group('title')) - - return [{'id': video_id, - 'url': video_url, - 'title': title, - 'thumbnail': thumb, - 'ext': ext - }] - -class BandcampIE(InfoExtractor): - _VALID_URL = r'http://.*?\.bandcamp\.com/track/(?P.*)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - title = mobj.group('title') - webpage = self._download_webpage(url, title) - # We get the link to the free download page - m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) - if m_download is None: - raise ExtractorError(u'No free songs founded') - - download_link = m_download.group(1) - id = re.search(r'var TralbumData = {(.*?)id: (?P<id>\d*?)$', - webpage, re.MULTILINE|re.DOTALL).group('id') - - download_webpage = self._download_webpage(download_link, id, - 'Downloading free downloads page') - # We get the dictionary of the track from some javascrip code - info = re.search(r'items: (.*?),$', - download_webpage, re.MULTILINE).group(1) - info = json.loads(info)[0] - # We pick mp3-320 for now, until format selection can be easily implemented. - mp3_info = info[u'downloads'][u'mp3-320'] - # If we try to use this url it says the link has expired - initial_url = mp3_info[u'url'] - re_url = r'(?P<server>http://(.*?)\.bandcamp\.com)/download/track\?enc=mp3-320&fsig=(?P<fsig>.*?)&id=(?P<id>.*?)&ts=(?P<ts>.*)$' - m_url = re.match(re_url, initial_url) - #We build the url we will use to get the final track url - # This url is build in Bandcamp in the script download_bunde_*.js - request_url = '%s/statdownload/track?enc=mp3-320&fsig=%s&id=%s&ts=%s&.rand=665028774616&.vrs=1' % (m_url.group('server'), m_url.group('fsig'), id, m_url.group('ts')) - final_url_webpage = self._download_webpage(request_url, id, 'Requesting download url') - # If we could correctly generate the .rand field the url would be - #in the "download_url" key - final_url = re.search(r'"retry_url":"(.*?)"', final_url_webpage).group(1) - - track_info = {'id':id, - 'title' : info[u'title'], - 'ext' : 'mp3', - 'url' : final_url, - 'thumbnail' : info[u'thumb_url'], - 'uploader' : info[u'artist'] - } - - return [track_info] - -class RedTubeIE(InfoExtractor): - """Information Extractor for redtube""" - _VALID_URL = r'(?:http://)?(?:www\.)?redtube\.com/(?P<id>[0-9]+)' - - def _real_extract(self,url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - - video_id = mobj.group('id') - video_extension = 'mp4' - webpage = self._download_webpage(url, video_id) - self.report_extraction(video_id) - mobj = re.search(r'<source src="'+'(.+)'+'" type="video/mp4">',webpage) - - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - - video_url = mobj.group(1) - mobj = re.search('<h1 class="videoTitle slidePanelMovable">(.+)</h1>',webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': video_extension, - 'title': video_title, - }] - -class InaIE(InfoExtractor): - """Information Extractor for Ina.fr""" - _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*' - - def _real_extract(self,url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - mrss_url='http://player.ina.fr/notices/%s.mrss' % video_id - video_extension = 'mp4' - webpage = self._download_webpage(mrss_url, video_id) - - mobj = re.search(r'<media:player url="(?P<mp4url>http://mp4.ina.fr/[^"]+\.mp4)', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - video_url = mobj.group(1) - - mobj = re.search(r'<title><!\[CDATA\[(?P<titre>.*?)]]>', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': video_extension, - 'title': video_title, - }] - -class HowcastIE(InfoExtractor): - """Information Extractor for Howcast.com""" - _VALID_URL = r'(?:https?://)?(?:www\.)?howcast\.com/videos/(?P\d+)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - webpage_url = 'http://www.howcast.com/videos/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - - self.report_extraction(video_id) - - mobj = re.search(r'\'?file\'?: "(http://mobile-media\.howcast\.com/[0-9]+\.mp4)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video URL') - video_url = mobj.group(1) - - mobj = re.search(r'\w+)' - - def _real_extract(self, url): - - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - webpage_url = 'https://vine.co/v/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - - self.report_extraction(video_id) - - mobj = re.search(r'.*?

(.+?)

', webpage, re.DOTALL) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader') - uploader = mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, - 'uploader': uploader, - }] - -class FlickrIE(InfoExtractor): - """Information Extractor for Flickr videos""" - _VALID_URL = r'(?:https?://)?(?:www\.)?flickr\.com/photos/(?P[\w\-_@]+)/(?P\d+).*' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - video_id = mobj.group('id') - video_uploader_id = mobj.group('uploader_id') - webpage_url = 'http://www.flickr.com/photos/' + video_uploader_id + '/' + video_id - webpage = self._download_webpage(webpage_url, video_id) - - mobj = re.search(r"photo_secret: '(\w+)'", webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract video secret') - secret = mobj.group(1) - - first_url = 'https://secure.flickr.com/apps/video/video_mtl_xml.gne?v=x&photo_id=' + video_id + '&secret=' + secret + '&bitrate=700&target=_self' - first_xml = self._download_webpage(first_url, video_id, 'Downloading first data webpage') - - mobj = re.search(r'(\d+-\d+)', first_xml) - if mobj is None: - raise ExtractorError(u'Unable to extract node_id') - node_id = mobj.group(1) - - second_url = 'https://secure.flickr.com/video_playlist.gne?node_id=' + node_id + '&tech=flash&mode=playlist&bitrate=700&secret=' + secret + '&rd=video.yahoo.com&noad=1' - second_xml = self._download_webpage(second_url, video_id, 'Downloading second data webpage') - - self.report_extraction(video_id) - - mobj = re.search(r'.*)' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - url_title = mobj.group('url_title') - webpage = self._download_webpage(url, url_title) - - mobj = re.search(r'
(.*?)', data) - if mobj is None: - raise ExtractorError(u'Unable to extract video url') - video_url = mobj.group(1) - - return [{ - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': video_title, - 'thumbnail': thumbnail, - 'description': description, - }] - -def gen_extractors(): - """ Return a list of an instance of every supported extractor. - The order does matter; the first extractor matched is the one handling the URL. - """ - return [ - YoutubePlaylistIE(), - YoutubeChannelIE(), - YoutubeUserIE(), - YoutubeSearchIE(), - YoutubeIE(), - MetacafeIE(), - DailymotionIE(), - GoogleSearchIE(), - PhotobucketIE(), - YahooIE(), - YahooSearchIE(), - DepositFilesIE(), - FacebookIE(), - BlipTVUserIE(), - BlipTVIE(), - VimeoIE(), - MyVideoIE(), - ComedyCentralIE(), - EscapistIE(), - CollegeHumorIE(), - XVideosIE(), - SoundcloudSetIE(), - SoundcloudIE(), - InfoQIE(), - MixcloudIE(), - StanfordOpenClassroomIE(), - MTVIE(), - YoukuIE(), - XNXXIE(), - YouJizzIE(), - PornotubeIE(), - YouPornIE(), - GooglePlusIE(), - ArteTvIE(), - NBAIE(), - WorldStarHipHopIE(), - JustinTVIE(), - FunnyOrDieIE(), - SteamIE(), - UstreamIE(), - RBMARadioIE(), - EightTracksIE(), - KeekIE(), - TEDIE(), - MySpassIE(), - SpiegelIE(), - LiveLeakIE(), - ARDIE(), - TumblrIE(), - BandcampIE(), - RedTubeIE(), - InaIE(), - HowcastIE(), - VineIE(), - FlickrIE(), - TeamcocoIE(), - GenericIE() - ] - -def get_info_extractor(ie_name): - """Returns the info extractor class with the given ie_name""" - return globals()[ie_name+'IE'] +from .extractor.common import InfoExtractor, SearchInfoExtractor +from .extractor import gen_extractors, get_info_extractor