X-Git-Url: https://git.rapsys.eu/youtubedl/blobdiff_plain/509eeaaa67b2f901752c50fc6edf41954dbe3085..1f17a37b9b95db09a420a1f52cf18723ce4eb8b5:/youtube_dl/extractor/wrzuta.py diff --git a/youtube_dl/extractor/wrzuta.py b/youtube_dl/extractor/wrzuta.py index 34dd6d9..0f53f1b 100644 --- a/youtube_dl/extractor/wrzuta.py +++ b/youtube_dl/extractor/wrzuta.py @@ -1,12 +1,14 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals import re from .common import InfoExtractor from ..utils import ( + ExtractorError, int_or_none, qualities, + remove_start, ) @@ -26,16 +28,17 @@ class WrzutaIE(InfoExtractor): 'uploader_id': 'laboratoriumdextera', 'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd', }, + 'skip': 'Redirected to wrzuta.pl', }, { - 'url': 'http://w729.wrzuta.pl/audio/9oXJqdcndqv/david_guetta_amp_showtek_ft._vassy_-_bad', - 'md5': '1e546a18e1c22ac6e9adce17b8961ff5', + 'url': 'http://vexling.wrzuta.pl/audio/01xBFabGXu6/james_horner_-_into_the_na_39_vi_world_bonus', + 'md5': 'f80564fb5a2ec6ec59705ae2bf2ba56d', 'info_dict': { - 'id': '9oXJqdcndqv', - 'ext': 'ogg', - 'title': 'David Guetta & Showtek ft. Vassy - Bad', - 'duration': 270, - 'uploader_id': 'w729', - 'description': 'md5:4628f01c666bbaaecefa83476cfa794a', + 'id': '01xBFabGXu6', + 'ext': 'mp3', + 'title': 'James Horner - Into The Na\'vi World [Bonus]', + 'description': 'md5:30a70718b2cd9df3120fce4445b0263b', + 'duration': 95, + 'uploader_id': 'vexling', }, }] @@ -45,20 +48,24 @@ class WrzutaIE(InfoExtractor): typ = mobj.group('typ') uploader = mobj.group('uploader') - webpage = self._download_webpage(url, video_id) + webpage, urlh = self._download_webpage_handle(url, video_id) + + if urlh.geturl() == 'http://www.wrzuta.pl/': + raise ExtractorError('Video removed', expected=True) quality = qualities(['SD', 'MQ', 'HQ', 'HD']) - audio_table = {'flv': 'mp3', 'webm': 'ogg'} + audio_table = {'flv': 'mp3', 'webm': 'ogg', '???': 'mp3'} embedpage = self._download_json('http://www.wrzuta.pl/npp/embed/%s/%s' % (uploader, video_id), video_id) formats = [] for media in embedpage['url']: + fmt = media['type'].split('@')[0] if typ == 'audio': - ext = audio_table[media['type'].split('@')[0]] + ext = audio_table.get(fmt, fmt) else: - ext = media['type'].split('@')[0] + ext = fmt formats.append({ 'format_id': '%s_%s' % (ext, media['quality'].lower()), @@ -79,3 +86,73 @@ class WrzutaIE(InfoExtractor): 'description': self._og_search_description(webpage), 'age_limit': embedpage.get('minimalAge', 0), } + + +class WrzutaPlaylistIE(InfoExtractor): + """ + this class covers extraction of wrzuta playlist entries + the extraction process bases on following steps: + * collect information of playlist size + * download all entries provided on + the playlist webpage (the playlist is split + on two pages: first directly reached from webpage + second: downloaded on demand by ajax call and rendered + using the ajax call response) + * in case size of extracted entries not reached total number of entries + use the ajax call to collect the remaining entries + """ + + IE_NAME = 'wrzuta.pl:playlist' + _VALID_URL = r'https?://(?P[0-9a-zA-Z]+)\.wrzuta\.pl/playlista/(?P[0-9a-zA-Z]+)' + _TESTS = [{ + 'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR/moja_muza', + 'playlist_mincount': 14, + 'info_dict': { + 'id': '7XfO4vE84iR', + 'title': 'Moja muza', + }, + }, { + 'url': 'http://heroesf70.wrzuta.pl/playlista/6Nj3wQHx756/lipiec_-_lato_2015_muzyka_swiata', + 'playlist_mincount': 144, + 'info_dict': { + 'id': '6Nj3wQHx756', + 'title': 'Lipiec - Lato 2015 Muzyka Świata', + }, + }, { + 'url': 'http://miromak71.wrzuta.pl/playlista/7XfO4vE84iR', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + uploader = mobj.group('uploader') + + webpage = self._download_webpage(url, playlist_id) + + playlist_size = int_or_none(self._html_search_regex( + (r']+class=["\']playlist-counter["\'][^>]*>\d+/(\d+)', + r']+class=["\']all-counter["\'][^>]*>(.+?)'), + webpage, 'playlist size', default=None)) + + playlist_title = remove_start( + self._og_search_title(webpage), 'Playlista: ') + + entries = [] + if playlist_size: + entries = [ + self.url_result(entry_url) + for _, entry_url in re.findall( + r']+href=(["\'])(http.+?)\1[^>]+class=["\']playlist-file-page', + webpage)] + if playlist_size > len(entries): + playlist_content = self._download_json( + 'http://%s.wrzuta.pl/xhr/get_playlist_offset/%s' % (uploader, playlist_id), + playlist_id, + 'Downloading playlist JSON', + 'Unable to download playlist JSON') + entries.extend([ + self.url_result(entry['filelink']) + for entry in playlist_content.get('files', []) if entry.get('filelink')]) + + return self.playlist_result(entries, playlist_id, playlist_title)