From: Rogério Brito Date: Thu, 9 Mar 2017 01:53:09 +0000 (-0300) Subject: New upstream version 2017.03.07 X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/1d3fd83f473663fce3e0a10303473a38d80cc3d0?hp=4e090bc3ceacc4e3cd464d12ea97700e3acad37d New upstream version 2017.03.07 --- diff --git a/ChangeLog b/ChangeLog index add8a67..601aad9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,98 @@ +version 2017.03.07 + +Core +* Metadata are now added after conversion (#5594) + +Extractors +* [soundcloud] Update client id (#12376) +* [openload] Fix extraction (#10408, #12357) + + +version 2017.03.06 + +Core ++ [utils] Process bytestrings in urljoin (#12369) +* [extractor/common] Improve height extraction and extract bitrate +* [extractor/common] Move jwplayer formats extraction in separate method ++ [external:ffmpeg] Limit test download size to 10KiB (#12362) + +Extractors ++ [drtv] Add geo countries to GeoRestrictedError ++ [drtv:live] Bypass geo restriction ++ [tunepk] Add extractor (#12197, #12243) + + +version 2017.03.05 + +Extractors ++ [twitch] Add basic support for two-factor authentication (#11974) ++ [vier] Add support for vijf.be (#12304) ++ [redbulltv] Add support for redbull.tv (#3919, #11948) +* [douyutv] Switch to the PC API to escape the 5-min limitation (#12316) ++ [generic] Add support for rutube embeds ++ [rutube] Relax URL regular expression ++ [vrak] Add support for vrak.tv (#11452) ++ [brightcove:new] Add ability to smuggle geo_countries into URL ++ [brightcove:new] Raise GeoRestrictedError +* [go] Relax URL regular expression (#12341) +* [24video] Use original host for requests (#12339) +* [ruutu] Disable DASH formats (#12322) + + +version 2017.03.02 + +Core ++ [adobepass] Add support for Charter Spectrum (#11465) +* [YoutubeDL] Don't sanitize identifiers in output template (#12317) + +Extractors +* [facebook] Fix extraction (#12323, #12330) +* [youtube] Mark errors about rental videos as expected (#12324) ++ [npo] Add support for audio +* [npo] Adapt to app.php API (#12311, #12320) + + +version 2017.02.28 + +Core ++ [utils] Add bytes_to_long and long_to_bytes ++ [utils] Add pkcs1pad ++ [aes] Add aes_cbc_encrypt + +Extractors ++ [azmedien:showplaylist] Add support for show playlists (#12160) ++ [youtube:playlist] Recognize another playlist pattern (#11928, #12286) ++ [daisuki] Add support for daisuki.net (#2486, #3186, #4738, #6175, #7776, + #10060) +* [douyu] Fix extraction (#12301) + + +version 2017.02.27 + +Core +* [downloader/common] Limit displaying 2 digits after decimal point in sleep + interval message (#12183) ++ [extractor/common] Add preference to _parse_html5_media_entries + +Extractors ++ [npo] Add support for zapp.nl ++ [npo] Add support for hetklokhuis.nl (#12293) +- [scivee] Remove extractor (#9315) ++ [cda] Decode download URL (#12255) ++ [crunchyroll] Improve uploader extraction (#12267) ++ [youtube] Raise GeoRestrictedError ++ [dailymotion] Raise GeoRestrictedError ++ [mdr] Recognize more URL patterns (#12169) ++ [tvigle] Raise GeoRestrictedError +* [vevo] Fix extraction for videos with the new streams/streamsV3 format + (#11719) ++ [freshlive] Add support for freshlive.tv (#12175) ++ [xhamster] Capture and output videoClosed error (#12263) ++ [etonline] Add support for etonline.com (#12236) ++ [njpwworld] Add support for njpwworld.com (#11561) +* [amcnetworks] Relax URL regular expression (#12127) + + version 2017.02.24.1 Extractors diff --git a/docs/supportedsites.md b/docs/supportedsites.md index f973973..85c59ca 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -78,6 +78,7 @@ - **awaan:video** - **AZMedien**: AZ Medien videos - **AZMedienPlaylist**: AZ Medien playlists + - **AZMedienShowPlaylist**: AZ Medien show playlists - **Azubu** - **AzubuLive** - **BaiduVideo**: 百度视频 @@ -191,6 +192,8 @@ - **dailymotion:playlist** - **dailymotion:user** - **DailymotionCloud** + - **Daisuki** + - **DaisukiPlaylist** - **daum.net** - **daum.net:clip** - **daum.net:playlist** @@ -239,6 +242,7 @@ - **ESPN** - **ESPNArticle** - **EsriVideo** + - **ETOnline** - **Europa** - **EveryonesMixtape** - **ExpoTV** @@ -274,6 +278,7 @@ - **francetvinfo.fr** - **Freesound** - **freespeech.org** + - **FreshLive** - **Funimation** - **FunnyOrDie** - **Fusion** @@ -310,6 +315,7 @@ - **HellPorno** - **Helsinki**: helsinki.fi - **HentaiStigma** + - **hetklokhuis** - **hgtv.com:show** - **HistoricFilms** - **history:topic**: History.com Topic @@ -511,6 +517,7 @@ - **Nintendo** - **njoy**: N-JOY - **njoy:embed** + - **NJPWWorld**: 新日本プロレスワールド - **NobelPrize** - **Noco** - **Normalboots** @@ -619,6 +626,7 @@ - **RaiTV** - **RBMARadio** - **RDS**: RDS.ca + - **RedBullTV** - **RedTube** - **RegioTV** - **RENTV** @@ -666,7 +674,6 @@ - **savefrom.net** - **SBS**: sbs.com.au - **schooltv** - - **SciVee** - **screen.yahoo:search**: Yahoo screen search - **Screencast** - **ScreencastOMatic** @@ -791,6 +798,7 @@ - **tunein:program** - **tunein:station** - **tunein:topic** + - **TunePk** - **Turbo** - **Tutv** - **tv.dfb.de** @@ -910,6 +918,7 @@ - **VoxMedia** - **Vporn** - **vpro**: npo.nl and ntr.nl + - **Vrak** - **VRT** - **vube**: Vube.com - **VuClip** diff --git a/test/test_aes.py b/test/test_aes.py index 54078a6..78a2875 100644 --- a/test/test_aes.py +++ b/test/test_aes.py @@ -8,7 +8,7 @@ import sys import unittest sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_decrypt_text +from youtube_dl.aes import aes_decrypt, aes_encrypt, aes_cbc_decrypt, aes_cbc_encrypt, aes_decrypt_text from youtube_dl.utils import bytes_to_intlist, intlist_to_bytes import base64 @@ -34,6 +34,13 @@ class TestAES(unittest.TestCase): decrypted = intlist_to_bytes(aes_cbc_decrypt(data, self.key, self.iv)) self.assertEqual(decrypted.rstrip(b'\x08'), self.secret_msg) + def test_cbc_encrypt(self): + data = bytes_to_intlist(self.secret_msg) + encrypted = intlist_to_bytes(aes_cbc_encrypt(data, self.key, self.iv)) + self.assertEqual( + encrypted, + b"\x97\x92+\xe5\x0b\xc3\x18\x91ky9m&\xb3\xb5@\xe6'\xc2\x96.\xc8u\x88\xab9-[\x9e|\xf1\xcd") + def test_decrypt_text(self): password = intlist_to_bytes(self.key).decode('utf-8') encrypted = base64.b64encode( diff --git a/test/test_utils.py b/test/test_utils.py index 3cdb21d..173c495 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -52,6 +52,7 @@ from youtube_dl.utils import ( parse_filesize, parse_count, parse_iso8601, + pkcs1pad, read_batch_urls, sanitize_filename, sanitize_path, @@ -454,6 +455,9 @@ class TestUtil(unittest.TestCase): def test_urljoin(self): self.assertEqual(urljoin('http://foo.de/', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin(b'http://foo.de/', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin('http://foo.de/', b'/a/b/c.txt'), 'http://foo.de/a/b/c.txt') + self.assertEqual(urljoin(b'http://foo.de/', b'/a/b/c.txt'), 'http://foo.de/a/b/c.txt') self.assertEqual(urljoin('//foo.de/', '/a/b/c.txt'), '//foo.de/a/b/c.txt') self.assertEqual(urljoin('http://foo.de/', 'a/b/c.txt'), 'http://foo.de/a/b/c.txt') self.assertEqual(urljoin('http://foo.de', '/a/b/c.txt'), 'http://foo.de/a/b/c.txt') @@ -1104,6 +1108,14 @@ The first line ohdave_rsa_encrypt(b'aa111222', e, N), '726664bd9a23fd0c70f9f1b84aab5e3905ce1e45a584e9cbcf9bcc7510338fc1986d6c599ff990d923aa43c51c0d9013cd572e13bc58f4ae48f2ed8c0b0ba881') + def test_pkcs1pad(self): + data = [1, 2, 3] + padded_data = pkcs1pad(data, 32) + self.assertEqual(padded_data[:2], [0, 2]) + self.assertEqual(padded_data[28:], [0, 1, 2, 3]) + + self.assertRaises(ValueError, pkcs1pad, data, 8) + def test_encode_base_n(self): self.assertEqual(encode_base_n(0, 30), '0') self.assertEqual(encode_base_n(80, 30), '2k') diff --git a/youtube-dl b/youtube-dl index bc236d2..e815236 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index f725456..13a3a90 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -616,7 +616,7 @@ class YoutubeDL(object): sanitize = lambda k, v: sanitize_filename( compat_str(v), restricted=self.params.get('restrictfilenames'), - is_id=(k == 'id')) + is_id=(k == 'id' or k.endswith('_id'))) template_dict = dict((k, v if isinstance(v, compat_numeric_types) else sanitize(k, v)) for k, v in template_dict.items() if v is not None and not isinstance(v, (list, tuple, dict))) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 0c401ba..ad5f13d 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -242,14 +242,11 @@ def _real_main(argv=None): # PostProcessors postprocessors = [] - # Add the metadata pp first, the other pps will copy it if opts.metafromtitle: postprocessors.append({ 'key': 'MetadataFromTitle', 'titleformat': opts.metafromtitle }) - if opts.addmetadata: - postprocessors.append({'key': 'FFmpegMetadata'}) if opts.extractaudio: postprocessors.append({ 'key': 'FFmpegExtractAudio', @@ -279,6 +276,11 @@ def _real_main(argv=None): }) if not already_have_thumbnail: opts.writethumbnail = True + # FFmpegMetadataPP should be run after FFmpegVideoConvertorPP and + # FFmpegExtractAudioPP as containers before conversion may not support + # metadata (3gp, webm, etc.) + if opts.addmetadata: + postprocessors.append({'key': 'FFmpegMetadata'}) # XAttrMetadataPP should be run after post-processors that may change file # contents if opts.xattrs: diff --git a/youtube_dl/aes.py b/youtube_dl/aes.py index b8ff454..c5bb3c4 100644 --- a/youtube_dl/aes.py +++ b/youtube_dl/aes.py @@ -60,6 +60,34 @@ def aes_cbc_decrypt(data, key, iv): return decrypted_data +def aes_cbc_encrypt(data, key, iv): + """ + Encrypt with aes in CBC mode. Using PKCS#7 padding + + @param {int[]} data cleartext + @param {int[]} key 16/24/32-Byte cipher key + @param {int[]} iv 16-Byte IV + @returns {int[]} encrypted data + """ + expanded_key = key_expansion(key) + block_count = int(ceil(float(len(data)) / BLOCK_SIZE_BYTES)) + + encrypted_data = [] + previous_cipher_block = iv + for i in range(block_count): + block = data[i * BLOCK_SIZE_BYTES: (i + 1) * BLOCK_SIZE_BYTES] + remaining_length = BLOCK_SIZE_BYTES - len(block) + block += [remaining_length] * remaining_length + mixed_block = xor(block, previous_cipher_block) + + encrypted_block = aes_encrypt(mixed_block, expanded_key) + encrypted_data += encrypted_block + + previous_cipher_block = encrypted_block + + return encrypted_data + + def key_expansion(data): """ Generate key schedule diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index b257e2e..0c119e4 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2760,8 +2760,10 @@ else: compat_kwargs = lambda kwargs: kwargs -compat_numeric_types = ((int, float, long, complex) if sys.version_info[0] < 3 - else (int, float, complex)) +try: + compat_numeric_types = (int, float, long, complex) +except NameError: # Python 3 + compat_numeric_types = (int, float, complex) if sys.version_info < (2, 7): diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 3dc144b..2c4470a 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -347,7 +347,10 @@ class FileDownloader(object): if min_sleep_interval: max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) - self.to_screen('[download] Sleeping %s seconds...' % sleep_interval) + self.to_screen( + '[download] Sleeping %s seconds...' % ( + int(sleep_interval) if sleep_interval.is_integer() + else '%.2f' % sleep_interval)) time.sleep(sleep_interval) return self.real_download(filename, info_dict) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index bdd3545..e13cf54 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -6,7 +6,10 @@ import sys import re from .common import FileDownloader -from ..compat import compat_setenv +from ..compat import ( + compat_setenv, + compat_str, +) from ..postprocessor.ffmpeg import FFmpegPostProcessor, EXT_TO_OUT_FORMATS from ..utils import ( cli_option, @@ -270,6 +273,10 @@ class FFmpegFD(ExternalFD): args += ['-rtmp_live', 'live'] args += ['-i', url, '-c', 'copy'] + + if self.params.get('test', False): + args += ['-fs', compat_str(self._TEST_FILE_SIZE)] + if protocol in ('m3u8', 'm3u8_native'): if self.params.get('hls_use_mpegts', False) or tmpfilename == '-': args += ['-f', 'mpegts'] diff --git a/youtube_dl/extractor/addanime.py b/youtube_dl/extractor/addanime.py index 55a9322..9f8a712 100644 --- a/youtube_dl/extractor/addanime.py +++ b/youtube_dl/extractor/addanime.py @@ -25,7 +25,8 @@ class AddAnimeIE(InfoExtractor): 'ext': 'mp4', 'description': 'One Piece 606', 'title': 'One Piece 606', - } + }, + 'skip': 'Video is gone', }, { 'url': 'http://add-anime.net/video/MDUGWYKNGBD8/One-Piece-687', 'only_matching': True, diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 4d655bd..d4816ab 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -36,6 +36,11 @@ MSO_INFO = { 'username_field': 'Ecom_User_ID', 'password_field': 'Ecom_Password', }, + 'Charter_Direct': { + 'name': 'Charter Spectrum', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, 'thr030': { 'name': '3 Rivers Communications' }, diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index b71d1a0..3a0ec67 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -10,7 +10,7 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies/|shows/[^/]+/(?:full-episodes/)?[^/]+/episode-\d+(?:-(?:[^/]+/)?|/))(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', 'md5': '', @@ -44,6 +44,12 @@ class AMCNetworksIE(ThePlatformIE): }, { 'url': 'http://www.bbcamerica.com/shows/doctor-who/full-episodes/the-power-of-the-daleks/episode-01-episode-1-color-version', 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/mama-june-from-not-to-hot/full-episode/season-01/thin-tervention', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index cbc3ed5..f4e07d9 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -5,6 +6,7 @@ import re from .common import InfoExtractor from .kaltura import KalturaIE from ..utils import ( + get_element_by_class, get_element_by_id, strip_or_none, urljoin, @@ -170,3 +172,42 @@ class AZMedienPlaylistIE(AZMedienBaseIE): 'video-title', webpage)), group='title') return self.playlist_result(entries, show_id, title) + + +class AZMedienShowPlaylistIE(AZMedienBaseIE): + IE_DESC = 'AZ Medien show playlists' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?: + telezueri\.ch| + telebaern\.tv| + telem1\.ch + )/ + (?: + all-episodes| + alle-episoden + )/ + (?P[^/?#&]+) + ''' + + _TEST = { + 'url': 'http://www.telezueri.ch/all-episodes/astrotalk', + 'info_dict': { + 'id': 'astrotalk', + 'title': 'TeleZüri: AstroTalk - alle episoden', + 'description': 'md5:4c0f7e7d741d906004266e295ceb4a26', + }, + 'playlist_mincount': 13, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + webpage = self._download_webpage(url, playlist_id) + episodes = get_element_by_class('search-mobile-box', webpage) + entries = [self.url_result( + urljoin(url, m.group('url'))) for m in re.finditer( + r']+href=(["\'])(?P(?:(?!\1).)+)\1', episodes)] + title = self._og_search_title(webpage, fatal=False) + description = self._og_search_description(webpage) + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 27685ee..66c8cb2 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -515,6 +515,9 @@ class BrightcoveNewIE(InfoExtractor): return entries def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + self._initialize_geo_bypass(smuggled_data.get('geo_countries')) + account_id, player_id, embed, video_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage( @@ -544,8 +547,10 @@ class BrightcoveNewIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: json_data = self._parse_json(e.cause.read().decode(), video_id)[0] - raise ExtractorError( - json_data.get('message') or json_data['error_code'], expected=True) + message = json_data.get('message') or json_data['error_code'] + if json_data.get('error_subcode') == 'CLIENT_GEO': + self.raise_geo_restricted(msg=message) + raise ExtractorError(message, expected=True) raise title = json_data['name'].strip() diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index ae7af2f..1ee35b5 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import codecs import re from .common import InfoExtractor @@ -96,6 +97,10 @@ class CDAIE(InfoExtractor): if not video or 'file' not in video: self.report_warning('Unable to extract %s version information' % version) return + if video['file'].startswith('uggc'): + video['file'] = codecs.decode(video['file'], 'rot_13') + if video['file'].endswith('adc.mp4'): + video['file'] = video['file'].replace('adc.mp4', '.mp4') f = { 'url': video['file'], } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 4252d68..78dc5be 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2010,7 +2010,7 @@ class InfoExtractor(object): }) return formats - def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None): + def _parse_html5_media_entries(self, base_url, webpage, video_id, m3u8_id=None, m3u8_entry_protocol='m3u8', mpd_id=None, preference=None): def absolute_url(video_url): return compat_urlparse.urljoin(base_url, video_url) @@ -2032,7 +2032,8 @@ class InfoExtractor(object): is_plain_url = False formats = self._extract_m3u8_formats( full_url, video_id, ext='mp4', - entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id) + entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id, + preference=preference) elif ext == 'mpd': is_plain_url = False formats = self._extract_mpd_formats( @@ -2197,56 +2198,9 @@ class InfoExtractor(object): this_video_id = video_id or video_data['mediaid'] - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - if base_url: - source_url = compat_urlparse.urljoin(base_url, source_url) - source_type = source.get('type') or '' - ext = mimetype2ext(source_type) or determine_ext(source_url) - if source_type == 'hls' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif ext == 'mpd': - formats.extend(self._extract_mpd_formats( - source_url, this_video_id, mpd_id=mpd_id, fatal=False)) - # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 - elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - 'ext': ext, - }) - else: - height = int_or_none(source.get('height')) - if height is None: - # Often no height is provided but there is a label in - # format like 1080p. - height = int_or_none(self._search_regex( - r'^(\d{3,})[pP]$', source.get('label') or '', - 'height', default=None)) - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': height, - 'ext': ext, - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv' - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) + formats = self._parse_jwplayer_formats( + video_data['sources'], video_id=this_video_id, m3u8_id=m3u8_id, + mpd_id=mpd_id, rtmp_params=rtmp_params, base_url=base_url) self._sort_formats(formats) subtitles = {} @@ -2277,6 +2231,62 @@ class InfoExtractor(object): else: return self.playlist_result(entries) + def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, + m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + formats = [] + for source in jwplayer_sources_data: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=m3u8_id, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + source_url, video_id, mpd_id=mpd_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ( + 'oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, + }) + else: + height = int_or_none(source.get('height')) + if height is None: + # Often no height is provided but there is a label in + # format like "1080p", "720p SD", or 1080. + height = int_or_none(self._search_regex( + r'^(\d{3,4})[pP]?(?:\b|$)', compat_str(source.get('label') or ''), + 'height', default=None)) + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': height, + 'tbr': int_or_none(source.get('bitrate')), + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + return formats + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index a1fc6a7..9c6cf00 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -207,6 +207,21 @@ class CrunchyrollIE(CrunchyrollBaseIE): # Just test metadata extraction 'skip_download': True, }, + }, { + # make sure we can extract an uploader name that's not a link + 'url': 'http://www.crunchyroll.com/hakuoki-reimeiroku/episode-1-dawn-of-the-divine-warriors-606899', + 'info_dict': { + 'id': '606899', + 'ext': 'mp4', + 'title': 'Hakuoki Reimeiroku Episode 1 – Dawn of the Divine Warriors', + 'description': 'Ryunosuke was left to die, but Serizawa-san asked him a simple question "Do you want to live?"', + 'uploader': 'Geneon Entertainment', + 'upload_date': '20120717', + }, + 'params': { + # just test metadata extraction + 'skip_download': True, + }, }] _FORMAT_IDS = { @@ -388,8 +403,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if video_upload_date: video_upload_date = unified_strdate(video_upload_date) video_uploader = self._html_search_regex( - r']+href="/publisher/[^"]+"[^>]*>([^<]+)', webpage, - 'video_uploader', fatal=False) + # try looking for both an uploader that's a link and one that's not + [r']+href="/publisher/[^"]+"[^>]*>([^<]+)', r'
\s*Publisher:\s*\s*(.+?)\s*\s*
'], + webpage, 'video_uploader', fatal=False) available_fmts = [] for a, fmt in re.findall(r'(]+token=["\']showmedia\.([0-9]{3,4})p["\'][^>]+>)', webpage): diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index b312401..246efde 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -282,9 +282,14 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } def _check_error(self, info): + error = info.get('error') if info.get('error') is not None: + title = error['title'] + # See https://developer.dailymotion.com/api#access-error + if error.get('code') == 'DM007': + self.raise_geo_restricted(msg=title) raise ExtractorError( - '%s said: %s' % (self.IE_NAME, info['error']['title']), expected=True) + '%s said: %s' % (self.IE_NAME, title), expected=True) def _get_subtitles(self, video_id, webpage): try: diff --git a/youtube_dl/extractor/daisuki.py b/youtube_dl/extractor/daisuki.py new file mode 100644 index 0000000..58cc986 --- /dev/null +++ b/youtube_dl/extractor/daisuki.py @@ -0,0 +1,159 @@ +from __future__ import unicode_literals + +import base64 +import json +import random +import re + +from .common import InfoExtractor +from ..aes import ( + aes_cbc_decrypt, + aes_cbc_encrypt, +) +from ..utils import ( + bytes_to_intlist, + bytes_to_long, + clean_html, + ExtractorError, + intlist_to_bytes, + get_element_by_id, + js_to_json, + int_or_none, + long_to_bytes, + pkcs1pad, + remove_end, +) + + +class DaisukiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?daisuki\.net/[^/]+/[^/]+/[^/]+/watch\.[^.]+\.(?P\d+)\.html' + + _TEST = { + 'url': 'http://www.daisuki.net/tw/en/anime/watch.TheIdolMasterCG.11213.html', + 'info_dict': { + 'id': '11213', + 'ext': 'mp4', + 'title': '#01 Who is in the pumpkin carriage? - THE IDOLM@STER CINDERELLA GIRLS', + 'subtitles': { + 'mul': [{ + 'ext': 'ttml', + }], + }, + 'creator': 'BANDAI NAMCO Entertainment', + }, + 'params': { + 'skip_download': True, # AES-encrypted HLS stream + }, + } + + # The public key in PEM format can be found in clientlibs_anime_watch.min.js + _RSA_KEY = (0xc5524c25e8e14b366b3754940beeb6f96cb7e2feef0b932c7659a0c5c3bf173d602464c2df73d693b513ae06ff1be8f367529ab30bf969c5640522181f2a0c51ea546ae120d3d8d908595e4eff765b389cde080a1ef7f1bbfb07411cc568db73b7f521cedf270cbfbe0ddbc29b1ac9d0f2d8f4359098caffee6d07915020077d, 65537) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + flashvars = self._parse_json(self._search_regex( + r'(?s)var\s+flashvars\s*=\s*({.+?});', webpage, 'flashvars'), + video_id, transform_source=js_to_json) + + iv = [0] * 16 + + data = {} + for key in ('device_cd', 'mv_id', 'ss1_prm', 'ss2_prm', 'ss3_prm', 'ss_id'): + data[key] = flashvars.get(key, '') + + encrypted_rtn = None + + # Some AES keys are rejected. Try it with different AES keys + for idx in range(5): + aes_key = [random.randint(0, 254) for _ in range(32)] + padded_aeskey = intlist_to_bytes(pkcs1pad(aes_key, 128)) + + n, e = self._RSA_KEY + encrypted_aeskey = long_to_bytes(pow(bytes_to_long(padded_aeskey), e, n)) + init_data = self._download_json('http://www.daisuki.net/bin/bgn/init', video_id, query={ + 's': flashvars.get('s', ''), + 'c': flashvars.get('ss3_prm', ''), + 'e': url, + 'd': base64.b64encode(intlist_to_bytes(aes_cbc_encrypt( + bytes_to_intlist(json.dumps(data)), + aes_key, iv))).decode('ascii'), + 'a': base64.b64encode(encrypted_aeskey).decode('ascii'), + }, note='Downloading JSON metadata' + (' (try #%d)' % (idx + 1) if idx > 0 else '')) + + if 'rtn' in init_data: + encrypted_rtn = init_data['rtn'] + break + + self._sleep(5, video_id) + + if encrypted_rtn is None: + raise ExtractorError('Failed to fetch init data') + + rtn = self._parse_json( + intlist_to_bytes(aes_cbc_decrypt(bytes_to_intlist( + base64.b64decode(encrypted_rtn)), + aes_key, iv)).decode('utf-8').rstrip('\0'), + video_id) + + formats = self._extract_m3u8_formats( + rtn['play_url'], video_id, ext='mp4', entry_protocol='m3u8_native') + + title = remove_end(self._og_search_title(webpage), ' - DAISUKI') + + creator = self._html_search_regex( + r'Creator\s*:\s*([^<]+)', webpage, 'creator', fatal=False) + + subtitles = {} + caption_url = rtn.get('caption_url') + if caption_url: + # mul: multiple languages + subtitles['mul'] = [{ + 'url': caption_url, + 'ext': 'ttml', + }] + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'creator': creator, + } + + +class DaisukiPlaylistIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)daisuki\.net/[^/]+/[^/]+/[^/]+/detail\.(?P[a-zA-Z0-9]+)\.html' + + _TEST = { + 'url': 'http://www.daisuki.net/tw/en/anime/detail.TheIdolMasterCG.html', + 'info_dict': { + 'id': 'TheIdolMasterCG', + 'title': 'THE IDOLM@STER CINDERELLA GIRLS', + 'description': 'md5:0f2c028a9339f7a2c7fbf839edc5c5d8', + }, + 'playlist_count': 26, + } + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + episode_pattern = r'''(?sx) + ]+delay="[^"]+/(\d+)/movie\.jpg".+? + ]+class=".*?\bepisodeNumber\b.*?">(?:]+>)?([^<]+)''' + entries = [{ + '_type': 'url_transparent', + 'url': url.replace('detail', 'watch').replace('.html', '.' + movie_id + '.html'), + 'episode_id': episode_id, + 'episode_number': int_or_none(episode_id), + } for movie_id, episode_id in re.findall(episode_pattern, webpage)] + + playlist_title = remove_end( + self._og_search_title(webpage, fatal=False), ' - Anime - DAISUKI') + playlist_description = clean_html(get_element_by_id('synopsisTxt', webpage)) + + return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 9115944..82d8a04 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -1,15 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import hashlib import time -import uuid +import hashlib from .common import InfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) from ..utils import ( ExtractorError, unescapeHTML, @@ -25,7 +20,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': 'iseven', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -56,7 +51,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': '17732', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ara根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -74,10 +69,6 @@ class DouyuTVIE(InfoExtractor): 'only_matching': True, }] - # Decompile core.swf in webpage by ffdec "Search SWFs in memory". core.swf - # is encrypted originally, but ffdec can dump memory to get the decrypted one. - _API_KEY = 'A12Svb&%1UUmf@hC' - def _real_extract(self, url): video_id = self._match_id(url) @@ -88,6 +79,7 @@ class DouyuTVIE(InfoExtractor): room_id = self._html_search_regex( r'"room_id\\?"\s*:\s*(\d+),', page, 'room id') + # Grab metadata from mobile API room = self._download_json( 'http://m.douyu.com/html5/live?roomId=%s' % room_id, video_id, note='Downloading room info')['data'] @@ -96,38 +88,22 @@ class DouyuTVIE(InfoExtractor): if room.get('show_status') == '2': raise ExtractorError('Live stream is offline', expected=True) - tt = compat_str(int(time.time() / 60)) - did = uuid.uuid4().hex.upper() - - sign_content = ''.join((room_id, did, self._API_KEY, tt)) - sign = hashlib.md5((sign_content).encode('utf-8')).hexdigest() - - flv_data = compat_urllib_parse_urlencode({ - 'cdn': 'ws', - 'rate': '0', - 'tt': tt, - 'did': did, - 'sign': sign, - }) - - video_info = self._download_json( - 'http://www.douyu.com/lapi/live/getPlay/%s' % room_id, video_id, - data=flv_data, note='Downloading video info', - headers={'Content-Type': 'application/x-www-form-urlencoded'}) - - error_code = video_info.get('error', 0) - if error_code is not 0: - raise ExtractorError( - '%s reported error %i' % (self.IE_NAME, error_code), - expected=True) - - base_url = video_info['data']['rtmp_url'] - live_path = video_info['data']['rtmp_live'] - - video_url = '%s/%s' % (base_url, live_path) + # Grab the URL from PC client API + # The m3u8 url from mobile API requires re-authentication every 5 minutes + tt = int(time.time()) + signContent = 'lapi/live/thirdPart/getPlay/%s?aid=pcclient&rate=0&time=%d9TUk5fjjUjg9qIMH3sdnh' % (room_id, tt) + sign = hashlib.md5(signContent.encode('ascii')).hexdigest() + video_url = self._download_json( + 'http://coapi.douyucdn.cn/lapi/live/thirdPart/getPlay/' + room_id, + video_id, note='Downloading video URL info', + query={'rate': 0}, headers={ + 'auth': sign, + 'time': str(tt), + 'aid': 'pcclient' + })['data']['live_url'] title = self._live_title(unescapeHTML(room['room_name'])) - description = room.get('notice') + description = room.get('show_details') thumbnail = room.get('room_src') uploader = room.get('nickname') diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e966d74..e491701 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -15,6 +15,8 @@ from ..utils import ( class DRTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv/se|nyheder|radio/ondemand)/(?:[^/]+/)*(?P[\da-z-]+)(?:[/#?]|$)' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['DK'] IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', @@ -137,7 +139,7 @@ class DRTVIE(InfoExtractor): if not formats and restricted_to_denmark: self.raise_geo_restricted( 'Unfortunately, DR is not allowed to show this program outside Denmark.', - expected=True) + countries=self._GEO_COUNTRIES) self._sort_formats(formats) @@ -156,6 +158,7 @@ class DRTVIE(InfoExtractor): class DRTVLiveIE(InfoExtractor): IE_NAME = 'drtv:live' _VALID_URL = r'https?://(?:www\.)?dr\.dk/(?:tv|TV)/live/(?P[\da-z-]+)' + _GEO_COUNTRIES = ['DK'] _TEST = { 'url': 'https://www.dr.dk/tv/live/dr1', 'info_dict': { diff --git a/youtube_dl/extractor/etonline.py b/youtube_dl/extractor/etonline.py new file mode 100644 index 0000000..17d7cfe --- /dev/null +++ b/youtube_dl/extractor/etonline.py @@ -0,0 +1,39 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor + + +class ETOnlineIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?etonline\.com/(?:[^/]+/)*(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.etonline.com/tv/211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale/', + 'info_dict': { + 'id': '211130_dove_cameron_liv_and_maddie_emotional_episode_series_finale', + 'title': 'md5:a21ec7d3872ed98335cbd2a046f34ee6', + 'description': 'md5:8b94484063f463cca709617c79618ccd', + }, + 'playlist_count': 2, + }, { + 'url': 'http://www.etonline.com/media/video/here_are_the_stars_who_love_bringing_their_moms_as_dates_to_the_oscars-211359/', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1242911076001/default_default/index.html?videoId=ref:%s' + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + entries = [ + self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % video_id, 'BrightcoveNew', video_id) + for video_id in re.findall( + r'site\.brightcove\s*\([^,]+,\s*["\'](title_\d+)', webpage)] + + return self.playlist_result( + entries, playlist_id, + self._og_search_title(webpage, fatal=False), + self._og_search_description(webpage)) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 83a170f..b056dff 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -83,6 +83,7 @@ from .awaan import ( from .azmedien import ( AZMedienIE, AZMedienPlaylistIE, + AZMedienShowPlaylistIE, ) from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE @@ -227,6 +228,10 @@ from .dailymotion import ( DailymotionUserIE, DailymotionCloudIE, ) +from .daisuki import ( + DaisukiIE, + DaisukiPlaylistIE, +) from .daum import ( DaumIE, DaumClipIE, @@ -288,6 +293,7 @@ from .espn import ( ESPNArticleIE, ) from .esri import EsriVideoIE +from .etonline import ETOnlineIE from .europa import EuropaIE from .everyonesmixtape import EveryonesMixtapeIE from .expotv import ExpoTVIE @@ -338,6 +344,7 @@ from .francetv import ( ) from .freesound import FreesoundIE from .freespeech import FreespeechIE +from .freshlive import FreshLiveIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE from .fusion import FusionIE @@ -637,6 +644,7 @@ from .ninecninemedia import ( from .ninegag import NineGagIE from .ninenow import NineNowIE from .nintendo import NintendoIE +from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE from .normalboots import NormalbootsIE @@ -666,6 +674,7 @@ from .npo import ( NPORadioIE, NPORadioFragmentIE, SchoolTVIE, + HetKlokhuisIE, VPROIE, WNLIE, ) @@ -784,6 +793,7 @@ from .rai import ( ) from .rbmaradio import RBMARadioIE from .rds import RDSIE +from .redbulltv import RedBullTVIE from .redtube import RedTubeIE from .regiotv import RegioTVIE from .rentv import ( @@ -835,7 +845,6 @@ from .safari import ( from .sapo import SapoIE from .savefrom import SaveFromIE from .sbs import SBSIE -from .scivee import SciVeeIE from .screencast import ScreencastIE from .screencastomatic import ScreencastOMaticIE from .scrippsnetworks import ScrippsNetworksWatchIE @@ -991,6 +1000,7 @@ from .tunein import ( TuneInTopicIE, TuneInShortenerIE, ) +from .tunepk import TunePkIE from .turbo import TurboIE from .tutv import TutvIE from .tv2 import ( @@ -1157,6 +1167,7 @@ from .voicerepublic import VoiceRepublicIE from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE +from .vrak import VrakIE from .vube import VubeIE from .vuclip import VuClipIE from .vvvvid import VVVVIDIE diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 70b8c95..6315d40 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -303,7 +303,7 @@ class FacebookIE(InfoExtractor): if not video_data: server_js_data = self._parse_json( self._search_regex( - r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall)', + r'bigPipe\.onPageletArrive\(({.+?})\)\s*;\s*}\s*\)\s*,\s*["\']onPageletArrive\s+(?:stream_pagelet|pagelet_group_mall|permalink_video_pagelet)', webpage, 'js data', default='{}'), video_id, transform_source=js_to_json, fatal=False) if server_js_data: diff --git a/youtube_dl/extractor/freshlive.py b/youtube_dl/extractor/freshlive.py new file mode 100644 index 0000000..a90f915 --- /dev/null +++ b/youtube_dl/extractor/freshlive.py @@ -0,0 +1,84 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + try_get, + unified_timestamp, +) + + +class FreshLiveIE(InfoExtractor): + _VALID_URL = r'https?://freshlive\.tv/[^/]+/(?P\d+)' + _TEST = { + 'url': 'https://freshlive.tv/satotv/74712', + 'md5': '9f0cf5516979c4454ce982df3d97f352', + 'info_dict': { + 'id': '74712', + 'ext': 'mp4', + 'title': 'テスト', + 'description': 'テスト', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1511, + 'timestamp': 1483619655, + 'upload_date': '20170105', + 'uploader': 'サトTV', + 'uploader_id': 'satotv', + 'view_count': int, + 'comment_count': int, + 'is_live': False, + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + options = self._parse_json( + self._search_regex( + r'window\.__CONTEXT__\s*=\s*({.+?});\s*', + webpage, 'initial context'), + video_id) + + info = options['context']['dispatcher']['stores']['ProgramStore']['programs'][video_id] + + title = info['title'] + + if info.get('status') == 'upcoming': + raise ExtractorError('Stream %s is upcoming' % video_id, expected=True) + + stream_url = info.get('liveStreamUrl') or info['archiveStreamUrl'] + + is_live = info.get('liveStreamUrl') is not None + + formats = self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id='hls') + + if is_live: + title = self._live_title(title) + + return { + 'id': video_id, + 'formats': formats, + 'title': title, + 'description': info.get('description'), + 'thumbnail': info.get('thumbnailUrl'), + 'duration': int_or_none(info.get('airTime')), + 'timestamp': unified_timestamp(info.get('createdAt')), + 'uploader': try_get( + info, lambda x: x['channel']['title'], compat_str), + 'uploader_id': try_get( + info, lambda x: x['channel']['code'], compat_str), + 'uploader_url': try_get( + info, lambda x: x['channel']['permalink'], compat_str), + 'view_count': int_or_none(info.get('viewCount')), + 'comment_count': int_or_none(info.get('commentCount')), + 'tags': info.get('tags', []), + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 9868ca6..ebab950 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -84,6 +84,7 @@ from .twentymin import TwentyMinutenIE from .ustream import UstreamIE from .openload import OpenloadIE from .videopress import VideoPressIE +from .rutube import RutubeIE class GenericIE(InfoExtractor): @@ -1502,6 +1503,23 @@ class GenericIE(InfoExtractor): }, 'add_ie': [VideoPressIE.ie_key()], }, + { + # Rutube embed + 'url': 'http://magazzino.friday.ru/videos/vipuski/kazan-2', + 'info_dict': { + 'id': '9b3d5bee0a8740bf70dfd29d3ea43541', + 'ext': 'flv', + 'title': 'Магаззино: Казань 2', + 'description': 'md5:99bccdfac2269f0e8fdbc4bbc9db184a', + 'uploader': 'Магаззино', + 'upload_date': '20170228', + 'uploader_id': '996642', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [RutubeIE.ie_key()], + }, { # ThePlatform embedded with whitespaces in URLs 'url': 'http://www.golfchannel.com/topics/shows/golftalkcentral.htm', @@ -2480,6 +2498,12 @@ class GenericIE(InfoExtractor): return _playlist_from_matches( videopress_urls, ie=VideoPressIE.ie_key()) + # Look for Rutube embeds + rutube_urls = RutubeIE._extract_urls(webpage) + if rutube_urls: + return _playlist_from_matches( + rutube_urls, ie=RutubeIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 21ed846..4c9be47 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,7 +36,7 @@ class GoIE(AdobePassIE): 'requestor_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P\w+)|season-\d+/\d+-(?P[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P\w+)|(?:[^/]+/)*(?P[^/?#]+))' % '|'.join(_SITE_INFO.keys()) _TESTS = [{ 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', 'info_dict': { @@ -52,6 +52,12 @@ class GoIE(AdobePassIE): }, { 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', 'only_matching': True, + }, { + 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', + 'only_matching': True, + }, { + 'url': 'http://abc.go.com/shows/world-news-tonight/episode-guide/2017-02/17-021717-intense-stand-off-between-man-with-rifle-and-police-in-oakland', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/mdr.py b/youtube_dl/extractor/mdr.py index 6e4290a..322e5b4 100644 --- a/youtube_dl/extractor/mdr.py +++ b/youtube_dl/extractor/mdr.py @@ -14,7 +14,7 @@ from ..utils import ( class MDRIE(InfoExtractor): IE_DESC = 'MDR.DE and KiKA' - _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z]+-?(?P\d+)(?:_.+?)?\.html' + _VALID_URL = r'https?://(?:www\.)?(?:mdr|kika)\.de/(?:.*)/[a-z-]+-?(?P\d+)(?:_.+?)?\.html' _TESTS = [{ # MDR regularly deletes its videos @@ -31,6 +31,7 @@ class MDRIE(InfoExtractor): 'duration': 250, 'uploader': 'MITTELDEUTSCHER RUNDFUNK', }, + 'skip': '404 not found', }, { 'url': 'http://www.kika.de/baumhaus/videos/video19636.html', 'md5': '4930515e36b06c111213e80d1e4aad0e', @@ -41,6 +42,7 @@ class MDRIE(InfoExtractor): 'duration': 134, 'uploader': 'KIKA', }, + 'skip': '404 not found', }, { 'url': 'http://www.kika.de/sendungen/einzelsendungen/weihnachtsprogramm/videos/video8182.html', 'md5': '5fe9c4dd7d71e3b238f04b8fdd588357', @@ -49,11 +51,21 @@ class MDRIE(InfoExtractor): 'ext': 'mp4', 'title': 'Beutolomäus und der geheime Weihnachtswunsch', 'description': 'md5:b69d32d7b2c55cbe86945ab309d39bbd', - 'timestamp': 1450950000, - 'upload_date': '20151224', + 'timestamp': 1482541200, + 'upload_date': '20161224', 'duration': 4628, 'uploader': 'KIKA', }, + }, { + # audio with alternative playerURL pattern + 'url': 'http://www.mdr.de/kultur/videos-und-audios/audio-radio/operation-mindfuck-robert-wilson100.html', + 'info_dict': { + 'id': '100', + 'ext': 'mp4', + 'title': 'Feature: Operation Mindfuck - Robert Anton Wilson', + 'duration': 3239, + 'uploader': 'MITTELDEUTSCHER RUNDFUNK', + }, }, { 'url': 'http://www.kika.de/baumhaus/sendungen/video19636_zc-fea7f8a0_zs-4bf89c60.html', 'only_matching': True, @@ -71,7 +83,7 @@ class MDRIE(InfoExtractor): webpage = self._download_webpage(url, video_id) data_url = self._search_regex( - r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P.+/(?:video|audio)-?[0-9]+-avCustom\.xml)\1', + r'(?:dataURL|playerXml(?:["\'])?)\s*:\s*(["\'])(?P.+?-avCustom\.xml)\1', webpage, 'data url', group='url').replace(r'\/', '/') doc = self._download_xml( diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py new file mode 100644 index 0000000..f5e3f68 --- /dev/null +++ b/youtube_dl/extractor/njpwworld.py @@ -0,0 +1,83 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + get_element_by_class, + urlencode_postdata, +) + + +class NJPWWorldIE(InfoExtractor): + _VALID_URL = r'https?://njpwworld\.com/p/(?P[a-z0-9_]+)' + IE_DESC = '新日本プロレスワールド' + _NETRC_MACHINE = 'njpwworld' + + _TEST = { + 'url': 'http://njpwworld.com/p/s_series_00155_1_9/', + 'info_dict': { + 'id': 's_series_00155_1_9', + 'ext': 'mp4', + 'title': '第9試合 ランディ・サベージ vs リック・スタイナー', + 'tags': list, + }, + 'params': { + 'skip_download': True, # AES-encrypted m3u8 + }, + 'skip': 'Requires login', + } + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + # No authentication to be performed + if not username: + return True + + webpage, urlh = self._download_webpage_handle( + 'https://njpwworld.com/auth/login', None, + note='Logging in', errnote='Unable to login', + data=urlencode_postdata({'login_id': username, 'pw': password})) + # /auth/login will return 302 for successful logins + if urlh.geturl() == 'https://njpwworld.com/auth/login': + self.report_warning('unable to login') + return False + + return True + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + formats = [] + for player_url, kind in re.findall(r']+href="(/player[^"]+)".+?]+src="[^"]+qf_btn_([^".]+)', webpage): + player_url = compat_urlparse.urljoin(url, player_url) + + player_page = self._download_webpage( + player_url, video_id, note='Downloading player page') + + entries = self._parse_html5_media_entries( + player_url, player_page, video_id, m3u8_id='hls-%s' % kind, + m3u8_entry_protocol='m3u8_native', + preference=2 if 'hq' in kind else 1) + formats.extend(entries[0]['formats']) + + self._sort_formats(formats) + + post_content = get_element_by_class('post-content', webpage) + tags = re.findall( + r']+class="tag-[^"]+">]*>([^<]+)', post_content + ) if post_content else None + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'formats': formats, + 'tags': tags, + } diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 9624371..38fefe4 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -3,41 +3,27 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + determine_ext, + ExtractorError, fix_xml_ampersands, orderedSet, parse_duration, qualities, strip_jsonp, unified_strdate, - ExtractorError, ) class NPOBaseIE(InfoExtractor): def _get_token(self, video_id): - token_page = self._download_webpage( - 'http://ida.omroep.nl/npoplayer/i.js', - video_id, note='Downloading token') - token = self._search_regex( - r'npoplayer\.token = "(.+?)"', token_page, 'token') - # Decryption algorithm extracted from http://npoplayer.omroep.nl/csjs/npoplayer-min.js - token_l = list(token) - first = second = None - for i in range(5, len(token_l) - 4): - if token_l[i].isdigit(): - if first is None: - first = i - elif second is None: - second = i - if first is None or second is None: - first = 12 - second = 13 - - token_l[first], token_l[second] = token_l[second], token_l[first] - - return ''.join(token_l) + return self._download_json( + 'http://ida.omroep.nl/app.php/auth', video_id, + note='Downloading token')['token'] class NPOIE(NPOBaseIE): @@ -51,97 +37,120 @@ class NPOIE(NPOBaseIE): (?: npo\.nl/(?!live|radio)(?:[^/]+/){2}| ntr\.nl/(?:[^/]+/){2,}| - omroepwnl\.nl/video/fragment/[^/]+__ + omroepwnl\.nl/video/fragment/[^/]+__| + zapp\.nl/[^/]+/[^/]+/ ) ) (?P[^/?#]+) ''' - _TESTS = [ - { - 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', - 'md5': '4b3f9c429157ec4775f2c9cb7b911016', - 'info_dict': { - 'id': 'VPWON_1220719', - 'ext': 'm4v', - 'title': 'Nieuwsuur', - 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', - 'upload_date': '20140622', - }, + _TESTS = [{ + 'url': 'http://www.npo.nl/nieuwsuur/22-06-2014/VPWON_1220719', + 'md5': '4b3f9c429157ec4775f2c9cb7b911016', + 'info_dict': { + 'id': 'VPWON_1220719', + 'ext': 'm4v', + 'title': 'Nieuwsuur', + 'description': 'Dagelijks tussen tien en elf: nieuws, sport en achtergronden.', + 'upload_date': '20140622', }, - { - 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', - 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', - 'info_dict': { - 'id': 'VARA_101191800', - 'ext': 'm4v', - 'title': 'De Mega Mike & Mega Thomas show: The best of.', - 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', - 'upload_date': '20090227', - 'duration': 2400, - }, + }, { + 'url': 'http://www.npo.nl/de-mega-mike-mega-thomas-show/27-02-2009/VARA_101191800', + 'md5': 'da50a5787dbfc1603c4ad80f31c5120b', + 'info_dict': { + 'id': 'VARA_101191800', + 'ext': 'm4v', + 'title': 'De Mega Mike & Mega Thomas show: The best of.', + 'description': 'md5:3b74c97fc9d6901d5a665aac0e5400f4', + 'upload_date': '20090227', + 'duration': 2400, }, - { - 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', - 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', - 'info_dict': { - 'id': 'VPWON_1169289', - 'ext': 'm4v', - 'title': 'Tegenlicht: De toekomst komt uit Afrika', - 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', - 'upload_date': '20130225', - 'duration': 3000, - }, + }, { + 'url': 'http://www.npo.nl/tegenlicht/25-02-2013/VPWON_1169289', + 'md5': 'f8065e4e5a7824068ed3c7e783178f2c', + 'info_dict': { + 'id': 'VPWON_1169289', + 'ext': 'm4v', + 'title': 'Tegenlicht: Zwart geld. De toekomst komt uit Afrika', + 'description': 'md5:52cf4eefbc96fffcbdc06d024147abea', + 'upload_date': '20130225', + 'duration': 3000, }, - { - 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', - 'info_dict': { - 'id': 'WO_VPRO_043706', - 'ext': 'wmv', - 'title': 'De nieuwe mens - Deel 1', - 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', - 'duration': 4680, - }, - 'params': { - # mplayer mms download - 'skip_download': True, - } + }, { + 'url': 'http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706', + 'info_dict': { + 'id': 'WO_VPRO_043706', + 'ext': 'm4v', + 'title': 'De nieuwe mens - Deel 1', + 'description': 'md5:518ae51ba1293ffb80d8d8ce90b74e4b', + 'duration': 4680, }, + 'params': { + 'skip_download': True, + } + }, { # non asf in streams - { - 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', - 'md5': 'b3da13de374cbe2d5332a7e910bef97f', - 'info_dict': { - 'id': 'WO_NOS_762771', - 'ext': 'mp4', - 'title': 'Hoe gaat Europa verder na Parijs?', - }, + 'url': 'http://www.npo.nl/hoe-gaat-europa-verder-na-parijs/10-01-2015/WO_NOS_762771', + 'info_dict': { + 'id': 'WO_NOS_762771', + 'ext': 'mp4', + 'title': 'Hoe gaat Europa verder na Parijs?', }, - { - 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', - 'md5': '01c6a2841675995da1f0cf776f03a9c3', - 'info_dict': { - 'id': 'VPWON_1233944', - 'ext': 'm4v', - 'title': 'Aap, poot, pies', - 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', - 'upload_date': '20150508', - 'duration': 599, - }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.ntr.nl/Aap-Poot-Pies/27/detail/Aap-poot-pies/VPWON_1233944#content', + 'info_dict': { + 'id': 'VPWON_1233944', + 'ext': 'm4v', + 'title': 'Aap, poot, pies', + 'description': 'md5:c9c8005d1869ae65b858e82c01a91fde', + 'upload_date': '20150508', + 'duration': 599, }, - { - 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', - 'md5': 'd30cd8417b8b9bca1fdff27428860d08', - 'info_dict': { - 'id': 'POW_00996502', - 'ext': 'm4v', - 'title': '''"Dit is wel een 'landslide'..."''', - 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', - 'upload_date': '20150508', - 'duration': 462, - }, + 'params': { + 'skip_download': True, } - ] + }, { + 'url': 'http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698', + 'info_dict': { + 'id': 'POW_00996502', + 'ext': 'm4v', + 'title': '''"Dit is wel een 'landslide'..."''', + 'description': 'md5:f8d66d537dfb641380226e31ca57b8e8', + 'upload_date': '20150508', + 'duration': 462, + }, + 'params': { + 'skip_download': True, + } + }, { + # audio + 'url': 'http://www.npo.nl/jouw-stad-rotterdam/29-01-2017/RBX_FUNX_6683215/RBX_FUNX_7601437', + 'info_dict': { + 'id': 'RBX_FUNX_6683215', + 'ext': 'mp3', + 'title': 'Jouw Stad Rotterdam', + 'description': 'md5:db251505244f097717ec59fabc372d9f', + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'http://www.zapp.nl/de-bzt-show/gemist/KN_1687547', + 'only_matching': True, + }, { + 'url': 'http://www.zapp.nl/de-bzt-show/filmpjes/POMS_KN_7315118', + 'only_matching': True, + }, { + 'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990', + 'only_matching': True, + }, { + # live stream + 'url': 'npo:LI_NL1_4188102', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -170,70 +179,115 @@ class NPOIE(NPOBaseIE): token = self._get_token(video_id) formats = [] + urls = set() + + quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) + items = self._download_json( + 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, + 'Downloading formats JSON', query={ + 'adaptive': 'yes', + 'token': token, + })['items'][0] + for num, item in enumerate(items): + item_url = item.get('url') + if not item_url or item_url in urls: + continue + urls.add(item_url) + format_id = self._search_regex( + r'video/ida/([^/]+)', item_url, 'format id', + default=None) + + def add_format_url(format_url): + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': quality(format_id), + }) + + # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 + if item.get('contentType') in ('url', 'audio'): + add_format_url(item_url) + continue - pubopties = metadata.get('pubopties') - if pubopties: - quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) - for format_id in pubopties: - format_info = self._download_json( - 'http://ida.omroep.nl/odi/?prid=%s&puboptions=%s&adaptive=yes&token=%s' - % (video_id, format_id, token), - video_id, 'Downloading %s JSON' % format_id) - if format_info.get('error_code', 0) or format_info.get('errorcode', 0): + try: + stream_info = self._download_json( + item_url + '&type=json', video_id, + 'Downloading %s stream JSON' + % item.get('label') or item.get('format') or format_id or num) + except ExtractorError as ee: + if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: + error = (self._parse_json( + ee.cause.read().decode(), video_id, + fatal=False) or {}).get('errorstring') + if error: + raise ExtractorError(error, expected=True) + raise + # Stream URL instead of JSON, example: npo:LI_NL1_4188102 + if isinstance(stream_info, compat_str): + if not stream_info.startswith('http'): continue - streams = format_info.get('streams') - if streams: - try: - video_info = self._download_json( - streams[0] + '&type=json', - video_id, 'Downloading %s stream JSON' % format_id) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: - error = (self._parse_json(ee.cause.read().decode(), video_id, fatal=False) or {}).get('errorstring') - if error: - raise ExtractorError(error, expected=True) - raise - else: - video_info = format_info - video_url = video_info.get('url') - if not video_url: + video_url = stream_info + # JSON + else: + video_url = stream_info.get('url') + if not video_url or video_url in urls: + continue + urls.add(item_url) + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + else: + add_format_url(video_url) + + is_live = metadata.get('medium') == 'live' + + if not is_live: + for num, stream in enumerate(metadata.get('streams', [])): + stream_url = stream.get('url') + if not stream_url or stream_url in urls: continue - if format_id == 'adaptive': - formats.extend(self._extract_m3u8_formats(video_url, video_id, 'mp4')) - else: + urls.add(stream_url) + # smooth streaming is not supported + stream_type = stream.get('type', '').lower() + if stream_type in ['ss', 'ms']: + continue + if stream_type == 'hds': + f4m_formats = self._extract_f4m_formats( + stream_url, video_id, fatal=False) + # f4m downloader downloads only piece of live stream + for f4m_format in f4m_formats: + f4m_format['preference'] = -1 + formats.extend(f4m_formats) + elif stream_type == 'hls': + formats.extend(self._extract_m3u8_formats( + stream_url, video_id, ext='mp4', fatal=False)) + # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 + elif '.asf' in stream_url: + asx = self._download_xml( + stream_url, video_id, + 'Downloading stream %d ASX playlist' % num, + transform_source=fix_xml_ampersands, fatal=False) + if not asx: + continue + ref = asx.find('./ENTRY/Ref') + if ref is None: + continue + video_url = ref.get('href') + if not video_url or video_url in urls: + continue + urls.add(video_url) formats.append({ 'url': video_url, - 'format_id': format_id, - 'quality': quality(format_id), + 'ext': stream.get('formaat', 'asf'), + 'quality': stream.get('kwaliteit'), + 'preference': -10, }) - - streams = metadata.get('streams') - if streams: - for i, stream in enumerate(streams): - stream_url = stream.get('url') - if not stream_url: - continue - if '.asf' not in stream_url: + else: formats.append({ 'url': stream_url, 'quality': stream.get('kwaliteit'), }) - continue - asx = self._download_xml( - stream_url, video_id, - 'Downloading stream %d ASX playlist' % i, - transform_source=fix_xml_ampersands) - ref = asx.find('./ENTRY/Ref') - if ref is None: - continue - video_url = ref.get('href') - if not video_url: - continue - formats.append({ - 'url': video_url, - 'ext': stream.get('formaat', 'asf'), - 'quality': stream.get('kwaliteit'), - }) self._sort_formats(formats) @@ -246,28 +300,28 @@ class NPOIE(NPOBaseIE): return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': metadata.get('info'), 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], 'upload_date': unified_strdate(metadata.get('gidsdatum')), 'duration': parse_duration(metadata.get('tijdsduur')), 'formats': formats, 'subtitles': subtitles, + 'is_live': is_live, } class NPOLiveIE(NPOBaseIE): IE_NAME = 'npo.nl:live' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P.+)' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P[^/?#&]+)' _TEST = { 'url': 'http://www.npo.nl/live/npo-1', 'info_dict': { - 'id': 'LI_NEDERLAND1_136692', + 'id': 'LI_NL1_4188102', 'display_id': 'npo-1', 'ext': 'mp4', - 'title': 're:^Nederland 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', - 'description': 'Livestream', + 'title': 're:^NPO 1 [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'is_live': True, }, 'params': { @@ -283,58 +337,12 @@ class NPOLiveIE(NPOBaseIE): live_id = self._search_regex( r'data-prid="([^"]+)"', webpage, 'live id') - metadata = self._download_json( - 'http://e.omroep.nl/metadata/%s' % live_id, - display_id, transform_source=strip_jsonp) - - token = self._get_token(display_id) - - formats = [] - - streams = metadata.get('streams') - if streams: - for stream in streams: - stream_type = stream.get('type').lower() - # smooth streaming is not supported - if stream_type in ['ss', 'ms']: - continue - stream_info = self._download_json( - 'http://ida.omroep.nl/aapi/?stream=%s&token=%s&type=jsonp' - % (stream.get('url'), token), - display_id, 'Downloading %s JSON' % stream_type) - if stream_info.get('error_code', 0) or stream_info.get('errorcode', 0): - continue - stream_url = self._download_json( - stream_info['stream'], display_id, - 'Downloading %s URL' % stream_type, - 'Unable to download %s URL' % stream_type, - transform_source=strip_jsonp, fatal=False) - if not stream_url: - continue - if stream_type == 'hds': - f4m_formats = self._extract_f4m_formats(stream_url, display_id) - # f4m downloader downloads only piece of live stream - for f4m_format in f4m_formats: - f4m_format['preference'] = -1 - formats.extend(f4m_formats) - elif stream_type == 'hls': - formats.extend(self._extract_m3u8_formats(stream_url, display_id, 'mp4')) - else: - formats.append({ - 'url': stream_url, - 'preference': -10, - }) - - self._sort_formats(formats) - return { + '_type': 'url_transparent', + 'url': 'npo:%s' % live_id, + 'ie_key': NPOIE.ie_key(), 'id': live_id, 'display_id': display_id, - 'title': self._live_title(metadata['titel']), - 'description': metadata['info'], - 'thumbnail': metadata.get('images', [{'url': None}])[-1]['url'], - 'formats': formats, - 'is_live': True, } @@ -416,7 +424,21 @@ class NPORadioFragmentIE(InfoExtractor): } -class SchoolTVIE(InfoExtractor): +class NPODataMidEmbedIE(InfoExtractor): + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._search_regex( + r'data-mid=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video_id', group='id') + return { + '_type': 'url_transparent', + 'ie_key': 'NPO', + 'url': 'npo:%s' % video_id, + 'display_id': display_id + } + + +class SchoolTVIE(NPODataMidEmbedIE): IE_NAME = 'schooltv' _VALID_URL = r'https?://(?:www\.)?schooltv\.nl/video/(?P[^/?#&]+)' @@ -435,17 +457,25 @@ class SchoolTVIE(InfoExtractor): } } - def _real_extract(self, url): - display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'data-mid=(["\'])(?P(?:(?!\1).)+)\1', webpage, 'video_id', group='id') - return { - '_type': 'url_transparent', - 'ie_key': 'NPO', - 'url': 'npo:%s' % video_id, - 'display_id': display_id + +class HetKlokhuisIE(NPODataMidEmbedIE): + IE_NAME = 'hetklokhuis' + _VALID_URL = r'https?://(?:www\.)?hetklokhuis.nl/[^/]+/\d+/(?P[^/?#&]+)' + + _TEST = { + 'url': 'http://hetklokhuis.nl/tv-uitzending/3471/Zwaartekrachtsgolven', + 'info_dict': { + 'id': 'VPWON_1260528', + 'display_id': 'Zwaartekrachtsgolven', + 'ext': 'm4v', + 'title': 'Het Klokhuis: Zwaartekrachtsgolven', + 'description': 'md5:c94f31fb930d76c2efa4a4a71651dd48', + 'upload_date': '20170223', + }, + 'params': { + 'skip_download': True } + } class NPOPlaylistBaseIE(NPOIE): diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index fc7ff43..25f6a9a 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -75,22 +75,37 @@ class OpenloadIE(InfoExtractor): ']+id="[^"]+"[^>]*>([0-9A-Za-z]+)', webpage, 'openload ID') - first_char = int(ol_id[0]) - urlcode = [] - num = 1 - - while num < len(ol_id): - i = ord(ol_id[num]) - key = 0 - if i <= 90: - key = i - 65 - elif i >= 97: - key = 25 + i - 97 - urlcode.append((key, compat_chr(int(ol_id[num + 2:num + 5]) // int(ol_id[num + 1]) - first_char))) - num += 5 - - video_url = 'https://openload.co/stream/' + ''.join( - [value for _, value in sorted(urlcode, key=lambda x: x[0])]) + video_url_chars = [] + + first_char = ord(ol_id[0]) + key = first_char - 55 + maxKey = max(2, key) + key = min(maxKey, len(ol_id) - 14) + t = ol_id[key:key + 12] + + hashMap = {} + v = ol_id.replace(t, "") + h = 0 + + while h < len(t): + f = t[h:h + 2] + i = int(f, 16) + hashMap[h / 2] = i + h += 2 + + h = 0 + + while h < len(v): + B = v[h:h + 2] + i = int(B, 16) + index = (h / 2) % 6 + A = hashMap[index] + i = i ^ A + video_url_chars.append(compat_chr(i)) + h += 2 + + video_url = 'https://openload.co/stream/%s?mime=true' + video_url = video_url % (''.join(video_url_chars)) title = self._og_search_title(webpage, default=None) or self._search_regex( r']+class=["\']title["\'][^>]*>([^<]+)', webpage, diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py new file mode 100644 index 0000000..5c73d5b --- /dev/null +++ b/youtube_dl/extractor/redbulltv.py @@ -0,0 +1,106 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + int_or_none, + try_get, + unified_timestamp, +) + + +class RedBullTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film)/(?PAP-\w+)' + _TESTS = [{ + # film + 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', + 'md5': '78e860f631d7a846e712fab8c5fe2c38', + 'info_dict': { + 'id': 'AP-1Q756YYX51W11', + 'ext': 'mp4', + 'title': 'ABC of...WRC', + 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', + 'duration': 1582.04, + 'timestamp': 1488405786, + 'upload_date': '20170301', + }, + }, { + # episode + 'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web', + 'info_dict': { + 'id': 'AP-1PMT5JCWH1W11', + 'ext': 'mp4', + 'title': 'Grime - Hashtags S2 E4', + 'description': 'md5:334b741c8c1ce65be057eab6773c1cf5', + 'duration': 904.6, + 'timestamp': 1487290093, + 'upload_date': '20170217', + 'series': 'Hashtags', + 'season_number': 2, + 'episode_number': 4, + }, + }, { + 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + access_token = self._download_json( + 'https://api-v2.redbull.tv/start', video_id, + note='Downloading access token', query={ + 'build': '4.0.9', + 'category': 'smartphone', + 'os_version': 23, + 'os_family': 'android', + })['auth']['access_token'] + + info = self._download_json( + 'https://api-v2.redbull.tv/views/%s' % video_id, + video_id, note='Downloading video information', + headers={'Authorization': 'Bearer ' + access_token} + )['blocks'][0]['top'][0] + + video = info['video_product'] + + title = info['title'].strip() + m3u8_url = video['url'] + + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + + subtitles = {} + for _, captions in (try_get( + video, lambda x: x['attachments']['captions'], + dict) or {}).items(): + if not captions or not isinstance(captions, list): + continue + for caption in captions: + caption_url = caption.get('url') + if not caption_url: + continue + subtitles.setdefault(caption.get('lang') or 'en', []).append({ + 'url': caption_url, + 'ext': caption.get('format'), + }) + + subheading = info.get('subheading') + if subheading: + title += ' - %s' % subheading + + return { + 'id': video_id, + 'title': title, + 'description': info.get('long_description') or info.get( + 'short_description'), + 'duration': float_or_none(video.get('duration'), scale=1000), + 'timestamp': unified_timestamp(info.get('published')), + 'series': info.get('show_title'), + 'season_number': int_or_none(info.get('season_number')), + 'episode_number': int_or_none(info.get('episode_number')), + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py index fd1df92..889fa76 100644 --- a/youtube_dl/extractor/rutube.py +++ b/youtube_dl/extractor/rutube.py @@ -17,7 +17,7 @@ from ..utils import ( class RutubeIE(InfoExtractor): IE_NAME = 'rutube' IE_DESC = 'Rutube videos' - _VALID_URL = r'https?://rutube\.ru/(?:video|play/embed)/(?P[\da-z]{32})' + _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P[\da-z]{32})' _TESTS = [{ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/', @@ -39,8 +39,17 @@ class RutubeIE(InfoExtractor): }, { 'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661', 'only_matching': True, + }, { + 'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661', + 'only_matching': True, }] + @staticmethod + def _extract_urls(webpage): + return [mobj.group('url') for mobj in re.finditer( + r']+?src=(["\'])(?P(?:https?:)?//rutube\.ru/embed/[\da-z]{32}.*?)\1', + webpage)] + def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 20d0175..6c09df2 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -82,6 +82,9 @@ class RuutuIE(InfoExtractor): formats.extend(self._extract_f4m_formats( video_url, video_id, f4m_id='hds', fatal=False)) elif ext == 'mpd': + # video-only and audio-only streams are of different + # duration resulting in out of sync issue + continue formats.extend(self._extract_mpd_formats( video_url, video_id, mpd_id='dash', fatal=False)) else: diff --git a/youtube_dl/extractor/scivee.py b/youtube_dl/extractor/scivee.py deleted file mode 100644 index b1ca12f..0000000 --- a/youtube_dl/extractor/scivee.py +++ /dev/null @@ -1,57 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - - -class SciVeeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?scivee\.tv/node/(?P\d+)' - - _TEST = { - 'url': 'http://www.scivee.tv/node/62352', - 'md5': 'b16699b74c9e6a120f6772a44960304f', - 'info_dict': { - 'id': '62352', - 'ext': 'mp4', - 'title': 'Adam Arkin at the 2014 DOE JGI Genomics of Energy & Environment Meeting', - 'description': 'md5:81f1710638e11a481358fab1b11059d7', - }, - 'skip': 'Not accessible from Travis CI server', - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - # annotations XML is malformed - annotations = self._download_webpage( - 'http://www.scivee.tv/assets/annotations/%s' % video_id, video_id, 'Downloading annotations') - - title = self._html_search_regex(r'([^<]+)', annotations, 'title') - description = self._html_search_regex(r'([^<]+)', annotations, 'abstract', fatal=False) - filesize = int_or_none(self._html_search_regex( - r'([^<]+)', annotations, 'filesize', fatal=False)) - - formats = [ - { - 'url': 'http://www.scivee.tv/assets/audio/%s' % video_id, - 'ext': 'mp3', - 'format_id': 'audio', - }, - { - 'url': 'http://www.scivee.tv/assets/video/%s' % video_id, - 'ext': 'mp4', - 'format_id': 'video', - 'filesize': filesize, - }, - ] - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': 'http://www.scivee.tv/assets/videothumb/%s' % video_id, - 'formats': formats, - } diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index b3aa4ce..0ee4a8f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -121,7 +121,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'fDoItMDbsbZz8dY16ZzARCZmzgHBPotA' + _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod diff --git a/youtube_dl/extractor/tunepk.py b/youtube_dl/extractor/tunepk.py new file mode 100644 index 0000000..9d42651 --- /dev/null +++ b/youtube_dl/extractor/tunepk.py @@ -0,0 +1,90 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + try_get, + unified_timestamp, +) + + +class TunePkIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?: + (?:www\.)?tune\.pk/(?:video/|player/embed_player.php?.*?\bvid=)| + embed\.tune\.pk/play/ + ) + (?P\d+) + ''' + _TESTS = [{ + 'url': 'https://tune.pk/video/6919541/maudie-2017-international-trailer-1-ft-ethan-hawke-sally-hawkins', + 'md5': '0c537163b7f6f97da3c5dd1e3ef6dd55', + 'info_dict': { + 'id': '6919541', + 'ext': 'mp4', + 'title': 'Maudie (2017) | International Trailer # 1 ft Ethan Hawke, Sally Hawkins', + 'description': 'md5:eb5a04114fafef5cec90799a93a2d09c', + 'thumbnail': r're:^https?://.*\.jpg$', + 'timestamp': 1487327564, + 'upload_date': '20170217', + 'uploader': 'Movie Trailers', + 'duration': 107, + 'view_count': int, + } + }, { + 'url': 'https://tune.pk/player/embed_player.php?vid=6919541&folder=2017/02/17/&width=600&height=350&autoplay=no', + 'only_matching': True, + }, { + 'url': 'https://embed.tune.pk/play/6919541?autoplay=no&ssl=yes&inline=true', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://tune.pk/video/%s' % video_id, video_id) + + details = self._parse_json( + self._search_regex( + r'new\s+TunePlayer\(({.+?})\)\s*;\s*\n', webpage, 'tune player'), + video_id)['details'] + + video = details['video'] + title = video.get('title') or self._og_search_title( + webpage, default=None) or self._html_search_meta( + 'title', webpage, 'title', fatal=True) + + formats = self._parse_jwplayer_formats( + details['player']['sources'], video_id) + self._sort_formats(formats) + + description = self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, 'description') + + thumbnail = video.get('thumb') or self._og_search_thumbnail( + webpage, default=None) or self._html_search_meta( + 'thumbnail', webpage, 'thumbnail') + + timestamp = unified_timestamp(video.get('date_added')) + uploader = try_get( + video, lambda x: x['uploader']['name'], + compat_str) or self._html_search_meta('author', webpage, 'author') + + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('views')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/tvigle.py b/youtube_dl/extractor/tvigle.py index f3817ab..3475ef4 100644 --- a/youtube_dl/extractor/tvigle.py +++ b/youtube_dl/extractor/tvigle.py @@ -17,6 +17,9 @@ class TvigleIE(InfoExtractor): IE_DESC = 'Интернет-телевидение Tvigle.ru' _VALID_URL = r'https?://(?:www\.)?(?:tvigle\.ru/(?:[^/]+/)+(?P[^/]+)/$|cloud\.tvigle\.ru/video/(?P\d+))' + _GEO_BYPASS = False + _GEO_COUNTRIES = ['RU'] + _TESTS = [ { 'url': 'http://www.tvigle.ru/video/sokrat/', @@ -72,8 +75,13 @@ class TvigleIE(InfoExtractor): error_message = item.get('errorMessage') if not videos and error_message: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_message), expected=True) + if item.get('isGeoBlocked') is True: + self.raise_geo_restricted( + msg=error_message, countries=self._GEO_COUNTRIES) + else: + raise ExtractorError( + '%s returned error: %s' % (self.IE_NAME, error_message), + expected=True) title = item['title'] description = item.get('description') diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index f3541b6..7af1165 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( parse_iso8601, @@ -12,7 +14,7 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx|sex|tube)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' + _VALID_URL = r'https?://(?P(?:www\.)?24video\.(?:net|me|xxx|sex|tube))/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P\d+)' _TESTS = [{ 'url': 'http://www.24video.net/video/view/1044982', @@ -43,10 +45,12 @@ class TwentyFourVideoIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + host = mobj.group('host') webpage = self._download_webpage( - 'http://www.24video.sex/video/view/%s' % video_id, video_id) + 'http://%s/video/view/%s' % (host, video_id), video_id) title = self._og_search_title(webpage) description = self._html_search_regex( @@ -72,11 +76,11 @@ class TwentyFourVideoIE(InfoExtractor): # Sets some cookies self._download_xml( - r'http://www.24video.sex/video/xml/%s?mode=init' % video_id, + r'http://%s/video/xml/%s?mode=init' % (host, video_id), video_id, 'Downloading init XML') video_xml = self._download_xml( - 'http://www.24video.sex/video/xml/%s?mode=play' % video_id, + 'http://%s/video/xml/%s?mode=play' % (host, video_id), video_id, 'Downloading video XML') video = xpath_element(video_xml, './/video', 'video', fatal=True) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index bbba394..ed36336 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -12,7 +12,6 @@ from ..compat import ( compat_str, compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, - compat_urlparse, ) from ..utils import ( clean_html, @@ -24,6 +23,7 @@ from ..utils import ( parse_iso8601, update_url_query, urlencode_postdata, + urljoin, ) @@ -32,7 +32,7 @@ class TwitchBaseIE(InfoExtractor): _API_BASE = 'https://api.twitch.tv' _USHER_BASE = 'https://usher.ttvnw.net' - _LOGIN_URL = 'http://www.twitch.tv/login' + _LOGIN_URL = 'https://www.twitch.tv/login' _CLIENT_ID = 'jzkbprff40iqj646a697cyrvl0zt2m6' _NETRC_MACHINE = 'twitch' @@ -64,6 +64,35 @@ class TwitchBaseIE(InfoExtractor): raise ExtractorError( 'Unable to login. Twitch said: %s' % message, expected=True) + def login_step(page, urlh, note, data): + form = self._hidden_inputs(page) + form.update(data) + + page_url = urlh.geturl() + post_url = self._search_regex( + r']+action=(["\'])(?P.+?)\1', page, + 'post url', default=page_url, group='url') + post_url = urljoin(page_url, post_url) + + headers = {'Referer': page_url} + + try: + response = self._download_json( + post_url, None, note, + data=urlencode_postdata(form), + headers=headers) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + response = self._parse_json( + e.cause.read().decode('utf-8'), None) + fail(response['message']) + raise + + redirect_url = urljoin(post_url, response['redirect']) + return self._download_webpage_handle( + redirect_url, None, 'Downloading login redirect page', + headers=headers) + login_page, handle = self._download_webpage_handle( self._LOGIN_URL, None, 'Downloading login page') @@ -71,40 +100,19 @@ class TwitchBaseIE(InfoExtractor): if 'blacklist_message' in login_page: fail(clean_html(login_page)) - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'username': username, - 'password': password, + redirect_page, handle = login_step( + login_page, handle, 'Logging in as %s' % username, { + 'username': username, + 'password': password, }) - redirect_url = handle.geturl() - - post_url = self._search_regex( - r']+action=(["\'])(?P.+?)\1', login_page, - 'post url', default=redirect_url, group='url') - - if not post_url.startswith('http'): - post_url = compat_urlparse.urljoin(redirect_url, post_url) - - headers = {'Referer': redirect_url} - - try: - response = self._download_json( - post_url, None, 'Logging in as %s' % username, - data=urlencode_postdata(login_form), - headers=headers) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: - response = self._parse_json( - e.cause.read().decode('utf-8'), None) - fail(response['message']) - raise - - if response.get('redirect'): - self._download_webpage( - response['redirect'], None, 'Downloading login redirect page', - headers=headers) + if re.search(r'(?i)]+id="two-factor-submit"', redirect_page) is not None: + # TODO: Add mechanism to request an SMS or phone call + tfa_token = self._get_tfa_info('two-factor authentication token') + login_step(redirect_page, handle, 'Submitting TFA token', { + 'authy_token': tfa_token, + 'remember_2fa': 'true', + }) def _prefer_source(self, formats): try: diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index c4e37f6..9aa38bc 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -17,12 +17,12 @@ from ..utils import ( class VevoBaseIE(InfoExtractor): - def _extract_json(self, webpage, video_id, item): + def _extract_json(self, webpage, video_id): return self._parse_json( self._search_regex( r'window\.__INITIAL_STORE__\s*=\s*({.+?});\s*', webpage, 'initial store'), - video_id)['default'][item] + video_id) class VevoIE(VevoBaseIE): @@ -139,6 +139,11 @@ class VevoIE(VevoBaseIE): # no genres available 'url': 'http://www.vevo.com/watch/INS171400764', 'only_matching': True, + }, { + # Another case available only via the webpage; using streams/streamsV3 formats + # Geo-restricted to Netherlands/Germany + 'url': 'http://www.vevo.com/watch/boostee/pop-corn-clip-officiel/FR1A91600909', + 'only_matching': True, }] _VERSIONS = { 0: 'youtube', # only in AuthenticateVideo videoVersions @@ -193,7 +198,14 @@ class VevoIE(VevoBaseIE): # https://github.com/rg3/youtube-dl/issues/9366) if not video_versions: webpage = self._download_webpage(url, video_id) - video_versions = self._extract_json(webpage, video_id, 'streams')[video_id][0] + json_data = self._extract_json(webpage, video_id) + if 'streams' in json_data.get('default', {}): + video_versions = json_data['default']['streams'][video_id][0] + else: + video_versions = [ + value + for key, value in json_data['apollo']['data'].items() + if key.startswith('%s.streams' % video_id)] uploader = None artist = None @@ -207,7 +219,7 @@ class VevoIE(VevoBaseIE): formats = [] for video_version in video_versions: - version = self._VERSIONS.get(video_version['version']) + version = self._VERSIONS.get(video_version.get('version'), 'generic') version_url = video_version.get('url') if not version_url: continue @@ -339,7 +351,7 @@ class VevoPlaylistIE(VevoBaseIE): if video_id: return self.url_result('vevo:%s' % video_id, VevoIE.ie_key()) - playlists = self._extract_json(webpage, playlist_id, '%ss' % playlist_kind) + playlists = self._extract_json(webpage, playlist_id)['default']['%ss' % playlist_kind] playlist = (list(playlists.values())[0] if playlist_kind == 'playlist' else playlists[playlist_id]) diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index d26fb49..5086f59 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -9,7 +9,7 @@ from .common import InfoExtractor class VierIE(InfoExtractor): IE_NAME = 'vier' - _VALID_URL = r'https?://(?:www\.)?vier\.be/(?:[^/]+/videos/(?P[^/]+)(?:/(?P\d+))?|video/v3/embed/(?P\d+))' + _VALID_URL = r'https?://(?:www\.)?(?Pvier|vijf)\.be/(?:[^/]+/videos/(?P[^/]+)(?:/(?P\d+))?|video/v3/embed/(?P\d+))' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', 'info_dict': { @@ -23,6 +23,19 @@ class VierIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + }, { + 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', + 'info_dict': { + 'id': '2561614', + 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', + 'ext': 'mp4', + 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', + 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', 'only_matching': True, @@ -35,6 +48,7 @@ class VierIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id + site = mobj.group('site') webpage = self._download_webpage(url, display_id) @@ -43,7 +57,7 @@ class VierIE(InfoExtractor): webpage, 'video id') application = self._search_regex( [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], - webpage, 'application', default='vier_vod') + webpage, 'application', default=site + '_vod') filename = self._search_regex( [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'], webpage, 'filename') @@ -68,13 +82,19 @@ class VierIE(InfoExtractor): class VierVideosIE(InfoExtractor): IE_NAME = 'vier:videos' - _VALID_URL = r'https?://(?:www\.)?vier\.be/(?P[^/]+)/videos(?:\?.*\bpage=(?P\d+)|$)' + _VALID_URL = r'https?://(?:www\.)?(?Pvier|vijf)\.be/(?P[^/]+)/videos(?:\?.*\bpage=(?P\d+)|$)' _TESTS = [{ 'url': 'http://www.vier.be/demoestuin/videos', 'info_dict': { 'id': 'demoestuin', }, 'playlist_mincount': 153, + }, { + 'url': 'http://www.vijf.be/temptationisland/videos', + 'info_dict': { + 'id': 'temptationisland', + }, + 'playlist_mincount': 159, }, { 'url': 'http://www.vier.be/demoestuin/videos?page=6', 'info_dict': { @@ -92,6 +112,7 @@ class VierVideosIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) program = mobj.group('program') + site = mobj.group('site') page_id = mobj.group('page') if page_id: @@ -105,13 +126,13 @@ class VierVideosIE(InfoExtractor): entries = [] for current_page_id in itertools.count(start_page): current_page = self._download_webpage( - 'http://www.vier.be/%s/videos?page=%d' % (program, current_page_id), + 'http://www.%s.be/%s/videos?page=%d' % (site, program, current_page_id), program, 'Downloading page %d' % (current_page_id + 1)) page_entries = [ - self.url_result('http://www.vier.be' + video_url, 'Vier') + self.url_result('http://www.' + site + '.be' + video_url, 'Vier') for video_url in re.findall( - r'

', current_page)] + r'', current_page)] entries.extend(page_entries) if page_id or '>Meer<' not in current_page: break diff --git a/youtube_dl/extractor/vrak.py b/youtube_dl/extractor/vrak.py new file mode 100644 index 0000000..daa247c --- /dev/null +++ b/youtube_dl/extractor/vrak.py @@ -0,0 +1,80 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .brightcove import BrightcoveNewIE +from ..utils import ( + int_or_none, + parse_age_limit, + smuggle_url, + unescapeHTML, +) + + +class VrakIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vrak\.tv/videos\?.*?\btarget=(?P[\d.]+)' + _TEST = { + 'url': 'http://www.vrak.tv/videos?target=1.2306782&filtre=emission&id=1.1806721', + 'info_dict': { + 'id': '5345661243001', + 'ext': 'mp4', + 'title': 'Obésité, film de hockey et Roseline Filion', + 'timestamp': 1488492126, + 'upload_date': '20170302', + 'uploader_id': '2890187628001', + 'creator': 'VRAK.TV', + 'age_limit': 8, + 'series': 'ALT (Actualité Légèrement Tordue)', + 'episode': 'Obésité, film de hockey et Roseline Filion', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + } + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/2890187628001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._html_search_regex( + r']+\bclass=["\']videoTitle["\'][^>]*>([^<]+)', + webpage, 'title', default=None) or self._og_search_title(webpage) + + content = self._parse_json( + self._search_regex( + r'data-player-options-content=(["\'])(?P{.+?})\1', + webpage, 'content', default='{}', group='content'), + video_id, transform_source=unescapeHTML) + + ref_id = content.get('refId') or self._search_regex( + r'refId":"([^&]+)"', webpage, 'ref id') + + brightcove_id = self._search_regex( + r'''(?x) + java\.lang\.String\s+value\s*=\s*["']brightcove\.article\.\d+\.%s + [^>]* + java\.lang\.String\s+value\s*=\s*["'](\d+) + ''' % re.escape(ref_id), webpage, 'brightcove id') + + return { + '_type': 'url_transparent', + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, + 'description': content.get('description'), + 'creator': content.get('brand'), + 'age_limit': parse_age_limit(content.get('rating')), + 'series': content.get('showName') or content.get( + 'episodeName'), # this is intentional + 'season_number': int_or_none(content.get('seasonNumber')), + 'episode': title, + 'episode_number': int_or_none(content.get('episodeNumber')), + 'tags': content.get('tags', []), + } diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 36a8c98..7b67037 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..utils import ( dict_get, + ExtractorError, int_or_none, parse_duration, unified_strdate, @@ -57,6 +58,10 @@ class XHamsterIE(InfoExtractor): }, { 'url': 'https://xhamster.com/movies/2272726/amber_slayed_by_the_knight.html', 'only_matching': True, + }, { + # This video is visible for marcoalfa123456's friends only + 'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -78,6 +83,12 @@ class XHamsterIE(InfoExtractor): mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo) webpage = self._download_webpage(mrss_url, video_id) + error = self._html_search_regex( + r']+id=["\']videoClosed["\'][^>]*>(.+?)', + webpage, 'error', default=None) + if error: + raise ExtractorError(error, expected=True) + title = self._html_search_regex( [r']*>([^<]+)

', r']+itemprop=".*?caption.*?"[^>]+content="(.+?)"', diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index dec0280..caa0482 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -47,7 +47,6 @@ from ..utils import ( unsmuggle_url, uppercase_escape, urlencode_postdata, - ISO3166Utils, ) @@ -371,6 +370,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } _SUBTITLE_FORMATS = ('ttml', 'vtt') + _GEO_BYPASS = False + IE_NAME = 'youtube' _TESTS = [ { @@ -917,7 +918,12 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # itag 212 'url': '1t24XAntNCY', 'only_matching': True, - } + }, + { + # geo restricted to JP + 'url': 'sJL6WA-aGkQ', + 'only_matching': True, + }, ] def __init__(self, *args, **kwargs): @@ -1376,11 +1382,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if 'token' not in video_info: if 'reason' in video_info: if 'The uploader has not made this video available in your country.' in video_info['reason']: - regions_allowed = self._html_search_meta('regionsAllowed', video_webpage, default=None) - if regions_allowed: - raise ExtractorError('YouTube said: This video is available in %s only' % ( - ', '.join(map(ISO3166Utils.short2full, regions_allowed.split(',')))), - expected=True) + regions_allowed = self._html_search_meta( + 'regionsAllowed', video_webpage, default=None) + countries = regions_allowed.split(',') if regions_allowed else None + self.raise_geo_restricted( + msg=video_info['reason'][0], countries=countries) raise ExtractorError( 'YouTube said: %s' % video_info['reason'][0], expected=True, video_id=video_id) @@ -1448,7 +1454,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # Check for "rental" videos if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info: - raise ExtractorError('"rental" videos not supported') + raise ExtractorError('"rental" videos not supported. See https://github.com/rg3/youtube-dl/issues/359 for more information.', expected=True) # Start extracting information self.report_information_extraction(video_id) @@ -1845,7 +1851,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): (?: youtube\.com/ (?: - (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/videoseries) + (?:course|view_play_list|my_playlists|artist|playlist|watch|embed/(?:videoseries|[0-9A-Za-z_-]{11})) \? (?:.*?[&;])*? (?:p|a|list)= | p/ )| @@ -1918,6 +1924,13 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): 'title': 'JODA15', 'id': 'PL6IaIsEjSbf96XFRuNccS_RuEXwNdsoEu', } + }, { + 'url': 'http://www.youtube.com/embed/_xDOZElKyNU?list=PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + 'playlist_mincount': 485, + 'info_dict': { + 'title': '2017 華語最新單曲 (2/24更新)', + 'id': 'PLsyOSbh5bs16vubvKePAQ1x3PhKavfBIl', + } }, { 'note': 'Embedded SWF player', 'url': 'https://www.youtube.com/p/YN5VISEtHet5D4NEvfTd0zcgFk84NqFZ?hl=en_US&fs=1&rel=0', @@ -2066,7 +2079,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): # Check if it's a video-specific URL query_dict = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) video_id = query_dict.get('v', [None])[0] or self._search_regex( - r'(?:^|//)youtu\.be/([0-9A-Za-z_-]{11})', url, + r'(?:(?:^|//)youtu\.be/|youtube\.com/embed/(?!videoseries))([0-9A-Za-z_-]{11})', url, 'video id', default=None) if video_id: if self._downloader.params.get('noplaylist'): @@ -2226,7 +2239,7 @@ class YoutubeUserIE(YoutubeChannelIE): 'url': 'https://www.youtube.com/gametrailers', 'only_matching': True, }, { - # This channel is not available. + # This channel is not available, geo restricted to JP 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', 'only_matching': True, }] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 17b8379..d293c74 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -473,7 +473,8 @@ def timeconvert(timestr): def sanitize_filename(s, restricted=False, is_id=False): """Sanitizes a string so it could be used as part of a filename. If restricted is set, use a stricter subset of allowed characters. - Set is_id if this is not an arbitrary string, but an ID that should be kept if possible + Set is_id if this is not an arbitrary string, but an ID that should be kept + if possible. """ def replace_insane(char): if restricted and char in ACCENT_CHARS: @@ -1747,11 +1748,16 @@ def base_url(url): def urljoin(base, path): + if isinstance(path, bytes): + path = path.decode('utf-8') if not isinstance(path, compat_str) or not path: return None if re.match(r'^(?:https?:)?//', path): return path - if not isinstance(base, compat_str) or not re.match(r'^(?:https?:)?//', base): + if isinstance(base, bytes): + base = base.decode('utf-8') + if not isinstance(base, compat_str) or not re.match( + r'^(?:https?:)?//', base): return None return compat_urlparse.urljoin(base, path) @@ -3319,6 +3325,57 @@ class PerRequestProxyHandler(compat_urllib_request.ProxyHandler): self, req, proxy, type) +# Both long_to_bytes and bytes_to_long are adapted from PyCrypto, which is +# released into Public Domain +# https://github.com/dlitz/pycrypto/blob/master/lib/Crypto/Util/number.py#L387 + +def long_to_bytes(n, blocksize=0): + """long_to_bytes(n:long, blocksize:int) : string + Convert a long integer to a byte string. + + If optional blocksize is given and greater than zero, pad the front of the + byte string with binary zeros so that the length is a multiple of + blocksize. + """ + # after much testing, this algorithm was deemed to be the fastest + s = b'' + n = int(n) + while n > 0: + s = compat_struct_pack('>I', n & 0xffffffff) + s + n = n >> 32 + # strip off leading zeros + for i in range(len(s)): + if s[i] != b'\000'[0]: + break + else: + # only happens when n == 0 + s = b'\000' + i = 0 + s = s[i:] + # add back some pad bytes. this could be done more efficiently w.r.t. the + # de-padding being done above, but sigh... + if blocksize > 0 and len(s) % blocksize: + s = (blocksize - len(s) % blocksize) * b'\000' + s + return s + + +def bytes_to_long(s): + """bytes_to_long(string) : long + Convert a byte string to a long integer. + + This is (essentially) the inverse of long_to_bytes(). + """ + acc = 0 + length = len(s) + if length % 4: + extra = (4 - length % 4) + s = b'\000' * extra + s + length = length + extra + for i in range(0, length, 4): + acc = (acc << 32) + compat_struct_unpack('>I', s[i:i + 4])[0] + return acc + + def ohdave_rsa_encrypt(data, exponent, modulus): ''' Implement OHDave's RSA algorithm. See http://www.ohdave.com/rsa/ @@ -3336,6 +3393,21 @@ def ohdave_rsa_encrypt(data, exponent, modulus): return '%x' % encrypted +def pkcs1pad(data, length): + """ + Padding input data with PKCS#1 scheme + + @param {int[]} data input data + @param {int} length target length + @returns {int[]} padded data + """ + if len(data) > length - 11: + raise ValueError('Input data too long for PKCS#1 padding') + + pseudo_random = [random.randint(0, 254) for _ in range(length - len(data) - 3)] + return [0, 2] + pseudo_random + [0] + data + + def encode_base_n(num, n, table=None): FULL_TABLE = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' if not table: diff --git a/youtube_dl/version.py b/youtube_dl/version.py index fe7462e..bd451bf 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.02.24.1' +__version__ = '2017.03.07'