From: Rogério Brito Date: Fri, 16 Mar 2018 16:49:03 +0000 (-0300) Subject: Merge pull request #2 from nbraud/autoupdate X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/1270059fa7cf720bc0533c2cdfe6370923ae4e20?hp=1f17a37b9b95db09a420a1f52cf18723ce4eb8b5 Merge pull request #2 from nbraud/autoupdate Disable upstream's autoupdate mechanism. In a distribution, we use our own update mechanism to avoid multiple copies of a given program. Therefore, we disable the updates performed via unsupported means. Knowledgeable users will know what to do in any case. --- diff --git a/AUTHORS b/AUTHORS index 40215a5..6223212 100644 --- a/AUTHORS +++ b/AUTHORS @@ -233,3 +233,6 @@ Daniel Weber Kay Bouché Yang Hongbo Lei Wang +Petr Novák +Leonardo Taccari +Martin Weinelt diff --git a/ChangeLog b/ChangeLog index 00c5c9c..47736e0 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,169 @@ +version 2018.03.14 + +Extractors +* [soundcloud] Update client id (#15866) ++ [tennistv] Add support for tennistv.com ++ [line] Add support for tv.line.me (#9427) +* [xnxx] Fix extraction (#15817) +* [njpwworld] Fix authentication (#15815) + + +version 2018.03.10 + +Core +* [downloader/hls] Skip uplynk ad fragments (#15748) + +Extractors +* [pornhub] Don't override session cookies (#15697) ++ [raywenderlich] Add support for videos.raywenderlich.com (#15251) +* [funk] Fix extraction and rework extractors (#15792) +* [nexx] Restore reverse engineered approach ++ [heise] Add support for kaltura embeds (#14961, #15728) ++ [tvnow] Extract series metadata (#15774) +* [ruutu] Continue formats extraction on NOT-USED URLs (#15775) +* [vrtnu] Use redirect URL for building video JSON URL (#15767, #15769) +* [vimeo] Modernize login code and improve error messaging +* [archiveorg] Fix extraction (#15770, #15772) ++ [hidive] Add support for hidive.com (#15494) +* [afreecatv] Detect deleted videos +* [afreecatv] Fix extraction (#15755) +* [vice] Fix extraction and rework extractors (#11101, #13019, #13622, #13778) ++ [vidzi] Add support for vidzi.si (#15751) +* [npo] Fix typo + + +version 2018.03.03 + +Core ++ [utils] Add parse_resolution +Revert respect --prefer-insecure while updating + +Extractors ++ [yapfiles] Add support for yapfiles.ru (#15726, #11085) +* [spankbang] Fix formats extraction (#15727) +* [adn] Fix extraction (#15716) ++ [toggle] Extract DASH and ISM formats (#15721) ++ [nickelodeon] Add support for nickelodeon.com.tr (#15706) +* [npo] Validate and filter format URLs (#15709) + + +version 2018.02.26 + +Extractors +* [udemy] Use custom User-Agent (#15571) + + +version 2018.02.25 + +Core +* [postprocessor/embedthumbnail] Skip embedding when there aren't any + thumbnails (#12573) +* [extractor/common] Improve jwplayer subtitles extraction (#15695) + +Extractors ++ [vidlii] Add support for vidlii.com (#14472, #14512, #14779) ++ [streamango] Capture and output error messages +* [streamango] Fix extraction (#14160, #14256) ++ [telequebec] Add support for emissions (#14649, #14655) ++ [telequebec:live] Add support for live streams (#15688) ++ [mailru:music] Add support for mail.ru/music (#15618) +* [aenetworks] Switch to akamai HLS formats (#15612) +* [ytsearch] Fix flat title extraction (#11260, #15681) + + +version 2018.02.22 + +Core ++ [utils] Fixup some common URL typos in sanitize_url (#15649) +* Respect --prefer-insecure while updating (#15497) + +Extractors +* [vidio] Fix HLS URL extraction (#15675) ++ [nexx] Add support for arc.nexx.cloud URLs +* [nexx] Switch to arc API (#15652) +* [redtube] Fix duration extraction (#15659) ++ [sonyliv] Respect referrer (#15648) ++ [brightcove:new] Use referrer for formats' HTTP headers ++ [cbc] Add support for olympics.cbc.ca (#15535) ++ [fusion] Add support for fusion.tv (#15628) +* [npo] Improve quality metadata extraction +* [npo] Relax URL regular expression (#14987, #14994) ++ [npo] Capture and output error message ++ [pornhub] Add support for channels (#15613) +* [youtube] Handle shared URLs with generic extractor (#14303) + + +version 2018.02.11 + +Core ++ [YoutubeDL] Add support for filesize_approx in format selector (#15550) + +Extractors ++ [francetv] Add support for live streams (#13689) ++ [francetv] Add support for zouzous.fr and ludo.fr (#10454, #13087, #13103, + #15012) +* [francetv] Separate main extractor and rework others to delegate to it +* [francetv] Improve manifest URL signing (#15536) ++ [francetv] Sign m3u8 manifest URLs (#15565) ++ [veoh] Add support for embed URLs (#15561) +* [afreecatv] Fix extraction (#15556) +* [periscope] Use accessVideoPublic endpoint (#15554) +* [discovery] Fix auth request (#15542) ++ [6play] Extract subtitles (#15541) +* [newgrounds] Fix metadata extraction (#15531) ++ [nbc] Add support for stream.nbcolympics.com (#10295) +* [dvtv] Fix live streams extraction (#15442) + + +version 2018.02.08 + +Extractors ++ [myvi] Extend URL regular expression ++ [myvi:embed] Add support for myvi.tv embeds (#15521) ++ [prosiebensat1] Extend URL regular expression (#15520) +* [pokemon] Relax URL regular expression and extend title extraction (#15518) ++ [gameinformer] Use geo verification headers +* [la7] Fix extraction (#15501, #15502) +* [gameinformer] Fix brightcove id extraction (#15416) ++ [afreecatv] Pass referrer to video info request (#15507) ++ [telebruxelles] Add support for live streams +* [telebruxelles] Relax URL regular expression +* [telebruxelles] Fix extraction (#15504) +* [extractor/common] Respect secure schemes in _extract_wowza_formats + + +version 2018.02.04 + +Core +* [downloader/http] Randomize HTTP chunk size ++ [downloader/http] Add ability to pass downloader options via info dict +* [downloader/http] Fix 302 infinite loops by not reusing requests ++ Document http_chunk_size + +Extractors ++ [brightcove] Pass embed page URL as referrer (#15486) ++ [youtube] Enforce using chunked HTTP downloading for DASH formats + + +version 2018.02.03 + +Core ++ Introduce --http-chunk-size for chunk-based HTTP downloading ++ Add support for IronPython +* [downloader/ism] Fix Python 3.2 support + +Extractors +* [redbulltv] Fix extraction (#15481) +* [redtube] Fix metadata extraction (#15472) +* [pladform] Respect platform id and extract HLS formats (#15468) +- [rtlnl] Remove progressive formats (#15459) +* [6play] Do no modify asset URLs with a token (#15248) +* [nationalgeographic] Relax URL regular expression +* [dplay] Relax URL regular expression (#15458) +* [cbsinteractive] Fix data extraction (#15451) ++ [amcnetworks] Add support for sundancetv.com (#9260) + + version 2018.01.27 Core diff --git a/README.md b/README.md index eb05f84..7dba577 100644 --- a/README.md +++ b/README.md @@ -198,6 +198,11 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo size. By default, the buffer size is automatically resized from an initial value of SIZE. + --http-chunk-size SIZE Size of a chunk for chunk-based HTTP + downloading (e.g. 10485760 or 10M) (default + is disabled). May be useful for bypassing + bandwidth throttling imposed by a webserver + (experimental) --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with diff --git a/README.txt b/README.txt index 54b6137..24959f0 100644 --- a/README.txt +++ b/README.txt @@ -227,6 +227,11 @@ Download Options: size. By default, the buffer size is automatically resized from an initial value of SIZE. + --http-chunk-size SIZE Size of a chunk for chunk-based HTTP + downloading (e.g. 10485760 or 10M) (default + is disabled). May be useful for bypassing + bandwidth throttling imposed by a webserver + (experimental) --playlist-reverse Download playlist videos in reverse order --playlist-random Download playlist videos in random order --xattr-set-filesize Set file xattribute ytdl.filesize with diff --git a/debian/changelog b/debian/changelog index 2b918d8..af41971 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,10 +1,24 @@ -youtube-dl (2018.01.27-1~1.gbpecaee9) UNRELEASED; urgency=medium +youtube-dl (2018.03.14-1~2.gbp941b2c) UNRELEASED; urgency=medium - ** SNAPSHOT build @ecaee9d66905db1d5836d396c705d18d6e5f1f72 ** + ** SNAPSHOT build @941b2c19f4769eabb3004accc12cf96965d4bca6 ** + [ Rogério Brito ] * New upstream version 2018.01.27 - -- Rogério Brito Wed, 31 Jan 2018 02:42:21 -0200 + [ Andreas Tille ] + * cme fix dpkg-control + * Moved packaging to salsa.debian.org + + [ Rogério Brito ] + * New upstream version 2018.03.14 + * debian/copyright: + + Update my copyright years. + * debian/{compat,control}: + + Relutanctly update compat to 11. + * debian/control: Add Recommends: python3-pyxattr. + Thanks to Mathieu Malaterre for the report. (Closes: #891446) + + -- Rogério Brito Fri, 16 Mar 2018 13:38:11 -0300 youtube-dl (2017.12.31-1) unstable; urgency=medium diff --git a/debian/compat b/debian/compat index f599e28..b4de394 100644 --- a/debian/compat +++ b/debian/compat @@ -1 +1 @@ -10 +11 diff --git a/debian/control b/debian/control index 0b7e153..06b3286 100644 --- a/debian/control +++ b/debian/control @@ -1,38 +1,33 @@ Source: youtube-dl +Maintainer: Rogério Brito Section: web Priority: optional -Maintainer: Rogério Brito -Build-Depends: - bash-completion, - debhelper (>= 10), - dh-exec, - dh-python, - pandoc, - python3-pkg-resources, - zip -Build-Depends-Indep: - python3 -X-Python3-Version: >= 3.2 +Build-Depends: bash-completion, + debhelper (>= 11), + dh-exec, + dh-python, + pandoc, + python3-pkg-resources, + zip +Build-Depends-Indep: python3 Standards-Version: 4.1.3 +Vcs-Browser: https://salsa.debian.org/debian/youtube-dl +Vcs-Git: https://salsa.debian.org/debian/youtube-dl.git Homepage: https://rg3.github.com/youtube-dl/ -Vcs-Git: https://github.com/rbrito/pkg-youtube-dl -Vcs-Browser: https://github.com/rbrito/pkg-youtube-dl +X-Python3-Version: >= 3.2 Package: youtube-dl Architecture: all -Pre-Depends: - dpkg (>= 1.15.7.2) -Depends: - python3-pkg-resources, - ${misc:Depends}, - ${python3:Depends} -Recommends: - aria2 | wget | curl, - ca-certificates, - ffmpeg | libav-tools, - mpv | mplayer, - phantomjs, - rtmpdump +Depends: python3-pkg-resources, + ${misc:Depends}, + ${python3:Depends} +Recommends: aria2 | wget | curl, + ca-certificates, + ffmpeg | libav-tools, + mpv | mplayer, + phantomjs, + python3-pyxattr, + rtmpdump Description: downloader of videos from YouTube and other sites youtube-dl is a small command-line program to download videos from YouTube.com and other sites that don't provide direct links to the diff --git a/debian/copyright b/debian/copyright index 0de8c2b..7362d3b 100644 --- a/debian/copyright +++ b/debian/copyright @@ -35,7 +35,7 @@ License: public-domain Files: debian/* Copyright: © 2006, Robert S. Edmonds . - © 2009-2015, Rogério Brito . + © 2009-2018, Rogério Brito . License: GPL-2+ This package is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/docs/supportedsites.md b/docs/supportedsites.md index c15b5ee..80358bb 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -135,6 +135,7 @@ - **CarambaTVPage** - **CartoonNetwork** - **cbc.ca** + - **cbc.ca:olympics** - **cbc.ca:player** - **cbc.ca:watch** - **cbc.ca:watch:video** @@ -189,7 +190,7 @@ - **CSpan**: C-SPAN - **CtsNews**: 華視新聞 - **CTVNews** - - **culturebox.francetvinfo.fr** + - **Culturebox** - **CultureUnplugged** - **curiositystream** - **curiositystream:collection** @@ -291,11 +292,14 @@ - **FranceTV** - **FranceTVEmbed** - **francetvinfo.fr** + - **FranceTVJeunesse** + - **FranceTVSite** - **Freesound** - **freespeech.org** - **FreshLive** - **Funimation** - - **Funk** + - **FunkChannel** + - **FunkMix** - **FunnyOrDie** - **Fusion** - **Fux** @@ -333,6 +337,7 @@ - **HentaiStigma** - **hetklokhuis** - **hgtv.com:show** + - **HiDive** - **HistoricFilms** - **history:topic**: History.com Topic - **hitbox** @@ -422,6 +427,7 @@ - **limelight** - **limelight:channel** - **limelight:channel_list** + - **LineTV** - **LiTV** - **LiveLeak** - **LiveLeakEmbed** @@ -437,6 +443,8 @@ - **m6** - **macgamestore**: MacGameStore trailers - **mailru**: Видео@Mail.Ru + - **mailru:music**: Музыка@Mail.Ru + - **mailru:music:search**: Музыка@Mail.Ru - **MakersChannel** - **MakerTV** - **mangomolo:live** @@ -502,6 +510,7 @@ - **MySpass** - **Myvi** - **MyVidster** + - **MyviEmbed** - **n-tv.de** - **natgeo** - **natgeo:episodeguide** @@ -510,7 +519,8 @@ - **NBA** - **NBC** - **NBCNews** - - **NBCOlympics** + - **nbcolympics** + - **nbcolympics:stream** - **NBCSports** - **NBCSportsVPlayer** - **ndr**: NDR.de - Norddeutscher Rundfunk @@ -667,6 +677,7 @@ - **RaiPlay** - **RaiPlayLive** - **RaiPlayPlaylist** + - **RayWenderlich** - **RBMARadio** - **RDS**: RDS.ca - **RedBullTV** @@ -815,8 +826,11 @@ - **Telegraaf** - **TeleMB** - **TeleQuebec** + - **TeleQuebecEmission** + - **TeleQuebecLive** - **TeleTask** - **Telewebion** + - **TennisTV** - **TF1** - **TFO** - **TheIntercept** @@ -925,7 +939,6 @@ - **vice** - **vice:article** - **vice:show** - - **Viceland** - **Vidbit** - **Viddler** - **Videa** @@ -941,6 +954,7 @@ - **VideoPress** - **videoweed**: VideoWeed - **Vidio** + - **VidLii** - **vidme** - **vidme:user** - **vidme:user:likes** @@ -1045,6 +1059,7 @@ - **yandexmusic:album**: Яндекс.Музыка - Альбом - **yandexmusic:playlist**: Яндекс.Музыка - Плейлист - **yandexmusic:track**: Яндекс.Музыка - Трек + - **YapFiles** - **YesJapan** - **yinyuetai:video**: 音悦Tai - **Ynet** diff --git a/setup.cfg b/setup.cfg index 2dc06ff..5208f7a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,4 +3,4 @@ universal = True [flake8] exclude = youtube_dl/extractor/__init__.py,devscripts/buildserver.py,devscripts/lazy_load_template.py,devscripts/make_issue_template.py,setup.py,build,.git -ignore = E402,E501,E731 +ignore = E402,E501,E731,E741 diff --git a/test/test_downloader_http.py b/test/test_downloader_http.py new file mode 100644 index 0000000..5cf2bf1 --- /dev/null +++ b/test/test_downloader_http.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python +# coding: utf-8 +from __future__ import unicode_literals + +# Allow direct execution +import os +import re +import sys +import unittest +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from test.helper import try_rm +from youtube_dl import YoutubeDL +from youtube_dl.compat import compat_http_server +from youtube_dl.downloader.http import HttpFD +from youtube_dl.utils import encodeFilename +import ssl +import threading + +TEST_DIR = os.path.dirname(os.path.abspath(__file__)) + + +def http_server_port(httpd): + if os.name == 'java' and isinstance(httpd.socket, ssl.SSLSocket): + # In Jython SSLSocket is not a subclass of socket.socket + sock = httpd.socket.sock + else: + sock = httpd.socket + return sock.getsockname()[1] + + +TEST_SIZE = 10 * 1024 + + +class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def send_content_range(self, total=None): + range_header = self.headers.get('Range') + start = end = None + if range_header: + mobj = re.search(r'^bytes=(\d+)-(\d+)', range_header) + if mobj: + start = int(mobj.group(1)) + end = int(mobj.group(2)) + valid_range = start is not None and end is not None + if valid_range: + content_range = 'bytes %d-%d' % (start, end) + if total: + content_range += '/%d' % total + self.send_header('Content-Range', content_range) + return (end - start + 1) if valid_range else total + + def serve(self, range=True, content_length=True): + self.send_response(200) + self.send_header('Content-Type', 'video/mp4') + size = TEST_SIZE + if range: + size = self.send_content_range(TEST_SIZE) + if content_length: + self.send_header('Content-Length', size) + self.end_headers() + self.wfile.write(b'#' * size) + + def do_GET(self): + if self.path == '/regular': + self.serve() + elif self.path == '/no-content-length': + self.serve(content_length=False) + elif self.path == '/no-range': + self.serve(range=False) + elif self.path == '/no-range-no-content-length': + self.serve(range=False, content_length=False) + else: + assert False + + +class FakeLogger(object): + def debug(self, msg): + pass + + def warning(self, msg): + pass + + def error(self, msg): + pass + + +class TestHttpFD(unittest.TestCase): + def setUp(self): + self.httpd = compat_http_server.HTTPServer( + ('127.0.0.1', 0), HTTPTestRequestHandler) + self.port = http_server_port(self.httpd) + self.server_thread = threading.Thread(target=self.httpd.serve_forever) + self.server_thread.daemon = True + self.server_thread.start() + + def download(self, params, ep): + params['logger'] = FakeLogger() + ydl = YoutubeDL(params) + downloader = HttpFD(ydl, params) + filename = 'testfile.mp4' + try_rm(encodeFilename(filename)) + self.assertTrue(downloader.real_download(filename, { + 'url': 'http://127.0.0.1:%d/%s' % (self.port, ep), + })) + self.assertEqual(os.path.getsize(encodeFilename(filename)), TEST_SIZE) + try_rm(encodeFilename(filename)) + + def download_all(self, params): + for ep in ('regular', 'no-content-length', 'no-range', 'no-range-no-content-length'): + self.download(params, ep) + + def test_regular(self): + self.download_all({}) + + def test_chunked(self): + self.download_all({ + 'http_chunk_size': 1000, + }) + + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_http.py b/test/test_http.py index 7a7a351..409fec9 100644 --- a/test/test_http.py +++ b/test/test_http.py @@ -47,7 +47,7 @@ class HTTPTestRequestHandler(compat_http_server.BaseHTTPRequestHandler): self.end_headers() return - new_url = 'http://localhost:%d/中文.html' % http_server_port(self.server) + new_url = 'http://127.0.0.1:%d/中文.html' % http_server_port(self.server) self.send_response(302) self.send_header(b'Location', new_url.encode('utf-8')) self.end_headers() @@ -74,7 +74,7 @@ class FakeLogger(object): class TestHTTP(unittest.TestCase): def setUp(self): self.httpd = compat_http_server.HTTPServer( - ('localhost', 0), HTTPTestRequestHandler) + ('127.0.0.1', 0), HTTPTestRequestHandler) self.port = http_server_port(self.httpd) self.server_thread = threading.Thread(target=self.httpd.serve_forever) self.server_thread.daemon = True @@ -86,15 +86,15 @@ class TestHTTP(unittest.TestCase): return ydl = YoutubeDL({'logger': FakeLogger()}) - r = ydl.extract_info('http://localhost:%d/302' % self.port) - self.assertEqual(r['entries'][0]['url'], 'http://localhost:%d/vid.mp4' % self.port) + r = ydl.extract_info('http://127.0.0.1:%d/302' % self.port) + self.assertEqual(r['entries'][0]['url'], 'http://127.0.0.1:%d/vid.mp4' % self.port) class TestHTTPS(unittest.TestCase): def setUp(self): certfn = os.path.join(TEST_DIR, 'testcert.pem') self.httpd = compat_http_server.HTTPServer( - ('localhost', 0), HTTPTestRequestHandler) + ('127.0.0.1', 0), HTTPTestRequestHandler) self.httpd.socket = ssl.wrap_socket( self.httpd.socket, certfile=certfn, server_side=True) self.port = http_server_port(self.httpd) @@ -107,11 +107,11 @@ class TestHTTPS(unittest.TestCase): ydl = YoutubeDL({'logger': FakeLogger()}) self.assertRaises( Exception, - ydl.extract_info, 'https://localhost:%d/video.html' % self.port) + ydl.extract_info, 'https://127.0.0.1:%d/video.html' % self.port) ydl = YoutubeDL({'logger': FakeLogger(), 'nocheckcertificate': True}) - r = ydl.extract_info('https://localhost:%d/video.html' % self.port) - self.assertEqual(r['entries'][0]['url'], 'https://localhost:%d/vid.mp4' % self.port) + r = ydl.extract_info('https://127.0.0.1:%d/video.html' % self.port) + self.assertEqual(r['entries'][0]['url'], 'https://127.0.0.1:%d/vid.mp4' % self.port) def _build_proxy_handler(name): @@ -132,23 +132,23 @@ def _build_proxy_handler(name): class TestProxy(unittest.TestCase): def setUp(self): self.proxy = compat_http_server.HTTPServer( - ('localhost', 0), _build_proxy_handler('normal')) + ('127.0.0.1', 0), _build_proxy_handler('normal')) self.port = http_server_port(self.proxy) self.proxy_thread = threading.Thread(target=self.proxy.serve_forever) self.proxy_thread.daemon = True self.proxy_thread.start() self.geo_proxy = compat_http_server.HTTPServer( - ('localhost', 0), _build_proxy_handler('geo')) + ('127.0.0.1', 0), _build_proxy_handler('geo')) self.geo_port = http_server_port(self.geo_proxy) self.geo_proxy_thread = threading.Thread(target=self.geo_proxy.serve_forever) self.geo_proxy_thread.daemon = True self.geo_proxy_thread.start() def test_proxy(self): - geo_proxy = 'localhost:{0}'.format(self.geo_port) + geo_proxy = '127.0.0.1:{0}'.format(self.geo_port) ydl = YoutubeDL({ - 'proxy': 'localhost:{0}'.format(self.port), + 'proxy': '127.0.0.1:{0}'.format(self.port), 'geo_verification_proxy': geo_proxy, }) url = 'http://foo.com/bar' @@ -162,7 +162,7 @@ class TestProxy(unittest.TestCase): def test_proxy_with_idn(self): ydl = YoutubeDL({ - 'proxy': 'localhost:{0}'.format(self.port), + 'proxy': '127.0.0.1:{0}'.format(self.port), }) url = 'http://中文.tw/' response = ydl.urlopen(url).read().decode('utf-8') diff --git a/test/test_utils.py b/test/test_utils.py index fdf6031..a1fe6fd 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -53,10 +53,12 @@ from youtube_dl.utils import ( parse_filesize, parse_count, parse_iso8601, + parse_resolution, pkcs1pad, read_batch_urls, sanitize_filename, sanitize_path, + sanitize_url, expand_path, prepend_extension, replace_extension, @@ -219,6 +221,12 @@ class TestUtil(unittest.TestCase): self.assertEqual(sanitize_path('./abc'), 'abc') self.assertEqual(sanitize_path('./../abc'), '..\\abc') + def test_sanitize_url(self): + self.assertEqual(sanitize_url('//foo.bar'), 'http://foo.bar') + self.assertEqual(sanitize_url('httpss://foo.bar'), 'https://foo.bar') + self.assertEqual(sanitize_url('rmtps://foo.bar'), 'rtmps://foo.bar') + self.assertEqual(sanitize_url('https://foo.bar'), 'https://foo.bar') + def test_expand_path(self): def env(var): return '%{0}%'.format(var) if sys.platform == 'win32' else '${0}'.format(var) @@ -344,6 +352,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361) self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) + self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') @@ -975,6 +984,16 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_count('1.1kk '), 1100000) self.assertEqual(parse_count('1.1kk views'), 1100000) + def test_parse_resolution(self): + self.assertEqual(parse_resolution(None), {}) + self.assertEqual(parse_resolution(''), {}) + self.assertEqual(parse_resolution('1920x1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('1920×1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('1920 x 1080'), {'width': 1920, 'height': 1080}) + self.assertEqual(parse_resolution('720p'), {'height': 720}) + self.assertEqual(parse_resolution('4k'), {'height': 2160}) + self.assertEqual(parse_resolution('8K'), {'height': 4320}) + def test_version_tuple(self): self.assertEqual(version_tuple('1'), (1,)) self.assertEqual(version_tuple('10.23.344'), (10, 23, 344)) diff --git a/youtube-dl b/youtube-dl index fb4e30d..56daa4b 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube-dl.1 b/youtube-dl.1 index 3800a96..b859b1d 100644 --- a/youtube-dl.1 +++ b/youtube-dl.1 @@ -331,6 +331,14 @@ value of SIZE. .RS .RE .TP +.B \-\-http\-chunk\-size \f[I]SIZE\f[] +Size of a chunk for chunk\-based HTTP downloading (e.g. +10485760 or 10M) (default is disabled). +May be useful for bypassing bandwidth throttling imposed by a webserver +(experimental) +.RS +.RE +.TP .B \-\-playlist\-reverse Download playlist videos in reverse order .RS diff --git a/youtube-dl.bash-completion b/youtube-dl.bash-completion index 2cf75de..c1b86f2 100644 --- a/youtube-dl.bash-completion +++ b/youtube-dl.bash-completion @@ -4,7 +4,7 @@ __youtube_dl() COMPREPLY=() cur="${COMP_WORDS[COMP_CWORD]}" prev="${COMP_WORDS[COMP_CWORD-1]}" - opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --buffer-size --no-resize-buffer --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs" + opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --buffer-size --no-resize-buffer --http-chunk-size --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs" keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory" fileopts="-a|--batch-file|--download-archive|--cookies|--load-info" diropts="--cache-dir" diff --git a/youtube-dl.fish b/youtube-dl.fish index 00d1845..aa7c59b 100644 --- a/youtube-dl.fish +++ b/youtube-dl.fish @@ -52,6 +52,7 @@ complete --command youtube-dl --long-option abort-on-unavailable-fragment --desc complete --command youtube-dl --long-option keep-fragments --description 'Keep downloaded fragments on disk after downloading is finished; fragments are erased by default' complete --command youtube-dl --long-option buffer-size --description 'Size of download buffer (e.g. 1024 or 16K) (default is %default)' complete --command youtube-dl --long-option no-resize-buffer --description 'Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.' +complete --command youtube-dl --long-option http-chunk-size --description 'Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)' complete --command youtube-dl --long-option test complete --command youtube-dl --long-option playlist-reverse --description 'Download playlist videos in reverse order' complete --command youtube-dl --long-option playlist-random --description 'Download playlist videos in random order' diff --git a/youtube-dl.zsh b/youtube-dl.zsh index 2d670ee..1f573a5 100644 --- a/youtube-dl.zsh +++ b/youtube-dl.zsh @@ -19,7 +19,7 @@ __youtube_dl() { elif [[ ${prev} == "--recode-video" ]]; then _arguments '*: :(mp4 flv ogg webm mkv)' else - _arguments '*: :(--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --buffer-size --no-resize-buffer --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs)' + _arguments '*: :(--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --buffer-size --no-resize-buffer --http-chunk-size --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs)' fi ;; esac diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 97bd9c5..523dd1f 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -298,7 +298,8 @@ class YoutubeDL(object): the downloader (see youtube_dl/downloader/common.py): nopart, updatetime, buffersize, ratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, continuedl, noprogress, consoletitle, - xattr_set_filesize, external_downloader_args, hls_use_mpegts. + xattr_set_filesize, external_downloader_args, hls_use_mpegts, + http_chunk_size. The following options are used by the post processors: prefer_ffmpeg: If True, use ffmpeg instead of avconv if both are available, @@ -1032,7 +1033,7 @@ class YoutubeDL(object): '!=': operator.ne, } operator_rex = re.compile(r'''(?x)\s* - (?Pwidth|height|tbr|abr|vbr|asr|filesize|fps) + (?Pwidth|height|tbr|abr|vbr|asr|filesize|filesize_approx|fps) \s*(?P%s)(?P\s*\?)?\s* (?P[0-9.]+(?:[kKmMgGtTpPeEzZyY]i?[Bb]?)?) $ diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index ba684a0..9bb9524 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -191,6 +191,11 @@ def _real_main(argv=None): if numeric_buffersize is None: parser.error('invalid buffer size specified') opts.buffersize = numeric_buffersize + if opts.http_chunk_size is not None: + numeric_chunksize = FileDownloader.parse_bytes(opts.http_chunk_size) + if not numeric_chunksize: + parser.error('invalid http chunk size specified') + opts.http_chunk_size = numeric_chunksize if opts.playliststart <= 0: raise ValueError('Playlist start must be positive') if opts.playlistend not in (-1, None) and opts.playlistend < opts.playliststart: @@ -346,6 +351,7 @@ def _real_main(argv=None): 'keep_fragments': opts.keep_fragments, 'buffersize': opts.buffersize, 'noresizebuffer': opts.noresizebuffer, + 'http_chunk_size': opts.http_chunk_size, 'continuedl': opts.continue_dl, 'noprogress': opts.noprogress, 'progress_with_newline': opts.progress_with_newline, diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 646c9d7..4a611f1 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -2897,9 +2897,24 @@ except TypeError: if isinstance(spec, compat_str): spec = spec.encode('ascii') return struct.unpack(spec, *args) + + class compat_Struct(struct.Struct): + def __init__(self, fmt): + if isinstance(fmt, compat_str): + fmt = fmt.encode('ascii') + super(compat_Struct, self).__init__(fmt) else: compat_struct_pack = struct.pack compat_struct_unpack = struct.unpack + if platform.python_implementation() == 'IronPython' and sys.version_info < (2, 7, 8): + class compat_Struct(struct.Struct): + def unpack(self, string): + if not isinstance(string, buffer): # noqa: F821 + string = buffer(string) # noqa: F821 + return super(compat_Struct, self).unpack(string) + else: + compat_Struct = struct.Struct + try: from future_builtins import zip as compat_zip @@ -2941,6 +2956,7 @@ __all__ = [ 'compat_HTMLParseError', 'compat_HTMLParser', 'compat_HTTPError', + 'compat_Struct', 'compat_b64decode', 'compat_basestring', 'compat_chr', diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 75b8166..cc16bbb 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -49,6 +49,9 @@ class FileDownloader(object): external_downloader_args: A list of additional command-line arguments for the external downloader. hls_use_mpegts: Use the mpegts container for HLS videos. + http_chunk_size: Size of a chunk for chunk-based HTTP downloading. May be + useful for bypassing bandwidth throttling imposed by + a webserver (experimental) Subclasses of this one must re-define the real_download method. """ diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 4dc3ab4..fd30452 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -75,8 +75,9 @@ class HlsFD(FragmentFD): fd.add_progress_hook(ph) return fd.real_download(filename, info_dict) - def anvato_ad(s): - return s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s + def is_ad_fragment(s): + return (s.startswith('#ANVATO-SEGMENT-INFO') and 'type=ad' in s or + s.startswith('#UPLYNK-SEGMENT') and s.endswith(',ad')) media_frags = 0 ad_frags = 0 @@ -86,7 +87,7 @@ class HlsFD(FragmentFD): if not line: continue if line.startswith('#'): - if anvato_ad(line): + if is_ad_fragment(line): ad_frags += 1 ad_frag_next = True continue @@ -195,7 +196,7 @@ class HlsFD(FragmentFD): 'start': sub_range_start, 'end': sub_range_start + int(splitted_byte_range[0]), } - elif anvato_ad(line): + elif is_ad_fragment(line): ad_frag_next = True self._finish_frag_download(ctx) diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py index 3ff26ff..a22875f 100644 --- a/youtube_dl/downloader/http.py +++ b/youtube_dl/downloader/http.py @@ -4,13 +4,18 @@ import errno import os import socket import time +import random import re from .common import FileDownloader -from ..compat import compat_urllib_error +from ..compat import ( + compat_str, + compat_urllib_error, +) from ..utils import ( ContentTooShortError, encodeFilename, + int_or_none, sanitize_open, sanitized_Request, write_xattr, @@ -38,21 +43,26 @@ class HttpFD(FileDownloader): add_headers = info_dict.get('http_headers') if add_headers: headers.update(add_headers) - basic_request = sanitized_Request(url, None, headers) - request = sanitized_Request(url, None, headers) is_test = self.params.get('test', False) - - if is_test: - request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1)) + chunk_size = self._TEST_FILE_SIZE if is_test else ( + info_dict.get('downloader_options', {}).get('http_chunk_size') or + self.params.get('http_chunk_size') or 0) ctx.open_mode = 'wb' ctx.resume_len = 0 + ctx.data_len = None + ctx.block_size = self.params.get('buffersize', 1024) + ctx.start_time = time.time() + ctx.chunk_size = None if self.params.get('continuedl', True): # Establish possible resume length if os.path.isfile(encodeFilename(ctx.tmpfilename)): - ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename)) + ctx.resume_len = os.path.getsize( + encodeFilename(ctx.tmpfilename)) + + ctx.is_resume = ctx.resume_len > 0 count = 0 retries = self.params.get('retries', 0) @@ -64,11 +74,36 @@ class HttpFD(FileDownloader): def __init__(self, source_error): self.source_error = source_error + class NextFragment(Exception): + pass + + def set_range(req, start, end): + range_header = 'bytes=%d-' % start + if end: + range_header += compat_str(end) + req.add_header('Range', range_header) + def establish_connection(): - if ctx.resume_len != 0: - self.report_resuming_byte(ctx.resume_len) - request.add_header('Range', 'bytes=%d-' % ctx.resume_len) + ctx.chunk_size = (random.randint(int(chunk_size * 0.95), chunk_size) + if not is_test and chunk_size else chunk_size) + if ctx.resume_len > 0: + range_start = ctx.resume_len + if ctx.is_resume: + self.report_resuming_byte(ctx.resume_len) ctx.open_mode = 'ab' + elif ctx.chunk_size > 0: + range_start = 0 + else: + range_start = None + ctx.is_resume = False + range_end = range_start + ctx.chunk_size - 1 if ctx.chunk_size else None + if range_end and ctx.data_len is not None and range_end >= ctx.data_len: + range_end = ctx.data_len - 1 + has_range = range_start is not None + ctx.has_range = has_range + request = sanitized_Request(url, None, headers) + if has_range: + set_range(request, range_start, range_end) # Establish connection try: ctx.data = self.ydl.urlopen(request) @@ -77,29 +112,40 @@ class HttpFD(FileDownloader): # that don't support resuming and serve a whole file with no Content-Range # set in response despite of requested Range (see # https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799) - if ctx.resume_len > 0: + if has_range: content_range = ctx.data.headers.get('Content-Range') if content_range: - content_range_m = re.search(r'bytes (\d+)-', content_range) + content_range_m = re.search(r'bytes (\d+)-(\d+)?(?:/(\d+))?', content_range) # Content-Range is present and matches requested Range, resume is possible - if content_range_m and ctx.resume_len == int(content_range_m.group(1)): - return + if content_range_m: + if range_start == int(content_range_m.group(1)): + content_range_end = int_or_none(content_range_m.group(2)) + content_len = int_or_none(content_range_m.group(3)) + accept_content_len = ( + # Non-chunked download + not ctx.chunk_size or + # Chunked download and requested piece or + # its part is promised to be served + content_range_end == range_end or + content_len < range_end) + if accept_content_len: + ctx.data_len = content_len + return # Content-Range is either not present or invalid. Assuming remote webserver is # trying to send the whole file, resume is not possible, so wiping the local file # and performing entire redownload self.report_unable_to_resume() ctx.resume_len = 0 ctx.open_mode = 'wb' + ctx.data_len = int_or_none(ctx.data.info().get('Content-length', None)) return except (compat_urllib_error.HTTPError, ) as err: - if (err.code < 500 or err.code >= 600) and err.code != 416: - # Unexpected HTTP error - raise - elif err.code == 416: + if err.code == 416: # Unable to resume (requested range not satisfiable) try: # Open the connection again without the range header - ctx.data = self.ydl.urlopen(basic_request) + ctx.data = self.ydl.urlopen( + sanitized_Request(url, None, headers)) content_length = ctx.data.info()['Content-Length'] except (compat_urllib_error.HTTPError, ) as err: if err.code < 500 or err.code >= 600: @@ -130,6 +176,9 @@ class HttpFD(FileDownloader): ctx.resume_len = 0 ctx.open_mode = 'wb' return + elif err.code < 500 or err.code >= 600: + # Unexpected HTTP error + raise raise RetryDownload(err) except socket.error as err: if err.errno != errno.ECONNRESET: @@ -160,7 +209,7 @@ class HttpFD(FileDownloader): return False byte_counter = 0 + ctx.resume_len - block_size = self.params.get('buffersize', 1024) + block_size = ctx.block_size start = time.time() # measure time over whole while-loop, so slow_down() and best_block_size() work together properly @@ -233,25 +282,30 @@ class HttpFD(FileDownloader): # Progress message speed = self.calc_speed(start, now, byte_counter - ctx.resume_len) - if data_len is None: + if ctx.data_len is None: eta = None else: - eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len) + eta = self.calc_eta(start, time.time(), ctx.data_len - ctx.resume_len, byte_counter - ctx.resume_len) self._hook_progress({ 'status': 'downloading', 'downloaded_bytes': byte_counter, - 'total_bytes': data_len, + 'total_bytes': ctx.data_len, 'tmpfilename': ctx.tmpfilename, 'filename': ctx.filename, 'eta': eta, 'speed': speed, - 'elapsed': now - start, + 'elapsed': now - ctx.start_time, }) if is_test and byte_counter == data_len: break + if not is_test and ctx.chunk_size and ctx.data_len is not None and byte_counter < ctx.data_len: + ctx.resume_len = byte_counter + # ctx.block_size = block_size + raise NextFragment() + if ctx.stream is None: self.to_stderr('\n') self.report_error('Did not get any data blocks') @@ -276,7 +330,7 @@ class HttpFD(FileDownloader): 'total_bytes': byte_counter, 'filename': ctx.filename, 'status': 'finished', - 'elapsed': time.time() - start, + 'elapsed': time.time() - ctx.start_time, }) return True @@ -290,6 +344,8 @@ class HttpFD(FileDownloader): if count <= retries: self.report_retry(e.source_error, count, retries) continue + except NextFragment: + continue except SucceedDownload: return True diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py index 9b001ec..063fcf4 100644 --- a/youtube_dl/downloader/ism.py +++ b/youtube_dl/downloader/ism.py @@ -1,25 +1,27 @@ from __future__ import unicode_literals import time -import struct import binascii import io from .fragment import FragmentFD -from ..compat import compat_urllib_error +from ..compat import ( + compat_Struct, + compat_urllib_error, +) -u8 = struct.Struct(b'>B') -u88 = struct.Struct(b'>Bx') -u16 = struct.Struct(b'>H') -u1616 = struct.Struct(b'>Hxx') -u32 = struct.Struct(b'>I') -u64 = struct.Struct(b'>Q') +u8 = compat_Struct('>B') +u88 = compat_Struct('>Bx') +u16 = compat_Struct('>H') +u1616 = compat_Struct('>Hxx') +u32 = compat_Struct('>I') +u64 = compat_Struct('>Q') -s88 = struct.Struct(b'>bx') -s16 = struct.Struct(b'>h') -s1616 = struct.Struct(b'>hxx') -s32 = struct.Struct(b'>i') +s88 = compat_Struct('>bx') +s16 = compat_Struct('>h') +s1616 = compat_Struct('>hxx') +s32 = compat_Struct('>i') unity_matrix = (s32.pack(0x10000) + s32.pack(0) * 3) * 2 + s32.pack(0x40000000) @@ -139,7 +141,7 @@ def write_piff_header(stream, params): sample_entry_payload += u16.pack(0x18) # depth sample_entry_payload += s16.pack(-1) # pre defined - codec_private_data = binascii.unhexlify(params['codec_private_data']) + codec_private_data = binascii.unhexlify(params['codec_private_data'].encode('utf-8')) if fourcc in ('H264', 'AVC1'): sps, pps = codec_private_data.split(u32.pack(1))[1:] avcc_payload = u8.pack(1) # configuration version diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py index f770fe9..cd29aca 100644 --- a/youtube_dl/extractor/abcnews.py +++ b/youtube_dl/extractor/abcnews.py @@ -66,7 +66,7 @@ class AbcNewsIE(InfoExtractor): _TESTS = [{ 'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY', 'info_dict': { - 'id': '10498713', + 'id': '10505354', 'ext': 'flv', 'display_id': 'dramatic-video-rare-death-job-america', 'title': 'Occupational Hazards', @@ -79,7 +79,7 @@ class AbcNewsIE(InfoExtractor): }, { 'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818', 'info_dict': { - 'id': '39125818', + 'id': '38897857', 'ext': 'mp4', 'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016', 'title': 'Justin Timberlake Drops Hints For Secret Single', diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py index 64fb755..041c61a 100644 --- a/youtube_dl/extractor/adn.py +++ b/youtube_dl/extractor/adn.py @@ -51,7 +51,7 @@ class ADNIE(InfoExtractor): # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( bytes_to_intlist(compat_b64decode(enc_subtitles[24:])), - bytes_to_intlist(b'\x1b\xe0\x29\x61\x38\x94\x24\x00\x12\xbd\xc5\x80\xac\xce\xbe\xb0'), + bytes_to_intlist(b'\xc8\x6e\x06\xbc\xbe\xc6\x49\xf5\x88\x0d\xc8\x47\xc4\x27\x0c\x60'), bytes_to_intlist(compat_b64decode(enc_subtitles[:24])) )) subtitles_json = self._parse_json( @@ -107,15 +107,18 @@ class ADNIE(InfoExtractor): options = player_config.get('options') or {} metas = options.get('metas') or {} - title = metas.get('title') or video_info['title'] links = player_config.get('links') or {} + sub_path = player_config.get('subtitles') error = None if not links: - links_url = player_config['linksurl'] + links_url = player_config.get('linksurl') or options['videoUrl'] links_data = self._download_json(urljoin( self._BASE_URL, links_url), video_id) links = links_data.get('links') or {} + metas = metas or links_data.get('meta') or {} + sub_path = sub_path or links_data.get('subtitles') error = links_data.get('error') + title = metas.get('title') or video_info['title'] formats = [] for format_id, qualities in links.items(): @@ -146,7 +149,7 @@ class ADNIE(InfoExtractor): 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), 'thumbnail': video_info.get('image'), 'formats': formats, - 'subtitles': self.extract_subtitles(player_config.get('subtitles'), video_id), + 'subtitles': self.extract_subtitles(sub_path, video_id), 'episode': metas.get('subtitle') or video_info.get('videoTitle'), 'series': video_info.get('playlistTitle'), } diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index da1b566..398e56e 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -122,7 +122,8 @@ class AENetworksIE(AENetworksBaseIE): query = { 'mbr': 'true', - 'assetTypes': 'high_video_s3' + 'assetTypes': 'high_video_ak', + 'switch': 'hls_high_ak', } video_id = self._html_search_meta('aetn:VideoID', webpage) media_url = self._search_regex( diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index 513dd81..df2a3fc 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -175,10 +175,27 @@ class AfreecaTVIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + if re.search(r'alert\(["\']This video has been deleted', webpage): + raise ExtractorError( + 'Video %s has been deleted' % video_id, expected=True) + + station_id = self._search_regex( + r'nStationNo\s*=\s*(\d+)', webpage, 'station') + bbs_id = self._search_regex( + r'nBbsNo\s*=\s*(\d+)', webpage, 'bbs') + video_id = self._search_regex( + r'nTitleNo\s*=\s*(\d+)', webpage, 'title', default=video_id) + video_xml = self._download_xml( 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', - video_id, query={ + video_id, headers={ + 'Referer': 'http://vod.afreecatv.com/embed.php', + }, query={ 'nTitleNo': video_id, + 'nStationNo': station_id, + 'nBbsNo': bbs_id, 'partialView': 'SKIP_ADULT', }) @@ -187,10 +204,10 @@ class AfreecaTVIE(InfoExtractor): raise ExtractorError( '%s said: %s' % (self.IE_NAME, flag), expected=True) - video_element = video_xml.findall(compat_xpath('./track/video'))[1] + video_element = video_xml.findall(compat_xpath('./track/video'))[-1] if video_element is None or video_element.text is None: - raise ExtractorError('Specified AfreecaTV video does not exist', - expected=True) + raise ExtractorError( + 'Video %s video does not exist' % video_id, expected=True) video_url = video_element.text.strip() diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py index dd3b18d..6fb3d6c 100644 --- a/youtube_dl/extractor/amcnetworks.py +++ b/youtube_dl/extractor/amcnetworks.py @@ -11,7 +11,7 @@ from ..utils import ( class AMCNetworksIE(ThePlatformIE): - _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|(?:we|sundance)tv)\.com/(?:movies|shows(?:/[^/]+)+)/(?P[^/?#]+)' _TESTS = [{ 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', 'md5': '', @@ -51,6 +51,9 @@ class AMCNetworksIE(ThePlatformIE): }, { 'url': 'http://www.wetv.com/shows/la-hair/videos/season-05/episode-09-episode-9-2/episode-9-sneak-peek-3', 'only_matching': True, + }, { + 'url': 'https://www.sundancetv.com/shows/riviera/full-episodes/season-1/episode-01-episode-1', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 3c7d725..c79c58e 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -41,7 +41,7 @@ class ArchiveOrgIE(InfoExtractor): webpage = self._download_webpage( 'http://archive.org/embed/' + video_id, video_id) jwplayer_playlist = self._parse_json(self._search_regex( - r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);", + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\)", webpage, 'jwplayer playlist'), video_id) info = self._parse_jwplayer_data( {'playlist': jwplayer_playlist}, video_id, base_url=url) diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index ef73d5a..86951d9 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -24,57 +24,30 @@ class ARDMediathekIE(InfoExtractor): _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ - 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', + # available till 26.07.2022 + 'url': 'http://www.ardmediathek.de/tv/S%C3%9CDLICHT/Was-ist-die-Kunst-der-Zukunft-liebe-Ann/BR-Fernsehen/Video?bcastId=34633636&documentId=44726822', 'info_dict': { - 'id': '29582122', + 'id': '44726822', 'ext': 'mp4', - 'title': 'Ich liebe das Leben trotzdem', - 'description': 'md5:45e4c225c72b27993314b31a84a5261c', - 'duration': 4557, + 'title': 'Was ist die Kunst der Zukunft, liebe Anna McCarthy?', + 'description': 'md5:4ada28b3e3b5df01647310e41f3a62f5', + 'duration': 1740, }, 'params': { # m3u8 download 'skip_download': True, - }, - 'skip': 'HTTP Error 404: Not Found', - }, { - 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', - 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', - 'info_dict': { - 'id': '29522730', - 'ext': 'mp4', - 'title': 'Tatort: Scheinwelten - Hörfassung (Video tgl. ab 20 Uhr)', - 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', - 'duration': 5252, - }, - 'skip': 'HTTP Error 404: Not Found', + } }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', - 'md5': '219d94d8980b4f538c7fcb0865eb7f2c', - 'info_dict': { - 'id': '28488308', - 'ext': 'mp3', - 'title': 'Tod eines Fußballers', - 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', - 'duration': 3240, - }, - 'skip': 'HTTP Error 404: Not Found', + 'only_matching': True, }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, }, { # audio 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', - 'md5': '4e8f00631aac0395fee17368ac0e9867', - 'info_dict': { - 'id': '30796318', - 'ext': 'mp3', - 'title': 'Vor dem Fest', - 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', - 'duration': 3287, - }, - 'skip': 'Video is no longer available', + 'only_matching': True, }] def _extract_media_info(self, media_info_url, webpage, video_id): @@ -252,20 +225,23 @@ class ARDMediathekIE(InfoExtractor): class ARDIE(InfoExtractor): _VALID_URL = r'(?Phttps?://(www\.)?daserste\.de/[^?#]+/videos/(?P[^/?#]+)-(?P[0-9]+))\.html' - _TEST = { - 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', - 'md5': 'd216c3a86493f9322545e045ddc3eb35', + _TESTS = [{ + # available till 14.02.2019 + 'url': 'http://www.daserste.de/information/talk/maischberger/videos/das-groko-drama-zerlegen-sich-die-volksparteien-video-102.html', + 'md5': '8e4ec85f31be7c7fc08a26cdbc5a1f49', 'info_dict': { - 'display_id': 'die-story-im-ersten-mission-unter-falscher-flagge', - 'id': '100', + 'display_id': 'das-groko-drama-zerlegen-sich-die-volksparteien-video', + 'id': '102', 'ext': 'mp4', - 'duration': 2600, - 'title': 'Die Story im Ersten: Mission unter falscher Flagge', - 'upload_date': '20140804', + 'duration': 4435.0, + 'title': 'Das GroKo-Drama: Zerlegen sich die Volksparteien?', + 'upload_date': '20180214', 'thumbnail': r're:^https?://.*\.jpg$', }, - 'skip': 'HTTP Error 404: Not Found', - } + }, { + 'url': 'http://www.daserste.de/information/reportage-dokumentation/dokus/videos/die-story-im-ersten-mission-unter-falscher-flagge-100.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f045050..0e4eaef 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -564,7 +564,7 @@ class BrightcoveNewIE(AdobePassIE): return entries - def _parse_brightcove_metadata(self, json_data, video_id): + def _parse_brightcove_metadata(self, json_data, video_id, headers={}): title = json_data['name'].strip() formats = [] @@ -638,6 +638,9 @@ class BrightcoveNewIE(AdobePassIE): self._sort_formats(formats) + for f in formats: + f.setdefault('http_headers', {}).update(headers) + subtitles = {} for text_track in json_data.get('text_tracks', []): if text_track.get('src'): @@ -690,10 +693,17 @@ class BrightcoveNewIE(AdobePassIE): webpage, 'policy key', group='pk') api_url = 'https://edge.api.brightcove.com/playback/v1/accounts/%s/videos/%s' % (account_id, video_id) - try: - json_data = self._download_json(api_url, video_id, headers={ - 'Accept': 'application/json;pk=%s' % policy_key + headers = { + 'Accept': 'application/json;pk=%s' % policy_key, + } + referrer = smuggled_data.get('referrer') + if referrer: + headers.update({ + 'Referer': referrer, + 'Origin': re.search(r'https?://[^/]+', referrer).group(0), }) + try: + json_data = self._download_json(api_url, video_id, headers=headers) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: json_data = self._parse_json(e.cause.read().decode(), video_id)[0] @@ -717,4 +727,5 @@ class BrightcoveNewIE(AdobePassIE): 'tveToken': tve_token, }) - return self._parse_brightcove_metadata(json_data, video_id) + return self._parse_brightcove_metadata( + json_data, video_id, headers=headers) diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 3faa760..8ac62c1 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -246,7 +246,7 @@ class VrtNUIE(GigyaBaseIE): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) + webpage, urlh = self._download_webpage_handle(url, display_id) title = self._html_search_regex( r'(?ms)

(.+?)

', @@ -276,7 +276,7 @@ class VrtNUIE(GigyaBaseIE): webpage, 'release_date', default=None)) # If there's a ? or a # in the URL, remove them and everything after - clean_url = url.split('?')[0].split('#')[0].strip('/') + clean_url = urlh.geturl().split('?')[0].split('#')[0].strip('/') securevideo_url = clean_url + '.mssecurevideo.json' try: diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index 9faf402..3be0c64 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -1,6 +1,7 @@ # coding: utf-8 from __future__ import unicode_literals +import json import re from .common import InfoExtractor @@ -13,6 +14,7 @@ from ..utils import ( xpath_element, xpath_with_ns, find_xpath_attr, + parse_duration, parse_iso8601, parse_age_limit, int_or_none, @@ -359,3 +361,63 @@ class CBCWatchIE(CBCWatchBaseIE): video_id = self._match_id(url) rss = self._call_api('web/browse/' + video_id, video_id) return self._parse_rss_feed(rss) + + +class CBCOlympicsIE(InfoExtractor): + IE_NAME = 'cbc.ca:olympics' + _VALID_URL = r'https?://olympics\.cbc\.ca/video/[^/]+/(?P[^/?#]+)' + _TESTS = [{ + 'url': 'https://olympics.cbc.ca/video/whats-on-tv/olympic-morning-featuring-the-opening-ceremony/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + video_id = self._hidden_inputs(webpage)['videoId'] + video_doc = self._download_xml( + 'https://olympics.cbc.ca/videodata/%s.xml' % video_id, video_id) + title = xpath_text(video_doc, 'title', fatal=True) + is_live = xpath_text(video_doc, 'kind') == 'Live' + if is_live: + title = self._live_title(title) + + formats = [] + for video_source in video_doc.findall('videoSources/videoSource'): + uri = xpath_text(video_source, 'uri') + if not uri: + continue + tokenize = self._download_json( + 'https://olympics.cbc.ca/api/api-akamai/tokenize', + video_id, data=json.dumps({ + 'VideoSource': uri, + }).encode(), headers={ + 'Content-Type': 'application/json', + 'Referer': url, + # d3.VideoPlayer._init in https://olympics.cbc.ca/components/script/base.js + 'Cookie': '_dvp=TK:C0ObxjerU', # AKAMAI CDN cookie + }, fatal=False) + if not tokenize: + continue + content_url = tokenize['ContentUrl'] + video_source_format = video_source.get('format') + if video_source_format == 'IIS': + formats.extend(self._extract_ism_formats( + content_url, video_id, ism_id=video_source_format, fatal=False)) + else: + formats.extend(self._extract_m3u8_formats( + content_url, video_id, 'mp4', + 'm3u8' if is_live else 'm3u8_native', + m3u8_id=video_source_format, fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': xpath_text(video_doc, 'description'), + 'thumbnail': xpath_text(video_doc, 'thumbnailUrl'), + 'duration': parse_duration(xpath_text(video_doc, 'duration')), + 'formats': formats, + 'is_live': is_live, + } diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 681d63e..6596e98 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -75,10 +75,10 @@ class CBSInteractiveIE(CBSIE): webpage = self._download_webpage(url, display_id) data_json = self._html_search_regex( - r"data-(?:cnet|zdnet)-video(?:-uvp(?:js)?)?-options='([^']+)'", + r"data(?:-(?:cnet|zdnet))?-video(?:-(?:uvp(?:js)?|player))?-options='([^']+)'", webpage, 'data json') data = self._parse_json(data_json, display_id) - vdata = data.get('video') or data['videos'][0] + vdata = data.get('video') or (data.get('videos') or data.get('playlist'))[0] video_id = vdata['mpxRefId'] diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index deafb48..fcdd0fd 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -174,6 +174,8 @@ class InfoExtractor(object): width : height ratio as float. * no_resume The server does not support resuming the (HTTP or RTMP) download. Boolean. + * downloader_options A dictionary of downloader options as + described in FileDownloader url: Final video URL. ext: Video filename extension. @@ -2248,9 +2250,10 @@ class InfoExtractor(object): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex( - r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') - http_base_url = '%s:%s' % ('http', url_base) + mobj = re.search( + r'(?:(?:http|rtmp|rtsp)(?Ps)?:)?(?P//[^?]+)', url) + url_base = mobj.group('url') + http_base_url = '%s%s:%s' % ('http', mobj.group('s') or '', url_base) formats = [] def manifest_url(manifest): @@ -2350,7 +2353,10 @@ class InfoExtractor(object): for track in tracks: if not isinstance(track, dict): continue - if track.get('kind') != 'captions': + track_kind = track.get('kind') + if not track_kind or not isinstance(track_kind, compat_str): + continue + if track_kind.lower() not in ('captions', 'subtitles'): continue track_url = urljoin(base_url, track.get('file')) if not track_url: diff --git a/youtube_dl/extractor/discovery.py b/youtube_dl/extractor/discovery.py index f9cec1d..91449dc 100644 --- a/youtube_dl/extractor/discovery.py +++ b/youtube_dl/extractor/discovery.py @@ -5,15 +5,16 @@ import re import string from .discoverygo import DiscoveryGoBaseIE +from ..compat import compat_str from ..utils import ( ExtractorError, - update_url_query, + try_get, ) from ..compat import compat_HTTPError class DiscoveryIE(DiscoveryGoBaseIE): - _VALID_URL = r'''(?x)https?://(?:www\.)?(?: + _VALID_URL = r'''(?x)https?://(?:www\.)?(?P discovery| investigationdiscovery| discoverylife| @@ -44,7 +45,7 @@ class DiscoveryIE(DiscoveryGoBaseIE): _GEO_BYPASS = False def _real_extract(self, url): - path, display_id = re.match(self._VALID_URL, url).groups() + site, path, display_id = re.match(self._VALID_URL, url).groups() webpage = self._download_webpage(url, display_id) react_data = self._parse_json(self._search_regex( @@ -55,14 +56,13 @@ class DiscoveryIE(DiscoveryGoBaseIE): video_id = video['id'] access_token = self._download_json( - 'https://www.discovery.com/anonymous', display_id, query={ - 'authLink': update_url_query( - 'https://login.discovery.com/v1/oauth2/authorize', { - 'client_id': react_data['application']['apiClientId'], - 'redirect_uri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html', - 'response_type': 'anonymous', - 'state': 'nonce,' + ''.join([random.choice(string.ascii_letters) for _ in range(32)]), - }) + 'https://www.%s.com/anonymous' % site, display_id, query={ + 'authRel': 'authorization', + 'client_id': try_get( + react_data, lambda x: x['application']['apiClientId'], + compat_str) or '3020a40c2356a645b4b4', + 'nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'redirectUri': 'https://fusion.ddmcdn.com/app/mercury-sdk/180/redirectHandler.html?https://www.%s.com' % site, })['access_token'] try: diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py index a08dace..b734467 100644 --- a/youtube_dl/extractor/dplay.py +++ b/youtube_dl/extractor/dplay.py @@ -26,7 +26,7 @@ from ..utils import ( class DPlayIE(InfoExtractor): - _VALID_URL = r'https?://(?Pwww\.(?Pdplay\.(?Pdk|se|no)))/(?:videoer/)?(?P[^/]+/[^/?#]+)' + _VALID_URL = r'https?://(?Pwww\.(?Pdplay\.(?Pdk|se|no)))/(?:video(?:er|s)/)?(?P[^/]+/[^/?#]+)' _TESTS = [{ # non geo restricted, via secure api, unsigned download hls URL @@ -89,9 +89,12 @@ class DPlayIE(InfoExtractor): 'skip_download': True, }, }, { - # geo restricted, bypassable via X-Forwarded-For + 'url': 'https://www.dplay.dk/videoer/singleliv/season-5-episode-3', 'only_matching': True, + }, { + 'url': 'https://www.dplay.se/videos/sofias-anglar/sofias-anglar-1001', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/dvtv.py b/youtube_dl/extractor/dvtv.py index e85c58b..3f76088 100644 --- a/youtube_dl/extractor/dvtv.py +++ b/youtube_dl/extractor/dvtv.py @@ -32,7 +32,7 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/dvtv/dvtv-16-12-2014-utok-talibanu-boj-o-kliniku-uprchlici/r~973eb3bc854e11e498be002590604f2e/', 'info_dict': { - 'title': 'DVTV 16. 12. 2014: útok Talibanu, boj o kliniku, uprchlíci', + 'title': r're:^DVTV 16\. 12\. 2014: útok Talibanu, boj o kliniku, uprchlíci', 'id': '973eb3bc854e11e498be002590604f2e', }, 'playlist': [{ @@ -91,10 +91,24 @@ class DVTVIE(InfoExtractor): }, { 'url': 'http://video.aktualne.cz/v-cechach-poprve-zazni-zelenkova-zrestaurovana-mse/r~45b4b00483ec11e4883b002590604f2e/', 'only_matching': True, + }, { + 'url': 'https://video.aktualne.cz/dvtv/babis-a-zeman-nesou-vinu-za-to-ze-nemame-jasno-v-tom-kdo-bud/r~026afb54fad711e79704ac1f6b220ee8/', + 'md5': '87defe16681b1429c91f7a74809823c6', + 'info_dict': { + 'id': 'f5ae72f6fad611e794dbac1f6b220ee8', + 'ext': 'mp4', + 'title': 'Babiš a Zeman nesou vinu za to, že nemáme jasno v tom, kdo bude vládnout, říká Pekarová Adamová', + }, + 'params': { + 'skip_download': True, + }, }] - def _parse_video_metadata(self, js, video_id): + def _parse_video_metadata(self, js, video_id, live_js=None): data = self._parse_json(js, video_id, transform_source=js_to_json) + if live_js: + data.update(self._parse_json( + live_js, video_id, transform_source=js_to_json)) title = unescapeHTML(data['title']) @@ -142,13 +156,18 @@ class DVTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) + # live content + live_item = self._search_regex( + r'(?s)embedData[0-9a-f]{32}\.asset\.liveStarter\s*=\s*(\{.+?\});', + webpage, 'video', default=None) + # single video item = self._search_regex( r'(?s)embedData[0-9a-f]{32}\[["\']asset["\']\]\s*=\s*(\{.+?\});', - webpage, 'video', default=None, fatal=False) + webpage, 'video', default=None) if item: - return self._parse_video_metadata(item, video_id) + return self._parse_video_metadata(item, video_id, live_item) # playlist items = re.findall( diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index b442256..3bde40e 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -162,6 +162,7 @@ from .cbc import ( CBCPlayerIE, CBCWatchVideoIE, CBCWatchIE, + CBCOlympicsIE, ) from .cbs import CBSIE from .cbslocal import CBSLocalIE @@ -373,8 +374,10 @@ from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( FranceTVIE, + FranceTVSiteIE, FranceTVEmbedIE, FranceTVInfoIE, + FranceTVJeunesseIE, GenerationWhatIE, CultureboxIE, ) @@ -382,7 +385,10 @@ from .freesound import FreesoundIE from .freespeech import FreespeechIE from .freshlive import FreshLiveIE from .funimation import FunimationIE -from .funk import FunkIE +from .funk import ( + FunkMixIE, + FunkChannelIE, +) from .funnyordie import FunnyOrDieIE from .fusion import FusionIE from .fxnetworks import FXNetworksIE @@ -426,6 +432,7 @@ from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE from .hgtv import HGTVComShowIE +from .hidive import HiDiveIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE @@ -543,6 +550,7 @@ from .limelight import ( LimelightChannelIE, LimelightChannelListIE, ) +from .line import LineTVIE from .litv import LiTVIE from .liveleak import ( LiveLeakIE, @@ -563,7 +571,11 @@ from .lynda import ( ) from .m6 import M6IE from .macgamestore import MacGameStoreIE -from .mailru import MailRuIE +from .mailru import ( + MailRuIE, + MailRuMusicIE, + MailRuMusicSearchIE, +) from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .mangomolo import ( @@ -630,7 +642,10 @@ from .musicplayon import MusicPlayOnIE from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE -from .myvi import MyviIE +from .myvi import ( + MyviIE, + MyviEmbedIE, +) from .myvidster import MyVidsterIE from .nationalgeographic import ( NationalGeographicVideoIE, @@ -644,6 +659,7 @@ from .nbc import ( NBCIE, NBCNewsIE, NBCOlympicsIE, + NBCOlympicsStreamIE, NBCSportsIE, NBCSportsVPlayerIE, ) @@ -860,6 +876,7 @@ from .rai import ( RaiPlayPlaylistIE, RaiIE, ) +from .raywenderlich import RayWenderlichIE from .rbmaradio import RBMARadioIE from .rds import RDSIE from .redbulltv import RedBullTVIE @@ -1038,9 +1055,14 @@ from .telebruxelles import TeleBruxellesIE from .telecinco import TelecincoIE from .telegraaf import TelegraafIE from .telemb import TeleMBIE -from .telequebec import TeleQuebecIE +from .telequebec import ( + TeleQuebecIE, + TeleQuebecEmissionIE, + TeleQuebecLiveIE, +) from .teletask import TeleTaskIE from .telewebion import TelewebionIE +from .tennistv import TennisTVIE from .testurl import TestURLIE from .tf1 import TF1IE from .tfo import TFOIE @@ -1195,7 +1217,6 @@ from .vice import ( ViceArticleIE, ViceShowIE, ) -from .viceland import VicelandIE from .vidbit import VidbitIE from .viddler import ViddlerIE from .videa import VideaIE @@ -1210,6 +1231,7 @@ from .videomore import ( from .videopremium import VideoPremiumIE from .videopress import VideoPressIE from .vidio import VidioIE +from .vidlii import VidLiiIE from .vidme import ( VidmeIE, VidmeUserIE, @@ -1353,6 +1375,7 @@ from .yandexmusic import ( YandexMusicPlaylistIE, ) from .yandexdisk import YandexDiskIE +from .yapfiles import YapFilesIE from .yesjapan import YesJapanIE from .yinyuetai import YinYueTaiIE from .ynet import YnetIE diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 095bb39..c02cd03 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -5,19 +5,89 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( clean_html, + determine_ext, ExtractorError, int_or_none, parse_duration, - determine_ext, + try_get, ) from .dailymotion import DailymotionIE class FranceTVBaseInfoExtractor(InfoExtractor): + def _make_url_result(self, video_or_full_id, catalog=None): + full_id = 'francetv:%s' % video_or_full_id + if '@' not in video_or_full_id and catalog: + full_id += '@%s' % catalog + return self.url_result( + full_id, ie=FranceTVIE.ie_key(), + video_id=video_or_full_id.split('@')[0]) + + +class FranceTVIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + https?:// + sivideo\.webservices\.francetelevisions\.fr/tools/getInfosOeuvre/v2/\? + .*?\bidDiffusion=[^&]+| + (?: + https?://videos\.francetv\.fr/video/| + francetv: + ) + (?P[^@]+)(?:@(?P.+))? + ) + ''' + + _TESTS = [{ + # without catalog + 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=162311093&callback=_jsonp_loader_callback_request_0', + 'md5': 'c2248a8de38c4e65ea8fae7b5df2d84f', + 'info_dict': { + 'id': '162311093', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1502623500, + 'upload_date': '20170813', + }, + }, { + # with catalog + 'url': 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=NI_1004933&catalogue=Zouzous&callback=_jsonp_loader_callback_request_4', + 'only_matching': True, + }, { + 'url': 'http://videos.francetv.fr/video/NI_657393@Regions', + 'only_matching': True, + }, { + 'url': 'francetv:162311093', + 'only_matching': True, + }, { + 'url': 'francetv:NI_1004933@Zouzous', + 'only_matching': True, + }, { + 'url': 'francetv:NI_983319@Info-web', + 'only_matching': True, + }, { + 'url': 'francetv:NI_983319', + 'only_matching': True, + }, { + 'url': 'francetv:NI_657393@Regions', + 'only_matching': True, + }, { + # france-3 live + 'url': 'francetv:SIM_France3', + 'only_matching': True, + }] + def _extract_video(self, video_id, catalogue=None): + # Videos are identified by idDiffusion so catalogue part is optional. + # However when provided, some extra formats may be returned so we pass + # it if available. info = self._download_json( 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', video_id, 'Downloading video JSON', query={ @@ -27,7 +97,8 @@ class FranceTVBaseInfoExtractor(InfoExtractor): if info.get('status') == 'NOK': raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, info['message']), expected=True) + '%s returned error: %s' % (self.IE_NAME, info['message']), + expected=True) allowed_countries = info['videos'][0].get('geoblocage') if allowed_countries: georestricted = True @@ -42,6 +113,21 @@ class FranceTVBaseInfoExtractor(InfoExtractor): else: georestricted = False + def sign(manifest_url, manifest_id): + for host in ('hdfauthftv-a.akamaihd.net', 'hdfauth.francetv.fr'): + signed_url = self._download_webpage( + 'https://%s/esi/TA' % host, video_id, + 'Downloading signed %s manifest URL' % manifest_id, + fatal=False, query={ + 'url': manifest_url, + }) + if (signed_url and isinstance(signed_url, compat_str) and + re.search(r'^(?:https?:)?//', signed_url)): + return signed_url + return manifest_url + + is_live = None + formats = [] for video in info['videos']: if video['statut'] != 'ONLINE': @@ -49,6 +135,10 @@ class FranceTVBaseInfoExtractor(InfoExtractor): video_url = video['url'] if not video_url: continue + if is_live is None: + is_live = (try_get( + video, lambda x: x['plages_ouverture'][0]['direct'], + bool) is True) or '/live.francetv.fr/' in video_url format_id = video['format'] ext = determine_ext(video_url) if ext == 'f4m': @@ -56,17 +146,14 @@ class FranceTVBaseInfoExtractor(InfoExtractor): # See https://github.com/rg3/youtube-dl/issues/3963 # m3u8 urls work fine continue - f4m_url = self._download_webpage( - 'http://hdfauth.francetv.fr/esi/TA?url=%s' % video_url, - video_id, 'Downloading f4m manifest token', fatal=False) - if f4m_url: - formats.extend(self._extract_f4m_formats( - f4m_url + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id=format_id, fatal=False)) + formats.extend(self._extract_f4m_formats( + sign(video_url, format_id) + '&hdcore=3.7.0&plugin=aasp-3.7.0.39.44', + video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id=format_id, fatal=False)) + sign(video_url, format_id), video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id=format_id, + fatal=False)) elif video_url.startswith('rtmp'): formats.append({ 'url': video_url, @@ -97,33 +184,48 @@ class FranceTVBaseInfoExtractor(InfoExtractor): return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': clean_html(info['synopsis']), 'thumbnail': compat_urlparse.urljoin('http://pluzz.francetv.fr', info['image']), 'duration': int_or_none(info.get('real_duration')) or parse_duration(info['duree']), 'timestamp': int_or_none(info['diffusion']['timestamp']), + 'is_live': is_live, 'formats': formats, 'subtitles': subtitles, } + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + catalog = mobj.group('catalog') -class FranceTVIE(FranceTVBaseInfoExtractor): + if not video_id: + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + video_id = qs.get('idDiffusion', [None])[0] + catalog = qs.get('catalogue', [None])[0] + if not video_id: + raise ExtractorError('Invalid URL', expected=True) + + return self._extract_video(video_id, catalog) + + +class FranceTVSiteIE(FranceTVBaseInfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)*(?P[^/]+)\.html' _TESTS = [{ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', 'info_dict': { - 'id': '157550144', + 'id': '162311093', 'ext': 'mp4', 'title': '13h15, le dimanche... - Les mystères de Jésus', 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', - 'timestamp': 1494156300, - 'upload_date': '20170507', + 'timestamp': 1502623500, + 'upload_date': '20170813', }, 'params': { - # m3u8 downloads 'skip_download': True, }, + 'add_ie': [FranceTVIE.ie_key()], }, { # france3 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', @@ -156,6 +258,10 @@ class FranceTVIE(FranceTVBaseInfoExtractor): }, { 'url': 'https://www.france.tv/142749-rouge-sang.html', 'only_matching': True, + }, { + # france-3 live + 'url': 'https://www.france.tv/france-3/direct.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -172,13 +278,14 @@ class FranceTVIE(FranceTVBaseInfoExtractor): video_id, catalogue = self._html_search_regex( r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', webpage, 'video ID').split('@') - return self._extract_video(video_id, catalogue) + + return self._make_url_result(video_id, catalogue) class FranceTVEmbedIE(FranceTVBaseInfoExtractor): _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P[^&]+)' - _TEST = { + _TESTS = [{ 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', 'info_dict': { 'id': 'NI_983319', @@ -188,7 +295,11 @@ class FranceTVEmbedIE(FranceTVBaseInfoExtractor): 'timestamp': 1493981780, 'duration': 16, }, - } + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -197,12 +308,12 @@ class FranceTVEmbedIE(FranceTVBaseInfoExtractor): 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, video_id) - return self._extract_video(video['video_id'], video.get('catalog')) + return self._make_url_result(video['video_id'], video.get('catalog')) class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P[^/?#&.]+)' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&.]+)' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -217,51 +328,18 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }, }, 'params': { - # m3u8 downloads 'skip_download': True, }, + 'add_ie': [FranceTVIE.ie_key()], }, { 'url': 'http://www.francetvinfo.fr/elections/europeennes/direct-europeennes-regardez-le-debat-entre-les-candidats-a-la-presidence-de-la-commission_600639.html', - 'info_dict': { - 'id': 'EV_20019', - 'ext': 'mp4', - 'title': 'Débat des candidats à la Commission européenne', - 'description': 'Débat des candidats à la Commission européenne', - }, - 'params': { - 'skip_download': 'HLS (reqires ffmpeg)' - }, - 'skip': 'Ce direct est terminé et sera disponible en rattrapage dans quelques minutes.', + 'only_matching': True, }, { 'url': 'http://www.francetvinfo.fr/economie/entreprises/les-entreprises-familiales-le-secret-de-la-reussite_933271.html', - 'md5': 'f485bda6e185e7d15dbc69b72bae993e', - 'info_dict': { - 'id': 'NI_173343', - 'ext': 'mp4', - 'title': 'Les entreprises familiales : le secret de la réussite', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1433273139, - 'upload_date': '20150602', - }, - 'params': { - # m3u8 downloads - 'skip_download': True, - }, + 'only_matching': True, }, { 'url': 'http://france3-regions.francetvinfo.fr/bretagne/cotes-d-armor/thalassa-echappee-breizh-ce-venredi-dans-les-cotes-d-armor-954961.html', - 'md5': 'f485bda6e185e7d15dbc69b72bae993e', - 'info_dict': { - 'id': 'NI_657393', - 'ext': 'mp4', - 'title': 'Olivier Monthus, réalisateur de "Bretagne, le choix de l’Armor"', - 'description': 'md5:a3264114c9d29aeca11ced113c37b16c', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1458300695, - 'upload_date': '20160318', - }, - 'params': { - 'skip_download': True, - }, + 'only_matching': True, }, { # Dailymotion embed 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', @@ -283,9 +361,9 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - page_title = mobj.group('title') - webpage = self._download_webpage(url, page_title) + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) dailymotion_urls = DailymotionIE._extract_urls(webpage) if dailymotion_urls: @@ -297,12 +375,13 @@ class FranceTVInfoIE(FranceTVBaseInfoExtractor): (r'id-video=([^@]+@[^"]+)', r'<a[^>]+href="(?:https?:)?//videos\.francetv\.fr/video/([^@]+@[^"]+)"'), webpage, 'video id').split('@') - return self._extract_video(video_id, catalogue) + + return self._make_url_result(video_id, catalogue) class GenerationWhatIE(InfoExtractor): IE_NAME = 'france2.fr:generation-what' - _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://generation-what\.francetv\.fr/[^/]+/video/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://generation-what.francetv.fr/portrait/video/present-arms', @@ -314,6 +393,10 @@ class GenerationWhatIE(InfoExtractor): 'uploader_id': 'UCHH9p1eetWCgt4kXBYCb3_w', 'upload_date': '20160411', }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://generation-what.francetv.fr/europe/video/present-arms', 'only_matching': True, @@ -321,42 +404,87 @@ class GenerationWhatIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + youtube_id = self._search_regex( r"window\.videoURL\s*=\s*'([0-9A-Za-z_-]{11})';", webpage, 'youtube id') - return self.url_result(youtube_id, 'Youtube', youtube_id) + + return self.url_result(youtube_id, ie='Youtube', video_id=youtube_id) class CultureboxIE(FranceTVBaseInfoExtractor): - IE_NAME = 'culturebox.francetvinfo.fr' - _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?P<name>.*?)(\?|$)' + _VALID_URL = r'https?://(?:m\.)?culturebox\.francetvinfo\.fr/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { - 'url': 'http://culturebox.francetvinfo.fr/live/musique/musique-classique/le-livre-vermeil-de-montserrat-a-la-cathedrale-delne-214511', - 'md5': '9b88dc156781c4dbebd4c3e066e0b1d6', + _TESTS = [{ + 'url': 'https://culturebox.francetvinfo.fr/opera-classique/musique-classique/c-est-baroque/concerts/cantates-bwv-4-106-et-131-de-bach-par-raphael-pichon-57-268689', 'info_dict': { - 'id': 'EV_50111', - 'ext': 'flv', - 'title': "Le Livre Vermeil de Montserrat à la Cathédrale d'Elne", - 'description': 'md5:f8a4ad202e8fe533e2c493cc12e739d9', - 'upload_date': '20150320', - 'timestamp': 1426892400, - 'duration': 2760.9, + 'id': 'EV_134885', + 'ext': 'mp4', + 'title': 'Cantates BWV 4, 106 et 131 de Bach par Raphaël Pichon 5/7', + 'description': 'md5:19c44af004b88219f4daa50fa9a351d4', + 'upload_date': '20180206', + 'timestamp': 1517945220, + 'duration': 5981, }, - } + 'params': { + 'skip_download': True, + }, + 'add_ie': [FranceTVIE.ie_key()], + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - name = mobj.group('name') + display_id = self._match_id(url) - webpage = self._download_webpage(url, name) + webpage = self._download_webpage(url, display_id) if ">Ce live n'est plus disponible en replay<" in webpage: - raise ExtractorError('Video %s is not available' % name, expected=True) + raise ExtractorError( + 'Video %s is not available' % display_id, expected=True) video_id, catalogue = self._search_regex( r'["\'>]https?://videos\.francetv\.fr/video/([^@]+@.+?)["\'<]', webpage, 'video id').split('@') - return self._extract_video(video_id, catalogue) + return self._make_url_result(video_id, catalogue) + + +class FranceTVJeunesseIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'(?P<url>https?://(?:www\.)?(?:zouzous|ludo)\.fr/heros/(?P<id>[^/?#&]+))' + + _TESTS = [{ + 'url': 'https://www.zouzous.fr/heros/simon', + 'info_dict': { + 'id': 'simon', + }, + 'playlist_count': 9, + }, { + 'url': 'https://www.ludo.fr/heros/ninjago', + 'info_dict': { + 'id': 'ninjago', + }, + 'playlist_count': 10, + }, { + 'url': 'https://www.zouzous.fr/heros/simon?abc', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + + playlist = self._download_json( + '%s/%s' % (mobj.group('url'), 'playlist'), playlist_id) + + if not playlist.get('count'): + raise ExtractorError( + '%s is not available' % playlist_id, expected=True) + + entries = [] + for item in playlist['items']: + identity = item.get('identity') + if identity and isinstance(identity, compat_str): + entries.append(self._make_url_result(identity)) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/funk.py b/youtube_dl/extractor/funk.py index ce5c67f..faea657 100644 --- a/youtube_dl/extractor/funk.py +++ b/youtube_dl/extractor/funk.py @@ -1,43 +1,102 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from .nexx import NexxIE -from ..utils import extract_attributes +from ..utils import int_or_none + + +class FunkBaseIE(InfoExtractor): + def _make_url_result(self, video): + return { + '_type': 'url_transparent', + 'url': 'nexx:741:%s' % video['sourceId'], + 'ie_key': NexxIE.ie_key(), + 'id': video['sourceId'], + 'title': video.get('title'), + 'description': video.get('description'), + 'duration': int_or_none(video.get('duration')), + 'season_number': int_or_none(video.get('seasonNr')), + 'episode_number': int_or_none(video.get('episodeNr')), + } + + +class FunkMixIE(FunkBaseIE): + _VALID_URL = r'https?://(?:www\.)?funk\.net/mix/(?P<id>[^/]+)/(?P<alias>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.funk.net/mix/59d65d935f8b160001828b5b/die-realste-kifferdoku-aller-zeiten', + 'md5': '8edf617c2f2b7c9847dfda313f199009', + 'info_dict': { + 'id': '123748', + 'ext': 'mp4', + 'title': '"Die realste Kifferdoku aller Zeiten"', + 'description': 'md5:c97160f5bafa8d47ec8e2e461012aa9d', + 'timestamp': 1490274721, + 'upload_date': '20170323', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + mix_id = mobj.group('id') + alias = mobj.group('alias') + + lists = self._download_json( + 'https://www.funk.net/api/v3.1/curation/curatedLists/', + mix_id, headers={ + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbC12Mi4wIiwic2NvcGUiOiJzdGF0aWMtY29udGVudC1hcGksY3VyYXRpb24tc2VydmljZSxzZWFyY2gtYXBpIn0.SGCC1IXHLtZYoo8PvRKlU2gXH1su8YSu47sB3S4iXBI', + 'Referer': url, + }, query={ + 'size': 100, + })['result']['lists'] + + metas = next( + l for l in lists + if mix_id in (l.get('entityId'), l.get('alias')))['videoMetas'] + video = next( + meta['videoDataDelegate'] + for meta in metas if meta.get('alias') == alias) + + return self._make_url_result(video) -class FunkIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funk\.net/(?:mix|channel)/(?:[^/]+/)*(?P<id>[^?/#]+)' +class FunkChannelIE(FunkBaseIE): + _VALID_URL = r'https?://(?:www\.)?funk\.net/channel/(?P<id>[^/]+)/(?P<alias>[^/?#&]+)' _TESTS = [{ - 'url': 'https://www.funk.net/mix/59d65d935f8b160001828b5b/0/59d517e741dca10001252574/', - 'md5': '4d40974481fa3475f8bccfd20c5361f8', + 'url': 'https://www.funk.net/channel/ba/die-lustigsten-instrumente-aus-dem-internet-teil-2', 'info_dict': { - 'id': '716599', + 'id': '1155821', 'ext': 'mp4', - 'title': 'Neue Rechte Welle', - 'description': 'md5:a30a53f740ffb6bfd535314c2cc5fb69', - 'timestamp': 1501337639, - 'upload_date': '20170729', + 'title': 'Die LUSTIGSTEN INSTRUMENTE aus dem Internet - Teil 2', + 'description': 'md5:a691d0413ef4835588c5b03ded670c1f', + 'timestamp': 1514507395, + 'upload_date': '20171229', }, 'params': { - 'format': 'bestvideo', 'skip_download': True, }, }, { - 'url': 'https://www.funk.net/channel/59d5149841dca100012511e3/0/59d52049999264000182e79d/', + 'url': 'https://www.funk.net/channel/59d5149841dca100012511e3/mein-erster-job-lovemilla-folge-1/lovemilla/', 'only_matching': True, }] def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + channel_id = mobj.group('id') + alias = mobj.group('alias') - webpage = self._download_webpage(url, video_id) + results = self._download_json( + 'https://www.funk.net/api/v3.0/content/videos/filter', channel_id, + headers={ + 'authorization': 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJjbGllbnROYW1lIjoiY3VyYXRpb24tdG9vbCIsInNjb3BlIjoic3RhdGljLWNvbnRlbnQtYXBpLGN1cmF0aW9uLWFwaSxzZWFyY2gtYXBpIn0.q4Y2xZG8PFHai24-4Pjx2gym9RmJejtmK6lMXP5wAgc', + 'Referer': url, + }, query={ + 'channelId': channel_id, + 'size': 100, + })['result'] - domain_id = NexxIE._extract_domain_id(webpage) or '741' - nexx_id = extract_attributes(self._search_regex( - r'(<div[^>]id=["\']mediaplayer-funk[^>]+>)', - webpage, 'media player'))['data-id'] + video = next(r for r in results if r.get('alias') == alias) - return self.url_result( - 'nexx:%s:%s' % (domain_id, nexx_id), ie=NexxIE.ie_key(), - video_id=nexx_id) + return self._make_url_result(video) diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py index ede729b..25e284d 100644 --- a/youtube_dl/extractor/fusion.py +++ b/youtube_dl/extractor/fusion.py @@ -5,9 +5,9 @@ from .ooyala import OoyalaIE class FusionIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?fusion\.(?:net|tv)/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', + 'url': 'http://fusion.tv/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', 'info_dict': { 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', 'ext': 'mp4', @@ -20,7 +20,7 @@ class FusionIE(InfoExtractor): }, 'add_ie': ['Ooyala'], }, { - 'url': 'http://fusion.net/video/201781', + 'url': 'http://fusion.tv/video/201781', 'only_matching': True, }] diff --git a/youtube_dl/extractor/gameinformer.py b/youtube_dl/extractor/gameinformer.py index a66e309..a2920a7 100644 --- a/youtube_dl/extractor/gameinformer.py +++ b/youtube_dl/extractor/gameinformer.py @@ -23,6 +23,11 @@ class GameInformerIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) - webpage = self._download_webpage(url, display_id) - brightcove_id = self._search_regex(r"getVideo\('[^']+video_id=(\d+)", webpage, 'brightcove id') - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + webpage = self._download_webpage( + url, display_id, headers=self.geo_verification_headers()) + brightcove_id = self._search_regex( + [r'<[^>]+\bid=["\']bc_(\d+)', r"getVideo\('[^']+video_id=(\d+)"], + webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', + brightcove_id) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 1d9da81..a98f363 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -102,6 +102,8 @@ from .channel9 import Channel9IE from .vshare import VShareIE from .mediasite import MediasiteIE from .springboardplatform import SpringboardPlatformIE +from .yapfiles import YapFilesIE +from .vice import ViceIE class GenericIE(InfoExtractor): @@ -1954,6 +1956,34 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, 'add_ie': [SpringboardPlatformIE.ie_key()], + }, + { + 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', + 'info_dict': { + 'id': 'uPDB5I9wfp8', + 'ext': 'webm', + 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', + 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', + 'upload_date': '20160219', + 'uploader': 'Pocoyo - Português (BR)', + 'uploader_id': 'PocoyoBrazil', + }, + 'add_ie': [YoutubeIE.ie_key()], + 'params': { + 'skip_download': True, + }, + }, + { + 'url': 'https://www.yapfiles.ru/show/1872528/690b05d3054d2dbe1e69523aa21bb3b1.mp4.html', + 'info_dict': { + 'id': 'vMDE4NzI1Mjgt690b', + 'ext': 'mp4', + 'title': 'Котята', + }, + 'add_ie': [YapFilesIE.ie_key()], + 'params': { + 'skip_download': True, + }, } # { # # TODO: find another test @@ -2280,7 +2310,10 @@ class GenericIE(InfoExtractor): # Look for Brightcove New Studio embeds bc_urls = BrightcoveNewIE._extract_urls(self, webpage) if bc_urls: - return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') + return self.playlist_from_matches( + bc_urls, video_id, video_title, + getter=lambda x: smuggle_url(x, {'referrer': url}), + ie='BrightcoveNew') # Look for Nexx embeds nexx_urls = NexxIE._extract_urls(webpage) @@ -2928,6 +2961,16 @@ class GenericIE(InfoExtractor): springboardplatform_urls, video_id, video_title, ie=SpringboardPlatformIE.ie_key()) + yapfiles_urls = YapFilesIE._extract_urls(webpage) + if yapfiles_urls: + return self.playlist_from_matches( + yapfiles_urls, video_id, video_title, ie=YapFilesIE.ie_key()) + + vice_urls = ViceIE._extract_urls(webpage) + if vice_urls: + return self.playlist_from_matches( + vice_urls, video_id, video_title, ie=ViceIE.ie_key()) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): diff --git a/youtube_dl/extractor/heise.py b/youtube_dl/extractor/heise.py index 82e11a7..8f49f52 100644 --- a/youtube_dl/extractor/heise.py +++ b/youtube_dl/extractor/heise.py @@ -2,11 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from .kaltura import KalturaIE from .youtube import YoutubeIE from ..utils import ( determine_ext, int_or_none, parse_iso8601, + smuggle_url, xpath_text, ) @@ -42,6 +44,19 @@ class HeiseIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + 'url': 'https://www.heise.de/video/artikel/nachgehakt-Wie-sichert-das-c-t-Tool-Restric-tor-Windows-10-ab-3700244.html', + 'md5': '4b58058b46625bdbd841fc2804df95fc', + 'info_dict': { + 'id': '1_ntrmio2s', + 'timestamp': 1512470717, + 'upload_date': '20171205', + 'ext': 'mp4', + 'title': 'ct10 nachgehakt hos restrictor', + }, + 'params': { + 'skip_download': True, + }, }, { 'url': 'http://www.heise.de/ct/artikel/c-t-uplink-3-3-Owncloud-Tastaturen-Peilsender-Smartphone-2403911.html', 'only_matching': True, @@ -67,9 +82,14 @@ class HeiseIE(InfoExtractor): if yt_urls: return self.playlist_from_matches(yt_urls, video_id, title, ie=YoutubeIE.ie_key()) + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) + container_id = self._search_regex( r'<div class="videoplayerjw"[^>]+data-container="([0-9]+)"', webpage, 'container ID') + sequenz_id = self._search_regex( r'<div class="videoplayerjw"[^>]+data-sequenz="([0-9]+)"', webpage, 'sequenz ID') diff --git a/youtube_dl/extractor/hidive.py b/youtube_dl/extractor/hidive.py new file mode 100644 index 0000000..eee5170 --- /dev/null +++ b/youtube_dl/extractor/hidive.py @@ -0,0 +1,96 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + urlencode_postdata, +) + + +class HiDiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hidive\.com/stream/(?P<title>[^/]+)/(?P<key>[^/?#&]+)' + # Using X-Forwarded-For results in 403 HTTP error for HLS fragments, + # so disabling geo bypass completely + _GEO_BYPASS = False + + _TESTS = [{ + 'url': 'https://www.hidive.com/stream/the-comic-artist-and-his-assistants/s01e001', + 'info_dict': { + 'id': 'the-comic-artist-and-his-assistants/s01e001', + 'ext': 'mp4', + 'title': 'the-comic-artist-and-his-assistants/s01e001', + 'series': 'the-comic-artist-and-his-assistants', + 'season_number': 1, + 'episode_number': 1, + }, + 'params': { + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + title, key = mobj.group('title', 'key') + video_id = '%s/%s' % (title, key) + + settings = self._download_json( + 'https://www.hidive.com/play/settings', video_id, + data=urlencode_postdata({ + 'Title': title, + 'Key': key, + })) + + restriction = settings.get('restrictionReason') + if restriction == 'RegionRestricted': + self.raise_geo_restricted() + + if restriction and restriction != 'None': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, restriction), expected=True) + + formats = [] + subtitles = {} + for rendition_id, rendition in settings['renditions'].items(): + bitrates = rendition.get('bitrates') + if not isinstance(bitrates, dict): + continue + m3u8_url = bitrates.get('hls') + if not isinstance(m3u8_url, compat_str): + continue + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='%s-hls' % rendition_id, fatal=False)) + cc_files = rendition.get('ccFiles') + if not isinstance(cc_files, list): + continue + for cc_file in cc_files: + if not isinstance(cc_file, list) or len(cc_file) < 3: + continue + cc_lang = cc_file[0] + cc_url = cc_file[2] + if not isinstance(cc_lang, compat_str) or not isinstance( + cc_url, compat_str): + continue + subtitles.setdefault(cc_lang, []).append({ + 'url': cc_url, + }) + + season_number = int_or_none(self._search_regex( + r's(\d+)', key, 'season number', default=None)) + episode_number = int_or_none(self._search_regex( + r'e(\d+)', key, 'episode number', default=None)) + + return { + 'id': video_id, + 'title': video_id, + 'subtitles': subtitles, + 'formats': formats, + 'series': title, + 'season_number': season_number, + 'episode_number': episode_number, + } diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index da5a5de..6373268 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -49,7 +49,9 @@ class LA7IE(InfoExtractor): webpage = self._download_webpage(url, video_id) player_data = self._parse_json( - self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'), + self._search_regex( + [r'(?s)videoParams\s*=\s*({.+?});', r'videoLa7\(({[^;]+})\);'], + webpage, 'player data'), video_id, transform_source=js_to_json) return { diff --git a/youtube_dl/extractor/line.py b/youtube_dl/extractor/line.py new file mode 100644 index 0000000..7f5fa44 --- /dev/null +++ b/youtube_dl/extractor/line.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import js_to_json + + +class LineTVIE(InfoExtractor): + _VALID_URL = r'https?://tv\.line\.me/v/(?P<id>\d+)_[^/]+-(?P<segment>ep\d+-\d+)' + + _TESTS = [{ + 'url': 'https://tv.line.me/v/793123_goodbye-mrblack-ep1-1/list/69246', + 'info_dict': { + 'id': '793123_ep1-1', + 'ext': 'mp4', + 'title': 'Goodbye Mr.Black | EP.1-1', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 998.509, + 'view_count': int, + }, + }, { + 'url': 'https://tv.line.me/v/2587507_%E6%B4%BE%E9%81%A3%E5%A5%B3%E9%86%ABx-ep1-02/list/185245', + 'only_matching': True, + }] + + def _real_extract(self, url): + series_id, segment = re.match(self._VALID_URL, url).groups() + video_id = '%s_%s' % (series_id, segment) + + webpage = self._download_webpage(url, video_id) + + player_params = self._parse_json(self._search_regex( + r'naver\.WebPlayer\(({[^}]+})\)', webpage, 'player parameters'), + video_id, transform_source=js_to_json) + + video_info = self._download_json( + 'https://global-nvapis.line.me/linetv/rmcnmv/vod_play_videoInfo.json', + video_id, query={ + 'videoId': player_params['videoId'], + 'key': player_params['key'], + }) + + stream = video_info['streams'][0] + extra_query = '?__gda__=' + stream['key']['value'] + formats = self._extract_m3u8_formats( + stream['source'] + extra_query, video_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + + for a_format in formats: + a_format['url'] += extra_query + + duration = None + for video in video_info.get('videos', {}).get('list', []): + encoding_option = video.get('encodingOption', {}) + abr = video['bitrate']['audio'] + vbr = video['bitrate']['video'] + tbr = abr + vbr + formats.append({ + 'url': video['source'], + 'format_id': 'http-%d' % int(tbr), + 'height': encoding_option.get('height'), + 'width': encoding_option.get('width'), + 'abr': abr, + 'vbr': vbr, + 'filesize': video.get('size'), + }) + if video.get('duration') and duration is None: + duration = video['duration'] + + self._sort_formats(formats) + + if not formats[0].get('width'): + formats[0]['vcodec'] = 'none' + + title = self._og_search_title(webpage) + + # like_count requires an additional API request https://tv.line.me/api/likeit/getCount + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'extra_param_to_segment_url': extra_query[1:], + 'duration': duration, + 'thumbnails': [{'url': thumbnail['source']} + for thumbnail in video_info.get('thumbnails', {}).get('list', [])], + 'view_count': video_info.get('meta', {}).get('count'), + } diff --git a/youtube_dl/extractor/mailru.py b/youtube_dl/extractor/mailru.py index 6b7c5e3..6b0e64b 100644 --- a/youtube_dl/extractor/mailru.py +++ b/youtube_dl/extractor/mailru.py @@ -1,12 +1,17 @@ # coding: utf-8 from __future__ import unicode_literals +import itertools +import json import re from .common import InfoExtractor +from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, + parse_duration, remove_end, + try_get, ) @@ -157,3 +162,153 @@ class MailRuIE(InfoExtractor): 'view_count': view_count, 'formats': formats, } + + +class MailRuMusicSearchBaseIE(InfoExtractor): + def _search(self, query, url, audio_id, limit=100, offset=0): + search = self._download_json( + 'https://my.mail.ru/cgi-bin/my/ajax', audio_id, + 'Downloading songs JSON page %d' % (offset // limit + 1), + headers={ + 'Referer': url, + 'X-Requested-With': 'XMLHttpRequest', + }, query={ + 'xemail': '', + 'ajax_call': '1', + 'func_name': 'music.search', + 'mna': '', + 'mnb': '', + 'arg_query': query, + 'arg_extended': '1', + 'arg_search_params': json.dumps({ + 'music': { + 'limit': limit, + 'offset': offset, + }, + }), + 'arg_limit': limit, + 'arg_offset': offset, + }) + return next(e for e in search if isinstance(e, dict)) + + @staticmethod + def _extract_track(t, fatal=True): + audio_url = t['URL'] if fatal else t.get('URL') + if not audio_url: + return + + audio_id = t['File'] if fatal else t.get('File') + if not audio_id: + return + + thumbnail = t.get('AlbumCoverURL') or t.get('FiledAlbumCover') + uploader = t.get('OwnerName') or t.get('OwnerName_Text_HTML') + uploader_id = t.get('UploaderID') + duration = int_or_none(t.get('DurationInSeconds')) or parse_duration( + t.get('Duration') or t.get('DurationStr')) + view_count = int_or_none(t.get('PlayCount') or t.get('PlayCount_hr')) + + track = t.get('Name') or t.get('Name_Text_HTML') + artist = t.get('Author') or t.get('Author_Text_HTML') + + if track: + title = '%s - %s' % (artist, track) if artist else track + else: + title = audio_id + + return { + 'extractor_key': MailRuMusicIE.ie_key(), + 'id': audio_id, + 'title': title, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'duration': duration, + 'view_count': view_count, + 'vcodec': 'none', + 'abr': int_or_none(t.get('BitRate')), + 'track': track, + 'artist': artist, + 'album': t.get('Album'), + 'url': audio_url, + } + + +class MailRuMusicIE(MailRuMusicSearchBaseIE): + IE_NAME = 'mailru:music' + IE_DESC = 'Музыка@Mail.Ru' + _VALID_URL = r'https?://my\.mail\.ru/music/songs/[^/?#&]+-(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://my.mail.ru/music/songs/%D0%BC8%D0%BB8%D1%82%D1%85-l-a-h-luciferian-aesthetics-of-herrschaft-single-2017-4e31f7125d0dfaef505d947642366893', + 'md5': '0f8c22ef8c5d665b13ac709e63025610', + 'info_dict': { + 'id': '4e31f7125d0dfaef505d947642366893', + 'ext': 'mp3', + 'title': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017 - М8Л8ТХ', + 'uploader': 'Игорь Мудрый', + 'uploader_id': '1459196328', + 'duration': 280, + 'view_count': int, + 'vcodec': 'none', + 'abr': 320, + 'track': 'L.A.H. (Luciferian Aesthetics of Herrschaft) single, 2017', + 'artist': 'М8Л8ТХ', + }, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage(url, audio_id) + + title = self._og_search_title(webpage) + music_data = self._search(title, url, audio_id)['MusicData'] + t = next(t for t in music_data if t.get('File') == audio_id) + + info = self._extract_track(t) + info['title'] = title + return info + + +class MailRuMusicSearchIE(MailRuMusicSearchBaseIE): + IE_NAME = 'mailru:music:search' + IE_DESC = 'Музыка@Mail.Ru' + _VALID_URL = r'https?://my\.mail\.ru/music/search/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://my.mail.ru/music/search/black%20shadow', + 'info_dict': { + 'id': 'black shadow', + }, + 'playlist_mincount': 532, + }] + + def _real_extract(self, url): + query = compat_urllib_parse_unquote(self._match_id(url)) + + entries = [] + + LIMIT = 100 + offset = 0 + + for _ in itertools.count(1): + search = self._search(query, url, query, LIMIT, offset) + + music_data = search.get('MusicData') + if not music_data or not isinstance(music_data, list): + break + + for t in music_data: + track = self._extract_track(t, fatal=False) + if track: + entries.append(track) + + total = try_get( + search, lambda x: x['Results']['music']['Total'], int) + + if total is not None: + if offset > total: + break + + offset += LIMIT + + return self.playlist_result(entries, query) diff --git a/youtube_dl/extractor/myvi.py b/youtube_dl/extractor/myvi.py index 621ae74..75d2863 100644 --- a/youtube_dl/extractor/myvi.py +++ b/youtube_dl/extractor/myvi.py @@ -3,22 +3,31 @@ from __future__ import unicode_literals import re +from .common import InfoExtractor from .vimple import SprutoBaseIE class MyviIE(SprutoBaseIE): _VALID_URL = r'''(?x) - https?:// - myvi\.(?:ru/player|tv)/ - (?: + (?: + https?:// + (?:www\.)? + myvi\. (?: - embed/html| - flash| - api/Video/Get - )/| - content/preloader\.swf\?.*\bid= - ) - (?P<id>[\da-zA-Z_-]+) + (?:ru/player|tv)/ + (?: + (?: + embed/html| + flash| + api/Video/Get + )/| + content/preloader\.swf\?.*\bid= + )| + ru/watch/ + )| + myvi: + ) + (?P<id>[\da-zA-Z_-]+) ''' _TESTS = [{ 'url': 'http://myvi.ru/player/embed/html/oOy4euHA6LVwNNAjhD9_Jq5Ha2Qf0rtVMVFMAZav8wObeRTZaCATzucDQIDph8hQU0', @@ -42,6 +51,12 @@ class MyviIE(SprutoBaseIE): }, { 'url': 'http://myvi.ru/player/flash/ocp2qZrHI-eZnHKQBK4cZV60hslH8LALnk0uBfKsB-Q4WnY26SeGoYPi8HWHxu0O30', 'only_matching': True, + }, { + 'url': 'https://www.myvi.ru/watch/YwbqszQynUaHPn_s82sx0Q2', + 'only_matching': True, + }, { + 'url': 'myvi:YwbqszQynUaHPn_s82sx0Q2', + 'only_matching': True, }] @classmethod @@ -58,3 +73,39 @@ class MyviIE(SprutoBaseIE): 'http://myvi.ru/player/api/Video/Get/%s?sig' % video_id, video_id)['sprutoData'] return self._extract_spruto(spruto, video_id) + + +class MyviEmbedIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?myvi\.tv/(?:[^?]+\?.*?\bv=|embed/)(?P<id>[\da-z]+)' + _TESTS = [{ + 'url': 'https://www.myvi.tv/embed/ccdqic3wgkqwpb36x9sxg43t4r', + 'info_dict': { + 'id': 'b3ea0663-3234-469d-873e-7fecf36b31d1', + 'ext': 'mp4', + 'title': 'Твоя (original song).mp4', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 277, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'https://www.myvi.tv/idmi6o?v=ccdqic3wgkqwpb36x9sxg43t4r#watch', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if MyviIE.suitable(url) else super(MyviEmbedIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.myvi.tv/embed/%s' % video_id, video_id) + + myvi_id = self._search_regex( + r'CreatePlayer\s*\(\s*["\'].*?\bv=([\da-zA-Z_]+)', + webpage, 'video id') + + return self.url_result('myvi:%s' % myvi_id, ie=MyviIE.ie_key()) diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 9e8d28f..246f679 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -68,7 +68,7 @@ class NationalGeographicVideoIE(InfoExtractor): class NationalGeographicIE(ThePlatformIE, AdobePassIE): IE_NAME = 'natgeo' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P<id>[^/?]+)' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:(?:wild/)?[^/]+/)?(?:videos|episodes)/(?P<id>[^/?]+)' _TESTS = [ { @@ -102,6 +102,10 @@ class NationalGeographicIE(ThePlatformIE, AdobePassIE): { 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', 'only_matching': True, + }, + { + 'url': 'http://channel.nationalgeographic.com/videos/treasures-rediscovered/', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 554dec3..9dc8f9e 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import base64 from .common import InfoExtractor from .theplatform import ThePlatformIE @@ -358,6 +359,7 @@ class NBCNewsIE(ThePlatformIE): class NBCOlympicsIE(InfoExtractor): + IE_NAME = 'nbcolympics' _VALID_URL = r'https?://www\.nbcolympics\.com/video/(?P<id>[a-z-]+)' _TEST = { @@ -395,3 +397,54 @@ class NBCOlympicsIE(InfoExtractor): 'ie_key': ThePlatformIE.ie_key(), 'display_id': display_id, } + + +class NBCOlympicsStreamIE(AdobePassIE): + IE_NAME = 'nbcolympics:stream' + _VALID_URL = r'https?://stream\.nbcolympics\.com/(?P<id>[0-9a-z-]+)' + _TEST = { + 'url': 'http://stream.nbcolympics.com/2018-winter-olympics-nbcsn-evening-feb-8', + 'info_dict': { + 'id': '203493', + 'ext': 'mp4', + 'title': 're:Curling, Alpine, Luge [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + _DATA_URL_TEMPLATE = 'http://stream.nbcolympics.com/data/%s_%s.json' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + pid = self._search_regex(r'pid\s*=\s*(\d+);', webpage, 'pid') + resource = self._search_regex( + r"resource\s*=\s*'(.+)';", webpage, + 'resource').replace("' + pid + '", pid) + event_config = self._download_json( + self._DATA_URL_TEMPLATE % ('event_config', pid), + pid)['eventConfig'] + title = self._live_title(event_config['eventTitle']) + source_url = self._download_json( + self._DATA_URL_TEMPLATE % ('live_sources', pid), + pid)['videoSources'][0]['sourceUrl'] + media_token = self._extract_mvpd_auth( + url, pid, event_config.get('requestorId', 'NBCOlympics'), resource) + formats = self._extract_m3u8_formats(self._download_webpage( + 'http://sp.auth.adobe.com/tvs/v1/sign', pid, query={ + 'cdn': 'akamai', + 'mediaToken': base64.b64encode(media_token.encode()), + 'resource': base64.b64encode(resource.encode()), + 'url': source_url, + }), pid, 'mp4') + self._sort_formats(formats) + + return { + 'id': pid, + 'display_id': display_id, + 'title': title, + 'formats': formats, + 'is_live': True, + } diff --git a/youtube_dl/extractor/newgrounds.py b/youtube_dl/extractor/newgrounds.py index 0e26f83..82e7cf5 100644 --- a/youtube_dl/extractor/newgrounds.py +++ b/youtube_dl/extractor/newgrounds.py @@ -87,19 +87,21 @@ class NewgroundsIE(InfoExtractor): self._check_formats(formats, media_id) self._sort_formats(formats) - uploader = self._search_regex( - r'(?:Author|Writer)\s*<a[^>]+>([^<]+)', webpage, 'uploader', + uploader = self._html_search_regex( + (r'(?s)<h4[^>]*>(.+?)</h4>.*?<em>\s*Author\s*</em>', + r'(?:Author|Writer)\s*<a[^>]+>([^<]+)'), webpage, 'uploader', fatal=False) - timestamp = unified_timestamp(self._search_regex( - r'<dt>Uploaded</dt>\s*<dd>([^<]+)', webpage, 'timestamp', + timestamp = unified_timestamp(self._html_search_regex( + (r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+</dd>\s*<dd>[^<]+)', + r'<dt>\s*Uploaded\s*</dt>\s*<dd>([^<]+)'), webpage, 'timestamp', default=None)) duration = parse_duration(self._search_regex( - r'<dd>Song\s*</dd><dd>.+?</dd><dd>([^<]+)', webpage, 'duration', - default=None)) + r'(?s)<dd>\s*Song\s*</dd>\s*<dd>.+?</dd>\s*<dd>([^<]+)', webpage, + 'duration', default=None)) filesize_approx = parse_filesize(self._html_search_regex( - r'<dd>Song\s*</dd><dd>(.+?)</dd>', webpage, 'filesize', + r'(?s)<dd>\s*Song\s*</dd>\s*<dd>(.+?)</dd>', webpage, 'filesize', default=None)) if len(formats) == 1: formats[0]['filesize_approx'] = filesize_approx diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py index 9203c04..c7029d2 100644 --- a/youtube_dl/extractor/nexx.py +++ b/youtube_dl/extractor/nexx.py @@ -21,7 +21,8 @@ class NexxIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/| - nexx:(?P<domain_id_s>\d+): + nexx:(?:(?P<domain_id_s>\d+):)?| + https?://arc\.nexx\.cloud/api/video/ ) (?P<id>\d+) ''' @@ -61,12 +62,33 @@ class NexxIE(InfoExtractor): 'params': { 'skip_download': True, }, + }, { + # does not work via arc + 'url': 'nexx:741:1269984', + 'md5': 'c714b5b238b2958dc8d5642addba6886', + 'info_dict': { + 'id': '1269984', + 'ext': 'mp4', + 'title': '1 TAG ohne KLO... wortwörtlich! 😑', + 'alt_title': '1 TAG ohne KLO... wortwörtlich! 😑', + 'description': 'md5:4604539793c49eda9443ab5c5b1d612f', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 607, + 'timestamp': 1518614955, + 'upload_date': '20180214', + }, }, { 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907', 'only_matching': True, }, { 'url': 'nexx:748:128907', 'only_matching': True, + }, { + 'url': 'nexx:128907', + 'only_matching': True, + }, { + 'url': 'https://arc.nexx.cloud/api/video/128907.json', + 'only_matching': True, }] @staticmethod @@ -124,65 +146,77 @@ class NexxIE(InfoExtractor): domain_id = mobj.group('domain_id') or mobj.group('domain_id_s') video_id = mobj.group('id') - # Reverse engineered from JS code (see getDeviceID function) - device_id = '%d:%d:%d%d' % ( - random.randint(1, 4), int(time.time()), - random.randint(1e4, 99999), random.randint(1, 9)) - - result = self._call_api(domain_id, 'session/init', video_id, data={ - 'nxp_devh': device_id, - 'nxp_userh': '', - 'precid': '0', - 'playlicense': '0', - 'screenx': '1920', - 'screeny': '1080', - 'playerversion': '6.0.00', - 'gateway': 'html5', - 'adGateway': '', - 'explicitlanguage': 'en-US', - 'addTextTemplates': '1', - 'addDomainData': '1', - 'addAdModel': '1', - }, headers={ - 'X-Request-Enable-Auth-Fallback': '1', - }) - - cid = result['general']['cid'] - - # As described in [1] X-Request-Token generation algorithm is - # as follows: - # md5( operation + domain_id + domain_secret ) - # where domain_secret is a static value that will be given by nexx.tv - # as per [1]. Here is how this "secret" is generated (reversed - # from _play.api.init function, search for clienttoken). So it's - # actually not static and not that much of a secret. - # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf - secret = result['device']['clienttoken'][int(device_id[0]):] - secret = secret[0:len(secret) - int(device_id[-1])] - - op = 'byid' - - # Reversed from JS code for _play.api.call function (search for - # X-Request-Token) - request_token = hashlib.md5( - ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() - - video = self._call_api( - domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ - 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', - 'addInteractionOptions': '1', - 'addStatusDetails': '1', - 'addStreamDetails': '1', - 'addCaptions': '1', - 'addScenes': '1', - 'addHotSpots': '1', - 'addBumpers': '1', - 'captionFormat': 'data', + video = None + + response = self._download_json( + 'https://arc.nexx.cloud/api/video/%s.json' % video_id, + video_id, fatal=False) + if response and isinstance(response, dict): + result = response.get('result') + if result and isinstance(result, dict): + video = result + + # not all videos work via arc, e.g. nexx:741:1269984 + if not video: + # Reverse engineered from JS code (see getDeviceID function) + device_id = '%d:%d:%d%d' % ( + random.randint(1, 4), int(time.time()), + random.randint(1e4, 99999), random.randint(1, 9)) + + result = self._call_api(domain_id, 'session/init', video_id, data={ + 'nxp_devh': device_id, + 'nxp_userh': '', + 'precid': '0', + 'playlicense': '0', + 'screenx': '1920', + 'screeny': '1080', + 'playerversion': '6.0.00', + 'gateway': 'html5', + 'adGateway': '', + 'explicitlanguage': 'en-US', + 'addTextTemplates': '1', + 'addDomainData': '1', + 'addAdModel': '1', }, headers={ - 'X-Request-CID': cid, - 'X-Request-Token': request_token, + 'X-Request-Enable-Auth-Fallback': '1', }) + cid = result['general']['cid'] + + # As described in [1] X-Request-Token generation algorithm is + # as follows: + # md5( operation + domain_id + domain_secret ) + # where domain_secret is a static value that will be given by nexx.tv + # as per [1]. Here is how this "secret" is generated (reversed + # from _play.api.init function, search for clienttoken). So it's + # actually not static and not that much of a secret. + # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf + secret = result['device']['clienttoken'][int(device_id[0]):] + secret = secret[0:len(secret) - int(device_id[-1])] + + op = 'byid' + + # Reversed from JS code for _play.api.call function (search for + # X-Request-Token) + request_token = hashlib.md5( + ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest() + + video = self._call_api( + domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={ + 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description', + 'addInteractionOptions': '1', + 'addStatusDetails': '1', + 'addStreamDetails': '1', + 'addCaptions': '1', + 'addScenes': '1', + 'addHotSpots': '1', + 'addBumpers': '1', + 'captionFormat': 'data', + }, headers={ + 'X-Request-CID': cid, + 'X-Request-Token': request_token, + }) + general = video['general'] title = general['title'] diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 7edd684..090f1ac 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -198,7 +198,7 @@ class NickNightIE(NickDeIE): class NickRuIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeonru' - _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu|com\.tr)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6', 'only_matching': True, @@ -220,6 +220,9 @@ class NickRuIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nickelodeon.hu/musorok/spongyabob-kockanadrag/videok/episodes/buborekfujas-az-elszakadt-nadrag/q57iob#playlist/k6te4y', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.com.tr/programlar/sunger-bob/videolar/kayip-yatak/mgqbjy', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/ninegag.py b/youtube_dl/extractor/ninegag.py index a06d38a..dc6a27d 100644 --- a/youtube_dl/extractor/ninegag.py +++ b/youtube_dl/extractor/ninegag.py @@ -13,7 +13,7 @@ class NineGagIE(InfoExtractor): _TESTS = [{ 'url': 'http://9gag.com/tv/p/Kk2X5/people-are-awesome-2013-is-absolutely-awesome', 'info_dict': { - 'id': 'Kk2X5', + 'id': 'kXzwOKyGlSA', 'ext': 'mp4', 'description': 'This 3-minute video will make you smile and then make you feel untalented and insignificant. Anyway, you should share this awesomeness. (Thanks, Dino!)', 'title': '\"People Are Awesome 2013\" Is Absolutely Awesome', diff --git a/youtube_dl/extractor/njpwworld.py b/youtube_dl/extractor/njpwworld.py index 9b5ad5a..febef09 100644 --- a/youtube_dl/extractor/njpwworld.py +++ b/youtube_dl/extractor/njpwworld.py @@ -43,7 +43,8 @@ class NJPWWorldIE(InfoExtractor): webpage, urlh = self._download_webpage_handle( 'https://njpwworld.com/auth/login', None, note='Logging in', errnote='Unable to login', - data=urlencode_postdata({'login_id': username, 'pw': password})) + data=urlencode_postdata({'login_id': username, 'pw': password}), + headers={'Referer': 'https://njpwworld.com/auth'}) # /auth/login will return 302 for successful logins if urlh.geturl() == 'https://njpwworld.com/auth/login': self.report_warning('unable to login') diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index b8fe244..ff21533 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -11,6 +11,7 @@ from ..utils import ( determine_ext, ExtractorError, fix_xml_ampersands, + int_or_none, orderedSet, parse_duration, qualities, @@ -38,7 +39,7 @@ class NPOIE(NPOBaseIE): npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}| ntr\.nl/(?:[^/]+/){2,}| omroepwnl\.nl/video/fragment/[^/]+__| - (?:zapp|npo3)\.nl/(?:[^/]+/){2} + (?:zapp|npo3)\.nl/(?:[^/]+/){2,} ) ) (?P<id>[^/?#]+) @@ -156,6 +157,9 @@ class NPOIE(NPOBaseIE): }, { 'url': 'http://www.npo.nl/radio-gaga/13-06-2017/BNN_101383373', 'only_matching': True, + }, { + 'url': 'https://www.zapp.nl/1803-skelterlab/instructie-video-s/740-instructievideo-s/POMS_AT_11736927', + 'only_matching': True, }] def _real_extract(self, url): @@ -170,6 +174,10 @@ class NPOIE(NPOBaseIE): transform_source=strip_jsonp, ) + error = metadata.get('error') + if error: + raise ExtractorError(error, expected=True) + # For some videos actual video id (prid) is different (e.g. for # http://www.omroepwnl.nl/video/fragment/vandaag-de-dag-verkiezingen__POMS_WNL_853698 # video id is POMS_WNL_853698 but prid is POW_00996502) @@ -187,7 +195,15 @@ class NPOIE(NPOBaseIE): formats = [] urls = set() - quality = qualities(['adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std']) + def is_legal_url(format_url): + return format_url and format_url not in urls and re.match( + r'^(?:https?:)?//', format_url) + + QUALITY_LABELS = ('Laag', 'Normaal', 'Hoog') + QUALITY_FORMATS = ('adaptive', 'wmv_sb', 'h264_sb', 'wmv_bb', 'h264_bb', 'wvc1_std', 'h264_std') + + quality_from_label = qualities(QUALITY_LABELS) + quality_from_format_id = qualities(QUALITY_FORMATS) items = self._download_json( 'http://ida.omroep.nl/app.php/%s' % video_id, video_id, 'Downloading formats JSON', query={ @@ -196,18 +212,34 @@ class NPOIE(NPOBaseIE): })['items'][0] for num, item in enumerate(items): item_url = item.get('url') - if not item_url or item_url in urls: + if not is_legal_url(item_url): continue urls.add(item_url) format_id = self._search_regex( r'video/ida/([^/]+)', item_url, 'format id', default=None) + item_label = item.get('label') + def add_format_url(format_url): + width = int_or_none(self._search_regex( + r'(\d+)[xX]\d+', format_url, 'width', default=None)) + height = int_or_none(self._search_regex( + r'\d+[xX](\d+)', format_url, 'height', default=None)) + if item_label in QUALITY_LABELS: + quality = quality_from_label(item_label) + f_id = item_label + elif item_label in QUALITY_FORMATS: + quality = quality_from_format_id(format_id) + f_id = format_id + else: + quality, f_id = [None] * 2 formats.append({ 'url': format_url, - 'format_id': format_id, - 'quality': quality(format_id), + 'format_id': f_id, + 'width': width, + 'height': height, + 'quality': quality, }) # Example: http://www.npo.nl/de-nieuwe-mens-deel-1/21-07-2010/WO_VPRO_043706 @@ -219,7 +251,7 @@ class NPOIE(NPOBaseIE): stream_info = self._download_json( item_url + '&type=json', video_id, 'Downloading %s stream JSON' - % item.get('label') or item.get('format') or format_id or num) + % item_label or item.get('format') or format_id or num) except ExtractorError as ee: if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 404: error = (self._parse_json( @@ -251,7 +283,7 @@ class NPOIE(NPOBaseIE): if not is_live: for num, stream in enumerate(metadata.get('streams', [])): stream_url = stream.get('url') - if not stream_url or stream_url in urls: + if not is_legal_url(stream_url): continue urls.add(stream_url) # smooth streaming is not supported diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index e5e0853..8afe541 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -56,18 +56,16 @@ class PeriscopeIE(PeriscopeBaseIE): def _real_extract(self, url): token = self._match_id(url) - broadcast_data = self._call_api( - 'getBroadcastPublic', {'broadcast_id': token}, token) - broadcast = broadcast_data['broadcast'] - status = broadcast['status'] + stream = self._call_api( + 'accessVideoPublic', {'broadcast_id': token}, token) - user = broadcast_data.get('user', {}) + broadcast = stream['broadcast'] + title = broadcast['status'] - uploader = broadcast.get('user_display_name') or user.get('display_name') - uploader_id = (broadcast.get('username') or user.get('username') or - broadcast.get('user_id') or user.get('id')) + uploader = broadcast.get('user_display_name') or broadcast.get('username') + uploader_id = (broadcast.get('user_id') or broadcast.get('username')) - title = '%s - %s' % (uploader, status) if uploader else status + title = '%s - %s' % (uploader, title) if uploader else title state = broadcast.get('state').lower() if state == 'running': title = self._live_title(title) @@ -77,9 +75,6 @@ class PeriscopeIE(PeriscopeBaseIE): 'url': broadcast[image], } for image in ('image_url', 'image_url_small') if broadcast.get(image)] - stream = self._call_api( - 'getAccessPublic', {'broadcast_id': token}, token) - video_urls = set() formats = [] for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'): diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index e38c761..e86c653 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -4,7 +4,9 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( + determine_ext, ExtractorError, int_or_none, xpath_text, @@ -26,17 +28,15 @@ class PladformIE(InfoExtractor): (?P<id>\d+) ''' _TESTS = [{ - # http://muz-tv.ru/kinozal/view/7400/ - 'url': 'http://out.pladform.ru/player?pl=24822&videoid=100183293', - 'md5': '61f37b575dd27f1bb2e1854777fe31f4', + 'url': 'https://out.pladform.ru/player?pl=64471&videoid=3777899&vk_puid15=0&vk_puid34=0', + 'md5': '53362fac3a27352da20fa2803cc5cd6f', 'info_dict': { - 'id': '100183293', + 'id': '3777899', 'ext': 'mp4', - 'title': 'Тайны перевала Дятлова • 1 серия 2 часть', - 'description': 'Документальный сериал-расследование одной из самых жутких тайн ХХ века', + 'title': 'СТУДИЯ СОЮЗ • Шоу Студия Союз, 24 выпуск (01.02.2018) Нурлан Сабуров и Слава Комиссаренко', + 'description': 'md5:05140e8bf1b7e2d46e7ba140be57fd95', 'thumbnail': r're:^https?://.*\.jpg$', - 'duration': 694, - 'age_limit': 0, + 'duration': 3190, }, }, { 'url': 'http://static.pladform.ru/player.swf?pl=21469&videoid=100183293&vkcid=0', @@ -56,22 +56,48 @@ class PladformIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + qs = compat_urlparse.parse_qs(compat_urlparse.urlparse(url).query) + pl = qs.get('pl', ['1'])[0] + video = self._download_xml( - 'http://out.pladform.ru/getVideo?pl=1&videoid=%s' % video_id, - video_id) + 'http://out.pladform.ru/getVideo', video_id, query={ + 'pl': pl, + 'videoid': video_id, + }) - if video.tag == 'error': + def fail(text): raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, video.text), + '%s returned error: %s' % (self.IE_NAME, text), expected=True) + if video.tag == 'error': + fail(video.text) + quality = qualities(('ld', 'sd', 'hd')) - formats = [{ - 'url': src.text, - 'format_id': src.get('quality'), - 'quality': quality(src.get('quality')), - } for src in video.findall('./src')] + formats = [] + for src in video.findall('./src'): + if src is None: + continue + format_url = src.text + if not format_url: + continue + if src.get('type') == 'hls' or determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src.text, + 'format_id': src.get('quality'), + 'quality': quality(src.get('quality')), + }) + + if not formats: + error = xpath_text(video, './cap', 'error', default=None) + if error: + fail(error) + self._sort_formats(formats) webpage = self._download_webpage( diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py index 2d87e7e..dd5f17f 100644 --- a/youtube_dl/extractor/pokemon.py +++ b/youtube_dl/extractor/pokemon.py @@ -11,19 +11,34 @@ from ..utils import ( class PokemonIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/[^/]+/\d+_\d+-(?P<display_id>[^/?#]+))' + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/(?:[^/]+/)+(?P<display_id>[^/?#&]+))' _TESTS = [{ - 'url': 'http://www.pokemon.com/us/pokemon-episodes/19_01-from-a-to-z/?play=true', - 'md5': '9fb209ae3a569aac25de0f5afc4ee08f', + 'url': 'https://www.pokemon.com/us/pokemon-episodes/20_30-the-ol-raise-and-switch/', + 'md5': '2fe8eaec69768b25ef898cda9c43062e', 'info_dict': { - 'id': 'd0436c00c3ce4071ac6cee8130ac54a1', + 'id': 'afe22e30f01c41f49d4f1d9eab5cd9a4', 'ext': 'mp4', - 'title': 'From A to Z!', - 'description': 'Bonnie makes a new friend, Ash runs into an old friend, and a terrifying premonition begins to unfold!', - 'timestamp': 1460478136, - 'upload_date': '20160412', + 'title': 'The Ol’ Raise and Switch!', + 'description': 'md5:7db77f7107f98ba88401d3adc80ff7af', + 'timestamp': 1511824728, + 'upload_date': '20171127', + }, + 'add_id': ['LimelightMedia'], + }, { + # no data-video-title + 'url': 'https://www.pokemon.com/us/pokemon-episodes/pokemon-movies/pokemon-the-rise-of-darkrai-2008', + 'info_dict': { + 'id': '99f3bae270bf4e5097274817239ce9c8', + 'ext': 'mp4', + 'title': 'Pokémon: The Rise of Darkrai', + 'description': 'md5:ea8fbbf942e1e497d54b19025dd57d9d', + 'timestamp': 1417778347, + 'upload_date': '20141205', + }, + 'add_id': ['LimelightMedia'], + 'params': { + 'skip_download': True, }, - 'add_id': ['LimelightMedia'] }, { 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', 'only_matching': True, @@ -42,7 +57,9 @@ class PokemonIE(InfoExtractor): r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), webpage, 'video data element')) video_id = video_data['data-video-id'] - title = video_data['data-video-title'] + title = video_data.get('data-video-title') or self._html_search_meta( + 'pkm-title', webpage, ' title', default=None) or self._search_regex( + r'<h1[^>]+\bclass=["\']us-title[^>]+>([^<]+)', webpage, 'title') return { '_type': 'url_transparent', 'id': video_id, diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 3428458..9ce513a 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -115,12 +115,13 @@ class PornHubIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + self._set_cookie('pornhub.com', 'age_verified', '1') + def dl_webpage(platform): + self._set_cookie('pornhub.com', 'platform', platform) return self._download_webpage( 'http://www.pornhub.com/view_video.php?viewkey=%s' % video_id, - video_id, headers={ - 'Cookie': 'age_verified=1; platform=%s' % platform, - }) + video_id) webpage = dl_webpage('pc') @@ -275,7 +276,7 @@ class PornHubPlaylistIE(PornHubPlaylistBaseIE): class PornHubUserVideosIE(PornHubPlaylistBaseIE): - _VALID_URL = r'https?://(?:www\.)?pornhub\.com/users/(?P<id>[^/]+)/videos' + _VALID_URL = r'https?://(?:www\.)?pornhub\.com/(?:user|channel)s/(?P<id>[^/]+)/videos' _TESTS = [{ 'url': 'http://www.pornhub.com/users/zoe_ph/videos/public', 'info_dict': { @@ -285,6 +286,25 @@ class PornHubUserVideosIE(PornHubPlaylistBaseIE): }, { 'url': 'http://www.pornhub.com/users/rushandlia/videos', 'only_matching': True, + }, { + # default sorting as Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos', + 'info_dict': { + 'id': 'povd', + }, + 'playlist_mincount': 293, + }, { + # Top Rated Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=ra', + 'only_matching': True, + }, { + # Most Recent Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=da', + 'only_matching': True, + }, { + # Most Viewed Videos + 'url': 'https://www.pornhub.com/channels/povd/videos?o=vi', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 48757fd..7efff45 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -129,6 +129,7 @@ class ProSiebenSat1IE(ProSiebenSat1BaseIE): https?:// (?:www\.)? (?: + (?:beta\.)? (?: prosieben(?:maxx)?|sixx|sat1(?:gold)?|kabeleins(?:doku)?|the-voice-of-germany|7tv|advopedia )\.(?:de|at|ch)| diff --git a/youtube_dl/extractor/raywenderlich.py b/youtube_dl/extractor/raywenderlich.py new file mode 100644 index 0000000..640c3ee --- /dev/null +++ b/youtube_dl/extractor/raywenderlich.py @@ -0,0 +1,102 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .vimeo import VimeoIE +from ..utils import ( + extract_attributes, + ExtractorError, + smuggle_url, + unsmuggle_url, + urljoin, +) + + +class RayWenderlichIE(InfoExtractor): + _VALID_URL = r'https?://videos\.raywenderlich\.com/courses/(?P<course_id>[^/]+)/lessons/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'info_dict': { + 'id': '248377018', + 'ext': 'mp4', + 'title': 'Testing In iOS Episode 1: Introduction', + 'duration': 133, + 'uploader': 'Ray Wenderlich', + 'uploader_id': 'user3304672', + }, + 'params': { + 'noplaylist': True, + 'skip_download': True, + }, + 'add_ie': [VimeoIE.ie_key()], + 'expected_warnings': ['HTTP Error 403: Forbidden'], + }, { + 'url': 'https://videos.raywenderlich.com/courses/105-testing-in-ios/lessons/1', + 'info_dict': { + 'title': 'Testing in iOS', + 'id': '105-testing-in-ios', + }, + 'params': { + 'noplaylist': False, + }, + 'playlist_count': 29, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url, {}) + + mobj = re.match(self._VALID_URL, url) + course_id, lesson_id = mobj.group('course_id', 'id') + video_id = '%s/%s' % (course_id, lesson_id) + + webpage = self._download_webpage(url, video_id) + + no_playlist = self._downloader.params.get('noplaylist') + if no_playlist or smuggled_data.get('force_video', False): + if no_playlist: + self.to_screen( + 'Downloading just video %s because of --no-playlist' + % video_id) + if '>Subscribe to unlock' in webpage: + raise ExtractorError( + 'This content is only available for subscribers', + expected=True) + vimeo_id = self._search_regex( + r'data-vimeo-id=["\'](\d+)', webpage, 'video id') + return self.url_result( + VimeoIE._smuggle_referrer( + 'https://player.vimeo.com/video/%s' % vimeo_id, url), + ie=VimeoIE.ie_key(), video_id=vimeo_id) + + self.to_screen( + 'Downloading playlist %s - add --no-playlist to just download video' + % course_id) + + lesson_ids = set((lesson_id, )) + for lesson in re.findall( + r'(<a[^>]+\bclass=["\']lesson-link[^>]+>)', webpage): + attrs = extract_attributes(lesson) + if not attrs: + continue + lesson_url = attrs.get('href') + if not lesson_url: + continue + lesson_id = self._search_regex( + r'/lessons/(\d+)', lesson_url, 'lesson id', default=None) + if not lesson_id: + continue + lesson_ids.add(lesson_id) + + entries = [] + for lesson_id in sorted(lesson_ids): + entries.append(self.url_result( + smuggle_url(urljoin(url, lesson_id), {'force_video': True}), + ie=RayWenderlichIE.ie_key())) + + title = self._search_regex( + r'class=["\']course-title[^>]+>([^<]+)', webpage, 'course title', + default=None) + + return self.playlist_result(entries, course_id, title) diff --git a/youtube_dl/extractor/redbulltv.py b/youtube_dl/extractor/redbulltv.py index 5d6cc36..2436036 100644 --- a/youtube_dl/extractor/redbulltv.py +++ b/youtube_dl/extractor/redbulltv.py @@ -5,135 +5,93 @@ from .common import InfoExtractor from ..compat import compat_HTTPError from ..utils import ( float_or_none, - int_or_none, - try_get, - # unified_timestamp, ExtractorError, ) class RedBullTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?redbull\.tv/(?:video|film|live)/(?:AP-\w+/segment/)?(?P<id>AP-\w+)' + _VALID_URL = r'https?://(?:www\.)?redbull\.tv/video/(?P<id>AP-\w+)' _TESTS = [{ # film - 'url': 'https://www.redbull.tv/video/AP-1Q756YYX51W11/abc-of-wrc', + 'url': 'https://www.redbull.tv/video/AP-1Q6XCDTAN1W11', 'md5': 'fb0445b98aa4394e504b413d98031d1f', 'info_dict': { - 'id': 'AP-1Q756YYX51W11', + 'id': 'AP-1Q6XCDTAN1W11', 'ext': 'mp4', - 'title': 'ABC of...WRC', + 'title': 'ABC of... WRC - ABC of... S1E6', 'description': 'md5:5c7ed8f4015c8492ecf64b6ab31e7d31', 'duration': 1582.04, - # 'timestamp': 1488405786, - # 'upload_date': '20170301', }, }, { # episode - 'url': 'https://www.redbull.tv/video/AP-1PMT5JCWH1W11/grime?playlist=shows:shows-playall:web', + 'url': 'https://www.redbull.tv/video/AP-1PMHKJFCW1W11', 'info_dict': { - 'id': 'AP-1PMT5JCWH1W11', + 'id': 'AP-1PMHKJFCW1W11', 'ext': 'mp4', - 'title': 'Grime - Hashtags S2 E4', - 'description': 'md5:334b741c8c1ce65be057eab6773c1cf5', + 'title': 'Grime - Hashtags S2E4', + 'description': 'md5:b5f522b89b72e1e23216e5018810bb25', 'duration': 904.6, - # 'timestamp': 1487290093, - # 'upload_date': '20170217', - 'series': 'Hashtags', - 'season_number': 2, - 'episode_number': 4, }, 'params': { 'skip_download': True, }, - }, { - # segment - 'url': 'https://www.redbull.tv/live/AP-1R5DX49XS1W11/segment/AP-1QSAQJ6V52111/semi-finals', - 'info_dict': { - 'id': 'AP-1QSAQJ6V52111', - 'ext': 'mp4', - 'title': 'Semi Finals - Vans Park Series Pro Tour', - 'description': 'md5:306a2783cdafa9e65e39aa62f514fd97', - 'duration': 11791.991, - }, - 'params': { - 'skip_download': True, - }, - }, { - 'url': 'https://www.redbull.tv/film/AP-1MSKKF5T92111/in-motion', - 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) session = self._download_json( - 'https://api-v2.redbull.tv/session', video_id, + 'https://api.redbull.tv/v3/session', video_id, note='Downloading access token', query={ - 'build': '4.370.0', 'category': 'personal_computer', - 'os_version': '1.0', 'os_family': 'http', }) if session.get('code') == 'error': raise ExtractorError('%s said: %s' % ( self.IE_NAME, session['message'])) - auth = '%s %s' % (session.get('token_type', 'Bearer'), session['access_token']) + token = session['token'] try: - info = self._download_json( - 'https://api-v2.redbull.tv/content/%s' % video_id, + video = self._download_json( + 'https://api.redbull.tv/v3/products/' + video_id, video_id, note='Downloading video information', - headers={'Authorization': auth} + headers={'Authorization': token} ) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 404: error_message = self._parse_json( - e.cause.read().decode(), video_id)['message'] + e.cause.read().decode(), video_id)['error'] raise ExtractorError('%s said: %s' % ( self.IE_NAME, error_message), expected=True) raise - video = info['video_product'] - - title = info['title'].strip() + title = video['title'].strip() formats = self._extract_m3u8_formats( - video['url'], video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') + 'https://dms.redbull.tv/v3/%s/%s/playlist.m3u8' % (video_id, token), + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') self._sort_formats(formats) subtitles = {} - for _, captions in (try_get( - video, lambda x: x['attachments']['captions'], - dict) or {}).items(): - if not captions or not isinstance(captions, list): - continue - for caption in captions: - caption_url = caption.get('url') - if not caption_url: - continue - ext = caption.get('format') - if ext == 'xml': - ext = 'ttml' - subtitles.setdefault(caption.get('lang') or 'en', []).append({ - 'url': caption_url, - 'ext': ext, - }) + for resource in video.get('resources', []): + if resource.startswith('closed_caption_'): + splitted_resource = resource.split('_') + if splitted_resource[2]: + subtitles.setdefault('en', []).append({ + 'url': 'https://resources.redbull.tv/%s/%s' % (video_id, resource), + 'ext': splitted_resource[2], + }) - subheading = info.get('subheading') + subheading = video.get('subheading') if subheading: title += ' - %s' % subheading return { 'id': video_id, 'title': title, - 'description': info.get('long_description') or info.get( + 'description': video.get('long_description') or video.get( 'short_description'), 'duration': float_or_none(video.get('duration'), scale=1000), - # 'timestamp': unified_timestamp(info.get('published')), - 'series': info.get('show_title'), - 'season_number': int_or_none(info.get('season_number')), - 'episode_number': int_or_none(info.get('episode_number')), 'formats': formats, 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py index f36bc64..53b1c96 100644 --- a/youtube_dl/extractor/reddit.py +++ b/youtube_dl/extractor/reddit.py @@ -15,7 +15,7 @@ class RedditIE(InfoExtractor): _TEST = { # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/ 'url': 'https://v.redd.it/zv89llsvexdz', - 'md5': '655d06ace653ea3b87bccfb1b27ec99d', + 'md5': '0a070c53eba7ec4534d95a5a1259e253', 'info_dict': { 'id': 'zv89llsvexdz', 'ext': 'mp4', diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py index f70a752..879bcf8 100644 --- a/youtube_dl/extractor/redtube.py +++ b/youtube_dl/extractor/redtube.py @@ -16,12 +16,12 @@ class RedTubeIE(InfoExtractor): _VALID_URL = r'https?://(?:(?:www\.)?redtube\.com/|embed\.redtube\.com/\?.*?\bid=)(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.redtube.com/66418', - 'md5': '7b8c22b5e7098a3e1c09709df1126d2d', + 'md5': 'fc08071233725f26b8f014dba9590005', 'info_dict': { 'id': '66418', 'ext': 'mp4', 'title': 'Sucked on a toilet', - 'upload_date': '20120831', + 'upload_date': '20110811', 'duration': 596, 'view_count': int, 'age_limit': 18, @@ -46,9 +46,10 @@ class RedTubeIE(InfoExtractor): raise ExtractorError('Video %s has been removed' % video_id, expected=True) title = self._html_search_regex( - (r'<h1 class="videoTitle[^"]*">(?P<title>.+?)</h1>', - r'videoTitle\s*:\s*(["\'])(?P<title>)\1'), - webpage, 'title', group='title') + (r'<h(\d)[^>]+class="(?:video_title_text|videoTitle)[^"]*">(?P<title>(?:(?!\1).)+)</h\1>', + r'(?:videoTitle|title)\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',), + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) formats = [] sources = self._parse_json( @@ -87,12 +88,14 @@ class RedTubeIE(InfoExtractor): thumbnail = self._og_search_thumbnail(webpage) upload_date = unified_strdate(self._search_regex( - r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<', + r'<span[^>]+>ADDED ([^<]+)<', webpage, 'upload date', fatal=False)) - duration = int_or_none(self._search_regex( - r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) + duration = int_or_none(self._og_search_property( + 'video:duration', webpage, default=None) or self._search_regex( + r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None)) view_count = str_to_int(self._search_regex( - r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)', + (r'<div[^>]*>Views</div>\s*<div[^>]*>\s*([\d,.]+)', + r'<span[^>]*>VIEWS</span>\s*</td>\s*<td>\s*([\d,.]+)'), webpage, 'view count', fatal=False)) # No self-labeling, but they describe themselves as diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index bba25a2..be36acc 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -93,58 +93,11 @@ class RtlNlIE(InfoExtractor): meta = info.get('meta', {}) - # m3u8 streams are encrypted and may not be handled properly by older ffmpeg/avconv. - # To workaround this previously adaptive -> flash trick was used to obtain - # unencrypted m3u8 streams (see https://github.com/rg3/youtube-dl/issues/4118) - # and bypass georestrictions as well. - # Currently, unencrypted m3u8 playlists are (intentionally?) invalid and therefore - # unusable albeit can be fixed by simple string replacement (see - # https://github.com/rg3/youtube-dl/pull/6337) - # Since recent ffmpeg and avconv handle encrypted streams just fine encrypted - # streams are used now. videopath = material['videopath'] m3u8_url = meta.get('videohost', 'http://manifest.us.rtl.nl') + videopath formats = self._extract_m3u8_formats( m3u8_url, uuid, 'mp4', m3u8_id='hls', fatal=False) - - video_urlpart = videopath.split('/adaptive/')[1][:-5] - PG_URL_TEMPLATE = 'http://pg.us.rtl.nl/rtlxl/network/%s/progressive/%s.mp4' - - PG_FORMATS = ( - ('a2t', 512, 288), - ('a3t', 704, 400), - ('nettv', 1280, 720), - ) - - def pg_format(format_id, width, height): - return { - 'url': PG_URL_TEMPLATE % (format_id, video_urlpart), - 'format_id': 'pg-%s' % format_id, - 'protocol': 'http', - 'width': width, - 'height': height, - } - - if not formats: - formats = [pg_format(*pg_tuple) for pg_tuple in PG_FORMATS] - else: - pg_formats = [] - for format_id, width, height in PG_FORMATS: - try: - # Find hls format with the same width and height corresponding - # to progressive format and copy metadata from it. - f = next(f for f in formats if f.get('height') == height) - # hls formats may have invalid width - f['width'] = width - f_copy = f.copy() - f_copy.update(pg_format(format_id, width, height)) - pg_formats.append(f_copy) - except StopIteration: - # Missing hls format does mean that no progressive format with - # such width and height exists either. - pass - formats.extend(pg_formats) self._sort_formats(formats) thumbnails = [] diff --git a/youtube_dl/extractor/ruutu.py b/youtube_dl/extractor/ruutu.py index 6c09df2..9fa8688 100644 --- a/youtube_dl/extractor/ruutu.py +++ b/youtube_dl/extractor/ruutu.py @@ -53,6 +53,12 @@ class RuutuIE(InfoExtractor): 'age_limit': 0, }, }, + # Episode where <SourceFile> is "NOT-USED", but has other + # downloadable sources available. + { + 'url': 'http://www.ruutu.fi/video/3193728', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -72,7 +78,7 @@ class RuutuIE(InfoExtractor): video_url = child.text if (not video_url or video_url in processed_urls or any(p in video_url for p in ('NOT_USED', 'NOT-USED'))): - return + continue processed_urls.append(video_url) ext = determine_ext(video_url) if ext == 'm3u8': diff --git a/youtube_dl/extractor/seznamzpravy.py b/youtube_dl/extractor/seznamzpravy.py index cf32d1e..6d4e3b7 100644 --- a/youtube_dl/extractor/seznamzpravy.py +++ b/youtube_dl/extractor/seznamzpravy.py @@ -159,7 +159,6 @@ class SeznamZpravyArticleIE(InfoExtractor): webpage = self._download_webpage(url, article_id) info = self._search_json_ld(webpage, article_id, default={}) - print(info) title = info.get('title') or self._og_search_title(webpage, fatal=False) description = info.get('description') or self._og_search_description(webpage) diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py index 547be8f..69951e3 100644 --- a/youtube_dl/extractor/sixplay.py +++ b/youtube_dl/extractor/sixplay.py @@ -4,7 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_parse_qs, + compat_str, + compat_urllib_parse_urlparse, +) from ..utils import ( determine_ext, int_or_none, @@ -48,6 +52,7 @@ class SixPlayIE(InfoExtractor): urls = [] quality_key = qualities(['lq', 'sd', 'hq', 'hd']) formats = [] + subtitles = {} for asset in clip_data['assets']: asset_url = asset.get('full_physical_path') protocol = asset.get('protocol') @@ -56,8 +61,11 @@ class SixPlayIE(InfoExtractor): urls.append(asset_url) container = asset.get('video_container') ext = determine_ext(asset_url) + if protocol == 'http_subtitle' or ext == 'vtt': + subtitles.setdefault('fr', []).append({'url': asset_url}) + continue if container == 'm3u8' or ext == 'm3u8': - if protocol == 'usp': + if protocol == 'usp' and not compat_parse_qs(compat_urllib_parse_urlparse(asset_url).query).get('token', [None])[0]: asset_url = re.sub(r'/([^/]+)\.ism/[^/]*\.m3u8', r'/\1.ism/\1.m3u8', asset_url) formats.extend(self._extract_m3u8_formats( asset_url, video_id, 'mp4', 'm3u8_native', @@ -98,4 +106,5 @@ class SixPlayIE(InfoExtractor): 'duration': int_or_none(clip_data.get('duration')), 'series': get(lambda x: x['program']['title']), 'formats': formats, + 'subtitles': subtitles, } diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py index c3078e2..58a8c0d 100644 --- a/youtube_dl/extractor/sonyliv.py +++ b/youtube_dl/extractor/sonyliv.py @@ -33,5 +33,8 @@ class SonyLIVIE(InfoExtractor): def _real_extract(self, url): brightcove_id = self._match_id(url) return self.url_result( - smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, {'geo_countries': ['IN']}), + smuggle_url(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, { + 'geo_countries': ['IN'], + 'referrer': url, + }), 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 97ff422..46332e5 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -157,8 +157,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'DQskPX1pntALRzMp4HSxya3Mc0AO66Ro' - _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' + _CLIENT_ID = 'LvWovRaJZlWCHql0bISuum8Bd2KX79mb' @staticmethod def _extract_urls(webpage): diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index e6c2dcf..67500b6 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -3,7 +3,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ExtractorError +from ..utils import ( + ExtractorError, + parse_duration, + parse_resolution, + str_to_int, +) class SpankBangIE(InfoExtractor): @@ -15,7 +20,7 @@ class SpankBangIE(InfoExtractor): 'id': '3vvn', 'ext': 'mp4', 'title': 'fantasy solo', - 'description': 'Watch fantasy solo free HD porn video - 05 minutes - Babe,Masturbation,Solo,Toy - dillion harper masturbates on a bed free adult movies sexy clips.', + 'description': 'dillion harper masturbates on a bed', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'silly2587', 'age_limit': 18, @@ -32,36 +37,49 @@ class SpankBangIE(InfoExtractor): # mobile page 'url': 'http://m.spankbang.com/1o2de/video/can+t+remember+her+name', 'only_matching': True, + }, { + # 4k + 'url': 'https://spankbang.com/1vwqx/video/jade+kush+solo+4k', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, video_id, headers={ + 'Cookie': 'country=US' + }) if re.search(r'<[^>]+\bid=["\']video_removed', webpage): raise ExtractorError( 'Video %s is not available' % video_id, expected=True) - stream_key = self._html_search_regex( - r'''var\s+stream_key\s*=\s*['"](.+?)['"]''', - webpage, 'stream key') - - formats = [{ - 'url': 'http://spankbang.com/_%s/%s/title/%sp__mp4' % (video_id, stream_key, height), - 'ext': 'mp4', - 'format_id': '%sp' % height, - 'height': int(height), - } for height in re.findall(r'<(?:span|li|p)[^>]+[qb]_(\d+)p', webpage)] - self._check_formats(formats, video_id) + formats = [] + for mobj in re.finditer( + r'stream_url_(?P<id>[^\s=]+)\s*=\s*(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage): + format_id, format_url = mobj.group('id', 'url') + f = parse_resolution(format_id) + f.update({ + 'url': format_url, + 'format_id': format_id, + }) + formats.append(f) self._sort_formats(formats) title = self._html_search_regex( r'(?s)<h1[^>]*>(.+?)</h1>', webpage, 'title') - description = self._og_search_description(webpage) + description = self._search_regex( + r'<div[^>]+\bclass=["\']bottom[^>]+>\s*<p>[^<]*</p>\s*<p>([^<]+)', + webpage, 'description', fatal=False) thumbnail = self._og_search_thumbnail(webpage) uploader = self._search_regex( r'class="user"[^>]*><img[^>]+>([^<]+)', webpage, 'uploader', default=None) + duration = parse_duration(self._search_regex( + r'<div[^>]+\bclass=["\']right_side[^>]+>\s*<span>([^<]+)', + webpage, 'duration', fatal=False)) + view_count = str_to_int(self._search_regex( + r'([\d,.]+)\s+plays', webpage, 'view count', fatal=False)) age_limit = self._rta_search(webpage) @@ -71,6 +89,8 @@ class SpankBangIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'uploader': uploader, + 'duration': duration, + 'view_count': view_count, 'formats': formats, 'age_limit': age_limit, } diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py index a9e34c0..fcaa5ac 100644 --- a/youtube_dl/extractor/streamango.py +++ b/youtube_dl/extractor/streamango.py @@ -4,8 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_chr from ..utils import ( determine_ext, + ExtractorError, int_or_none, js_to_json, ) @@ -32,12 +34,34 @@ class StreamangoIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'gone', }, { 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', 'only_matching': True, }] def _real_extract(self, url): + def decrypt_src(encoded, val): + ALPHABET = '=/+9876543210zyxwvutsrqponmlkjihgfedcbaZYXWVUTSRQPONMLKJIHGFEDCBA' + encoded = re.sub(r'[^A-Za-z0-9+/=]', '', encoded) + decoded = '' + sm = [None] * 4 + i = 0 + str_len = len(encoded) + while i < str_len: + for j in range(4): + sm[j % 4] = ALPHABET.index(encoded[i]) + i += 1 + char_code = ((sm[0] << 0x2) | (sm[1] >> 0x4)) ^ val + decoded += compat_chr(char_code) + if sm[2] != 0x40: + char_code = ((sm[1] & 0xf) << 0x4) | (sm[2] >> 0x2) + decoded += compat_chr(char_code) + if sm[3] != 0x40: + char_code = ((sm[2] & 0x3) << 0x6) | sm[3] + decoded += compat_chr(char_code) + return decoded + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -46,13 +70,26 @@ class StreamangoIE(InfoExtractor): formats = [] for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): + mobj = re.search(r'(src\s*:\s*[^(]+\(([^)]*)\)[\s,]*)', format_) + if mobj is None: + continue + + format_ = format_.replace(mobj.group(0), '') + video = self._parse_json( - format_, video_id, transform_source=js_to_json, fatal=False) - if not video: + format_, video_id, transform_source=js_to_json, + fatal=False) or {} + + mobj = re.search( + r'([\'"])(?P<src>(?:(?!\1).)+)\1\s*,\s*(?P<val>\d+)', + mobj.group(1)) + if mobj is None: continue - src = video.get('src') + + src = decrypt_src(mobj.group('src'), int_or_none(mobj.group('val'))) if not src: continue + ext = determine_ext(src, default_ext=None) if video.get('type') == 'application/dash+xml' or ext == 'mpd': formats.extend(self._extract_mpd_formats( @@ -65,6 +102,16 @@ class StreamangoIE(InfoExtractor): 'height': int_or_none(video.get('height')), 'tbr': int_or_none(video.get('bitrate')), }) + + if not formats: + error = self._search_regex( + r'<p[^>]+\bclass=["\']lead[^>]+>(.+?)</p>', webpage, + 'error', default=None) + if not error and '>Sorry' in webpage: + error = 'Video %s is not available' % video_id + if error: + raise ExtractorError(error, expected=True) + self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/telebruxelles.py b/youtube_dl/extractor/telebruxelles.py index 5886e9c..a0353fe 100644 --- a/youtube_dl/extractor/telebruxelles.py +++ b/youtube_dl/extractor/telebruxelles.py @@ -7,7 +7,7 @@ from .common import InfoExtractor class TeleBruxellesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(news|sport|dernier-jt|emission)/?(?P<id>[^/#?]+)' + _VALID_URL = r'https?://(?:www\.)?(?:telebruxelles|bx1)\.be/(?:[^/]+/)*(?P<id>[^/#?]+)' _TESTS = [{ 'url': 'http://bx1.be/news/que-risque-lauteur-dune-fausse-alerte-a-la-bombe/', 'md5': 'a2a67a5b1c3e8c9d33109b902f474fd9', @@ -31,6 +31,16 @@ class TeleBruxellesIE(InfoExtractor): }, { 'url': 'http://bx1.be/emission/bxenf1-gastronomie/', 'only_matching': True, + }, { + 'url': 'https://bx1.be/berchem-sainte-agathe/personnel-carrefour-de-berchem-sainte-agathe-inquiet/', + 'only_matching': True, + }, { + 'url': 'https://bx1.be/dernier-jt/', + 'only_matching': True, + }, { + # live stream + 'url': 'https://bx1.be/lives/direct-tv/', + 'only_matching': True, }] def _real_extract(self, url): @@ -38,22 +48,29 @@ class TeleBruxellesIE(InfoExtractor): webpage = self._download_webpage(url, display_id) article_id = self._html_search_regex( - r"<article id=\"post-(\d+)\"", webpage, 'article ID', default=None) + r'<article[^>]+\bid=["\']post-(\d+)', webpage, 'article ID', default=None) title = self._html_search_regex( - r'<h1 class=\"entry-title\">(.*?)</h1>', webpage, 'title') + r'<h1[^>]*>(.+?)</h1>', webpage, 'title', + default=None) or self._og_search_title(webpage) description = self._og_search_description(webpage, default=None) rtmp_url = self._html_search_regex( - r'file\s*:\s*"(rtmp://[^/]+/vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*".mp4)"', + r'file["\']?\s*:\s*"(r(?:tm|mt)ps?://[^/]+/(?:vod/mp4:"\s*\+\s*"[^"]+"\s*\+\s*"\.mp4|stream/live))"', webpage, 'RTMP url') + # Yes, they have a typo in scheme name for live stream URLs (e.g. + # https://bx1.be/lives/direct-tv/) + rtmp_url = re.sub(r'^rmtp', 'rtmp', rtmp_url) rtmp_url = re.sub(r'"\s*\+\s*"', '', rtmp_url) formats = self._extract_wowza_formats(rtmp_url, article_id or display_id) self._sort_formats(formats) + is_live = 'stream/live' in rtmp_url + return { 'id': article_id or display_id, 'display_id': display_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'description': description, 'formats': formats, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/telequebec.py b/youtube_dl/extractor/telequebec.py index fafaa82..6965c12 100644 --- a/youtube_dl/extractor/telequebec.py +++ b/youtube_dl/extractor/telequebec.py @@ -10,19 +10,33 @@ from ..utils import ( ) -class TeleQuebecIE(InfoExtractor): +class TeleQuebecBaseIE(InfoExtractor): + @staticmethod + def _limelight_result(media_id): + return { + '_type': 'url_transparent', + 'url': smuggle_url( + 'limelight:media:' + media_id, {'geo_countries': ['CA']}), + 'ie_key': 'LimelightMedia', + } + + +class TeleQuebecIE(TeleQuebecBaseIE): _VALID_URL = r'https?://zonevideo\.telequebec\.tv/media/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://zonevideo.telequebec.tv/media/20984/le-couronnement-de-new-york/couronnement-de-new-york', - 'md5': 'fe95a0957e5707b1b01f5013e725c90f', + # available till 01.01.2023 + 'url': 'http://zonevideo.telequebec.tv/media/37578/un-petit-choc-et-puis-repart/un-chef-a-la-cabane', 'info_dict': { - 'id': '20984', + 'id': '577116881b4b439084e6b1cf4ef8b1b3', 'ext': 'mp4', - 'title': 'Le couronnement de New York', - 'description': 'md5:f5b3d27a689ec6c1486132b2d687d432', - 'upload_date': '20170201', - 'timestamp': 1485972222, - } + 'title': 'Un petit choc et puis repart!', + 'description': 'md5:b04a7e6b3f74e32d7b294cffe8658374', + 'upload_date': '20180222', + 'timestamp': 1519326631, + }, + 'params': { + 'skip_download': True, + }, }, { # no description 'url': 'http://zonevideo.telequebec.tv/media/30261', @@ -31,19 +45,107 @@ class TeleQuebecIE(InfoExtractor): def _real_extract(self, url): media_id = self._match_id(url) + media_data = self._download_json( 'https://mnmedias.api.telequebec.tv/api/v2/media/' + media_id, media_id)['media'] - return { - '_type': 'url_transparent', - 'id': media_id, - 'url': smuggle_url( - 'limelight:media:' + media_data['streamInfo']['sourceId'], - {'geo_countries': ['CA']}), - 'title': media_data['title'], + + info = self._limelight_result(media_data['streamInfo']['sourceId']) + info.update({ + 'title': media_data.get('title'), 'description': try_get( media_data, lambda x: x['descriptions'][0]['text'], compat_str), 'duration': int_or_none( media_data.get('durationInMilliseconds'), 1000), - 'ie_key': 'LimelightMedia', + }) + return info + + +class TeleQuebecEmissionIE(TeleQuebecBaseIE): + _VALID_URL = r'''(?x) + https?:// + (?: + [^/]+\.telequebec\.tv/emissions/| + (?:www\.)?telequebec\.tv/ + ) + (?P<id>[^?#&]+) + ''' + _TESTS = [{ + 'url': 'http://lindicemcsween.telequebec.tv/emissions/100430013/des-soins-esthetiques-a-377-d-interets-annuels-ca-vous-tente', + 'info_dict': { + 'id': '66648a6aef914fe3badda25e81a4d50a', + 'ext': 'mp4', + 'title': "Des soins esthétiques à 377 % d'intérêts annuels, ça vous tente?", + 'description': 'md5:369e0d55d0083f1fc9b71ffb640ea014', + 'upload_date': '20171024', + 'timestamp': 1508862118, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://bancpublic.telequebec.tv/emissions/emission-49/31986/jeunes-meres-sous-pression', + 'only_matching': True, + }, { + 'url': 'http://www.telequebec.tv/masha-et-michka/epi059masha-et-michka-3-053-078', + 'only_matching': True, + }, { + 'url': 'http://www.telequebec.tv/documentaire/bebes-sur-mesure/', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + media_id = self._search_regex( + r'mediaUID\s*:\s*["\'][Ll]imelight_(?P<id>[a-z0-9]{32})', webpage, + 'limelight id') + + info = self._limelight_result(media_id) + info.update({ + 'title': self._og_search_title(webpage, default=None), + 'description': self._og_search_description(webpage, default=None), + }) + return info + + +class TeleQuebecLiveIE(InfoExtractor): + _VALID_URL = r'https?://zonevideo\.telequebec\.tv/(?P<id>endirect)' + _TEST = { + 'url': 'http://zonevideo.telequebec.tv/endirect/', + 'info_dict': { + 'id': 'endirect', + 'ext': 'mp4', + 'title': 're:^Télé-Québec - En direct [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + m3u8_url = None + webpage = self._download_webpage( + 'https://player.telequebec.tv/Tq_VideoPlayer.js', video_id, + fatal=False) + if webpage: + m3u8_url = self._search_regex( + r'm3U8Url\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'm3u8 url', default=None, group='url') + if not m3u8_url: + m3u8_url = 'https://teleqmmd.mmdlive.lldns.net/teleqmmd/f386e3b206814e1f8c8c1c71c0f8e748/manifest.m3u8' + formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._live_title('Télé-Québec - En direct'), + 'is_live': True, + 'formats': formats, } diff --git a/youtube_dl/extractor/tennistv.py b/youtube_dl/extractor/tennistv.py new file mode 100644 index 0000000..0c6f707 --- /dev/null +++ b/youtube_dl/extractor/tennistv.py @@ -0,0 +1,112 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + unified_timestamp, +) + + +class TennisTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?tennistv\.com/videos/(?P<id>[-a-z0-9]+)' + _TEST = { + 'url': 'https://www.tennistv.com/videos/indian-wells-2018-verdasco-fritz', + 'info_dict': { + 'id': 'indian-wells-2018-verdasco-fritz', + 'ext': 'mp4', + 'title': 'Fernando Verdasco v Taylor Fritz', + 'description': 're:^After his stunning victory.{174}$', + 'thumbnail': 'https://atp-prod.akamaized.net/api/images/v1/images/112831/landscape/1242/0', + 'timestamp': 1521017381, + 'upload_date': '20180314', + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires email and password of a subscribed account', + } + _NETRC_MACHINE = 'tennistv' + + def _login(self): + (username, password) = self._get_login_info() + if not username or not password: + raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) + + login_form = { + 'Email': username, + 'Password': password, + } + login_json = json.dumps(login_form).encode('utf-8') + headers = { + 'content-type': 'application/json', + 'Referer': 'https://www.tennistv.com/login', + 'Origin': 'https://www.tennistv.com', + } + + login_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/login', None, + note='Logging in', + errnote='Login failed (wrong password?)', + headers=headers, + data=login_json) + + if login_result['error']['errorCode']: + raise ExtractorError('Login failed, %s said: %r' % (self.IE_NAME, login_result['error']['errorMessage'])) + + if login_result['entitlement'] != 'SUBSCRIBED': + self.report_warning('%s may not be subscribed to %s.' % (username, self.IE_NAME)) + + self._session_token = login_result['sessionToken'] + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + internal_id = self._search_regex(r'video=([0-9]+)', webpage, 'internal video id') + + headers = { + 'Origin': 'https://www.tennistv.com', + 'authorization': 'ATP %s' % self._session_token, + 'content-type': 'application/json', + 'Referer': url, + } + check_data = { + 'videoID': internal_id, + 'VideoUrlType': 'HLSV3', + } + check_json = json.dumps(check_data).encode('utf-8') + check_result = self._download_json( + 'https://www.tennistv.com/api/users/v1/entitlementchecknondiva', + video_id, note='Checking video authorization', headers=headers, data=check_json) + formats = self._extract_m3u8_formats(check_result['contentUrl'], video_id, ext='mp4') + + vdata_url = 'https://www.tennistv.com/api/channels/v1/de/none/video/%s' % video_id + vdata = self._download_json(vdata_url, video_id) + + timestamp = unified_timestamp(vdata['timestamp']) + thumbnail = vdata['video']['thumbnailUrl'] + description = vdata['displayText']['description'] + title = vdata['video']['title'] + + series = vdata['tour'] + venue = vdata['displayText']['venue'] + round_str = vdata['seo']['round'] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'formats': formats, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + 'series': series, + 'season': venue, + 'episode': round_str, + } diff --git a/youtube_dl/extractor/toggle.py b/youtube_dl/extractor/toggle.py index 348d6ec..5e5efda 100644 --- a/youtube_dl/extractor/toggle.py +++ b/youtube_dl/extractor/toggle.py @@ -132,7 +132,7 @@ class ToggleIE(InfoExtractor): formats = [] for video_file in info.get('Files', []): video_url, vid_format = video_file.get('URL'), video_file.get('Format') - if not video_url or not vid_format: + if not video_url or video_url == 'NA' or not vid_format: continue ext = determine_ext(video_url) vid_format = vid_format.replace(' ', '') @@ -143,6 +143,18 @@ class ToggleIE(InfoExtractor): note='Downloading %s m3u8 information' % vid_format, errnote='Failed to download %s m3u8 information' % vid_format, fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id=vid_format, + note='Downloading %s MPD manifest' % vid_format, + errnote='Failed to download %s MPD manifest' % vid_format, + fatal=False)) + elif ext == 'ism': + formats.extend(self._extract_ism_formats( + video_url, video_id, ism_id=vid_format, + note='Downloading %s ISM manifest' % vid_format, + errnote='Failed to download %s ISM manifest' % vid_format, + fatal=False)) elif ext in ('mp4', 'wvm'): # wvm are drm-protected files formats.append({ diff --git a/youtube_dl/extractor/tvnow.py b/youtube_dl/extractor/tvnow.py index e2169f2..1bf4724 100644 --- a/youtube_dl/extractor/tvnow.py +++ b/youtube_dl/extractor/tvnow.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..compat import compat_str from ..utils import ( ExtractorError, + int_or_none, parse_iso8601, parse_duration, update_url_query, @@ -16,8 +17,9 @@ from ..utils import ( class TVNowBaseIE(InfoExtractor): _VIDEO_FIELDS = ( 'id', 'title', 'free', 'geoblocked', 'articleLong', 'articleShort', - 'broadcastStartDate', 'isDrm', 'duration', 'manifest.dashclear', - 'format.defaultImage169Format', 'format.defaultImage169Logo') + 'broadcastStartDate', 'isDrm', 'duration', 'season', 'episode', + 'manifest.dashclear', 'format.title', 'format.defaultImage169Format', + 'format.defaultImage169Logo') def _call_api(self, path, video_id, query): return self._download_json( @@ -66,6 +68,10 @@ class TVNowBaseIE(InfoExtractor): 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, + 'series': f.get('title'), + 'season_number': int_or_none(info.get('season')), + 'episode_number': int_or_none(info.get('episode')), + 'episode': title, 'formats': formats, } @@ -74,18 +80,21 @@ class TVNowIE(TVNowBaseIE): _VALID_URL = r'https?://(?:www\.)?tvnow\.(?:de|at|ch)/(?:rtl(?:2|plus)?|nitro|superrtl|ntv|vox)/(?P<show_id>[^/]+)/(?:(?:list/[^/]+|jahr/\d{4}/\d{1,2})/)?(?P<id>[^/]+)/(?:player|preview)' _TESTS = [{ - # rtl - 'url': 'https://www.tvnow.de/rtl/alarm-fuer-cobra-11/freier-fall/player?return=/rtl', + 'url': 'https://www.tvnow.de/rtl2/grip-das-motormagazin/der-neue-porsche-911-gt-3/player', 'info_dict': { - 'id': '385314', - 'display_id': 'alarm-fuer-cobra-11/freier-fall', + 'id': '331082', + 'display_id': 'grip-das-motormagazin/der-neue-porsche-911-gt-3', 'ext': 'mp4', - 'title': 'Freier Fall', - 'description': 'md5:8c2d8f727261adf7e0dc18366124ca02', + 'title': 'Der neue Porsche 911 GT 3', + 'description': 'md5:6143220c661f9b0aae73b245e5d898bb', 'thumbnail': r're:^https?://.*\.jpg$', - 'timestamp': 1512677700, - 'upload_date': '20171207', - 'duration': 2862.0, + 'timestamp': 1495994400, + 'upload_date': '20170528', + 'duration': 5283, + 'series': 'GRIP - Das Motormagazin', + 'season_number': 14, + 'episode_number': 405, + 'episode': 'Der neue Porsche 911 GT 3', }, }, { # rtl2 diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index 195f5ce..6d6c0a9 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -5,6 +5,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_HTTPError, + compat_kwargs, compat_str, compat_urllib_request, compat_urlparse, @@ -114,6 +115,11 @@ class UdemyIE(InfoExtractor): error_str += ' - %s' % error_data.get('formErrors') raise ExtractorError(error_str, expected=True) + def _download_webpage(self, *args, **kwargs): + kwargs.setdefault('headers', {})['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/603.2.4 (KHTML, like Gecko) Version/10.1.1 Safari/603.2.4' + return super(UdemyIE, self)._download_webpage( + *args, **compat_kwargs(kwargs)) + def _download_json(self, url_or_request, *args, **kwargs): headers = { 'X-Udemy-Snail-Case': 'true', diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index b20dddc..071774a 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -12,7 +12,7 @@ from ..utils import ( class VeohIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|embed|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)' _TESTS = [{ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3', @@ -24,6 +24,9 @@ class VeohIE(InfoExtractor): 'uploader': 'LUMOback', 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ', }, + }, { + 'url': 'http://www.veoh.com/embed/v56314296nk7Zdmz3', + 'only_matching': True, }, { 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage', 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa', diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index bcc2869..5382586 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -5,56 +5,169 @@ import re import time import hashlib import json +import random from .adobepass import AdobePassIE from .youtube import YoutubeIE from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( + ExtractorError, int_or_none, parse_age_limit, str_or_none, - parse_duration, - ExtractorError, - extract_attributes, + try_get, ) -class ViceBaseIE(AdobePassIE): - def _extract_preplay_video(self, url, locale, webpage): - watch_hub_data = extract_attributes(self._search_regex( - r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub')) - video_id = watch_hub_data['vms-id'] - title = watch_hub_data['video-title'] +class ViceIE(AdobePassIE): + IE_NAME = 'vice' + _VALID_URL = r'https?://(?:(?:video|vms)\.vice|(?:www\.)?viceland)\.com/(?P<locale>[^/]+)/(?:video/[^/]+|embed)/(?P<id>[\da-f]+)' + _TESTS = [{ + 'url': 'https://video.vice.com/en_us/video/pet-cremator/58c69e38a55424f1227dc3f7', + 'info_dict': { + 'id': '5e647f0125e145c9aef2069412c0cbde', + 'ext': 'mp4', + 'title': '10 Questions You Always Wanted To Ask: Pet Cremator', + 'description': 'md5:fe856caacf61fe0e74fab15ce2b07ca5', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1489664942, + 'upload_date': '20170316', + 'age_limit': 14, + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + # geo restricted to US + 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', + 'info_dict': { + 'id': '930c0ad1f47141cc955087eecaddb0e2', + 'ext': 'mp4', + 'uploader': 'waypoint', + 'title': 'The Signal From Tölva', + 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', + 'uploader_id': '57f7d621e05ca860fa9ccaf9', + 'timestamp': 1477941983, + 'upload_date': '20161031', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', + 'info_dict': { + 'id': '581b12b60a0e1f4c0fb6ea2f', + 'ext': 'mp4', + 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', + 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1485368119, + 'upload_date': '20170125', + 'age_limit': 14, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + 'proxy': '127.0.0.1:8118', + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', + 'only_matching': True, + }, { + 'url': 'https://video.vice.com/en_us/embed/57f41d3556a0a80f54726060', + 'only_matching': True, + }, { + 'url': 'https://vms.vice.com/en_us/video/preplay/58c69e38a55424f1227dc3f7', + 'only_matching': True, + }, { + 'url': 'https://www.viceland.com/en_us/video/thursday-march-1-2018/5a8f2d7ff1cdb332dd446ec1', + 'only_matching': True, + }] + _PREPLAY_HOST = 'vms.vice' + + @staticmethod + def _extract_urls(webpage): + return re.findall( + r'<iframe\b[^>]+\bsrc=["\']((?:https?:)?//video\.vice\.com/[^/]+/embed/[\da-f]+)', + webpage) + + @staticmethod + def _extract_url(webpage): + urls = ViceIE._extract_urls(webpage) + return urls[0] if urls else None + + def _real_extract(self, url): + locale, video_id = re.match(self._VALID_URL, url).groups() + + webpage = self._download_webpage( + 'https://video.vice.com/%s/embed/%s' % (locale, video_id), + video_id) + + video = self._parse_json( + self._search_regex( + r'PREFETCH_DATA\s*=\s*({.+?})\s*;\s*\n', webpage, + 'app state'), video_id)['video'] + video_id = video.get('vms_id') or video.get('id') or video_id + title = video['title'] + is_locked = video.get('locked') + rating = video.get('rating') + thumbnail = video.get('thumbnail_url') + duration = int_or_none(video.get('duration')) + series = try_get( + video, lambda x: x['episode']['season']['show']['title'], + compat_str) + episode_number = try_get( + video, lambda x: x['episode']['episode_number']) + season_number = try_get( + video, lambda x: x['episode']['season']['season_number']) + uploader = None query = {} - is_locked = watch_hub_data.get('video-locked') == '1' if is_locked: resource = self._get_mvpd_resource( - 'VICELAND', title, video_id, - watch_hub_data.get('video-rating')) + 'VICELAND', title, video_id, rating) query['tvetoken'] = self._extract_mvpd_auth( url, video_id, 'VICELAND', resource) # signature generation algorithm is reverse engineered from signatureGenerator in # webpack:///../shared/~/vice-player/dist/js/vice-player.js in # https://www.viceland.com/assets/common/js/web.vendor.bundle.js - exp = int(time.time()) + 14400 + # new JS is located here https://vice-web-statics-cdn.vice.com/vice-player/player-embed.js + exp = int(time.time()) + 1440 + query.update({ 'exp': exp, 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), + '_ad_blocked': None, + '_ad_unit': '', + '_debug': '', + 'platform': 'desktop', + 'rn': random.randint(10000, 100000), + 'fbprebidtoken': '', }) try: host = 'www.viceland' if is_locked else self._PREPLAY_HOST preplay = self._download_json( - 'https://%s.com/%s/preplay/%s' % (host, locale, video_id), + 'https://%s.com/%s/video/preplay/%s' % (host, locale, video_id), video_id, query=query) except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 401): error = json.loads(e.cause.read().decode()) + error_message = error.get('error_description') or error['details'] raise ExtractorError('%s said: %s' % ( - self.IE_NAME, error['details']), expected=True) + self.IE_NAME, error_message), expected=True) raise video_data = preplay['video'] @@ -76,92 +189,22 @@ class ViceBaseIE(AdobePassIE): 'id': video_id, 'title': title, 'description': base.get('body') or base.get('display_body'), - 'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'), - 'duration': int_or_none(video_data.get('video_duration')) or parse_duration(watch_hub_data.get('video-duration')), + 'thumbnail': thumbnail, + 'duration': int_or_none(video_data.get('video_duration')) or duration, 'timestamp': int_or_none(video_data.get('created_at'), 1000), 'age_limit': parse_age_limit(video_data.get('video_rating')), - 'series': video_data.get('show_title') or watch_hub_data.get('show-title'), - 'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')), + 'series': video_data.get('show_title') or series, + 'episode_number': int_or_none(episode.get('episode_number') or episode_number), 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), - 'season_number': int_or_none(watch_hub_data.get('season')), + 'season_number': int_or_none(season_number), 'season_id': str_or_none(episode.get('season_id')), - 'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'), + 'uploader': channel.get('base', {}).get('title') or channel.get('name') or uploader, 'uploader_id': str_or_none(channel.get('id')), 'subtitles': subtitles, 'ie_key': 'UplynkPreplay', } -class ViceIE(ViceBaseIE): - IE_NAME = 'vice' - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)' - - _TESTS = [{ - 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2', - 'info_dict': { - 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', - 'ext': 'flv', - 'title': 'Monkey Labs of Holland', - 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149', - }, - 'add_ie': ['Ooyala'], - }, { - 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', - 'info_dict': { - 'id': '5816510690b70e6c5fd39a56', - 'ext': 'mp4', - 'uploader': 'Waypoint', - 'title': 'The Signal From Tölva', - 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', - 'uploader_id': '57f7d621e05ca860fa9ccaf9', - 'timestamp': 1477941983, - 'upload_date': '20161031', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['UplynkPreplay'], - }, { - 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', - 'info_dict': { - 'id': '581b12b60a0e1f4c0fb6ea2f', - 'ext': 'mp4', - 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', - 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', - 'uploader': 'VICE', - 'uploader_id': '57a204088cb727dec794c67b', - 'timestamp': 1485368119, - 'upload_date': '20170125', - 'age_limit': 14, - }, - 'params': { - # AES-encrypted m3u8 - 'skip_download': True, - }, - 'add_ie': ['UplynkPreplay'], - }, { - 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', - 'only_matching': True, - }] - _PREPLAY_HOST = 'video.vice' - - def _real_extract(self, url): - locale, video_id = re.match(self._VALID_URL, url).groups() - webpage, urlh = self._download_webpage_handle(url, video_id) - embed_code = self._search_regex( - r'embedCode=([^&\'"]+)', webpage, - 'ooyala embed code', default=None) - if embed_code: - return self.url_result('ooyala:%s' % embed_code, 'Ooyala') - youtube_id = self._search_regex( - r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None) - if youtube_id: - return self.url_result(youtube_id, 'Youtube') - return self._extract_preplay_video(urlh.geturl(), locale, webpage) - - class ViceShowIE(InfoExtractor): IE_NAME = 'vice:show' _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' @@ -203,14 +246,15 @@ class ViceArticleIE(InfoExtractor): _TESTS = [{ 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', 'info_dict': { - 'id': '58dc0a3dee202d2a0ccfcbd8', + 'id': '41eae2a47b174a1398357cec55f1f6fc', 'ext': 'mp4', 'title': 'Mormon War on Porn ', - 'description': 'md5:ad396a2481e7f8afb5ed486878421090', - 'uploader': 'VICE', - 'uploader_id': '57a204088cb727dec794c693', - 'timestamp': 1489160690, - 'upload_date': '20170310', + 'description': 'md5:6394a8398506581d0346b9ab89093fef', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1491883129, + 'upload_date': '20170411', + 'age_limit': 17, }, 'params': { # AES-encrypted m3u8 @@ -219,17 +263,35 @@ class ViceArticleIE(InfoExtractor): 'add_ie': ['UplynkPreplay'], }, { 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', - 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'md5': '7fe8ebc4fa3323efafc127b82bd821d9', 'info_dict': { 'id': '3jstaBeXgAs', 'ext': 'mp4', 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', - 'uploader_id': 'MotherboardTV', 'uploader': 'Motherboard', + 'uploader_id': 'MotherboardTV', 'upload_date': '20140529', }, 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.vice.com/en_us/article/znm9dx/karley-sciortino-slutever-reloaded', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'info_dict': { + 'id': 'e2ed435eb67e43efb66e6ef9a6930a88', + 'ext': 'mp4', + 'title': "Making The World's First Male Sex Doll", + 'description': 'md5:916078ef0e032d76343116208b6cc2c4', + 'uploader': 'vice', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1476919911, + 'upload_date': '20161019', + 'age_limit': 17, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ViceIE.ie_key()], }, { 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', 'only_matching': True, @@ -244,8 +306,8 @@ class ViceArticleIE(InfoExtractor): webpage = self._download_webpage(url, display_id) prefetch_data = self._parse_json(self._search_regex( - r'window\.__PREFETCH_DATA\s*=\s*({.*});', - webpage, 'prefetch data'), display_id) + r'__APP_STATE\s*=\s*({.+?})(?:\s*\|\|\s*{}\s*)?;\s*\n', + webpage, 'app state'), display_id)['pageData'] body = prefetch_data['body'] def _url_res(video_url, ie_key): @@ -256,6 +318,10 @@ class ViceArticleIE(InfoExtractor): 'ie_key': ie_key, } + vice_url = ViceIE._extract_url(webpage) + if vice_url: + return _url_res(vice_url, ViceIE.ie_key()) + embed_code = self._search_regex( r'embedCode=([^&\'"]+)', body, 'ooyala embed code', default=None) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py deleted file mode 100644 index bd60235..0000000 --- a/youtube_dl/extractor/viceland.py +++ /dev/null @@ -1,38 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .vice import ViceBaseIE - - -class VicelandIE(ViceBaseIE): - _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)' - _TEST = { - 'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316', - 'info_dict': { - 'id': '588a70d0dba8a16007de7316', - 'ext': 'mp4', - 'title': 'TRAPPED (Series Trailer)', - 'description': 'md5:7a8e95c2b6cd86461502a2845e581ccf', - 'age_limit': 14, - 'timestamp': 1485474122, - 'upload_date': '20170126', - 'uploader_id': '57a204098cb727dec794c6a3', - 'uploader': 'Viceland', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['UplynkPreplay'], - 'skip': '404', - } - _PREPLAY_HOST = 'www.viceland' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - locale = mobj.group('locale') - webpage = self._download_webpage(url, video_id) - return self._extract_preplay_video(url, locale, webpage) diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index 01da32f..b48baf0 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -49,8 +49,8 @@ class VidioIE(InfoExtractor): thumbnail = clip.get('image') m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?!\1).+)\1', - webpage, 'hls url') + r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'hls url', group='url') formats = self._extract_m3u8_formats( m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') self._sort_formats(formats) diff --git a/youtube_dl/extractor/vidlii.py b/youtube_dl/extractor/vidlii.py new file mode 100644 index 0000000..f477425 --- /dev/null +++ b/youtube_dl/extractor/vidlii.py @@ -0,0 +1,125 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + get_element_by_id, + int_or_none, + strip_or_none, + unified_strdate, + urljoin, +) + + +class VidLiiIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidlii\.com/(?:watch|embed)\?.*?\bv=(?P<id>[0-9A-Za-z_-]{11})' + _TESTS = [{ + 'url': 'https://www.vidlii.com/watch?v=tJluaH4BJ3v', + 'md5': '9bf7d1e005dfa909b6efb0a1ff5175e2', + 'info_dict': { + 'id': 'tJluaH4BJ3v', + 'ext': 'mp4', + 'title': 'Vidlii is against me', + 'description': 'md5:fa3f119287a2bfb922623b52b1856145', + 'thumbnail': 're:https://.*.jpg', + 'uploader': 'APPle5auc31995', + 'uploader_url': 'https://www.vidlii.com/user/APPle5auc31995', + 'upload_date': '20171107', + 'duration': 212, + 'view_count': int, + 'comment_count': int, + 'average_rating': float, + 'categories': ['News & Politics'], + 'tags': ['Vidlii', 'Jan', 'Videogames'], + } + }, { + 'url': 'https://www.vidlii.com/embed?v=tJluaH4BJ3v&a=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://www.vidlii.com/watch?v=%s' % video_id, video_id) + + video_url = self._search_regex( + r'src\s*:\s*(["\'])(?P<url>(?:https?://)?(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + title = self._search_regex( + (r'<h1>([^<]+)</h1>', r'<title>([^<]+) - VidLii<'), webpage, + 'title') + + description = self._html_search_meta( + ('description', 'twitter:description'), webpage, + default=None) or strip_or_none( + get_element_by_id('des_text', webpage)) + + thumbnail = self._html_search_meta( + 'twitter:image', webpage, default=None) + if not thumbnail: + thumbnail_path = self._search_regex( + r'img\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'thumbnail', fatal=False, group='url') + if thumbnail_path: + thumbnail = urljoin(url, thumbnail_path) + + uploader = self._search_regex( + r'<div[^>]+class=["\']wt_person[^>]+>\s*<a[^>]+\bhref=["\']/user/[^>]+>([^<]+)', + webpage, 'uploader', fatal=False) + uploader_url = 'https://www.vidlii.com/user/%s' % uploader if uploader else None + + upload_date = unified_strdate(self._html_search_meta( + 'datePublished', webpage, default=None) or self._search_regex( + r'<date>([^<]+)', webpage, 'upload date', fatal=False)) + + duration = int_or_none(self._html_search_meta( + 'video:duration', webpage, 'duration', + default=None) or self._search_regex( + r'duration\s*:\s*(\d+)', webpage, 'duration', fatal=False)) + + view_count = int_or_none(self._search_regex( + (r'<strong>(\d+)</strong> views', + r'Views\s*:\s*<strong>(\d+)</strong>'), + webpage, 'view count', fatal=False)) + + comment_count = int_or_none(self._search_regex( + (r'<span[^>]+id=["\']cmt_num[^>]+>(\d+)', + r'Comments\s*:\s*<strong>(\d+)'), + webpage, 'comment count', fatal=False)) + + average_rating = float_or_none(self._search_regex( + r'rating\s*:\s*([\d.]+)', webpage, 'average rating', fatal=False)) + + category = self._html_search_regex( + r'<div>Category\s*:\s*</div>\s*<div>\s*<a[^>]+>([^<]+)', webpage, + 'category', fatal=False) + categories = [category] if category else None + + tags = [ + strip_or_none(tag) + for tag in re.findall( + r'<a[^>]+\bhref=["\']/results\?.*?q=[^>]*>([^<]+)', + webpage) if strip_or_none(tag) + ] or None + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'uploader_url': uploader_url, + 'upload_date': upload_date, + 'duration': duration, + 'view_count': view_count, + 'comment_count': comment_count, + 'average_rating': average_rating, + 'categories': categories, + 'tags': tags, + } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index ac35d55..9026e77 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -13,7 +13,7 @@ from ..utils import ( class VidziIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _VALID_URL = r'https?://(?:www\.)?vidzi\.(?:tv|cc|si)/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', @@ -32,6 +32,9 @@ class VidziIE(InfoExtractor): }, { 'url': 'http://vidzi.cc/cghql9yq6emu.html', 'only_matching': True, + }, { + 'url': 'https://vidzi.si/rph9gztxj1et.html', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index 6af7056..0825714 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -41,21 +41,30 @@ class VimeoBaseInfoExtractor(InfoExtractor): if self._LOGIN_REQUIRED: raise ExtractorError('No login info available, needed for using %s.' % self.IE_NAME, expected=True) return - self.report_login() - webpage = self._download_webpage(self._LOGIN_URL, None, False) + webpage = self._download_webpage( + self._LOGIN_URL, None, 'Downloading login page') token, vuid = self._extract_xsrft_and_vuid(webpage) - data = urlencode_postdata({ + data = { 'action': 'login', 'email': username, 'password': password, 'service': 'vimeo', 'token': token, - }) - login_request = sanitized_Request(self._LOGIN_URL, data) - login_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - login_request.add_header('Referer', self._LOGIN_URL) + } self._set_vimeo_cookie('vuid', vuid) - self._download_webpage(login_request, None, False, 'Wrong login info') + try: + self._download_webpage( + self._LOGIN_URL, None, 'Logging in', + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': self._LOGIN_URL, + }) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 418: + raise ExtractorError( + 'Unable to log in: bad username or password', + expected=True) + raise ExtractorError('Unable to log in') def _verify_video_password(self, url, video_id, webpage): password = self._downloader.params.get('videopassword') @@ -218,7 +227,7 @@ class VimeoIE(VimeoBaseInfoExtractor): 'id': '56015672', 'ext': 'mp4', 'title': "youtube-dl test video - \u2605 \" ' \u5e78 / \\ \u00e4 \u21ad \U0001d550", - 'description': 'md5:2d3305bad981a06ff79f027f19865021', + 'description': 'md5:509a9ad5c9bf97c60faee9203aca4479', 'timestamp': 1355990239, 'upload_date': '20121220', 'uploader_url': r're:https?://(?:www\.)?vimeo\.com/user7108434', diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index b8ea503..b50d4f1 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -99,10 +99,10 @@ class VKIE(VKBaseIE): _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', - 'md5': '0deae91935c54e00003c2a00646315f0', + 'md5': '7babad3b85ea2e91948005b1b8b0cb84', 'info_dict': { 'id': '162222515', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'ProtivoGunz - Хуёвая песня', 'uploader': 're:(?:Noize MC|Alexander Ilyashenko).*', 'duration': 195, diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py index 68652a2..d1bc992 100644 --- a/youtube_dl/extractor/xhamster.py +++ b/youtube_dl/extractor/xhamster.py @@ -39,7 +39,7 @@ class XHamsterIE(InfoExtractor): 'uploader': 'Ruseful2011', 'duration': 893, 'age_limit': 18, - 'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy'], + 'categories': ['Fake Hub', 'Amateur', 'MILFs', 'POV', 'Beauti', 'Beauties', 'Beautiful', 'Boss', 'Office', 'Oral', 'Reality', 'Sexy', 'Taking'], }, }, { 'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd', diff --git a/youtube_dl/extractor/xnxx.py b/youtube_dl/extractor/xnxx.py index e0a6255..ac1ccc4 100644 --- a/youtube_dl/extractor/xnxx.py +++ b/youtube_dl/extractor/xnxx.py @@ -1,19 +1,29 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote +from ..utils import ( + determine_ext, + int_or_none, + NO_DEFAULT, + str_to_int, +) class XNXXIE(InfoExtractor): _VALID_URL = r'https?://(?:video|www)\.xnxx\.com/video-?(?P<id>[0-9a-z]+)/' _TESTS = [{ 'url': 'http://www.xnxx.com/video-55awb78/skyrim_test_video', - 'md5': 'ef7ecee5af78f8b03dca2cf31341d3a0', + 'md5': '7583e96c15c0f21e9da3453d9920fbba', 'info_dict': { 'id': '55awb78', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Skyrim Test Video', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 469, + 'view_count': int, 'age_limit': 18, }, }, { @@ -26,23 +36,49 @@ class XNXXIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) - video_url = self._search_regex(r'flv_url=(.*?)&', - webpage, 'video URL') - video_url = compat_urllib_parse_unquote(video_url) + def get(meta, default=NO_DEFAULT, fatal=True): + return self._search_regex( + r'set%s\s*\(\s*(["\'])(?P<value>(?:(?!\1).)+)\1' % meta, + webpage, meta, default=default, fatal=fatal, group='value') + + title = self._og_search_title( + webpage, default=None) or get('VideoTitle') - video_title = self._html_search_regex(r'<title>(.*?)\s+-\s+XNXX.COM', - webpage, 'title') + formats = [] + for mobj in re.finditer( + r'setVideo(?:Url(?P<id>Low|High)|HLS)\s*\(\s*(?P<q>["\'])(?P<url>(?:https?:)?//.+?)(?P=q)', webpage): + format_url = mobj.group('url') + if determine_ext(format_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', entry_protocol='m3u8_native', + preference=1, m3u8_id='hls', fatal=False)) + else: + format_id = mobj.group('id') + if format_id: + format_id = format_id.lower() + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': -1 if format_id == 'low' else 0, + }) + self._sort_formats(formats) - video_thumbnail = self._search_regex(r'url_bigthumb=(.*?)&', - webpage, 'thumbnail', fatal=False) + thumbnail = self._og_search_thumbnail(webpage, default=None) or get( + 'ThumbUrl', fatal=False) or get('ThumbUrl169', fatal=False) + duration = int_or_none(self._og_search_property('duration', webpage)) + view_count = str_to_int(self._search_regex( + r'id=["\']nb-views-number[^>]+>([\d,.]+)', webpage, 'view count', + default=None)) return { 'id': video_id, - 'url': video_url, - 'title': video_title, - 'ext': 'flv', - 'thumbnail': video_thumbnail, + 'title': title, + 'thumbnail': thumbnail, + 'duration': duration, + 'view_count': view_count, 'age_limit': 18, + 'formats': formats, } diff --git a/youtube_dl/extractor/yapfiles.py b/youtube_dl/extractor/yapfiles.py new file mode 100644 index 0000000..7fafbf5 --- /dev/null +++ b/youtube_dl/extractor/yapfiles.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + qualities, + unescapeHTML, +) + + +class YapFilesIE(InfoExtractor): + _YAPFILES_URL = r'//(?:(?:www|api)\.)?yapfiles\.ru/get_player/*\?.*?\bv=(?P<id>\w+)' + _VALID_URL = r'https?:%s' % _YAPFILES_URL + _TESTS = [{ + # with hd + 'url': 'http://www.yapfiles.ru/get_player/?v=vMDE1NjcyNDUt0413', + 'md5': '2db19e2bfa2450568868548a1aa1956c', + 'info_dict': { + 'id': 'vMDE1NjcyNDUt0413', + 'ext': 'mp4', + 'title': 'Самый худший пароль WIFI', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 72, + }, + }, { + # without hd + 'url': 'https://api.yapfiles.ru/get_player/?uid=video_player_1872528&plroll=1&adv=1&v=vMDE4NzI1Mjgt690b', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [unescapeHTML(mobj.group('url')) for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?%s.*?)\1' + % YapFilesIE._YAPFILES_URL, webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, fatal=False) + + player_url = None + query = {} + if webpage: + player_url = self._search_regex( + r'player\.init\s*\(\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'player url', default=None, group='url') + + if not player_url: + player_url = 'http://api.yapfiles.ru/load/%s/' % video_id + query = { + 'md5': 'ded5f369be61b8ae5f88e2eeb2f3caff', + 'type': 'json', + 'ref': url, + } + + player = self._download_json( + player_url, video_id, query=query)['player'] + + playlist_url = player['playlist'] + title = player['title'] + thumbnail = player.get('poster') + + if title == 'Ролик удален' or 'deleted.jpg' in (thumbnail or ''): + raise ExtractorError( + 'Video %s has been removed' % video_id, expected=True) + + playlist = self._download_json( + playlist_url, video_id)['player']['main'] + + hd_height = int_or_none(player.get('hd')) + + QUALITIES = ('sd', 'hd') + quality_key = qualities(QUALITIES) + formats = [] + for format_id in QUALITIES: + is_hd = format_id == 'hd' + format_url = playlist.get( + 'file%s' % ('_hd' if is_hd else '')) + if not format_url or not isinstance(format_url, compat_str): + continue + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'quality': quality_key(format_id), + 'height': hd_height if is_hd else None, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'duration': int_or_none(player.get('length')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4305151..617be8e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1944,6 +1944,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break if codecs: dct.update(parse_codecs(codecs)) + if dct.get('acodec') == 'none' or dct.get('vcodec') == 'none': + dct['downloader_options'] = { + # Youtube throttles chunks >~10M + 'http_chunk_size': 10485760, + } formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] @@ -2446,7 +2451,7 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results|shared)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' @@ -2578,7 +2583,11 @@ class YoutubePlaylistsIE(YoutubePlaylistsBaseInfoExtractor): }] -class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): +class YoutubeSearchBaseInfoExtractor(YoutubePlaylistBaseInfoExtractor): + _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' + + +class YoutubeSearchIE(SearchInfoExtractor, YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com searches' # there doesn't appear to be a real limit, for example if you search for # 'python' you get more than 8.000.000 results @@ -2612,8 +2621,7 @@ class YoutubeSearchIE(SearchInfoExtractor, YoutubePlaylistIE): raise ExtractorError( '[youtube] No video results', expected=True) - new_videos = self._ids_to_results(orderedSet(re.findall( - r'href="/watch\?v=(.{11})', html_content))) + new_videos = list(self._process_page(html_content)) videos += new_videos if not new_videos or len(videos) > limit: break @@ -2636,11 +2644,10 @@ class YoutubeSearchDateIE(YoutubeSearchIE): _EXTRA_QUERY_ARGS = {'search_sort': 'video_date_uploaded'} -class YoutubeSearchURLIE(YoutubePlaylistBaseInfoExtractor): +class YoutubeSearchURLIE(YoutubeSearchBaseInfoExtractor): IE_DESC = 'YouTube.com search URLs' IE_NAME = 'youtube:search_url' _VALID_URL = r'https?://(?:www\.)?youtube\.com/results\?(.*?&)?(?:search_query|q)=(?P<query>[^&]+)(?:[&]|$)' - _VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})(?:[^"]*"[^>]+\btitle="(?P<title>[^"]+))?' _TESTS = [{ 'url': 'https://www.youtube.com/results?baz=bar&search_query=youtube-dl+test+video&filters=video&lclk=video', 'playlist_mincount': 5, diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 523bb5c..bb9020c 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -42,16 +42,19 @@ class ZDFIE(ZDFBaseIE): _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh') _TESTS = [{ - 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { - 'id': 'zdfmediathek-trailer-100', + 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', 'ext': 'mp4', - 'title': 'Die neue ZDFmediathek', - 'description': 'md5:3003d36487fb9a5ea2d1ff60beb55e8d', - 'duration': 30, - 'timestamp': 1477627200, - 'upload_date': '20161028', - } + 'title': 'Die Magie der Farben (2/2)', + 'description': 'md5:a89da10c928c6235401066b60a6d5c1a', + 'duration': 2615, + 'timestamp': 1465021200, + 'upload_date': '20160604', + }, + }, { + 'url': 'https://www.zdf.de/service-und-hilfe/die-neue-zdf-mediathek/zdfmediathek-trailer-100.html', + 'only_matching': True, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/die-lebenden-und-die-toten-1---ein-taunuskrimi-100.html', 'only_matching': True, diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 4c04550..7d1bbc0 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -478,6 +478,11 @@ def parseOpts(overrideArguments=None): '--no-resize-buffer', action='store_true', dest='noresizebuffer', default=False, help='Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.') + downloader.add_option( + '--http-chunk-size', + dest='http_chunk_size', metavar='SIZE', default=None, + help='Size of a chunk for chunk-based HTTP downloading (e.g. 10485760 or 10M) (default is disabled). ' + 'May be useful for bypassing bandwidth throttling imposed by a webserver (experimental)') downloader.add_option( '--test', action='store_true', dest='test', default=False, diff --git a/youtube_dl/postprocessor/embedthumbnail.py b/youtube_dl/postprocessor/embedthumbnail.py index e606a58..56be914 100644 --- a/youtube_dl/postprocessor/embedthumbnail.py +++ b/youtube_dl/postprocessor/embedthumbnail.py @@ -31,7 +31,8 @@ class EmbedThumbnailPP(FFmpegPostProcessor): temp_filename = prepend_extension(filename, 'temp') if not info.get('thumbnails'): - raise EmbedThumbnailPPError('Thumbnail was not found. Nothing to do.') + self._downloader.to_screen('[embedthumbnail] There aren\'t any thumbnails to embed') + return [], info thumbnail_filename = info['thumbnails'][-1]['filename'] diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 2fe9cf5..027d127 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -82,7 +82,7 @@ def register_socks_protocols(): compiled_regex_type = type(re.compile('')) std_headers = { - 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:10.0) Gecko/20150101 Firefox/47.0 (Chrome)', + 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:59.0) Gecko/20100101 Firefox/59.0 (Chrome)', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.7', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', @@ -538,10 +538,22 @@ def sanitize_path(s): return os.path.join(*sanitized_path) -# Prepend protocol-less URLs with `http:` scheme in order to mitigate the number of -# unwanted failures due to missing protocol def sanitize_url(url): - return 'http:%s' % url if url.startswith('//') else url + # Prepend protocol-less URLs with `http:` scheme in order to mitigate + # the number of unwanted failures due to missing protocol + if url.startswith('//'): + return 'http:%s' % url + # Fix some common typos seen so far + COMMON_TYPOS = ( + # https://github.com/rg3/youtube-dl/issues/15649 + (r'^httpss://', r'https://'), + # https://bx1.be/lives/direct-tv/ + (r'^rmtp([es]?)://', r'rtmp\1://'), + ) + for mistake, fixup in COMMON_TYPOS: + if re.match(mistake, url): + return re.sub(mistake, fixup, url) + return url def sanitized_Request(url, *args, **kwargs): @@ -866,8 +878,8 @@ def _create_http_connection(ydl_handler, http_class, is_https, *args, **kwargs): # expected HTTP responses to meet HTTP/1.0 or later (see also # https://github.com/rg3/youtube-dl/issues/6727) if sys.version_info < (3, 0): - kwargs[b'strict'] = True - hc = http_class(*args, **kwargs) + kwargs['strict'] = True + hc = http_class(*args, **compat_kwargs(kwargs)) source_address = ydl_handler._params.get('source_address') if source_address is not None: sa = (source_address, 0) @@ -1199,6 +1211,11 @@ def unified_timestamp(date_str, day_first=True): if m: date_str = date_str[:-len(m.group('tz'))] + # Python only supports microseconds, so remove nanoseconds + m = re.search(r'^([0-9]{4,}-[0-9]{1,2}-[0-9]{1,2}T[0-9]{1,2}:[0-9]{1,2}:[0-9]{1,2}\.[0-9]{6})[0-9]+$', date_str) + if m: + date_str = m.group(1) + for expression in date_formats(day_first): try: dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) @@ -1677,6 +1694,28 @@ def parse_count(s): return lookup_unit_table(_UNIT_TABLE, s) +def parse_resolution(s): + if s is None: + return {} + + mobj = re.search(r'\b(?P<w>\d+)\s*[xX×]\s*(?P<h>\d+)\b', s) + if mobj: + return { + 'width': int(mobj.group('w')), + 'height': int(mobj.group('h')), + } + + mobj = re.search(r'\b(\d+)[pPiI]\b', s) + if mobj: + return {'height': int(mobj.group(1))} + + mobj = re.search(r'\b([48])[kK]\b', s) + if mobj: + return {'height': int(mobj.group(1)) * 540} + + return {} + + def month_by_name(name, lang='en'): """ Return the number of a month by (locale-independently) English name """ diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 8a2b57f..6ce11c3 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2018.01.27' +__version__ = '2018.03.14'