From: Rogério Brito <rbrito@ime.usp.br> Date: Wed, 8 Nov 2017 06:34:09 +0000 (-0200) Subject: Update upstream source from tag 'upstream/2017.11.06' X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/421b7b220ea958384617bd7d339f188bf601280e?hp=cb145f3364e1a9778509df08a6404bf697d67b90 Update upstream source from tag 'upstream/2017.11.06' Update to upstream version '2017.11.06' with Debian dir 5931589e7c307c2de69c9e3c4a7d50442dd2bf0a --- diff --git a/ChangeLog b/ChangeLog index d728e4d..8af3682 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,64 @@ +version 2017.11.06 + +Core ++ [extractor/common] Add protocol for f4m formats +* [f4m] Prefer baseURL for relative URLs (#14660) +* [extractor/common] Respect URL query in _extract_wowza_formats (14645) + +Extractors ++ [hotstar:playlist] Add support for playlists (#12465) +* [hotstar] Bypass geo restriction (#14672) +- [22tracks] Remove extractor (#11024, #14628) ++ [skysport] Sdd support ooyala videos protected with embed_token (#14641) +* [gamespot] Extract formats referenced with new data fields (#14652) +* [spankbang] Detect unavailable videos (#14644) + + +version 2017.10.29 + +Core +* [extractor/common] Prefix format id for audio only HLS formats ++ [utils] Add support for zero years and months in parse_duration + +Extractors +* [egghead] Fix extraction (#14388) ++ [fxnetworks] Extract series metadata (#14603) ++ [younow] Add support for younow.com (#9255, #9432, #12436) +* [dctptv] Fix extraction (#14599) +* [youtube] Restrict embed regex (#14600) +* [vimeo] Restrict iframe embed regex (#14600) +* [soundgasm] Improve extraction (#14588) +- [myvideo] Remove extractor (#8557) ++ [nbc] Add support for classic-tv videos (#14575) ++ [vrtnu] Add support for cookies authentication and simplify (#11873) ++ [canvas] Add support for vrt.be/vrtnu (#11873) +* [twitch:clips] Fix title extraction (#14566) ++ [ndtv] Add support for sub-sites (#14534) +* [dramafever] Fix login error message extraction ++ [nick] Add support for more nickelodeon sites (no, dk, se, ch, fr, es, pt, + ro, hu) (#14553) + + +version 2017.10.20 + +Core +* [downloader/fragment] Report warning instead of error on inconsistent + download state +* [downloader/hls] Fix total fragments count when ad fragments exist + +Extractors +* [parliamentliveuk] Fix extraction (#14524) +* [soundcloud] Update client id (#14546) ++ [servus] Add support for servus.com (#14362) ++ [unity] Add support for unity3d.com (#14528) +* [youtube] Replace youtube redirect URLs in description (#14517) +* [pbs] Restrict direct video URL regular expression (#14519) +* [drtv] Respect preference for direct HTTP formats (#14509) ++ [eporner] Add support for embed URLs (#14507) +* [arte] Capture and output error message +* [niconico] Improve uploader metadata extraction robustness (#14135) + + version 2017.10.15.1 Core @@ -834,7 +895,7 @@ version 2017.04.14 Core + [downloader/hls] Add basic support for EXT-X-BYTERANGE tag (#10955) -+ [adobepass] Improve Comcast and Verison login code (#10803) ++ [adobepass] Improve Comcast and Verizon login code (#10803) + [adobepass] Add support for Verizon (#10803) Extractors diff --git a/README.md b/README.md index 2879aad..ea321d5 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ +[](https://travis-ci.org/rg3/youtube-dl) + youtube-dl - download videos from youtube.com or other video platforms - [INSTALLATION](#installation) diff --git a/README.txt b/README.txt index a42d837..4b7adfd 100644 --- a/README.txt +++ b/README.txt @@ -1,3 +1,5 @@ +[Build Status] + youtube-dl - download videos from youtube.com or other video platforms - INSTALLATION diff --git a/docs/supportedsites.md b/docs/supportedsites.md index 7071450..6009df5 100644 --- a/docs/supportedsites.md +++ b/docs/supportedsites.md @@ -3,8 +3,6 @@ - **1up.com** - **20min** - **220.ro** - - **22tracks:genre** - - **22tracks:track** - **24video** - **3qsdn**: 3Q SDN - **3sat** @@ -342,6 +340,7 @@ - **HornBunny** - **HotNewHipHop** - **HotStar** + - **hotstar:playlist** - **Howcast** - **HowStuffWorks** - **HRTi** @@ -498,7 +497,6 @@ - **MySpace:album** - **MySpass** - **Myvi** - - **myvideo** (Currently broken) - **MyVidster** - **n-tv.de** - **natgeo** @@ -728,6 +726,7 @@ - **SenateISVP** - **SendtoNews** - **ServingSys** + - **Servus** - **Sexu** - **Shahid** - **Shared**: shared.sx @@ -887,6 +886,7 @@ - **UDNEmbed**: è¯åå½±é³ - **UKTVPlay** - **Unistra** + - **Unity** - **uol.com.br** - **uplynk** - **uplynk:preplay** @@ -975,6 +975,7 @@ - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl - **Vrak** - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be + - **VrtNU**: VrtNU.be - **vrv** - **vrv:series** - **VShare** @@ -1033,6 +1034,9 @@ - **YouJizz** - **youku**: ä¼é · - **youku:show** + - **YouNowChannel** + - **YouNowLive** + - **YouNowMoment** - **YouPorn** - **YourUpload** - **youtube**: YouTube.com diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index f18a823..686c63e 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -574,6 +574,32 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ self.ie._sort_formats(formats) expect_value(self, formats, expected_formats, None) + def test_parse_f4m_formats(self): + _TEST_CASES = [ + ( + # https://github.com/rg3/youtube-dl/issues/14660 + 'custom_base_url', + 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + [{ + 'manifest_url': 'http://api.new.livestream.com/accounts/6115179/events/6764928/videos/144884262.f4m', + 'ext': 'flv', + 'format_id': '2148', + 'protocol': 'f4m', + 'tbr': 2148, + 'width': 1280, + 'height': 720, + }] + ), + ] + + for f4m_file, f4m_url, expected_formats in _TEST_CASES: + with io.open('./test/testdata/f4m/%s.f4m' % f4m_file, + mode='r', encoding='utf-8') as f: + formats = self.ie._parse_f4m_formats( + compat_etree_fromstring(f.read().encode('utf-8')), + f4m_url, None) + self.ie._sort_formats(formats) + expect_value(self, formats, expected_formats, None) if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index efa73d0..cc13f79 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -540,6 +540,7 @@ class TestUtil(unittest.TestCase): self.assertEqual(parse_duration('87 Min.'), 5220) self.assertEqual(parse_duration('PT1H0.040S'), 3600.04) self.assertEqual(parse_duration('PT00H03M30SZ'), 210) + self.assertEqual(parse_duration('P0Y0M0DT0H4M20.880S'), 260.88) def test_fix_xml_ampersands(self): self.assertEqual( diff --git a/youtube-dl b/youtube-dl index 15c016a..3b69288 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index c8fde9a..fdb80f4 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -243,8 +243,17 @@ def remove_encrypted_media(media): media)) -def _add_ns(prop): - return '{http://ns.adobe.com/f4m/1.0}%s' % prop +def _add_ns(prop, ver=1): + return '{http://ns.adobe.com/f4m/%d.0}%s' % (ver, prop) + + +def get_base_url(manifest): + base_url = xpath_text( + manifest, [_add_ns('baseURL'), _add_ns('baseURL', 2)], + 'base URL', default=None) + if base_url: + base_url = base_url.strip() + return base_url class F4mFD(FragmentFD): @@ -330,13 +339,13 @@ class F4mFD(FragmentFD): rate, media = list(filter( lambda f: int(f[0]) == requested_bitrate, formats))[0] - base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) + # Prefer baseURL for relative URLs as per 11.2 of F4M 3.0 spec. + man_base_url = get_base_url(doc) or man_url + + base_url = compat_urlparse.urljoin(man_base_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - # From Adobe F4M 3.0 spec: - # The <baseURL> element SHALL be the base URL for all relative - # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said - # URLs should be relative to the location of the containing document. - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) + boot_info, bootstrap_url = self._parse_bootstrap_node( + bootstrap_node, man_base_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py index 7e891b9..93002e4 100644 --- a/youtube_dl/downloader/fragment.py +++ b/youtube_dl/downloader/fragment.py @@ -158,7 +158,7 @@ class FragmentFD(FileDownloader): if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))): self._read_ytdl_file(ctx) if ctx['fragment_index'] > 0 and resume_len == 0: - self.report_error( + self.report_warning( 'Inconsistent state of incomplete fragment download. ' 'Restarting from the beginning...') ctx['fragment_index'] = resume_len = 0 diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 7955ca5..1a6e226 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -88,6 +88,7 @@ class HlsFD(FragmentFD): if line.startswith('#'): if anvato_ad(line): ad_frags += 1 + ad_frag_next = True continue if ad_frag_next: ad_frag_next = False diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 5cde90c..ffc3218 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, + compat_str, compat_urllib_parse_urlparse, ) from ..utils import ( @@ -15,6 +16,7 @@ from ..utils import ( int_or_none, NO_DEFAULT, qualities, + try_get, unified_strdate, ) @@ -80,12 +82,15 @@ class ArteTVBaseIE(InfoExtractor): info = self._download_json(json_url, video_id) player_info = info['videoJsonPlayer'] - vsr = player_info['VSR'] - + vsr = try_get(player_info, lambda x: x['VSR'], dict) if not vsr: - raise ExtractorError( - 'Video %s is not available' % player_info.get('VID') or video_id, - expected=True) + error = None + if try_get(player_info, lambda x: x['custom_msg']['type']) == 'error': + error = try_get( + player_info, lambda x: x['custom_msg']['msg'], compat_str) + if not error: + error = 'Video %s is not available' % player_info.get('VID') or video_id + raise ExtractorError(error, expected=True) upload_date_str = player_info.get('shootingDate') if not upload_date_str: diff --git a/youtube_dl/extractor/azmedien.py b/youtube_dl/extractor/azmedien.py index f4e07d9..68f26e2 100644 --- a/youtube_dl/extractor/azmedien.py +++ b/youtube_dl/extractor/azmedien.py @@ -47,7 +47,7 @@ class AZMedienIE(AZMedienBaseIE): 'url': 'http://www.telezueri.ch/62-show-zuerinews/13772-episode-sonntag-18-dezember-2016/32419-segment-massenabweisungen-beim-hiltl-club-wegen-pelzboom', 'info_dict': { 'id': '1_2444peh4', - 'ext': 'mov', + 'ext': 'mp4', 'title': 'Massenabweisungen beim Hiltl Club wegen Pelzboom', 'description': 'md5:9ea9dd1b159ad65b36ddcf7f0d7c76a8', 'uploader_id': 'TeleZ?ri', diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 6899f84..3faa760 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -1,16 +1,22 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor +from .gigya import GigyaBaseIE +from ..compat import compat_HTTPError from ..utils import ( - float_or_none, + ExtractorError, strip_or_none, + float_or_none, + int_or_none, + parse_iso8601, ) class CanvasIE(InfoExtractor): - _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet)/assets/(?P<id>m[dz]-ast-[^/?#&]+)' + _VALID_URL = r'https?://mediazone\.vrt\.be/api/v1/(?P<site_id>canvas|een|ketnet|vrtvideo)/assets/(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'https://mediazone.vrt.be/api/v1/ketnet/assets/md-ast-4ac54990-ce66-4d00-a8ca-9eac86f4c475', 'md5': '90139b746a0a9bd7bb631283f6e2a64e', @@ -166,3 +172,139 @@ class CanvasEenIE(InfoExtractor): 'title': title, 'description': self._og_search_description(webpage), } + + +class VrtNUIE(GigyaBaseIE): + IE_DESC = 'VrtNU.be' + _VALID_URL = r'https?://(?:www\.)?vrt\.be/(?P<site_id>vrtnu)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.vrt.be/vrtnu/a-z/postbus-x/1/postbus-x-s1a1/', + 'info_dict': { + 'id': 'pbs-pub-2e2d8c27-df26-45c9-9dc6-90c78153044d$vid-90c932b1-e21d-4fb8-99b1-db7b49cf74de', + 'ext': 'flv', + 'title': 'De zwarte weduwe', + 'description': 'md5:d90c21dced7db869a85db89a623998d4', + 'duration': 1457.04, + 'thumbnail': r're:^https?://.*\.jpg$', + 'season': '1', + 'season_number': 1, + 'episode_number': 1, + }, + 'skip': 'This video is only available for registered users' + }] + _NETRC_MACHINE = 'vrtnu' + _APIKEY = '3_0Z2HujMtiWq_pkAjgnS2Md2E11a1AwZjYiBETtwNE-EoEHDINgtnvcAOpNgmrVGy' + _CONTEXT_ID = 'R3595707040' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + auth_data = { + 'APIKey': self._APIKEY, + 'targetEnv': 'jssdk', + 'loginID': username, + 'password': password, + 'authMode': 'cookie', + } + + auth_info = self._gigya_login(auth_data) + + # Sometimes authentication fails for no good reason, retry + login_attempt = 1 + while login_attempt <= 3: + try: + # When requesting a token, no actual token is returned, but the + # necessary cookies are set. + self._request_webpage( + 'https://token.vrt.be', + None, note='Requesting a token', errnote='Could not get a token', + headers={ + 'Content-Type': 'application/json', + 'Referer': 'https://www.vrt.be/vrtnu/', + }, + data=json.dumps({ + 'uid': auth_info['UID'], + 'uidsig': auth_info['UIDSignature'], + 'ts': auth_info['signatureTimestamp'], + 'email': auth_info['profile']['email'], + }).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + login_attempt += 1 + self.report_warning('Authentication failed') + self._sleep(1, None, msg_template='Waiting for %(timeout)s seconds before trying again') + else: + raise e + else: + break + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + title = self._html_search_regex( + r'(?ms)<h1 class="content__heading">(.+?)</h1>', + webpage, 'title').strip() + + description = self._html_search_regex( + r'(?ms)<div class="content__description">(.+?)</div>', + webpage, 'description', default=None) + + season = self._html_search_regex( + [r'''(?xms)<div\ class="tabs__tab\ tabs__tab--active">\s* + <span>seizoen\ (.+?)</span>\s* + </div>''', + r'<option value="seizoen (\d{1,3})" data-href="[^"]+?" selected>'], + webpage, 'season', default=None) + + season_number = int_or_none(season) + + episode_number = int_or_none(self._html_search_regex( + r'''(?xms)<div\ class="content__episode">\s* + <abbr\ title="aflevering">afl</abbr>\s*<span>(\d+)</span> + </div>''', + webpage, 'episode_number', default=None)) + + release_date = parse_iso8601(self._html_search_regex( + r'(?ms)<div class="content__broadcastdate">\s*<time\ datetime="(.+?)"', + webpage, 'release_date', default=None)) + + # If there's a ? or a # in the URL, remove them and everything after + clean_url = url.split('?')[0].split('#')[0].strip('/') + securevideo_url = clean_url + '.mssecurevideo.json' + + try: + video = self._download_json(securevideo_url, display_id) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + self.raise_login_required() + raise + + # We are dealing with a '../<show>.relevant' URL + redirect_url = video.get('url') + if redirect_url: + return self.url_result(self._proto_relative_url(redirect_url, 'https:')) + + # There is only one entry, but with an unknown key, so just get + # the first one + video_id = list(video.values())[0].get('videoid') + + return { + '_type': 'url_transparent', + 'url': 'https://mediazone.vrt.be/api/v1/vrtvideo/assets/%s' % video_id, + 'ie_key': CanvasIE.ie_key(), + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'season': season, + 'season_number': season_number, + 'episode_number': episode_number, + 'release_date': release_date, + } diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index a692406..e2d9f52 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -29,7 +29,10 @@ from ..compat import ( compat_urlparse, compat_xml_parse_error, ) -from ..downloader.f4m import remove_encrypted_media +from ..downloader.f4m import ( + get_base_url, + remove_encrypted_media, +) from ..utils import ( NO_DEFAULT, age_restricted, @@ -1239,11 +1242,8 @@ class InfoExtractor(object): media_nodes = remove_encrypted_media(media_nodes) if not media_nodes: return formats - base_url = xpath_text( - manifest, ['{http://ns.adobe.com/f4m/1.0}baseURL', '{http://ns.adobe.com/f4m/2.0}baseURL'], - 'base URL', default=None) - if base_url: - base_url = base_url.strip() + + manifest_base_url = get_base_url(manifest) bootstrap_info = xpath_element( manifest, ['{http://ns.adobe.com/f4m/1.0}bootstrapInfo', '{http://ns.adobe.com/f4m/2.0}bootstrapInfo'], @@ -1275,7 +1275,7 @@ class InfoExtractor(object): continue manifest_url = ( media_url if media_url.startswith('http://') or media_url.startswith('https://') - else ((base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) + else ((manifest_base_url or '/'.join(manifest_url.split('/')[:-1])) + '/' + media_url)) # If media_url is itself a f4m manifest do the recursive extraction # since bitrates in parent manifest (this one) and media_url manifest # may differ leading to inability to resolve the format by requested @@ -1310,6 +1310,7 @@ class InfoExtractor(object): 'url': manifest_url, 'manifest_url': manifest_url, 'ext': 'flv' if bootstrap_info is not None else None, + 'protocol': 'f4m', 'tbr': tbr, 'width': width, 'height': height, @@ -1401,7 +1402,7 @@ class InfoExtractor(object): media_url = media.get('URI') if media_url: format_id = [] - for v in (group_id, name): + for v in (m3u8_id, group_id, name): if v: format_id.append(v) f = { @@ -2233,27 +2234,35 @@ class InfoExtractor(object): return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): + query = compat_urlparse.urlparse(url).query url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) url_base = self._search_regex( r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') http_base_url = '%s:%s' % ('http', url_base) formats = [] + + def manifest_url(manifest): + m_url = '%s/%s' % (http_base_url, manifest) + if query: + m_url += '?%s' % query + return m_url + if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( - http_base_url + '/playlist.m3u8', video_id, 'mp4', + manifest_url('playlist.m3u8'), video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) if 'f4m' not in skip_protocols: formats.extend(self._extract_f4m_formats( - http_base_url + '/manifest.f4m', + manifest_url('manifest.f4m'), video_id, f4m_id='hds', fatal=False)) if 'dash' not in skip_protocols: formats.extend(self._extract_mpd_formats( - http_base_url + '/manifest.mpd', + manifest_url('manifest.mpd'), video_id, mpd_id='dash', fatal=False)) if re.search(r'(?:/smil:|\.smil)', url_base): if 'smil' not in skip_protocols: rtmp_formats = self._extract_smil_formats( - http_base_url + '/jwplayer.smil', + manifest_url('jwplayer.smil'), video_id, fatal=False) for rtmp_format in rtmp_formats: rtsp_format = rtmp_format.copy() diff --git a/youtube_dl/extractor/dctp.py b/youtube_dl/extractor/dctp.py index 00fbbff..3a6d056 100644 --- a/youtube_dl/extractor/dctp.py +++ b/youtube_dl/extractor/dctp.py @@ -2,53 +2,85 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import unified_strdate +from ..compat import compat_str +from ..utils import ( + float_or_none, + unified_strdate, +) class DctpTvIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(#/)?filme/(?P<id>.+?)/$' + _VALID_URL = r'https?://(?:www\.)?dctp\.tv/(?:#/)?filme/(?P<id>[^/?#&]+)' _TEST = { 'url': 'http://www.dctp.tv/filme/videoinstallation-fuer-eine-kaufhausfassade/', - 'md5': '174dd4a8a6225cf5655952f969cfbe24', 'info_dict': { 'id': '95eaa4f33dad413aa17b4ee613cccc6c', 'display_id': 'videoinstallation-fuer-eine-kaufhausfassade', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Videoinstallation für eine Kaufhausfassade', 'description': 'Kurzfilm', 'upload_date': '20110407', 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 71.24, + }, + 'params': { + # rtmp download + 'skip_download': True, }, } def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + display_id = self._match_id(url) - object_id = self._html_search_meta('DC.identifier', webpage) + webpage = self._download_webpage(url, display_id) - servers_json = self._download_json( - 'http://www.dctp.tv/elastic_streaming_client/get_streaming_server/', - video_id, note='Downloading server list') - server = servers_json[0]['server'] - m3u8_path = self._search_regex( - r'\'([^\'"]+/playlist\.m3u8)"', webpage, 'm3u8 path') - formats = self._extract_m3u8_formats( - 'http://%s%s' % (server, m3u8_path), video_id, ext='mp4', - entry_protocol='m3u8_native') + video_id = self._html_search_meta( + 'DC.identifier', webpage, 'video id', + default=None) or self._search_regex( + r'id=["\']uuid[^>]+>([^<]+)<', webpage, 'video id') title = self._og_search_title(webpage) + + servers = self._download_json( + 'http://www.dctp.tv/streaming_servers/', display_id, + note='Downloading server list', fatal=False) + + if servers: + endpoint = next( + server['endpoint'] + for server in servers + if isinstance(server.get('endpoint'), compat_str) and + 'cloudfront' in server['endpoint']) + else: + endpoint = 'rtmpe://s2pqqn4u96e4j8.cloudfront.net/cfx/st/' + + app = self._search_regex( + r'^rtmpe?://[^/]+/(?P<app>.*)$', endpoint, 'app') + + formats = [{ + 'url': endpoint, + 'app': app, + 'play_path': 'mp4:%s_dctp_0500_4x3.m4v' % video_id, + 'page_url': url, + 'player_url': 'http://svm-prod-dctptv-static.s3.amazonaws.com/dctptv-relaunch2012-109.swf', + 'ext': 'flv', + }] + description = self._html_search_meta('DC.description', webpage) upload_date = unified_strdate( self._html_search_meta('DC.date.created', webpage)) thumbnail = self._og_search_thumbnail(webpage) + duration = float_or_none(self._search_regex( + r'id=["\']duration_in_ms[^+]>(\d+)', webpage, 'duration', + default=None), scale=1000) return { - 'id': object_id, + 'id': video_id, 'title': title, 'formats': formats, - 'display_id': video_id, + 'display_id': display_id, 'description': description, 'upload_date': upload_date, 'thumbnail': thumbnail, + 'duration': duration, } diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py index 9a498d7..95883a0 100644 --- a/youtube_dl/extractor/dramafever.py +++ b/youtube_dl/extractor/dramafever.py @@ -59,7 +59,7 @@ class DramaFeverBaseIE(AMPIE): if all(logout_pattern not in response for logout_pattern in ['href="/accounts/logout/"', '>Log out<']): error = self._html_search_regex( - r'(?s)class="hidden-xs prompt"[^>]*>(.+?)<', + r'(?s)<h\d[^>]+\bclass="hidden-xs prompt"[^>]*>(.+?)</h\d', response, 'error message', default=None) if error: raise ExtractorError('Unable to login: %s' % error, expected=True) diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index 69effba..f757745 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -138,6 +138,7 @@ class DRTVIE(InfoExtractor): 'tbr': int_or_none(bitrate), 'ext': link.get('FileFormat'), 'vcodec': 'none' if kind == 'AudioResource' else None, + 'preference': preference, }) subtitles_list = asset.get('SubtitlesList') if isinstance(subtitles_list, list): diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py index e4a3046..edabaaf 100644 --- a/youtube_dl/extractor/egghead.py +++ b/youtube_dl/extractor/egghead.py @@ -2,7 +2,9 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + determine_ext, int_or_none, try_get, unified_timestamp, @@ -17,7 +19,7 @@ class EggheadCourseIE(InfoExtractor): 'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript', 'playlist_count': 29, 'info_dict': { - 'id': 'professor-frisby-introduces-composable-functional-javascript', + 'id': '72', 'title': 'Professor Frisby Introduces Composable Functional JavaScript', 'description': 're:(?s)^This course teaches the ubiquitous.*You\'ll start composing functionality before you know it.$', }, @@ -26,14 +28,28 @@ class EggheadCourseIE(InfoExtractor): def _real_extract(self, url): playlist_id = self._match_id(url) + lessons = self._download_json( + 'https://egghead.io/api/v1/series/%s/lessons' % playlist_id, + playlist_id, 'Downloading course lessons JSON') + + entries = [] + for lesson in lessons: + lesson_url = lesson.get('http_url') + if not lesson_url or not isinstance(lesson_url, compat_str): + continue + lesson_id = lesson.get('id') + if lesson_id: + lesson_id = compat_str(lesson_id) + entries.append(self.url_result( + lesson_url, ie=EggheadLessonIE.ie_key(), video_id=lesson_id)) + course = self._download_json( - 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id) + 'https://egghead.io/api/v1/series/%s' % playlist_id, + playlist_id, 'Downloading course JSON', fatal=False) or {} - entries = [ - self.url_result( - 'wistia:%s' % lesson['wistia_id'], ie='Wistia', - video_id=lesson['wistia_id'], video_title=lesson.get('title')) - for lesson in course['lessons'] if lesson.get('wistia_id')] + playlist_id = course.get('id') + if playlist_id: + playlist_id = compat_str(playlist_id) return self.playlist_result( entries, playlist_id, course.get('title'), @@ -43,11 +59,12 @@ class EggheadCourseIE(InfoExtractor): class EggheadLessonIE(InfoExtractor): IE_DESC = 'egghead.io lesson' IE_NAME = 'egghead:lesson' - _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)' - _TEST = { + _VALID_URL = r'https://egghead\.io/(?:api/v1/)?lessons/(?P<id>[^/?#&]+)' + _TESTS = [{ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box', 'info_dict': { - 'id': 'fv5yotjxcg', + 'id': '1196', + 'display_id': 'javascript-linear-data-flow-with-container-style-types-box', 'ext': 'mp4', 'title': 'Create linear data flow with container style types (Box)', 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e', @@ -60,25 +77,51 @@ class EggheadLessonIE(InfoExtractor): }, 'params': { 'skip_download': True, + 'format': 'bestvideo', }, - } + }, { + 'url': 'https://egghead.io/api/v1/lessons/react-add-redux-to-a-react-application', + 'only_matching': True, + }] def _real_extract(self, url): - lesson_id = self._match_id(url) + display_id = self._match_id(url) lesson = self._download_json( - 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id) + 'https://egghead.io/api/v1/lessons/%s' % display_id, display_id) + + lesson_id = compat_str(lesson['id']) + title = lesson['title'] + + formats = [] + for _, format_url in lesson['media_urls'].items(): + if not format_url or not isinstance(format_url, compat_str): + continue + ext = determine_ext(format_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + format_url, lesson_id, 'mp4', entry_protocol='m3u8', + m3u8_id='hls', fatal=False)) + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + format_url, lesson_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': format_url, + }) + self._sort_formats(formats) return { - '_type': 'url_transparent', - 'ie_key': 'Wistia', - 'url': 'wistia:%s' % lesson['wistia_id'], - 'id': lesson['wistia_id'], - 'title': lesson.get('title'), + 'id': lesson_id, + 'display_id': display_id, + 'title': title, 'description': lesson.get('summary'), 'thumbnail': lesson.get('thumb_nail'), 'timestamp': unified_timestamp(lesson.get('published_at')), 'duration': int_or_none(lesson.get('duration')), 'view_count': int_or_none(lesson.get('plays_count')), 'tags': try_get(lesson, lambda x: x['tag_list'], list), + 'series': try_get( + lesson, lambda x: x['series']['title'], compat_str), + 'formats': formats, } diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index f3734e9..81f2e2e 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -15,7 +15,7 @@ from ..utils import ( class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/(?:hd-porn|embed)/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', @@ -35,6 +35,9 @@ class EpornerIE(InfoExtractor): }, { 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index ecb33bc..d084707 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -150,6 +150,7 @@ from .canalc2 import Canalc2IE from .canvas import ( CanvasIE, CanvasEenIE, + VrtNUIE, ) from .carambatv import ( CarambaTVIE, @@ -431,7 +432,10 @@ from .hitbox import HitboxIE, HitboxLiveIE from .hitrecord import HitRecordIE from .hornbunny import HornBunnyIE from .hotnewhiphop import HotNewHipHopIE -from .hotstar import HotStarIE +from .hotstar import ( + HotStarIE, + HotStarPlaylistIE, +) from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE from .hrti import ( @@ -623,7 +627,6 @@ from .mwave import MwaveIE, MwaveMeetGreetIE from .myspace import MySpaceIE, MySpaceAlbumIE from .myspass import MySpassIE from .myvi import MyviIE -from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import ( NationalGeographicVideoIE, @@ -925,6 +928,7 @@ from .seeker import SeekerIE from .senateisvp import SenateISVPIE from .sendtonews import SendtoNewsIE from .servingsys import ServingSysIE +from .servus import ServusIE from .sexu import SexuIE from .shahid import ShahidIE from .shared import ( @@ -1109,10 +1113,6 @@ from .tvplayer import TVPlayerIE from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE -from .twentytwotracks import ( - TwentyTwoTracksIE, - TwentyTwoTracksGenreIE -) from .twitch import ( TwitchVideoIE, TwitchChapterIE, @@ -1138,6 +1138,7 @@ from .udn import UDNEmbedIE from .uktvplay import UKTVPlayIE from .digiteka import DigitekaIE from .unistra import UnistraIE +from .unity import UnityIE from .uol import UOLIE from .uplynk import ( UplynkIE, @@ -1333,6 +1334,11 @@ from .youku import ( YoukuIE, YoukuShowIE, ) +from .younow import ( + YouNowLiveIE, + YouNowChannelIE, + YouNowMomentIE, +) from .youporn import YouPornIE from .yourupload import YourUploadIE from .youtube import ( diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py index 6298973..37549fb 100644 --- a/youtube_dl/extractor/fxnetworks.py +++ b/youtube_dl/extractor/fxnetworks.py @@ -3,27 +3,31 @@ from __future__ import unicode_literals from .adobepass import AdobePassIE from ..utils import ( - update_url_query, extract_attributes, + int_or_none, parse_age_limit, smuggle_url, + update_url_query, ) class FXNetworksIE(AdobePassIE): _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' _TESTS = [{ - 'url': 'http://www.fxnetworks.com/video/719841347694', - 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'url': 'http://www.fxnetworks.com/video/1032565827847', + 'md5': '8d99b97b4aa7a202f55b6ed47ea7e703', 'info_dict': { - 'id': '719841347694', + 'id': 'dRzwHC_MMqIv', 'ext': 'mp4', - 'title': 'Vanpage', - 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'title': 'First Look: Better Things - Season 2', + 'description': 'Because real life is like a fart. Watch this FIRST LOOK to see what inspired the new season of Better Things.', 'age_limit': 14, 'uploader': 'NEWA-FNG-FX', - 'upload_date': '20160706', - 'timestamp': 1467844741, + 'upload_date': '20170825', + 'timestamp': 1503686274, + 'episode_number': 0, + 'season_number': 2, + 'series': 'Better Things', }, 'add_ie': ['ThePlatform'], }, { @@ -64,6 +68,9 @@ class FXNetworksIE(AdobePassIE): 'id': video_id, 'title': title, 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'series': video_data.get('data-show-title'), + 'episode_number': int_or_none(video_data.get('data-episode')), + 'season_number': int_or_none(video_data.get('data-season')), 'thumbnail': video_data.get('data-large-thumb'), 'age_limit': parse_age_limit(rating), 'ie_key': 'ThePlatform', diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 02804d2..6d177cb 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -14,7 +14,7 @@ from ..utils import ( class GameSpotIE(OnceIE): - _VALID_URL = r'https?://(?:www\.)?gamespot\.com/.*-(?P<id>\d+)/?' + _VALID_URL = r'https?://(?:www\.)?gamespot\.com/videos/(?:[^/]+/\d+-|embed/)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.gamespot.com/videos/arma-3-community-guide-sitrep-i/2300-6410818/', 'md5': 'b2a30deaa8654fcccd43713a6b6a4825', @@ -35,6 +35,9 @@ class GameSpotIE(OnceIE): 'params': { 'skip_download': True, # m3u8 downloads }, + }, { + 'url': 'https://www.gamespot.com/videos/embed/6439218/', + 'only_matching': True, }] def _real_extract(self, url): @@ -52,7 +55,7 @@ class GameSpotIE(OnceIE): manifest_url = f4m_url formats.extend(self._extract_f4m_formats( f4m_url + '?hdcore=3.7.0', page_id, f4m_id='hds', fatal=False)) - m3u8_url = streams.get('m3u8_stream') + m3u8_url = dict_get(streams, ('m3u8_stream', 'adaptive_stream')) if m3u8_url: manifest_url = m3u8_url m3u8_formats = self._extract_m3u8_formats( @@ -60,7 +63,7 @@ class GameSpotIE(OnceIE): m3u8_id='hls', fatal=False) formats.extend(m3u8_formats) progressive_url = dict_get( - streams, ('progressive_hd', 'progressive_high', 'progressive_low')) + streams, ('progressive_hd', 'progressive_high', 'progressive_low', 'other_lr')) if progressive_url and manifest_url: qualities_basename = self._search_regex( r'/([^/]+)\.csmil/', diff --git a/youtube_dl/extractor/gigya.py b/youtube_dl/extractor/gigya.py new file mode 100644 index 0000000..4121784 --- /dev/null +++ b/youtube_dl/extractor/gigya.py @@ -0,0 +1,22 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor + +from ..utils import ( + ExtractorError, + urlencode_postdata, +) + + +class GigyaBaseIE(InfoExtractor): + def _gigya_login(self, auth_data): + auth_info = self._download_json( + 'https://accounts.eu1.gigya.com/accounts.login', None, + note='Logging in', errnote='Unable to log in', + data=urlencode_postdata(auth_data)) + + error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') + if error_message: + raise ExtractorError( + 'Unable to login: %s' % error_message, expected=True) + return auth_info diff --git a/youtube_dl/extractor/hotstar.py b/youtube_dl/extractor/hotstar.py index 3a7a66a..d28af36 100644 --- a/youtube_dl/extractor/hotstar.py +++ b/youtube_dl/extractor/hotstar.py @@ -1,22 +1,47 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, int_or_none, ) -class HotStarIE(InfoExtractor): +class HotStarBaseIE(InfoExtractor): + _GEO_COUNTRIES = ['IN'] + + def _download_json(self, *args, **kwargs): + response = super(HotStarBaseIE, self)._download_json(*args, **kwargs) + if response['resultCode'] != 'OK': + if kwargs.get('fatal'): + raise ExtractorError( + response['errorDescription'], expected=True) + return None + return response['resultObj'] + + def _download_content_info(self, content_id): + return self._download_json( + 'https://account.hotstar.com/AVS/besc', content_id, query={ + 'action': 'GetAggregatedContentDetails', + 'appVersion': '5.0.40', + 'channel': 'PCTV', + 'contentId': content_id, + })['contentInfo'][0] + + +class HotStarIE(HotStarBaseIE): _VALID_URL = r'https?://(?:www\.)?hotstar\.com/(?:.+?[/-])?(?P<id>\d{10})' _TESTS = [{ 'url': 'http://www.hotstar.com/on-air-with-aib--english-1000076273', 'info_dict': { 'id': '1000076273', 'ext': 'mp4', - 'title': 'On Air With AIB - English', + 'title': 'On Air With AIB', 'description': 'md5:c957d8868e9bc793ccb813691cc4c434', 'timestamp': 1447227000, 'upload_date': '20151111', @@ -34,23 +59,11 @@ class HotStarIE(InfoExtractor): 'only_matching': True, }] - def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', fatal=True, query=None): - json_data = super(HotStarIE, self)._download_json( - url_or_request, video_id, note, fatal=fatal, query=query) - if json_data['resultCode'] != 'OK': - if fatal: - raise ExtractorError(json_data['errorDescription']) - return None - return json_data['resultObj'] - def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://account.hotstar.com/AVS/besc', video_id, query={ - 'action': 'GetAggregatedContentDetails', - 'channel': 'PCTV', - 'contentId': video_id, - })['contentInfo'][0] + + video_data = self._download_content_info(video_id) + title = video_data['episodeTitle'] if video_data.get('encrypted') == 'Y': @@ -99,3 +112,51 @@ class HotStarIE(InfoExtractor): 'episode_number': int_or_none(video_data.get('episodeNumber')), 'series': video_data.get('contentTitle'), } + + +class HotStarPlaylistIE(HotStarBaseIE): + IE_NAME = 'hotstar:playlist' + _VALID_URL = r'(?P<url>https?://(?:www\.)?hotstar\.com/tv/[^/]+/(?P<content_id>\d+))/(?P<type>[^/]+)/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.hotstar.com/tv/pratidaan/14982/episodes/14812/9993', + 'info_dict': { + 'id': '14812', + }, + 'playlist_mincount': 75, + }, { + 'url': 'http://www.hotstar.com/tv/pratidaan/14982/popular-clips/9998/9998', + 'only_matching': True, + }] + _ITEM_TYPES = { + 'episodes': 'EPISODE', + 'popular-clips': 'CLIPS', + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + base_url = mobj.group('url') + content_id = mobj.group('content_id') + playlist_type = mobj.group('type') + + content_info = self._download_content_info(content_id) + playlist_id = compat_str(content_info['categoryId']) + + collection = self._download_json( + 'https://search.hotstar.com/AVS/besc', playlist_id, query={ + 'action': 'SearchContents', + 'appVersion': '5.0.40', + 'channel': 'PCTV', + 'moreFilters': 'series:%s;' % playlist_id, + 'query': '*', + 'searchOrder': 'last_broadcast_date desc,year desc,title asc', + 'type': self._ITEM_TYPES.get(playlist_type, 'EPISODE'), + }) + + entries = [ + self.url_result( + '%s/_/%s' % (base_url, video['contentId']), + ie=HotStarIE.ie_key(), video_id=video['contentId']) + for video in collection['response']['docs'] + if video.get('contentId')] + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/medialaan.py b/youtube_dl/extractor/medialaan.py index 4c32fbc..f8c3005 100644 --- a/youtube_dl/extractor/medialaan.py +++ b/youtube_dl/extractor/medialaan.py @@ -2,19 +2,18 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .gigya import GigyaBaseIE + from ..compat import compat_str from ..utils import ( - ExtractorError, int_or_none, parse_duration, try_get, unified_timestamp, - urlencode_postdata, ) -class MedialaanIE(InfoExtractor): +class MedialaanIE(GigyaBaseIE): _VALID_URL = r'''(?x) https?:// (?:www\.|nieuws\.)? @@ -119,15 +118,7 @@ class MedialaanIE(InfoExtractor): 'password': password, } - auth_info = self._download_json( - 'https://accounts.eu1.gigya.com/accounts.login', None, - note='Logging in', errnote='Unable to log in', - data=urlencode_postdata(auth_data)) - - error_message = auth_info.get('errorDetails') or auth_info.get('errorMessage') - if error_message: - raise ExtractorError( - 'Unable to login: %s' % error_message, expected=True) + auth_info = self._gigya_login(auth_data) self._uid = auth_info['UID'] self._uid_signature = auth_info['UIDSignature'] diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py index 60e3caf..5bafa6c 100644 --- a/youtube_dl/extractor/megaphone.py +++ b/youtube_dl/extractor/megaphone.py @@ -18,7 +18,7 @@ class MegaphoneIE(InfoExtractor): 'id': 'GLT9749789991', 'ext': 'mp3', 'title': '#97 What Kind Of Idiot Gets Phished?', - 'thumbnail': 're:^https://.*\.png.*$', + 'thumbnail': r're:^https://.*\.png.*$', 'duration': 1776.26375, 'author': 'Reply All', }, diff --git a/youtube_dl/extractor/myvideo.py b/youtube_dl/extractor/myvideo.py deleted file mode 100644 index 367e811..0000000 --- a/youtube_dl/extractor/myvideo.py +++ /dev/null @@ -1,177 +0,0 @@ -from __future__ import unicode_literals - -import binascii -import base64 -import hashlib -import re -import json - -from .common import InfoExtractor -from ..compat import ( - compat_ord, - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - sanitized_Request, -) - - -class MyVideoIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'https?://(?:www\.)?myvideo\.de/(?:[^/]+/)?watch/(?P<id>[0-9]+)/[^?/]+.*' - IE_NAME = 'myvideo' - _TEST = { - 'url': 'http://www.myvideo.de/watch/8229274/bowling_fail_or_win', - 'md5': '2d2753e8130479ba2cb7e0a37002053e', - 'info_dict': { - 'id': '8229274', - 'ext': 'flv', - 'title': 'bowling-fail-or-win', - } - } - - # Original Code from: https://github.com/dersphere/plugin.video.myvideo_de.git - # Released into the Public Domain by Tristan Fischer on 2013-05-19 - # https://github.com/rg3/youtube-dl/pull/842 - def __rc4crypt(self, data, key): - x = 0 - box = list(range(256)) - for i in list(range(256)): - x = (x + box[i] + compat_ord(key[i % len(key)])) % 256 - box[i], box[x] = box[x], box[i] - x = 0 - y = 0 - out = '' - for char in data: - x = (x + 1) % 256 - y = (y + box[x]) % 256 - box[x], box[y] = box[y], box[x] - out += chr(compat_ord(char) ^ box[(box[x] + box[y]) % 256]) - return out - - def __md5(self, s): - return hashlib.md5(s).hexdigest().encode() - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - GK = ( - b'WXpnME1EZGhNRGhpTTJNM01XVmhOREU0WldNNVpHTTJOakpt' - b'TW1FMU5tVTBNR05pWkRaa05XRXhNVFJoWVRVd1ptSXhaVEV3' - b'TnpsbA0KTVRkbU1tSTRNdz09' - ) - - # Get video webpage - webpage_url = 'http://www.myvideo.de/watch/%s' % video_id - webpage = self._download_webpage(webpage_url, video_id) - - mobj = re.search('source src=\'(.+?)[.]([^.]+)\'', webpage) - if mobj is not None: - self.report_extraction(video_id) - video_url = mobj.group(1) + '.flv' - - video_title = self._html_search_regex('<title>([^<]+)</title>', - webpage, 'title') - - return { - 'id': video_id, - 'url': video_url, - 'title': video_title, - } - - mobj = re.search(r'data-video-service="/service/data/video/%s/config' % video_id, webpage) - if mobj is not None: - request = sanitized_Request('http://www.myvideo.de/service/data/video/%s/config' % video_id, '') - response = self._download_webpage(request, video_id, - 'Downloading video info') - info = json.loads(base64.b64decode(response).decode('utf-8')) - return { - 'id': video_id, - 'title': info['title'], - 'url': info['streaming_url'].replace('rtmpe', 'rtmpt'), - 'play_path': info['filename'], - 'ext': 'flv', - 'thumbnail': info['thumbnail'][0]['url'], - } - - # try encxml - mobj = re.search('var flashvars={(.+?)}', webpage) - if mobj is None: - raise ExtractorError('Unable to extract video') - - params = {} - encxml = '' - sec = mobj.group(1) - for (a, b) in re.findall('(.+?):\'(.+?)\',?', sec): - if not a == '_encxml': - params[a] = b - else: - encxml = compat_urllib_parse_unquote(b) - if not params.get('domain'): - params['domain'] = 'www.myvideo.de' - xmldata_url = '%s?%s' % (encxml, compat_urllib_parse_urlencode(params)) - if 'flash_playertype=MTV' in xmldata_url: - self._downloader.report_warning('avoiding MTV player') - xmldata_url = ( - 'http://www.myvideo.de/dynamic/get_player_video_xml.php' - '?flash_playertype=D&ID=%s&_countlimit=4&autorun=yes' - ) % video_id - - # get enc data - enc_data = self._download_webpage(xmldata_url, video_id).split('=')[1] - enc_data_b = binascii.unhexlify(enc_data) - sk = self.__md5( - base64.b64decode(base64.b64decode(GK)) + - self.__md5( - str(video_id).encode('utf-8') - ) - ) - dec_data = self.__rc4crypt(enc_data_b, sk) - - # extracting infos - self.report_extraction(video_id) - - video_url = None - mobj = re.search('connectionurl=\'(.*?)\'', dec_data) - if mobj: - video_url = compat_urllib_parse_unquote(mobj.group(1)) - if 'myvideo2flash' in video_url: - self.report_warning( - 'Rewriting URL to use unencrypted rtmp:// ...', - video_id) - video_url = video_url.replace('rtmpe://', 'rtmp://') - - if not video_url: - # extract non rtmp videos - mobj = re.search('path=\'(http.*?)\' source=\'(.*?)\'', dec_data) - if mobj is None: - raise ExtractorError('unable to extract url') - video_url = compat_urllib_parse_unquote(mobj.group(1)) + compat_urllib_parse_unquote(mobj.group(2)) - - video_file = self._search_regex('source=\'(.*?)\'', dec_data, 'video file') - video_file = compat_urllib_parse_unquote(video_file) - - if not video_file.endswith('f4m'): - ppath, prefix = video_file.split('.') - video_playpath = '%s:%s' % (prefix, ppath) - else: - video_playpath = '' - - video_swfobj = self._search_regex(r'swfobject\.embedSWF\(\'(.+?)\'', webpage, 'swfobj') - video_swfobj = compat_urllib_parse_unquote(video_swfobj) - - video_title = self._html_search_regex("<h1(?: class='globalHd')?>(.*?)</h1>", - webpage, 'title') - - return { - 'id': video_id, - 'url': video_url, - 'tc_url': video_url, - 'title': video_title, - 'ext': 'flv', - 'play_path': video_playpath, - 'player_url': video_swfobj, - } diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index 35151f5..554dec3 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -15,7 +15,7 @@ from ..utils import ( class NBCIE(AdobePassIE): - _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))' + _VALID_URL = r'https?(?P<permalink>://(?:www\.)?nbc\.com/(?:classic-tv/)?[^/]+/video/[^/]+/(?P<id>n?\d+))' _TESTS = [ { @@ -67,7 +67,11 @@ class NBCIE(AdobePassIE): 'skip_download': True, }, 'skip': 'Only works from US', - } + }, + { + 'url': 'https://www.nbc.com/classic-tv/charles-in-charge/video/charles-in-charge-pilot/n3310', + 'only_matching': True, + }, ] def _real_extract(self, url): diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index 255f608..ddec89f 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -1,45 +1,106 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_unquote_plus +) from ..utils import ( - int_or_none, + parse_duration, remove_end, unified_strdate, + urljoin ) class NDTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ndtv\.com/video/(?:[^/]+/)+[^/?^&]+-(?P<id>\d+)' - - _TEST = { - 'url': 'http://www.ndtv.com/video/news/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal-300710', - 'md5': '39f992dbe5fb531c395d8bbedb1e5e88', - 'info_dict': { - 'id': '300710', - 'ext': 'mp4', - 'title': "NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal", - 'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02', - 'upload_date': '20131208', - 'duration': 1327, - 'thumbnail': r're:https?://.*\.jpg', - }, - } + _VALID_URL = r'https?://(?:[^/]+\.)?ndtv\.com/(?:[^/]+/)*videos?/?(?:[^/]+/)*[^/?^&]+-(?P<id>\d+)' + + _TESTS = [ + { + 'url': 'https://khabar.ndtv.com/video/show/prime-time/prime-time-ill-system-and-poor-education-468818', + 'md5': '78efcf3880ef3fd9b83d405ca94a38eb', + 'info_dict': { + 'id': '468818', + 'ext': 'mp4', + 'title': "पà¥à¤°à¤¾à¤à¤® à¤à¤¾à¤à¤®: सिसà¥à¤à¤® बà¥à¤®à¤¾à¤°, सà¥à¤à¥à¤² बदहाल", + 'description': 'md5:f410512f1b49672e5695dea16ef2731d', + 'upload_date': '20170928', + 'duration': 2218, + 'thumbnail': r're:https?://.*\.jpg', + } + }, + { + # __filename is url + 'url': 'http://movies.ndtv.com/videos/cracker-free-diwali-wishes-from-karan-johar-kriti-sanon-other-stars-470304', + 'md5': 'f1d709352305b44443515ac56b45aa46', + 'info_dict': { + 'id': '470304', + 'ext': 'mp4', + 'title': "Cracker-Free Diwali Wishes From Karan Johar, Kriti Sanon & Other Stars", + 'description': 'md5:f115bba1adf2f6433fa7c1ade5feb465', + 'upload_date': '20171019', + 'duration': 137, + 'thumbnail': r're:https?://.*\.jpg', + } + }, + { + 'url': 'https://www.ndtv.com/video/news/news/delhi-s-air-quality-status-report-after-diwali-is-very-poor-470372', + 'only_matching': True + }, + { + 'url': 'https://auto.ndtv.com/videos/the-cnb-daily-october-13-2017-469935', + 'only_matching': True + }, + { + 'url': 'https://sports.ndtv.com/cricket/videos/2nd-t20i-rock-thrown-at-australia-cricket-team-bus-after-win-over-india-469764', + 'only_matching': True + }, + { + 'url': 'http://gadgets.ndtv.com/videos/uncharted-the-lost-legacy-review-465568', + 'only_matching': True + }, + { + 'url': 'http://profit.ndtv.com/videos/news/video-indian-economy-on-very-solid-track-international-monetary-fund-chief-470040', + 'only_matching': True + }, + { + 'url': 'http://food.ndtv.com/video-basil-seeds-coconut-porridge-419083', + 'only_matching': True + }, + { + 'url': 'https://doctor.ndtv.com/videos/top-health-stories-of-the-week-467396', + 'only_matching': True + }, + { + 'url': 'https://swirlster.ndtv.com/video/how-to-make-friends-at-work-469324', + 'only_matching': True + } + ] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - NDTV') + # '__title' does not contain extra words such as sub-site name, "Video" etc. + title = compat_urllib_parse_unquote_plus( + self._search_regex(r"__title\s*=\s*'([^']+)'", webpage, 'title', default=None) or + self._og_search_title(webpage)) filename = self._search_regex( - r"__filename='([^']+)'", webpage, 'video filename') - video_url = 'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename + r"(?:__)?filename\s*[:=]\s*'([^']+)'", webpage, 'video filename') + # in "movies" sub-site pages, filename is URL + video_url = urljoin('https://ndtvod.bc-ssl.cdn.bitgravity.com/23372/ndtv/', filename.lstrip('/')) - duration = int_or_none(self._search_regex( - r"__duration='([^']+)'", webpage, 'duration', fatal=False)) + # "doctor" sub-site has MM:SS format + duration = parse_duration(self._search_regex( + r"(?:__)?duration\s*[:=]\s*'([^']+)'", webpage, 'duration', fatal=False)) + # "sports", "doctor", "swirlster" sub-sites don't have 'publish-date' upload_date = unified_strdate(self._html_search_meta( - 'publish-date', webpage, 'upload date', fatal=False)) + 'publish-date', webpage, 'upload date', default=None) or self._html_search_meta( + 'uploadDate', webpage, 'upload date', default=None) or self._search_regex( + r'datePublished"\s*:\s*"([^"]+)"', webpage, 'upload date', fatal=False)) description = remove_end(self._og_search_description(webpage), ' (Read more)') diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index 510b1c4..310eea2 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -75,7 +75,7 @@ class NickIE(MTVServicesInfoExtractor): class NickDeIE(MTVServicesInfoExtractor): IE_NAME = 'nick.de' - _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl|ch)|nickelodeon\.(?:nl|be|at|dk|no|se))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse', 'only_matching': True, @@ -91,6 +91,21 @@ class NickDeIE(MTVServicesInfoExtractor): }, { 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.no/program/2626-bulderhuset/videoer/90947-femteklasse-veronica-vs-vanzilla', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.dk/serier/2626-hojs-hus/videoer/761-tissepause', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.se/serier/2626-lugn-i-stormen/videos/998-', + 'only_matching': True, + }, { + 'url': 'http://www.nick.ch/shows/2304-adventure-time-abenteuerzeit-mit-finn-und-jake', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.be/afspeellijst/4530-top-videos/videos/episode/73917-inval-broodschapper-lariekoek-arie', + 'only_matching': True, }] def _extract_mrss_url(self, webpage, host): @@ -132,13 +147,28 @@ class NickNightIE(NickDeIE): class NickRuIE(MTVServicesInfoExtractor): IE_NAME = 'nickelodeonru' - _VALID_URL = r'https?://(?:www\.)nickelodeon\.ru/(?:playlist|shows|videos)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)nickelodeon\.(?:ru|fr|es|pt|ro|hu)/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6', 'only_matching': True, }, { 'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7', 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.fr/programmes/bob-l-eponge/videos/le-marathon-de-booh-kini-bottom-mardi-31-octobre/nfn7z0', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.es/videos/nickelodeon-consejos-tortitas/f7w7xy', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.pt/series/spongebob-squarepants/videos/a-bolha-de-tinta-gigante/xutq1b', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.ro/emisiuni/shimmer-si-shine/video/nahal-din-bomboane/uw5u2k', + 'only_matching': True, + }, { + 'url': 'http://www.nickelodeon.hu/musorok/spongyabob-kockanadrag/videok/episodes/buborekfujas-az-elszakadt-nadrag/q57iob#playlist/k6te4y', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 026329d..df7f528 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -40,7 +40,7 @@ class NiconicoIE(InfoExtractor): 'uploader': 'takuya0301', 'uploader_id': '2698420', 'upload_date': '20131123', - 'timestamp': 1385182762, + 'timestamp': int, # timestamp is unstable 'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', 'duration': 33, 'view_count': int, @@ -115,8 +115,8 @@ class NiconicoIE(InfoExtractor): 'skip': 'Requires an account', }, { # "New" HTML5 video + # md5 is unstable 'url': 'http://www.nicovideo.jp/watch/sm31464864', - 'md5': '351647b4917660986dc0fa8864085135', 'info_dict': { 'id': 'sm31464864', 'ext': 'mp4', @@ -124,7 +124,7 @@ class NiconicoIE(InfoExtractor): 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb', 'timestamp': 1498514060, 'upload_date': '20170626', - 'uploader': 'ã²ã¹', + 'uploader': 'ã²ã¹ã', 'uploader_id': '40826363', 'thumbnail': r're:https?://.*', 'duration': 198, @@ -132,6 +132,25 @@ class NiconicoIE(InfoExtractor): 'comment_count': int, }, 'skip': 'Requires an account', + }, { + # Video without owner + 'url': 'http://www.nicovideo.jp/watch/sm18238488', + 'md5': 'd265680a1f92bdcbbd2a507fc9e78a9e', + 'info_dict': { + 'id': 'sm18238488', + 'ext': 'mp4', + 'title': 'ãå®åçããã¥ã¼ã¿ã³ãã¿ã¼ãã«ãº', + 'description': 'md5:15df8988e47a86f9e978af2064bf6d8e', + 'timestamp': 1341160408, + 'upload_date': '20120701', + 'uploader': None, + 'uploader_id': None, + 'thumbnail': r're:https?://.*', + 'duration': 5271, + 'view_count': int, + 'comment_count': int, + }, + 'skip': 'Requires an account', }, { 'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg', 'only_matching': True, @@ -395,7 +414,9 @@ class NiconicoIE(InfoExtractor): webpage_url = get_video_info('watch_url') or url - owner = api_data.get('owner', {}) + # Note: cannot use api_data.get('owner', {}) because owner may be set to "null" + # in the JSON, which will cause None to be returned instead of {}. + owner = try_get(api_data, lambda x: x.get('owner'), dict) or {} uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id') uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname') diff --git a/youtube_dl/extractor/parliamentliveuk.py b/youtube_dl/extractor/parliamentliveuk.py index ebdab8d..bdd5ff5 100644 --- a/youtube_dl/extractor/parliamentliveuk.py +++ b/youtube_dl/extractor/parliamentliveuk.py @@ -11,7 +11,7 @@ class ParliamentLiveUKIE(InfoExtractor): _TESTS = [{ 'url': 'http://parliamentlive.tv/Event/Index/c1e9d44d-fd6c-4263-b50f-97ed26cc998b', 'info_dict': { - 'id': 'c1e9d44d-fd6c-4263-b50f-97ed26cc998b', + 'id': '1_af9nv9ym', 'ext': 'mp4', 'title': 'Home Affairs Committee', 'uploader_id': 'FFMPEG-01', @@ -28,14 +28,14 @@ class ParliamentLiveUKIE(InfoExtractor): webpage = self._download_webpage( 'http://vodplayer.parliamentlive.tv/?mid=' + video_id, video_id) widget_config = self._parse_json(self._search_regex( - r'kWidgetConfig\s*=\s*({.+});', + r'(?s)kWidgetConfig\s*=\s*({.+});', webpage, 'kaltura widget config'), video_id) - kaltura_url = 'kaltura:%s:%s' % (widget_config['wid'][1:], widget_config['entry_id']) + kaltura_url = 'kaltura:%s:%s' % ( + widget_config['wid'][1:], widget_config['entry_id']) event_title = self._download_json( 'http://parliamentlive.tv/Event/GetShareVideo/' + video_id, video_id)['event']['title'] return { '_type': 'url_transparent', - 'id': video_id, 'title': event_title, 'description': '', 'url': kaltura_url, diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 8889e4a..b51dcbe 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -187,7 +187,7 @@ class PBSIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// (?: # Direct video URL - (?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? | + (?:%s)/(?:(?:vir|port)alplayer|video)/(?P<id>[0-9]+)(?:[?/]|$) | # Article with embedded player (or direct video) (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) | # Player @@ -367,6 +367,10 @@ class PBSIE(InfoExtractor): { 'url': 'http://watch.knpb.org/video/2365616055/', 'only_matching': True, + }, + { + 'url': 'https://player.pbs.org/portalplayer/3004638221/?uid=', + 'only_matching': True, } ] _ERRORS = { diff --git a/youtube_dl/extractor/servus.py b/youtube_dl/extractor/servus.py new file mode 100644 index 0000000..264e1dd --- /dev/null +++ b/youtube_dl/extractor/servus.py @@ -0,0 +1,43 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class ServusIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?servus\.com/(?:at|de)/p/[^/]+/(?P<id>AA-\w+|\d+-\d+)' + _TESTS = [{ + 'url': 'https://www.servus.com/de/p/Die-Gr%C3%BCnen-aus-Sicht-des-Volkes/AA-1T6VBU5PW1W12/', + 'md5': '046dee641cda1c4cabe13baef3be2c1c', + 'info_dict': { + 'id': 'AA-1T6VBU5PW1W12', + 'ext': 'mp4', + 'title': 'Die Grünen aus Volkssicht', + 'description': 'md5:052b5da1cb2cd7d562ef1f19be5a5cba', + 'thumbnail': r're:^https?://.*\.jpg$', + } + }, { + 'url': 'https://www.servus.com/at/p/Wie-das-Leben-beginnt/1309984137314-381415152/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + formats = self._extract_m3u8_formats( + 'https://stv.rbmbtnx.net/api/v1/manifests/%s.m3u8' % video_id, + video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py index 4ca9f6b..efcbb36 100644 --- a/youtube_dl/extractor/skysports.py +++ b/youtube_dl/extractor/skysports.py @@ -2,7 +2,12 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import strip_or_none +from ..utils import ( + extract_attributes, + smuggle_url, + strip_or_none, + urljoin, +) class SkySportsIE(InfoExtractor): @@ -22,12 +27,22 @@ class SkySportsIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + video_data = extract_attributes(self._search_regex( + r'(<div.+?class="sdc-article-video__media-ooyala"[^>]+>)', webpage, 'video data')) + + video_url = 'ooyala:%s' % video_data['data-video-id'] + if video_data.get('data-token-required') == 'true': + token_fetch_options = self._parse_json(video_data.get('data-token-fetch-options', '{}'), video_id, fatal=False) or {} + token_fetch_url = token_fetch_options.get('url') + if token_fetch_url: + embed_token = self._download_webpage(urljoin(url, token_fetch_url), video_id, fatal=False) + if embed_token: + video_url = smuggle_url(video_url, {'embed_token': embed_token.strip('"')}) return { '_type': 'url_transparent', 'id': video_id, - 'url': 'ooyala:%s' % self._search_regex( - r'data-video-id="([^"]+)"', webpage, 'ooyala id'), + 'url': video_url, 'title': self._og_search_title(webpage), 'description': strip_or_none(self._og_search_description(webpage)), 'ie_key': 'Ooyala', diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 1c6799d..8894f4b 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -138,7 +138,7 @@ class SoundcloudIE(InfoExtractor): }, ] - _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg' + _CLIENT_ID = 'c6CU49JDMapyrQo06UxU9xouB9ZVzqCn' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' @staticmethod diff --git a/youtube_dl/extractor/soundgasm.py b/youtube_dl/extractor/soundgasm.py index e004e2c..3d78a9d 100644 --- a/youtube_dl/extractor/soundgasm.py +++ b/youtube_dl/extractor/soundgasm.py @@ -8,36 +8,49 @@ from .common import InfoExtractor class SoundgasmIE(InfoExtractor): IE_NAME = 'soundgasm' - _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_\-]+)/(?P<title>[0-9a-zA-Z_\-]+)' + _VALID_URL = r'https?://(?:www\.)?soundgasm\.net/u/(?P<user>[0-9a-zA-Z_-]+)/(?P<display_id>[0-9a-zA-Z_-]+)' _TEST = { 'url': 'http://soundgasm.net/u/ytdl/Piano-sample', 'md5': '010082a2c802c5275bb00030743e75ad', 'info_dict': { 'id': '88abd86ea000cafe98f96321b23cc1206cbcbcc9', 'ext': 'm4a', - 'title': 'ytdl_Piano-sample', - 'description': 'Royalty Free Sample Music' + 'title': 'Piano sample', + 'description': 'Royalty Free Sample Music', + 'uploader': 'ytdl', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('title') - audio_title = mobj.group('user') + '_' + mobj.group('title') + display_id = mobj.group('display_id') + webpage = self._download_webpage(url, display_id) + audio_url = self._html_search_regex( - r'(?s)m4a\:\s"([^"]+)"', webpage, 'audio URL') - audio_id = re.split(r'\/|\.', audio_url)[-2] + r'(?s)m4a\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'audio URL', group='url') + + title = self._search_regex( + r'<div[^>]+\bclass=["\']jp-title[^>]+>([^<]+)', + webpage, 'title', default=display_id) + description = self._html_search_regex( - r'(?s)<li>Description:\s(.*?)<\/li>', webpage, 'description', - fatal=False) + (r'(?s)<div[^>]+\bclass=["\']jp-description[^>]+>(.+?)</div>', + r'(?s)<li>Description:\s(.*?)<\/li>'), + webpage, 'description', fatal=False) + + audio_id = self._search_regex( + r'/([^/]+)\.m4a', audio_url, 'audio id', default=display_id) return { 'id': audio_id, 'display_id': display_id, 'url': audio_url, - 'title': audio_title, - 'description': description + 'vcodec': 'none', + 'title': title, + 'description': description, + 'uploader': mobj.group('user'), } diff --git a/youtube_dl/extractor/spankbang.py b/youtube_dl/extractor/spankbang.py index 3394c7e..2863e53 100644 --- a/youtube_dl/extractor/spankbang.py +++ b/youtube_dl/extractor/spankbang.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ExtractorError class SpankBangIE(InfoExtractor): @@ -33,6 +34,10 @@ class SpankBangIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + if re.search(r'<[^>]+\bid=["\']video_removed', webpage): + raise ExtractorError( + 'Video %s is not available' % video_id, expected=True) + stream_key = self._html_search_regex( r'''var\s+stream_key\s*=\s*['"](.+?)['"]''', webpage, 'stream key') diff --git a/youtube_dl/extractor/twentytwotracks.py b/youtube_dl/extractor/twentytwotracks.py deleted file mode 100644 index d6c0ab1..0000000 --- a/youtube_dl/extractor/twentytwotracks.py +++ /dev/null @@ -1,86 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import int_or_none - -# 22Tracks regularly replace the audio tracks that can be streamed on their -# site. The tracks usually expire after 1 months, so we can't add tests. - - -class TwentyTwoTracksIE(InfoExtractor): - _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/(?P<id>\d+)' - IE_NAME = '22tracks:track' - - _API_BASE = 'http://22tracks.com/api' - - def _extract_info(self, city, genre_name, track_id=None): - item_id = track_id if track_id else genre_name - - cities = self._download_json( - '%s/cities' % self._API_BASE, item_id, - 'Downloading cities info', - 'Unable to download cities info') - city_id = [x['id'] for x in cities if x['slug'] == city][0] - - genres = self._download_json( - '%s/genres/%s' % (self._API_BASE, city_id), item_id, - 'Downloading %s genres info' % city, - 'Unable to download %s genres info' % city) - genre = [x for x in genres if x['slug'] == genre_name][0] - genre_id = genre['id'] - - tracks = self._download_json( - '%s/tracks/%s' % (self._API_BASE, genre_id), item_id, - 'Downloading %s genre tracks info' % genre_name, - 'Unable to download track info') - - return [x for x in tracks if x['id'] == item_id][0] if track_id else [genre['title'], tracks] - - def _get_track_url(self, filename, track_id): - token = self._download_json( - 'http://22tracks.com/token.php?desktop=true&u=/128/%s' % filename, - track_id, 'Downloading token', 'Unable to download token') - return 'http://audio.22tracks.com%s?st=%s&e=%d' % (token['filename'], token['st'], token['e']) - - def _extract_track_info(self, track_info, track_id): - download_url = self._get_track_url(track_info['filename'], track_id) - title = '%s - %s' % (track_info['artist'].strip(), track_info['title'].strip()) - return { - 'id': track_id, - 'url': download_url, - 'ext': 'mp3', - 'title': title, - 'duration': int_or_none(track_info.get('duration')), - 'timestamp': int_or_none(track_info.get('published_at') or track_info.get('created')) - } - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - track_id = mobj.group('id') - - track_info = self._extract_info(city, genre, track_id) - return self._extract_track_info(track_info, track_id) - - -class TwentyTwoTracksGenreIE(TwentyTwoTracksIE): - _VALID_URL = r'https?://22tracks\.com/(?P<city>[a-z]+)/(?P<genre>[\da-z]+)/?$' - IE_NAME = '22tracks:genre' - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - city = mobj.group('city') - genre = mobj.group('genre') - - genre_title, tracks = self._extract_info(city, genre) - - entries = [ - self._extract_track_info(track_info, track_info['id']) - for track_info in tracks] - - return self.playlist_result(entries, genre, genre_title) diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index c926c99..fefcd28 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -609,7 +609,7 @@ class TwitchClipsIE(InfoExtractor): r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), video_id, transform_source=js_to_json) - title = clip.get('channel_title') or self._og_search_title(webpage) + title = clip.get('title') or clip.get('channel_title') or self._og_search_title(webpage) formats = [{ 'url': option['source'], diff --git a/youtube_dl/extractor/unity.py b/youtube_dl/extractor/unity.py new file mode 100644 index 0000000..73daacf --- /dev/null +++ b/youtube_dl/extractor/unity.py @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .youtube import YoutubeIE + + +class UnityIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?unity3d\.com/learn/tutorials/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://unity3d.com/learn/tutorials/topics/animation/animate-anything-mecanim', + 'info_dict': { + 'id': 'jWuNtik0C8E', + 'ext': 'mp4', + 'title': 'Live Training 22nd September 2014 - Animate Anything', + 'description': 'md5:e54913114bd45a554c56cdde7669636e', + 'duration': 2893, + 'uploader': 'Unity', + 'uploader_id': 'Unity3D', + 'upload_date': '20140926', + } + }, { + 'url': 'https://unity3d.com/learn/tutorials/projects/2d-ufo-tutorial/following-player-camera?playlist=25844', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + youtube_id = self._search_regex( + r'data-video-id="([_0-9a-zA-Z-]+)"', + webpage, 'youtube ID') + return self.url_result(youtube_id, ie=YoutubeIE.ie_key(), video_id=video_id) diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index c3f71b4..cedb548 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -412,7 +412,7 @@ class VimeoIE(VimeoBaseInfoExtractor): urls = [] # Look for embedded (iframe) Vimeo player for mobj in re.finditer( - r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/.+?)\1', + r'<iframe[^>]+?src=(["\'])(?P<url>(?:https?:)?//player\.vimeo\.com/video/\d+.*?)\1', webpage): urls.append(VimeoIE._smuggle_referrer(unescapeHTML(mobj.group('url')), url)) PLAIN_EMBED_RE = ( diff --git a/youtube_dl/extractor/younow.py b/youtube_dl/extractor/younow.py new file mode 100644 index 0000000..04dbc87 --- /dev/null +++ b/youtube_dl/extractor/younow.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import itertools + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + ExtractorError, + int_or_none, + try_get, +) + +CDN_API_BASE = 'https://cdn.younow.com/php/api' +MOMENT_URL_FORMAT = '%s/moment/fetch/id=%%s' % CDN_API_BASE + + +class YouNowLiveIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/AmandaPadeezy', + 'info_dict': { + 'id': 'AmandaPadeezy', + 'ext': 'mp4', + 'is_live': True, + 'title': 'March 26, 2017', + 'thumbnail': r're:^https?://.*\.jpg$', + 'tags': ['girls'], + 'categories': ['girls'], + 'uploader': 'AmandaPadeezy', + 'uploader_id': '6716501', + 'uploader_url': 'https://www.younow.com/AmandaPadeezy', + 'creator': 'AmandaPadeezy', + }, + 'skip': True, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) or YouNowMomentIE.suitable(url) + else super(YouNowLiveIE, cls).suitable(url)) + + def _real_extract(self, url): + username = self._match_id(url) + + data = self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username) + + if data.get('errorCode') != 0: + raise ExtractorError(data['errorMsg'], expected=True) + + uploader = try_get( + data, lambda x: x['user']['profileUrlString'], + compat_str) or username + + return { + 'id': uploader, + 'is_live': True, + 'title': self._live_title(uploader), + 'thumbnail': data.get('awsUrl'), + 'tags': data.get('tags'), + 'categories': data.get('tags'), + 'uploader': uploader, + 'uploader_id': data.get('userId'), + 'uploader_url': 'https://www.younow.com/%s' % username, + 'creator': uploader, + 'view_count': int_or_none(data.get('viewers')), + 'like_count': int_or_none(data.get('likes')), + 'formats': [{ + 'url': '%s/broadcast/videoPath/hls=1/broadcastId=%s/channelId=%s' + % (CDN_API_BASE, data['broadcastId'], data['userId']), + 'ext': 'mp4', + 'protocol': 'm3u8', + }], + } + + +def _extract_moment(item, fatal=True): + moment_id = item.get('momentId') + if not moment_id: + if not fatal: + return + raise ExtractorError('Unable to extract moment id') + + moment_id = compat_str(moment_id) + + title = item.get('text') + if not title: + title = 'YouNow %s' % ( + item.get('momentType') or item.get('titleType') or 'moment') + + uploader = try_get(item, lambda x: x['owner']['name'], compat_str) + uploader_id = try_get(item, lambda x: x['owner']['userId']) + uploader_url = 'https://www.younow.com/%s' % uploader if uploader else None + + entry = { + 'extractor_key': 'YouNowMoment', + 'id': moment_id, + 'title': title, + 'view_count': int_or_none(item.get('views')), + 'like_count': int_or_none(item.get('likes')), + 'timestamp': int_or_none(item.get('created')), + 'creator': uploader, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + 'formats': [{ + 'url': 'https://hls.younow.com/momentsplaylists/live/%s/%s.m3u8' + % (moment_id, moment_id), + 'ext': 'mp4', + 'protocol': 'm3u8_native', + }], + } + + return entry + + +class YouNowChannelIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/(?P<id>[^/]+)/channel' + _TEST = { + 'url': 'https://www.younow.com/its_Kateee_/channel', + 'info_dict': { + 'id': '14629760', + 'title': 'its_Kateee_ moments' + }, + 'playlist_mincount': 8, + } + + def _entries(self, username, channel_id): + created_before = 0 + for page_num in itertools.count(1): + if created_before is None: + break + info = self._download_json( + '%s/moment/profile/channelId=%s/createdBefore=%d/records=20' + % (CDN_API_BASE, channel_id, created_before), username, + note='Downloading moments page %d' % page_num) + items = info.get('items') + if not items or not isinstance(items, list): + break + for item in items: + if not isinstance(item, dict): + continue + item_type = item.get('type') + if item_type == 'moment': + entry = _extract_moment(item, fatal=False) + if entry: + yield entry + elif item_type == 'collection': + moments = item.get('momentsIds') + if isinstance(moments, list): + for moment_id in moments: + m = self._download_json( + MOMENT_URL_FORMAT % moment_id, username, + note='Downloading %s moment JSON' % moment_id, + fatal=False) + if m and isinstance(m, dict) and m.get('item'): + entry = _extract_moment(m['item']) + if entry: + yield entry + created_before = int_or_none(item.get('created')) + + def _real_extract(self, url): + username = self._match_id(url) + channel_id = compat_str(self._download_json( + 'https://api.younow.com/php/api/broadcast/info/curId=0/user=%s' + % username, username, note='Downloading user information')['userId']) + return self.playlist_result( + self._entries(username, channel_id), channel_id, + '%s moments' % username) + + +class YouNowMomentIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?younow\.com/[^/]+/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.younow.com/GABO.../20712117/36319236/3b316doc/m', + 'md5': 'a30c70eadb9fb39a1aa3c8c0d22a0807', + 'info_dict': { + 'id': '20712117', + 'ext': 'mp4', + 'title': 'YouNow capture', + 'view_count': int, + 'like_count': int, + 'timestamp': 1490432040, + 'upload_date': '20170325', + 'uploader': 'GABO...', + 'uploader_id': 35917228, + }, + } + + @classmethod + def suitable(cls, url): + return (False + if YouNowChannelIE.suitable(url) + else super(YouNowMomentIE, cls).suitable(url)) + + def _real_extract(self, url): + video_id = self._match_id(url) + item = self._download_json(MOMENT_URL_FORMAT % video_id, video_id) + return _extract_moment(item['item']) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 4e8db24..9943ddd 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1391,7 +1391,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): ) (["\']) (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/ - (?:embed|v|p)/.+?) + (?:embed|v|p)/[0-9A-Za-z_-]{11}.*?) \1''', webpage)] # lazyYT YouTube embed @@ -1622,6 +1622,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): # description description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: + + def replace_url(m): + redir_url = compat_urlparse.urljoin(url, m.group(1)) + parsed_redir_url = compat_urllib_parse_urlparse(redir_url) + if re.search(r'^(?:www\.)?(?:youtube(?:-nocookie)?\.com|youtu\.be)$', parsed_redir_url.netloc) and parsed_redir_url.path == '/redirect': + qs = compat_parse_qs(parsed_redir_url.query) + q = qs.get('q') + if q and q[0]: + return q[0] + return redir_url + description_original = video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? @@ -1630,7 +1641,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): class="[^"]*"[^>]*> [^<]+\.{3}\s* </a> - ''', lambda m: compat_urlparse.urljoin(url, m.group(1)), video_description) + ''', replace_url, video_description) video_description = clean_html(video_description) else: fd_mobj = re.search(r'<meta name="description" content="([^"]+)"', video_webpage) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 59fb334..34866a5 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1835,10 +1835,20 @@ def parse_duration(s): days, hours, mins, secs, ms = m.groups() else: m = re.match( - r'''(?ix)(?:P?T)? + r'''(?ix)(?:P? + (?: + [0-9]+\s*y(?:ears?)?\s* + )? + (?: + [0-9]+\s*m(?:onths?)?\s* + )? + (?: + [0-9]+\s*w(?:eeks?)?\s* + )? (?: (?P<days>[0-9]+)\s*d(?:ays?)?\s* )? + T)? (?: (?P<hours>[0-9]+)\s*h(?:ours?)?\s* )? diff --git a/youtube_dl/version.py b/youtube_dl/version.py index d01ba30..8b67d23 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2017.10.15.1' +__version__ = '2017.11.06'