From: Rogério Brito <rbrito@ime.usp.br> Date: Sun, 22 Jun 2014 14:48:31 +0000 (-0300) Subject: Imported Upstream version 2014.06.19 X-Git-Url: https://git.rapsys.eu/youtubedl/commitdiff_plain/509eeaaa67b2f901752c50fc6edf41954dbe3085 Imported Upstream version 2014.06.19 --- diff --git a/test/test_playlists.py b/test/test_playlists.py index 465b07b..ee91e41 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -114,10 +114,10 @@ class TestPlaylists(unittest.TestCase): def test_ustream_channel(self): dl = FakeYDL() ie = UstreamChannelIE(dl) - result = ie.extract('http://www.ustream.tv/channel/young-americans-for-liberty') + result = ie.extract('http://www.ustream.tv/channel/channeljapan') self.assertIsPlaylist(result) - self.assertEqual(result['id'], '5124905') - self.assertTrue(len(result['entries']) >= 6) + self.assertEqual(result['id'], '10874166') + self.assertTrue(len(result['entries']) >= 54) def test_soundcloud_set(self): dl = FakeYDL() diff --git a/youtube-dl b/youtube-dl index b98d36a..4c445a9 100755 Binary files a/youtube-dl and b/youtube-dl differ diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 9d407fe..9f29e2f 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -25,7 +25,7 @@ class HlsFD(FileDownloader): except (OSError, IOError): pass else: - self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found') + self.report_error(u'm3u8 download detected but ffmpeg or avconv could not be found. Please install one.') cmd = [program] + args retval = subprocess.call(cmd) diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py index cc6a841..6864670 100644 --- a/youtube_dl/downloader/rtmp.py +++ b/youtube_dl/downloader/rtmp.py @@ -106,7 +106,7 @@ class RtmpFD(FileDownloader): try: subprocess.call(['rtmpdump', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT) except (OSError, IOError): - self.report_error('RTMP download detected but "rtmpdump" could not be run') + self.report_error('RTMP download detected but "rtmpdump" could not be run. Please install it.') return False # Download using rtmpdump. rtmpdump returns exit code 2 when diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 15a42ce..dcf64d0 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -109,6 +109,7 @@ from .gdcvault import GDCVaultIE from .generic import GenericIE from .googleplus import GooglePlusIE from .googlesearch import GoogleSearchIE +from .gorillavid import GorillaVidIE from .hark import HarkIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE @@ -216,6 +217,7 @@ from .pornotube import PornotubeIE from .prosiebensat1 import ProSiebenSat1IE from .pyvideo import PyvideoIE from .radiofrance import RadioFranceIE +from .rai import RaiIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE @@ -332,6 +334,7 @@ from .viki import VikiIE from .vk import VKIE from .vube import VubeIE from .vuclip import VuClipIE +from .vulture import VultureIE from .washingtonpost import WashingtonPostIE from .wat import WatIE from .wdr import ( @@ -343,6 +346,7 @@ from .weibo import WeiboIE from .wimp import WimpIE from .wistia import WistiaIE from .worldstarhiphop import WorldStarHipHopIE +from .wrzuta import WrzutaIE from .xbef import XBefIE from .xhamster import XHamsterIE from .xnxx import XNXXIE diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index c6d22c0..b36a4d4 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -56,7 +56,18 @@ class ARDIE(InfoExtractor): raise ExtractorError('This video is only available after 20:00') formats = [] + for s in streams: + if type(s['_stream']) == list: + for index, url in enumerate(s['_stream'][::-1]): + quality = s['_quality'] + index + formats.append({ + 'quality': quality, + 'url': url, + 'format_id': '%s-%s' % (determine_ext(url), quality) + }) + continue + format = { 'quality': s['_quality'], 'url': s['_stream'], diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 45067b9..0d5889f 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -13,7 +13,7 @@ from ..utils import ( class BiliBiliIE(InfoExtractor): - _VALID_URL = r'http://www\.bilibili\.tv/video/av(?P<id>[0-9]+)/' + _VALID_URL = r'http://www\.bilibili\.(?:tv|com)/video/av(?P<id>[0-9]+)/' _TEST = { 'url': 'http://www.bilibili.tv/video/av1074402/', @@ -56,7 +56,7 @@ class BiliBiliIE(InfoExtractor): 'thumbnailUrl', video_code, 'thumbnail', fatal=False) player_params = compat_parse_qs(self._html_search_regex( - r'<iframe .*?class="player" src="https://secure.bilibili.tv/secure,([^"]+)"', + r'<iframe .*?class="player" src="https://secure\.bilibili\.(?:tv|com)/secure,([^"]+)"', webpage, 'player params')) if 'cid' in player_params: diff --git a/youtube_dl/extractor/blinkx.py b/youtube_dl/extractor/blinkx.py index 38ccd95..7d558e2 100644 --- a/youtube_dl/extractor/blinkx.py +++ b/youtube_dl/extractor/blinkx.py @@ -4,9 +4,7 @@ import json import re from .common import InfoExtractor -from ..utils import ( - remove_start, -) +from ..utils import remove_start class BlinkxIE(InfoExtractor): @@ -15,9 +13,10 @@ class BlinkxIE(InfoExtractor): _TEST = { 'url': 'http://www.blinkx.com/ce/8aQUy7GVFYgFzpKhT0oqsilwOGFRVXk3R1ZGWWdGenBLaFQwb3FzaWx3OGFRVXk3R1ZGWWdGenB', - 'file': '8aQUy7GV.mp4', 'md5': '2e9a07364af40163a908edbf10bb2492', 'info_dict': { + 'id': '8aQUy7GV', + 'ext': 'mp4', 'title': 'Police Car Rolls Away', 'uploader': 'stupidvideos.com', 'upload_date': '20131215', @@ -27,6 +26,7 @@ class BlinkxIE(InfoExtractor): 'thumbnails': [{ 'width': 100, 'height': 76, + 'resolution': '100x76', 'url': 'http://cdn.blinkx.com/stream/b/41/StupidVideos/20131215/1873969261/1873969261_tn_0.jpg', }], }, @@ -37,7 +37,7 @@ class BlinkxIE(InfoExtractor): video_id = m.group('id') display_id = video_id[:8] - api_url = (u'https://apib4.blinkx.com/api.php?action=play_video&' + + api_url = ('https://apib4.blinkx.com/api.php?action=play_video&' + 'video=%s' % video_id) data_json = self._download_webpage(api_url, display_id) data = json.loads(data_json)['api']['results'][0] @@ -55,13 +55,13 @@ class BlinkxIE(InfoExtractor): duration = m['d'] elif m['type'] == 'youtube': yt_id = m['link'] - self.to_screen(u'Youtube video detected: %s' % yt_id) + self.to_screen('Youtube video detected: %s' % yt_id) return self.url_result(yt_id, 'Youtube', video_id=yt_id) elif m['type'] in ('flv', 'mp4'): vcodec = remove_start(m['vcodec'], 'ff') acodec = remove_start(m['acodec'], 'ff') tbr = (int(m['vbr']) + int(m['abr'])) // 1000 - format_id = u'%s-%sk-%s' % (vcodec, tbr, m['w']) + format_id = '%s-%sk-%s' % (vcodec, tbr, m['w']) formats.append({ 'format_id': format_id, 'url': m['link'], diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 3c02c29..419951b 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -15,6 +15,7 @@ from ..utils import ( compat_urllib_request, compat_parse_qs, + determine_ext, ExtractorError, unsmuggle_url, unescapeHTML, @@ -29,10 +30,11 @@ class BrightcoveIE(InfoExtractor): { # From http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/ 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1654948606001&flashID=myExperience&%40videoPlayer=2371591881001', - 'file': '2371591881001.mp4', 'md5': '5423e113865d26e40624dce2e4b45d95', 'note': 'Test Brightcove downloads and detection in GenericIE', 'info_dict': { + 'id': '2371591881001', + 'ext': 'mp4', 'title': 'Xavier Sala i MartÃn: âUn banc que no presta és un banc zombi que no serveix per a resâ', 'uploader': '8TV', 'description': 'md5:a950cc4285c43e44d763d036710cd9cd', @@ -41,8 +43,9 @@ class BrightcoveIE(InfoExtractor): { # From http://medianetwork.oracle.com/video/player/1785452137001 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=1217746023001&flashID=myPlayer&%40videoPlayer=1785452137001', - 'file': '1785452137001.flv', 'info_dict': { + 'id': '1785452137001', + 'ext': 'flv', 'title': 'JVMLS 2012: Arrays 2.0 - Opportunities and Challenges', 'description': 'John Rose speaks at the JVM Language Summit, August 1, 2012.', 'uploader': 'Oracle', @@ -70,7 +73,20 @@ class BrightcoveIE(InfoExtractor): 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, - } + }, + { + # test flv videos served by akamaihd.net + # From http://www.redbull.com/en/bike/stories/1331655643987/replay-uci-dh-world-cup-2014-from-fort-william + 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?%40videoPlayer=ref%3ABC2996102916001&linkBaseURL=http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fvideos%2F1331655630249%2Freplay-uci-fort-william-2014-dh&playerKey=AQ%7E%7E%2CAAAApYJ7UqE%7E%2Cxqr_zXk0I-zzNndy8NlHogrCb5QdyZRf&playerID=1398061561001#__youtubedl_smuggle=%7B%22Referer%22%3A+%22http%3A%2F%2Fwww.redbull.com%2Fen%2Fbike%2Fstories%2F1331655643987%2Freplay-uci-dh-world-cup-2014-from-fort-william%22%7D', + # The md5 checksum changes on each download + 'info_dict': { + 'id': '2996102916001', + 'ext': 'flv', + 'title': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + 'uploader': 'Red Bull TV', + 'description': 'UCI MTB World Cup 2014: Fort William, UK - Downhill Finals', + }, + }, ] @classmethod @@ -187,7 +203,7 @@ class BrightcoveIE(InfoExtractor): webpage = self._download_webpage(req, video_id) self.report_extraction(video_id) - info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') + info = self._search_regex(r'var experienceJSON = ({.*});', webpage, 'json') info = json.loads(info)['data'] video_info = info['programmedContent']['videoPlayer']['mediaDTO'] video_info['_youtubedl_adServerURL'] = info.get('adServerURL') @@ -219,12 +235,26 @@ class BrightcoveIE(InfoExtractor): renditions = video_info.get('renditions') if renditions: - renditions = sorted(renditions, key=lambda r: r['size']) - info['formats'] = [{ - 'url': rend['defaultURL'], - 'height': rend.get('frameHeight'), - 'width': rend.get('frameWidth'), - } for rend in renditions] + formats = [] + for rend in renditions: + url = rend['defaultURL'] + if rend['remote']: + # This type of renditions are served through akamaihd.net, + # but they don't use f4m manifests + url = url.replace('control/', '') + '?&v=3.3.0&fp=13&r=FEEFJ&g=RTSJIMBMPFPB' + ext = 'flv' + else: + ext = determine_ext(url) + size = rend.get('size') + formats.append({ + 'url': url, + 'ext': ext, + 'height': rend.get('frameHeight'), + 'width': rend.get('frameWidth'), + 'filesize': size if size != 0 else None, + }) + self._sort_formats(formats) + info['formats'] = formats elif video_info.get('FLVFullLengthURL') is not None: info.update({ 'url': video_info['FLVFullLengthURL'], diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 0b11d1f..69ca754 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -1,39 +1,37 @@ -# coding: utf-8 +from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - unified_strdate, -) +from ..utils import unified_strdate class DreiSatIE(InfoExtractor): IE_NAME = '3sat' _VALID_URL = r'(?:http://)?(?:www\.)?3sat\.de/mediathek/(?:index\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P<id>[0-9]+)$' _TEST = { - u"url": u"http://www.3sat.de/mediathek/index.php?obj=36983", - u'file': u'36983.mp4', - u'md5': u'9dcfe344732808dbfcc901537973c922', - u'info_dict': { - u"title": u"Kaffeeland Schweiz", - u"description": u"Ãber 80 Kaffeeröstereien liefern in der Schweiz das Getränk, in das das Land so vernarrt ist: Mehr als 1000 Tassen trinkt ein Schweizer pro Jahr. SCHWEIZWEIT nimmt die Kaffeekultur unter die...", - u"uploader": u"3sat", - u"upload_date": u"20130622" + 'url': 'http://www.3sat.de/mediathek/index.php?obj=36983', + 'md5': '9dcfe344732808dbfcc901537973c922', + 'info_dict': { + 'id': '36983', + 'ext': 'mp4', + 'title': 'Kaffeeland Schweiz', + 'description': 'md5:cc4424b18b75ae9948b13929a0814033', + 'uploader': '3sat', + 'upload_date': '20130622' } } - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?ak=web&id=%s' % video_id - details_doc = self._download_xml(details_url, video_id, note=u'Downloading video details') + details_doc = self._download_xml(details_url, video_id, 'Downloading video details') thumbnail_els = details_doc.findall('.//teaserimage') thumbnails = [{ - 'width': te.attrib['key'].partition('x')[0], - 'height': te.attrib['key'].partition('x')[2], + 'width': int(te.attrib['key'].partition('x')[0]), + 'height': int(te.attrib['key'].partition('x')[2]), 'url': te.text, } for te in thumbnail_els] diff --git a/youtube_dl/extractor/fc2.py b/youtube_dl/extractor/fc2.py index 18f91ef..c663a0f 100644 --- a/youtube_dl/extractor/fc2.py +++ b/youtube_dl/extractor/fc2.py @@ -50,10 +50,13 @@ class FC2IE(InfoExtractor): raise ExtractorError('Error code: %s' % info['err_code'][0]) video_url = info['filepath'][0] + '?mid=' + info['mid'][0] + title_info = info.get('title') + if title_info: + title = title_info[0] return { 'id': video_id, - 'title': info['title'][0], + 'title': title, 'url': video_url, 'ext': 'flv', 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 38a357d..3105b47 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -260,7 +260,24 @@ class GenericIE(InfoExtractor): 'uploader': 'Spi0n', }, 'add_ie': ['Dailymotion'], - } + }, + # YouTube embed + { + 'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html', + 'info_dict': { + 'id': 'FXRb4ykk4S0', + 'ext': 'mp4', + 'title': 'The NBL Auction 2014', + 'uploader': 'BADMINTON England', + 'uploader_id': 'BADMINTONEvents', + 'upload_date': '20140603', + 'description': 'md5:9ef128a69f1e262a700ed83edb163a73', + }, + 'add_ie': ['Youtube'], + 'params': { + 'skip_download': True, + } + }, ] def report_download_webpage(self, video_id): @@ -478,8 +495,13 @@ class GenericIE(InfoExtractor): # Look for embedded YouTube player matches = re.findall(r'''(?x) - (?:<iframe[^>]+?src=|embedSWF\(\s*) - (["\'])(?P<url>(?:https?:)?//(?:www\.)?youtube\.com/ + (?: + <iframe[^>]+?src=| + <embed[^>]+?src=| + embedSWF\(?:\s* + ) + (["\']) + (?P<url>(?:https?:)?//(?:www\.)?youtube\.com/ (?:embed|v)/.+?) \1''', webpage) if matches: @@ -646,6 +668,14 @@ class GenericIE(InfoExtractor): url = unescapeHTML(mobj.group('url')) return self.url_result(url) + # Look for embedded vulture.com player + mobj = re.search( + r'<iframe src="(?P<url>https?://video\.vulture\.com/[^"]+)"', + webpage) + if mobj is not None: + url = unescapeHTML(mobj.group('url')) + return self.url_result(url, ie='Vulture') + # Start with something easy: JW Player in SWFObject found = re.findall(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if not found: diff --git a/youtube_dl/extractor/gorillavid.py b/youtube_dl/extractor/gorillavid.py new file mode 100644 index 0000000..aa15caf --- /dev/null +++ b/youtube_dl/extractor/gorillavid.py @@ -0,0 +1,76 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + compat_urllib_parse, + compat_urllib_request, +) + + +class GorillaVidIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?gorillavid\.in/(?:embed-)?(?P<id>[0-9a-zA-Z]+)(?:-[0-9]+x[0-9]+\.html)?' + + _TESTS = [{ + 'url': 'http://gorillavid.in/06y9juieqpmi', + 'md5': '5ae4a3580620380619678ee4875893ba', + 'info_dict': { + 'id': '06y9juieqpmi', + 'ext': 'flv', + 'title': 'Rebecca Black My Moment Official Music Video Reaction', + 'thumbnail': 're:http://.*\.jpg', + }, + }, { + 'url': 'http://gorillavid.in/embed-z08zf8le23c6-960x480.html', + 'md5': 'c9e293ca74d46cad638e199c3f3fe604', + 'info_dict': { + 'id': 'z08zf8le23c6', + 'ext': 'mp4', + 'title': 'Say something nice', + 'thumbnail': 're:http://.*\.jpg', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + url = 'http://gorillavid.in/%s' % video_id + + webpage = self._download_webpage(url, video_id) + + fields = dict(re.findall(r'''(?x)<input\s+ + type="hidden"\s+ + name="([^"]+)"\s+ + (?:id="[^"]+"\s+)? + value="([^"]*)" + ''', webpage)) + + if fields['op'] == 'download1': + post = compat_urllib_parse.urlencode(fields) + + req = compat_urllib_request.Request(url, post) + req.add_header('Content-type', 'application/x-www-form-urlencoded') + + webpage = self._download_webpage(req, video_id, 'Downloading video page') + + title = self._search_regex(r'style="z-index: [0-9]+;">([0-9a-zA-Z ]+)(?:-.+)?</span>', webpage, 'title') + thumbnail = self._search_regex(r'image:\'(http[^\']+)\',', webpage, 'thumbnail') + url = self._search_regex(r'file: \'(http[^\']+)\',', webpage, 'file url') + + formats = [{ + 'format_id': 'sd', + 'url': url, + 'ext': determine_ext(url), + 'quality': 1, + }] + + return { + 'id': video_id, + 'title': title, + 'thumbnail': thumbnail, + 'formats': formats, + } diff --git a/youtube_dl/extractor/hypem.py b/youtube_dl/extractor/hypem.py index 9bd06e7..6d0d847 100644 --- a/youtube_dl/extractor/hypem.py +++ b/youtube_dl/extractor/hypem.py @@ -1,10 +1,11 @@ +from __future__ import unicode_literals + import json import re import time from .common import InfoExtractor from ..utils import ( - compat_str, compat_urllib_parse, compat_urllib_request, @@ -13,59 +14,55 @@ from ..utils import ( class HypemIE(InfoExtractor): - """Information Extractor for hypem""" - _VALID_URL = r'(?:http://)?(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)' + _VALID_URL = r'http://(?:www\.)?hypem\.com/track/([^/]+)/([^/]+)' _TEST = { - u'url': u'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', - u'file': u'1v6ga.mp3', - u'md5': u'b9cc91b5af8995e9f0c1cee04c575828', - u'info_dict': { - u"title": u"Tame" + 'url': 'http://hypem.com/track/1v6ga/BODYWORK+-+TAME', + 'md5': 'b9cc91b5af8995e9f0c1cee04c575828', + 'info_dict': { + 'id': '1v6ga', + 'ext': 'mp3', + 'title': 'Tame', + 'uploader': 'BODYWORK', } } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) track_id = mobj.group(1) data = {'ax': 1, 'ts': time.time()} data_encoded = compat_urllib_parse.urlencode(data) complete_url = url + "?" + data_encoded request = compat_urllib_request.Request(complete_url) - response, urlh = self._download_webpage_handle(request, track_id, u'Downloading webpage with the url') + response, urlh = self._download_webpage_handle( + request, track_id, 'Downloading webpage with the url') cookie = urlh.headers.get('Set-Cookie', '') - self.report_extraction(track_id) - - html_tracks = self._html_search_regex(r'<script type="application/json" id="displayList-data">(.*?)</script>', - response, u'tracks', flags=re.MULTILINE|re.DOTALL).strip() + html_tracks = self._html_search_regex( + r'(?ms)<script type="application/json" id="displayList-data">\s*(.*?)\s*</script>', + response, 'tracks') try: track_list = json.loads(html_tracks) - track = track_list[u'tracks'][0] + track = track_list['tracks'][0] except ValueError: - raise ExtractorError(u'Hypemachine contained invalid JSON.') + raise ExtractorError('Hypemachine contained invalid JSON.') - key = track[u"key"] - track_id = track[u"id"] - artist = track[u"artist"] - title = track[u"song"] + key = track['key'] + track_id = track['id'] + artist = track['artist'] + title = track['song'] - serve_url = "http://hypem.com/serve/source/%s/%s" % (compat_str(track_id), compat_str(key)) - request = compat_urllib_request.Request(serve_url, "" , {'Content-Type': 'application/json'}) + serve_url = "http://hypem.com/serve/source/%s/%s" % (track_id, key) + request = compat_urllib_request.Request( + serve_url, '', {'Content-Type': 'application/json'}) request.add_header('cookie', cookie) - song_data_json = self._download_webpage(request, track_id, u'Downloading metadata') - try: - song_data = json.loads(song_data_json) - except ValueError: - raise ExtractorError(u'Hypemachine contained invalid JSON.') - final_url = song_data[u"url"] + song_data = self._download_json(request, track_id, 'Downloading metadata') + final_url = song_data["url"] - return [{ - 'id': track_id, - 'url': final_url, - 'ext': "mp3", - 'title': title, - 'artist': artist, - }] + return { + 'id': track_id, + 'url': final_url, + 'ext': 'mp3', + 'title': title, + 'uploader': artist, + } diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index 7a431a2..8d9491f 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -24,7 +24,7 @@ class LifeNewsIE(InfoExtractor): 'ext': 'mp4', 'title': 'ÐÐÐ ÑазÑÑÐºÐ¸Ð²Ð°ÐµÑ Ð¼ÑжÑин, оÑÑавивÑÐ¸Ñ Ð² IKEA ÑÑÐ¼ÐºÑ Ñ Ð°Ð²ÑомаÑом', 'description': 'ÐамеÑÑ Ð½Ð°Ð±Ð»ÑÐ´ÐµÐ½Ð¸Ñ Ð³Ð¸Ð¿ÐµÑмаÑкеÑа заÑикÑиÑовали ÑÑÐ¾Ð¸Ñ Ð¼ÑжÑин, ÑпÑÑÑавÑÐ¸Ñ Ð¾ÑÑжейнÑй аÑÑенал в камеÑе Ñ ÑанениÑ.', - 'thumbnail': 'http://lifenews.ru/static/posts/2014/1/126342/.video.jpg', + 'thumbnail': 're:http://.*\.jpg', 'upload_date': '20140130', } } diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py index 1dcd1fb..5c71f4f 100644 --- a/youtube_dl/extractor/livestream.py +++ b/youtube_dl/extractor/livestream.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import re import json @@ -6,31 +8,34 @@ from ..utils import ( compat_urllib_parse_urlparse, compat_urlparse, xpath_with_ns, + compat_str, ) class LivestreamIE(InfoExtractor): - IE_NAME = u'livestream' + IE_NAME = 'livestream' _VALID_URL = r'http://new\.livestream\.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$' _TEST = { - u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', - u'file': u'4719370.mp4', - u'md5': u'0d2186e3187d185a04b3cdd02b828836', - u'info_dict': { - u'title': u'Live from Webster Hall NYC', - u'upload_date': u'20121012', + 'url': 'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', + 'md5': '53274c76ba7754fb0e8d072716f2292b', + 'info_dict': { + 'id': '4719370', + 'ext': 'mp4', + 'title': 'Live from Webster Hall NYC', + 'upload_date': '20121012', } } def _extract_video_info(self, video_data): video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url') - return {'id': video_data['id'], - 'url': video_url, - 'ext': 'mp4', - 'title': video_data['caption'], - 'thumbnail': video_data['thumbnail_url'], - 'upload_date': video_data['updated_at'].replace('-','')[:8], - } + return { + 'id': compat_str(video_data['id']), + 'url': video_url, + 'ext': 'mp4', + 'title': video_data['caption'], + 'thumbnail': video_data['thumbnail_url'], + 'upload_date': video_data['updated_at'].replace('-', '')[:8], + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -40,36 +45,36 @@ class LivestreamIE(InfoExtractor): if video_id is None: # This is an event page: - config_json = self._search_regex(r'window.config = ({.*?});', - webpage, u'window config') + config_json = self._search_regex( + r'window.config = ({.*?});', webpage, 'window config') info = json.loads(config_json)['event'] videos = [self._extract_video_info(video_data['data']) - for video_data in info['feed']['data'] if video_data['type'] == u'video'] + for video_data in info['feed']['data'] if video_data['type'] == 'video'] return self.playlist_result(videos, info['id'], info['full_name']) else: - og_video = self._og_search_video_url(webpage, name=u'player url') + og_video = self._og_search_video_url(webpage, 'player url') query_str = compat_urllib_parse_urlparse(og_video).query query = compat_urlparse.parse_qs(query_str) api_url = query['play_url'][0].replace('.smil', '') - info = json.loads(self._download_webpage(api_url, video_id, - u'Downloading video info')) + info = json.loads(self._download_webpage( + api_url, video_id, 'Downloading video info')) return self._extract_video_info(info) # The original version of Livestream uses a different system class LivestreamOriginalIE(InfoExtractor): - IE_NAME = u'livestream:original' + IE_NAME = 'livestream:original' _VALID_URL = r'https?://www\.livestream\.com/(?P<user>[^/]+)/video\?.*?clipId=(?P<id>.*?)(&|$)' _TEST = { - u'url': u'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', - u'info_dict': { - u'id': u'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', - u'ext': u'flv', - u'title': u'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', + 'url': 'http://www.livestream.com/dealbook/video?clipId=pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', + 'info_dict': { + 'id': 'pla_8aa4a3f1-ba15-46a4-893b-902210e138fb', + 'ext': 'flv', + 'title': 'Spark 1 (BitCoin) with Cameron Winklevoss & Tyler Winklevoss of Winklevoss Capital', }, - u'params': { + 'params': { # rtmp - u'skip_download': True, + 'skip_download': True, }, } @@ -84,7 +89,7 @@ class LivestreamOriginalIE(InfoExtractor): ns = {'media': 'http://search.yahoo.com/mrss'} thumbnail_url = item.find(xpath_with_ns('media:thumbnail', ns)).attrib['url'] # Remove the extension and number from the path (like 1.jpg) - path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, u'path') + path = self._search_regex(r'(user-files/.+)_.*?\.jpg$', thumbnail_url, 'path') return { 'id': video_id, diff --git a/youtube_dl/extractor/ndtv.py b/youtube_dl/extractor/ndtv.py index d81df3c..95e7d63 100644 --- a/youtube_dl/extractor/ndtv.py +++ b/youtube_dl/extractor/ndtv.py @@ -1,22 +1,28 @@ +from __future__ import unicode_literals + import re from .common import InfoExtractor -from ..utils import month_by_name +from ..utils import ( + month_by_name, + int_or_none, +) class NDTVIE(InfoExtractor): _VALID_URL = r'^https?://(?:www\.)?ndtv\.com/video/player/[^/]*/[^/]*/(?P<id>[a-z0-9]+)' _TEST = { - u"url": u"http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710", - u"file": u"300710.mp4", - u"md5": u"39f992dbe5fb531c395d8bbedb1e5e88", - u"info_dict": { - u"title": u"NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal", - u"description": u"In an exclusive interview to NDTV, Aam Aadmi Party's Arvind Kejriwal says it makes no difference to him that Rahul Gandhi said the Congress needs to learn from his party.", - u"upload_date": u"20131208", - u"duration": 1327, - u"thumbnail": u"http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg", + 'url': 'http://www.ndtv.com/video/player/news/ndtv-exclusive-don-t-need-character-certificate-from-rahul-gandhi-says-arvind-kejriwal/300710', + 'md5': '39f992dbe5fb531c395d8bbedb1e5e88', + 'info_dict': { + 'id': '300710', + 'ext': 'mp4', + 'title': "NDTV exclusive: Don't need character certificate from Rahul Gandhi, says Arvind Kejriwal", + 'description': 'md5:ab2d4b4a6056c5cb4caa6d729deabf02', + 'upload_date': '20131208', + 'duration': 1327, + 'thumbnail': 'http://i.ndtvimg.com/video/images/vod/medium/2013-12/big_300710_1386518307.jpg', }, } @@ -27,13 +33,12 @@ class NDTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) filename = self._search_regex( - r"__filename='([^']+)'", webpage, u'video filename') - video_url = (u'http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % + r"__filename='([^']+)'", webpage, 'video filename') + video_url = ('http://bitcast-b.bitgravity.com/ndtvod/23372/ndtv/%s' % filename) - duration_str = filename = self._search_regex( - r"__duration='([^']+)'", webpage, u'duration', fatal=False) - duration = None if duration_str is None else int(duration_str) + duration = int_or_none(self._search_regex( + r"__duration='([^']+)'", webpage, 'duration', fatal=False)) date_m = re.search(r'''(?x) <p\s+class="vod_dateline">\s* @@ -41,7 +46,7 @@ class NDTVIE(InfoExtractor): (?P<monthname>[A-Za-z]+)\s+(?P<day>[0-9]+),\s*(?P<year>[0-9]+) ''', webpage) upload_date = None - assert date_m + if date_m is not None: month = month_by_name(date_m.group('monthname')) if month is not None: @@ -49,14 +54,19 @@ class NDTVIE(InfoExtractor): date_m.group('year'), month, int(date_m.group('day'))) description = self._og_search_description(webpage) - READ_MORE = u' (Read more)' + READ_MORE = ' (Read more)' if description.endswith(READ_MORE): description = description[:-len(READ_MORE)] + title = self._og_search_title(webpage) + TITLE_SUFFIX = ' - NDTV' + if title.endswith(TITLE_SUFFIX): + title = title[:-len(TITLE_SUFFIX)] + return { 'id': video_id, 'url': video_url, - 'title': self._og_search_title(webpage), + 'title': title, 'description': description, 'thumbnail': self._og_search_thumbnail(webpage), 'duration': duration, diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 3a6a788..96f0ae1 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -6,7 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( ExtractorError, - int_or_none, + float_or_none, unified_strdate, ) @@ -72,14 +72,14 @@ class NRKIE(InfoExtractor): class NRKTVIE(InfoExtractor): - _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-z]{4}\d{8})' + _VALID_URL = r'http://tv\.nrk(?:super)?\.no/(?:serie/[^/]+|program)/(?P<id>[a-zA-Z]{4}\d{8})' _TESTS = [ { - 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/muhh48000314/23-05-2014', + 'url': 'http://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014', 'md5': '7b96112fbae1faf09a6f9ae1aff6cb84', 'info_dict': { - 'id': 'muhh48000314', + 'id': 'MUHH48000314', 'ext': 'flv', 'title': '20 spørsmÃ¥l', 'description': 'md5:bdea103bc35494c143c6a9acdd84887a', @@ -89,7 +89,7 @@ class NRKTVIE(InfoExtractor): }, { 'url': 'http://tv.nrk.no/program/mdfp15000514', - 'md5': '383650ece2b25ecec996ad7b5bb2a384', + 'md5': 'af01795a31f1cf7265c8657534d8077b', 'info_dict': { 'id': 'mdfp15000514', 'ext': 'flv', @@ -111,9 +111,8 @@ class NRKTVIE(InfoExtractor): description = self._html_search_meta('description', page, 'description') thumbnail = self._html_search_regex(r'data-posterimage="([^"]+)"', page, 'thumbnail', fatal=False) upload_date = unified_strdate(self._html_search_meta('rightsfrom', page, 'upload date', fatal=False)) - duration = self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False) - if duration: - duration = float(duration) + duration = float_or_none( + self._html_search_regex(r'data-duration="([^"]+)"', page, 'duration', fatal=False)) formats = [] @@ -142,4 +141,4 @@ class NRKTVIE(InfoExtractor): 'upload_date': upload_date, 'duration': duration, 'formats': formats, - } \ No newline at end of file + } diff --git a/youtube_dl/extractor/ntv.py b/youtube_dl/extractor/ntv.py index 733ed6c..ed60314 100644 --- a/youtube_dl/extractor/ntv.py +++ b/youtube_dl/extractor/ntv.py @@ -5,7 +5,6 @@ import re from .common import InfoExtractor from ..utils import ( - ExtractorError, unescapeHTML ) diff --git a/youtube_dl/extractor/nuvid.py b/youtube_dl/extractor/nuvid.py index e3db9fe..280328b 100644 --- a/youtube_dl/extractor/nuvid.py +++ b/youtube_dl/extractor/nuvid.py @@ -3,6 +3,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, + compat_urllib_request, +) class NuvidIE(InfoExtractor): @@ -13,8 +18,10 @@ class NuvidIE(InfoExtractor): 'info_dict': { 'id': '1310741', 'ext': 'mp4', - "title": "Horny babes show their awesome bodeis and", - "age_limit": 18, + 'title': 'Horny babes show their awesome bodeis and', + 'duration': 129, + 'upload_date': '20140508', + 'age_limit': 18, } } @@ -22,27 +29,41 @@ class NuvidIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - murl = url.replace('://www.', '://m.') - webpage = self._download_webpage(murl, video_id) - - title = self._html_search_regex( - r'<div class="title">\s+<h2[^>]*>([^<]+)</h2>', - webpage, 'title').strip() + formats = [] - url_end = self._html_search_regex( - r'href="(/[^"]+)"[^>]*data-link_type="mp4"', - webpage, 'video_url') - video_url = 'http://m.nuvid.com' + url_end + for dwnld_speed, format_id in [(0, '3gp'), (5, 'mp4')]: + request = compat_urllib_request.Request( + 'http://m.nuvid.com/play/%s' % video_id) + request.add_header('Cookie', 'skip_download_page=1; dwnld_speed=%d; adv_show=1' % dwnld_speed) + webpage = self._download_webpage( + request, video_id, 'Downloading %s page' % format_id) + video_url = self._html_search_regex( + r'<a href="([^"]+)"\s*>Continue to watch video', webpage, '%s video URL' % format_id, fatal=False) + if not video_url: + continue + formats.append({ + 'url': video_url, + 'format_id': format_id, + }) + webpage = self._download_webpage( + 'http://m.nuvid.com/video/%s' % video_id, video_id, 'Downloading video page') + title = self._html_search_regex( + r'<div class="title">\s+<h2[^>]*>([^<]+)</h2>', webpage, 'title').strip() thumbnail = self._html_search_regex( r'href="(/thumbs/[^"]+)"[^>]*data-link_type="thumbs"', webpage, 'thumbnail URL', fatal=False) + duration = parse_duration(self._html_search_regex( + r'Length:\s*<span>(\d{2}:\d{2})</span>',webpage, 'duration', fatal=False)) + upload_date = unified_strdate(self._html_search_regex( + r'Added:\s*<span>(\d{4}-\d{2}-\d{2})</span>', webpage, 'upload date', fatal=False)) return { 'id': video_id, - 'url': video_url, - 'ext': 'mp4', 'title': title, - 'thumbnail': thumbnail, + 'thumbnail': 'http://m.nuvid.com%s' % thumbnail, + 'duration': duration, + 'upload_date': upload_date, 'age_limit': 18, - } + 'formats': formats, + } \ No newline at end of file diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index e4c4ad7..da64a1a 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -158,19 +158,19 @@ class ProSiebenSat1IE(InfoExtractor): _CLIPID_REGEXES = [ r'"clip_id"\s*:\s+"(\d+)"', r'clipid: "(\d+)"', - r'clipId=(\d+)', + r'clip[iI]d=(\d+)', ] _TITLE_REGEXES = [ r'<h2 class="subtitle" itemprop="name">\s*(.+?)</h2>', r'<header class="clearfix">\s*<h3>(.+?)</h3>', r'<!-- start video -->\s*<h1>(.+?)</h1>', - r'<div class="ep-femvideos-pi4-video-txt">\s*<h2>(.+?)</h2>', + r'<h1 class="att-name">\s*(.+?)</h1>', ] _DESCRIPTION_REGEXES = [ r'<p itemprop="description">\s*(.+?)</p>', r'<div class="videoDecription">\s*<p><strong>Beschreibung</strong>: (.+?)</p>', r'<div class="g-plusone" data-size="medium"></div>\s*</div>\s*</header>\s*(.+?)\s*<footer>', - r'<p>(.+?)</p>\s*<div class="ep-femvideos-pi4-video-footer">', + r'<p class="att-description">\s*(.+?)\s*</p>', ] _UPLOAD_DATE_REGEXES = [ r'<meta property="og:published_time" content="(.+?)">', diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py new file mode 100644 index 0000000..cb43053 --- /dev/null +++ b/youtube_dl/extractor/rai.py @@ -0,0 +1,121 @@ +from __future__ import unicode_literals + +import re + +from .subtitles import SubtitlesInfoExtractor +from ..utils import ( + parse_duration, + unified_strdate, + compat_urllib_parse, +) + + +class RaiIE(SubtitlesInfoExtractor): + _VALID_URL = r'(?P<url>http://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html)' + _TESTS = [ + { + 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': 'c064c0b2d09c278fb293116ef5d0a32d', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report del 07/04/2014', + 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'upload_date': '20140407', + 'duration': 6160, + } + }, + { + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'md5': '8bb9c151924ce241b74dd52ef29ceafa', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'description': '', + 'upload_date': '20140612', + 'duration': 1758, + } + }, + { + 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', + 'md5': '35cf7c229f22eeef43e48b5cf923bef0', + 'info_dict': { + 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13', + 'ext': 'mp4', + 'title': 'State of the Net, Antonella La Carpia: regole virali', + 'description': 'md5:b0ba04a324126903e3da7763272ae63c', + 'upload_date': '20140613', + }, + 'skip': 'Error 404', + }, + { + 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', + 'md5': '35694f062977fe6619943f08ed935730', + 'info_dict': { + 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', + 'ext': 'mp4', + 'title': 'Alluvione in Sardegna e dissesto idrogeologico', + 'description': 'Edizione delle ore 20:30 ', + } + }, + ] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + + media = self._download_json('%s?json' % mobj.group('url'), video_id, 'Downloading video JSON') + + title = media.get('name') + description = media.get('desc') + thumbnail = media.get('image_300') or media.get('image_medium') or media.get('image') + duration = parse_duration(media.get('length')) + uploader = media.get('author') + upload_date = unified_strdate(media.get('date')) + + formats = [] + + for format_id in ['wmv', 'm3u8', 'mediaUri', 'h264']: + media_url = media.get(format_id) + if not media_url: + continue + formats.append({ + 'url': media_url, + 'format_id': format_id, + 'ext': 'mp4', + }) + + if self._downloader.params.get('listsubtitles', False): + page = self._download_webpage(url, video_id) + self._list_available_subtitles(video_id, page) + return + + subtitles = {} + if self._have_to_download_any_subtitles: + page = self._download_webpage(url, video_id) + subtitles = self.extract_subtitles(video_id, page) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'uploader': uploader, + 'upload_date': upload_date, + 'duration': duration, + 'formats': formats, + 'subtitles': subtitles, + } + + def _get_available_subtitles(self, video_id, webpage): + subtitles = {} + m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) + if m: + captions = m.group('captions') + STL_EXT = '.stl' + SRT_EXT = '.srt' + if captions.endswith(STL_EXT): + captions = captions[:-len(STL_EXT)] + SRT_EXT + subtitles['it'] = 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions) + return subtitles \ No newline at end of file diff --git a/youtube_dl/extractor/slutload.py b/youtube_dl/extractor/slutload.py index ecc0abf..e6e7d08 100644 --- a/youtube_dl/extractor/slutload.py +++ b/youtube_dl/extractor/slutload.py @@ -3,9 +3,6 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import ( - ExtractorError, -) class SlutloadIE(InfoExtractor): diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 25515f0..7aa100f 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -1,7 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import json import re import itertools diff --git a/youtube_dl/extractor/steam.py b/youtube_dl/extractor/steam.py index 1d8d572..af689e2 100644 --- a/youtube_dl/extractor/steam.py +++ b/youtube_dl/extractor/steam.py @@ -53,7 +53,7 @@ class SteamIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20140329', 'title': 'FRONTIERS - Final Greenlight Trailer', - 'description': "The final trailer for the Steam Greenlight launch. Hooray, progress! Here's the official Greenlight page: http://steamcommunity.com/sharedfiles/filedetails/?id=242472205", + 'description': 'md5:6df4fe8dd494ae811869672b0767e025', 'uploader': 'AAD Productions', 'uploader_id': 'AtomicAgeDogGames', } diff --git a/youtube_dl/extractor/teachertube.py b/youtube_dl/extractor/teachertube.py index 6d52763..4d9666c 100644 --- a/youtube_dl/extractor/teachertube.py +++ b/youtube_dl/extractor/teachertube.py @@ -55,11 +55,13 @@ class TeacherTubeIE(InfoExtractor): quality = qualities(['mp3', 'flv', 'mp4']) + _, media_urls = zip(*re.findall(r'([\'"])file\1\s*:\s*"([^"]+)"', webpage)) + formats = [ { 'url': media_url, 'quality': quality(determine_ext(media_url)) - } for media_url in set(zip(*re.findall(r'([\'"])file\1\s*:\s*"([^"]+)"', webpage))[1]) + } for media_url in set(media_urls) ] self._sort_formats(formats) diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index d260c91..bce32a8 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -27,7 +27,7 @@ class TEDIE(SubtitlesInfoExtractor): ''' _TESTS = [{ 'url': 'http://www.ted.com/talks/dan_dennett_on_our_consciousness.html', - 'md5': '4ea1dada91e4174b53dac2bb8ace429d', + 'md5': 'fc94ac279feebbce69f21c0c6ee82810', 'info_dict': { 'id': '102', 'ext': 'mp4', diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 36bc36a..08a48c0 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -17,9 +17,10 @@ class Tube8IE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/){2}(?P<id>\d+)' _TEST = { 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', - 'file': '229795.mp4', - 'md5': 'e9e0b0c86734e5e3766e653509475db0', + 'md5': '44bf12b98313827dd52d35b8706a4ea0', 'info_dict': { + 'id': '229795', + 'ext': 'mp4', 'description': 'hot teen Kasia grinding', 'uploader': 'unknown', 'title': 'Kasia music video', diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py index d16993d..fb132ae 100644 --- a/youtube_dl/extractor/veoh.py +++ b/youtube_dl/extractor/veoh.py @@ -7,6 +7,7 @@ from .common import InfoExtractor from ..utils import ( compat_urllib_request, int_or_none, + ExtractorError, ) @@ -94,8 +95,12 @@ class VeohIE(InfoExtractor): if video_id.startswith('v'): rsp = self._download_xml( r'http://www.veoh.com/api/findByPermalink?permalink=%s' % video_id, video_id, 'Downloading video XML') - if rsp.get('stat') == 'ok': + stat = rsp.get('stat') + if stat == 'ok': return self._extract_video(rsp.find('./videoList/video')) + elif stat == 'fail': + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, rsp.find('./errorList/error').get('errorMessage')), expected=True) webpage = self._download_webpage(url, video_id) age_limit = 0 diff --git a/youtube_dl/extractor/vulture.py b/youtube_dl/extractor/vulture.py new file mode 100644 index 0000000..1eb24a3 --- /dev/null +++ b/youtube_dl/extractor/vulture.py @@ -0,0 +1,69 @@ +from __future__ import unicode_literals + +import json +import os.path +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + parse_iso8601, +) + + +class VultureIE(InfoExtractor): + IE_NAME = 'vulture.com' + _VALID_URL = r'https?://video\.vulture\.com/video/(?P<display_id>[^/]+)/' + _TEST = { + 'url': 'http://video.vulture.com/video/Mindy-Kaling-s-Harvard-Speech/player?layout=compact&read_more=1', + 'md5': '8d997845642a2b5152820f7257871bc8', + 'info_dict': { + 'id': '6GHRQL3RV7MSD1H4', + 'ext': 'mp4', + 'title': 'kaling-speech-2-MAGNIFY STANDARD CONTAINER REVISED', + 'uploader_id': 'Sarah', + 'thumbnail': 're:^http://.*\.jpg$', + 'timestamp': 1401288564, + 'upload_date': '20140528', + 'description': 'Uplifting and witty, as predicted.', + 'duration': 1015, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id = mobj.group('display_id') + + webpage = self._download_webpage(url, display_id) + query_string = self._search_regex( + r"queryString\s*=\s*'([^']+)'", webpage, 'query string') + video_id = self._search_regex( + r'content=([^&]+)', query_string, 'video ID') + query_url = 'http://video.vulture.com/embed/player/container/1000/1000/?%s' % query_string + + query_webpage = self._download_webpage( + query_url, display_id, note='Downloading query page') + params_json = self._search_regex( + r'(?sm)new MagnifyEmbeddablePlayer\({.*?contentItem:\s*(\{.*?\})\n,\n', + query_webpage, + 'player params') + params = json.loads(params_json) + + upload_timestamp = parse_iso8601(params['posted'].replace(' ', 'T')) + uploader_id = params.get('user', {}).get('handle') + + media_item = params['media_item'] + title = os.path.splitext(media_item['title'])[0] + duration = int_or_none(media_item.get('duration_seconds')) + + return { + 'id': video_id, + 'display_id': display_id, + 'url': media_item['pipeline_xid'], + 'title': title, + 'timestamp': upload_timestamp, + 'thumbnail': params.get('thumbnail_url'), + 'uploader_id': uploader_id, + 'description': params.get('description'), + 'duration': duration, + } diff --git a/youtube_dl/extractor/wrzuta.py b/youtube_dl/extractor/wrzuta.py new file mode 100644 index 0000000..34dd6d9 --- /dev/null +++ b/youtube_dl/extractor/wrzuta.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + qualities, +) + + +class WrzutaIE(InfoExtractor): + IE_NAME = 'wrzuta.pl' + + _VALID_URL = r'https?://(?P<uploader>[0-9a-zA-Z]+)\.wrzuta\.pl/(?P<typ>film|audio)/(?P<id>[0-9a-zA-Z]+)' + + _TESTS = [{ + 'url': 'http://laboratoriumdextera.wrzuta.pl/film/aq4hIZWrkBu/nike_football_the_last_game', + 'md5': '9e67e05bed7c03b82488d87233a9efe7', + 'info_dict': { + 'id': 'aq4hIZWrkBu', + 'ext': 'mp4', + 'title': 'Nike Football: The Last Game', + 'duration': 307, + 'uploader_id': 'laboratoriumdextera', + 'description': 'md5:7fb5ef3c21c5893375fda51d9b15d9cd', + }, + }, { + 'url': 'http://w729.wrzuta.pl/audio/9oXJqdcndqv/david_guetta_amp_showtek_ft._vassy_-_bad', + 'md5': '1e546a18e1c22ac6e9adce17b8961ff5', + 'info_dict': { + 'id': '9oXJqdcndqv', + 'ext': 'ogg', + 'title': 'David Guetta & Showtek ft. Vassy - Bad', + 'duration': 270, + 'uploader_id': 'w729', + 'description': 'md5:4628f01c666bbaaecefa83476cfa794a', + }, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + typ = mobj.group('typ') + uploader = mobj.group('uploader') + + webpage = self._download_webpage(url, video_id) + + quality = qualities(['SD', 'MQ', 'HQ', 'HD']) + + audio_table = {'flv': 'mp3', 'webm': 'ogg'} + + embedpage = self._download_json('http://www.wrzuta.pl/npp/embed/%s/%s' % (uploader, video_id), video_id) + + formats = [] + for media in embedpage['url']: + if typ == 'audio': + ext = audio_table[media['type'].split('@')[0]] + else: + ext = media['type'].split('@')[0] + + formats.append({ + 'format_id': '%s_%s' % (ext, media['quality'].lower()), + 'url': media['url'], + 'ext': ext, + 'quality': quality(media['quality']), + }) + + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage), + 'thumbnail': self._og_search_thumbnail(webpage), + 'formats': formats, + 'duration': int_or_none(embedpage['duration']), + 'uploader_id': uploader, + 'description': self._og_search_description(webpage), + 'age_limit': embedpage.get('minimalAge', 0), + } diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 7c50881..d45545e 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -440,7 +440,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor, SubtitlesInfoExtractor): def _parse_sig_js(self, jscode): funcname = self._search_regex( - r'signature=([a-zA-Z]+)', jscode, + r'signature=([$a-zA-Z]+)', jscode, u'Initial JS player signature function name') jsi = JSInterpreter(jscode) @@ -1386,13 +1386,13 @@ class YoutubePlaylistIE(YoutubeBaseInfoExtractor): | p/ ) ( - (?:PL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} + (?:PL|LL|EC|UU|FL|RD)?[0-9A-Za-z-_]{10,} # Top tracks, they can also include dots |(?:MC)[\w\.]* ) .* | - ((?:PL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) + ((?:PL|LL|EC|UU|FL|RD)[0-9A-Za-z-_]{10,}) )""" _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s' _MORE_PAGES_INDICATOR = r'data-link-type="next"' diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 6fe7c7b..a332b5a 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2014.06.07' +__version__ = '2014.06.19'