From: Rogério Brito Date: Tue, 6 Aug 2013 20:36:01 +0000 (-0300) Subject: Imported Upstream version 2013.08.02 X-Git-Url: https://git.rapsys.eu/.gitweb.cgi/youtubedl/commitdiff_plain/33cd347759d6d999325ebf3c69b7ed5692c343b2?hp=-c Imported Upstream version 2013.08.02 --- 33cd347759d6d999325ebf3c69b7ed5692c343b2 diff --git a/README.md b/README.md index b246d3c..560bcdc 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,9 @@ which means you can modify it, redistribute it or use it however you like. # OPTIONS -h, --help print this help text and exit --version print program version and exit - -U, --update update this program to latest version + -U, --update update this program to latest version. Make sure + that you have sufficient permissions (run with + sudo if needed) -i, --ignore-errors continue on download errors --dump-user-agent display the current browser identification --user-agent UA specify a custom user agent diff --git a/README.txt b/README.txt index 8f08dd2..b13711a 100644 --- a/README.txt +++ b/README.txt @@ -23,7 +23,9 @@ OPTIONS -h, --help print this help text and exit --version print program version and exit - -U, --update update this program to latest version + -U, --update update this program to latest version. Make sure + that you have sufficient permissions (run with + sudo if needed) -i, --ignore-errors continue on download errors --dump-user-agent display the current browser identification --user-agent UA specify a custom user agent diff --git a/devscripts/youtube_genalgo.py b/devscripts/youtube_genalgo.py index 150c88d..31d6ec9 100644 --- a/devscripts/youtube_genalgo.py +++ b/devscripts/youtube_genalgo.py @@ -5,27 +5,45 @@ import sys tests = [ + # 92 - vflQw-fB4 2013/07/17 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"", + "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"), + # 90 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`", + "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"), # 88 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<", "J:|}][{=+-_)(*&;%$#@>MNBVCXZASDFGH^KLPOIUYTREWQ0987654321mnbvcxzasdfghrklpoiuytej"), - # 87 + # 87 - vflART1Nf 2013/07/24 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<", - "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), - # 86 - vfl_ymO4Z 2013/06/27 + "tyuioplkjhgfdsazxcv"), + # 86 - vflm_D8eE 2013/07/31 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", - "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), - # 85 + ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK.<", - "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"), + "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"), # 84 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?>.<", "<.>?;}[{=+-_)(*&^%$#@!MNBVCXZASDFGHJKLPOIUYTREWe098765432rmnbvcxzasdfghjklpoiuyt1"), - # 83 - vfl26ng3K 2013/07/10 + # 83 - vflTWC9KW 2013/08/01 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<", - "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"), + "qwertyuioplkjhg>dsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/f"), # 82 ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.<", "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"), + # 81 - vflLC8JvQ 2013/07/25 + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.", + "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), + # 79 - vflLC8JvQ 2013/07/25 (sporadic) + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/", + "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"), +] + +tests_age_gate = [ + # 86 - vflqinMWD + ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<", + "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"), ] def find_matching(wrong, right): @@ -78,6 +96,8 @@ def genall(tests): def main(): print(genall(tests)) + print(u' Age gate:') + print(genall(tests_age_gate)) if __name__ == '__main__': main() diff --git a/test/test_playlists.py b/test/test_playlists.py new file mode 100644 index 0000000..65de3a5 --- /dev/null +++ b/test/test_playlists.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python + +import sys +import unittest +import json + +# Allow direct execution +import os +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE +from youtube_dl.utils import * + +from helper import FakeYDL + +class TestPlaylists(unittest.TestCase): + def assertIsPlaylist(self, info): + """Make sure the info has '_type' set to 'playlist'""" + self.assertEqual(info['_type'], 'playlist') + + def test_dailymotion_playlist(self): + dl = FakeYDL() + ie = DailymotionPlaylistIE(dl) + result = ie.extract('http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'SPORT') + self.assertTrue(len(result['entries']) > 20) + + def test_vimeo_channel(self): + dl = FakeYDL() + ie = VimeoChannelIE(dl) + result = ie.extract('http://vimeo.com/channels/tributes') + self.assertIsPlaylist(result) + self.assertEqual(result['title'], u'Vimeo Tributes') + self.assertTrue(len(result['entries']) > 24) + +if __name__ == '__main__': + unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index c4b7136..be10691 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -4,6 +4,7 @@ import sys import unittest +import xml.etree.ElementTree # Allow direct execution import os @@ -16,6 +17,7 @@ from youtube_dl.utils import unescapeHTML from youtube_dl.utils import orderedSet from youtube_dl.utils import DateRange from youtube_dl.utils import unified_strdate +from youtube_dl.utils import find_xpath_attr if sys.version_info < (3, 0): _compat_str = lambda b: b.decode('unicode-escape') @@ -112,5 +114,18 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214') self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011') + def test_find_xpath_attr(self): + testxml = u''' + + + + + ''' + doc = xml.etree.ElementTree.fromstring(testxml) + + self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None) + self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1]) + self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2]) + if __name__ == '__main__': unittest.main() diff --git a/test/test_youtube_sig.py b/test/test_youtube_sig.py old mode 100755 new mode 100644 index e766042..d645c08 --- a/test/test_youtube_sig.py +++ b/test/test_youtube_sig.py @@ -10,12 +10,19 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from youtube_dl.extractor.youtube import YoutubeIE from helper import FakeYDL -sig = YoutubeIE(FakeYDL())._decrypt_signature +ie = YoutubeIE(FakeYDL()) +sig = ie._decrypt_signature +sig_age_gate = ie._decrypt_signature_age_gate class TestYoutubeSig(unittest.TestCase): - def test_43_43(self): - wrong = '5AEEAE0EC39677BC65FD9021CCD115F1F2DBD5A59E4.C0B243A3E2DED6769199AF3461781E75122AE135135' - right = '931EA22157E1871643FA9519676DED253A342B0C.4E95A5DBD2F1F511DCC1209DF56CB77693CE0EAE' + def test_92(self): + wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8" + right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7" + self.assertEqual(sig(wrong), right) + + def test_90(self): + wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`" + right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|" self.assertEqual(sig(wrong), right) def test_88(self): @@ -25,17 +32,17 @@ class TestYoutubeSig(unittest.TestCase): def test_87(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<" - right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr" + right = "tyuioplkjhgfdsazxcv" self.assertEqual(sig(wrong), right) def test_86(self): wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<" - right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@" + right = ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK 0 and not re.search(r'^[#/;]', x)] + if opts.verbose: + sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n') except IOError: sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args @@ -580,7 +582,7 @@ def _real_main(argv=None): }) if opts.verbose: - ydl.to_screen(u'[debug] youtube-dl version ' + __version__) + sys.stderr.write(u'[debug] youtube-dl version ' + __version__ + u'\n') try: sp = subprocess.Popen( ['git', 'rev-parse', '--short', 'HEAD'], @@ -589,11 +591,14 @@ def _real_main(argv=None): out, err = sp.communicate() out = out.decode().strip() if re.match('[0-9a-f]+', out): - ydl.to_screen(u'[debug] Git HEAD: ' + out) + sys.stderr.write(u'[debug] Git HEAD: ' + out + u'\n') except: - sys.exc_clear() - ydl.to_screen(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform())) - ydl.to_screen(u'[debug] Proxy map: ' + str(proxy_handler.proxies)) + try: + sys.exc_clear() + except: + pass + sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()) + u'\n') + sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n') ydl.add_default_info_extractors() diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 934419c..c20172a 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -6,17 +6,23 @@ from .bandcamp import BandcampIE from .bliptv import BlipTVIE, BlipTVUserIE from .breakcom import BreakIE from .brightcove import BrightcoveIE +from .canalplus import CanalplusIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE +from .condenast import CondeNastIE +from .criterion import CriterionIE from .cspan import CSpanIE -from .dailymotion import DailymotionIE +from .dailymotion import DailymotionIE, DailymotionPlaylistIE from .depositfiles import DepositFilesIE from .dotsub import DotsubIE from .dreisat import DreiSatIE +from .ehow import EHowIE from .eighttracks import EightTracksIE from .escapist import EscapistIE +from .exfm import ExfmIE from .facebook import FacebookIE from .flickr import FlickrIE +from .freesound import FreesoundIE from .funnyordie import FunnyOrDieIE from .gamespot import GameSpotIE from .gametrailers import GametrailersIE @@ -26,13 +32,16 @@ from .googlesearch import GoogleSearchIE from .hotnewhiphop import HotNewHipHopIE from .howcast import HowcastIE from .hypem import HypemIE +from .ign import IGNIE, OneUPIE from .ina import InaIE from .infoq import InfoQIE from .instagram import InstagramIE from .jukebox import JukeboxIE from .justintv import JustinTVIE +from .kankan import KankanIE from .keek import KeekIE from .liveleak import LiveLeakIE +from .livestream import LivestreamIE from .metacafe import MetacafeIE from .mixcloud import MixcloudIE from .mtv import MTVIE @@ -44,6 +53,8 @@ from .pornotube import PornotubeIE from .rbmaradio import RBMARadioIE from .redtube import RedTubeIE from .ringtv import RingTVIE +from .roxwel import RoxwelIE +from .sina import SinaIE from .soundcloud import SoundcloudIE, SoundcloudSetIE from .spiegel import SpiegelIE from .stanfordoc import StanfordOpenClassroomIE @@ -52,6 +63,7 @@ from .steam import SteamIE from .teamcoco import TeamcocoIE from .ted import TEDIE from .tf1 import TF1IE +from .thisav import ThisAVIE from .traileraddict import TrailerAddictIE from .tudou import TudouIE from .tumblr import TumblrIE @@ -60,9 +72,11 @@ from .ustream import UstreamIE from .vbox7 import Vbox7IE from .veoh import VeohIE from .vevo import VevoIE -from .vimeo import VimeoIE +from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .c56 import C56IE from .wat import WatIE +from .weibo import WeiboIE from .wimp import WimpIE from .worldstarhiphop import WorldStarHipHopIE from .xhamster import XHamsterIE @@ -80,6 +94,9 @@ from .youtube import ( YoutubeChannelIE, YoutubeShowIE, YoutubeSubscriptionsIE, + YoutubeRecommendedIE, + YoutubeWatchLaterIE, + YoutubeFavouritesIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 29cb9bd..7efd1d8 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -48,6 +48,7 @@ class ArchiveOrgIE(InfoExtractor): formats.sort(key=lambda fdata: fdata['file_size']) info = { + '_type': 'video', 'id': video_id, 'title': title, 'formats': formats, @@ -63,4 +64,4 @@ class ArchiveOrgIE(InfoExtractor): info['url'] = formats[-1]['url'] info['ext'] = determine_ext(formats[-1]['url']) - return self.video_result(info) \ No newline at end of file + return info \ No newline at end of file diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index e7a91a1..18d5916 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -5,6 +5,7 @@ import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( ExtractorError, + find_xpath_attr, unified_strdate, ) @@ -97,7 +98,7 @@ class ArteTvIE(InfoExtractor): l = 'F' elif lang == 'de': l = 'A' - regexes = [r'VO?%s' % l, r'V%s-ST.' % l] + regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l] return any(re.match(r, f['versionCode']) for r in regexes) # Some formats may not be in the same language as the url formats = filter(_match_lang, formats) @@ -119,7 +120,7 @@ class ArteTvIE(InfoExtractor): ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml') ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata') ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml) - config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang) + config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang) config_xml_url = config_node.attrib['ref'] config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration') diff --git a/youtube_dl/extractor/breakcom.py b/youtube_dl/extractor/breakcom.py index 34f555e..53a898d 100644 --- a/youtube_dl/extractor/breakcom.py +++ b/youtube_dl/extractor/breakcom.py @@ -1,6 +1,8 @@ import re +import json from .common import InfoExtractor +from ..utils import determine_ext class BreakIE(InfoExtractor): @@ -17,17 +19,20 @@ class BreakIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1).split("-")[-1] - webpage = self._download_webpage(url, video_id) - video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1) - key = re.search(r"icon: '(.+?)',",webpage).group(1) - final_url = str(video_url)+"?"+str(key) - thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1) - title = re.search(r"sVidTitle: '(.+)',",webpage).group(1) - ext = video_url.split('.')[-1] + embed_url = 'http://www.break.com/embed/%s' % video_id + webpage = self._download_webpage(embed_url, video_id) + info_json = self._search_regex(r'var embedVars = ({.*?});', webpage, + u'info json', flags=re.DOTALL) + info = json.loads(info_json) + video_url = info['videoUri'] + m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url) + if m_youtube is not None: + return self.url_result(m_youtube.group(1), 'Youtube') + final_url = video_url + '?' + info['AuthToken'] return [{ 'id': video_id, 'url': final_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, + 'ext': determine_ext(final_url), + 'title': info['contentName'], + 'thumbnail': info['thumbUri'], }] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index f85acbb..71e3c78 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -1,28 +1,82 @@ import re import json +import xml.etree.ElementTree from .common import InfoExtractor +from ..utils import ( + compat_urllib_parse, + find_xpath_attr, + compat_urlparse, +) class BrightcoveIE(InfoExtractor): - _VALID_URL = r'http://.*brightcove\.com/.*\?(?P.*videoPlayer=(?P\d*).*)' + _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P.*)' + _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' + _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s' + + # There is a test for Brigtcove in GenericIE, that way we test both the download + # and the detection of videos, and we don't have to find an URL that is always valid + + @classmethod + def _build_brighcove_url(cls, object_str): + """ + Build a Brightcove url from a xml string containing + {params} + """ + object_doc = xml.etree.ElementTree.fromstring(object_str) + assert u'BrightcoveExperience' in object_doc.attrib['class'] + params = {'flashID': object_doc.attrib['id'], + 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], + } + playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey') + # Not all pages define this value + if playerKey is not None: + params['playerKey'] = playerKey.attrib['value'] + videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer') + if videoPlayer is not None: + params['@videoPlayer'] = videoPlayer.attrib['value'] + data = compat_urllib_parse.urlencode(params) + return cls._FEDERATED_URL_TEMPLATE % data def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - query = mobj.group('query') - video_id = mobj.group('id') + query_str = mobj.group('query') + query = compat_urlparse.parse_qs(query_str) + + videoPlayer = query.get('@videoPlayer') + if videoPlayer: + return self._get_video_info(videoPlayer[0], query_str) + else: + player_key = query['playerKey'] + return self._get_playlist_info(player_key[0]) - request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query + def _get_video_info(self, video_id, query): + request_url = self._FEDERATED_URL_TEMPLATE % query webpage = self._download_webpage(request_url, video_id) self.report_extraction(video_id) info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json') info = json.loads(info)['data'] video_info = info['programmedContent']['videoPlayer']['mediaDTO'] + + return self._extract_video_info(video_info) + + def _get_playlist_info(self, player_key): + playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key, + player_key, u'Downloading playlist information') + + playlist_info = json.loads(playlist_info)['videoList'] + videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] + + return self.playlist_result(videos, playlist_id=playlist_info['id'], + playlist_title=playlist_info['mediaCollectionDTO']['displayName']) + + def _extract_video_info(self, video_info): renditions = video_info['renditions'] renditions = sorted(renditions, key=lambda r: r['size']) best_format = renditions[-1] - - return {'id': video_id, + + return {'id': video_info['id'], 'title': video_info['displayName'], 'url': best_format['defaultURL'], 'ext': 'mp4', diff --git a/youtube_dl/extractor/c56.py b/youtube_dl/extractor/c56.py new file mode 100644 index 0000000..4c8a8af --- /dev/null +++ b/youtube_dl/extractor/c56.py @@ -0,0 +1,36 @@ +# coding: utf-8 + +import re +import json + +from .common import InfoExtractor +from ..utils import determine_ext + +class C56IE(InfoExtractor): + _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P.+?)\.(html|swf)' + IE_NAME = u'56.com' + + _TEST ={ + u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html', + u'file': u'93440716.mp4', + u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e', + u'info_dict': { + u'title': u'网事知多少 第32期:车怒', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE) + text_id = mobj.group('textid') + info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id, + text_id, u'Downloading video info') + info = json.loads(info_page)['info'] + best_format = sorted(info['rfiles'], key=lambda f: int(f['filesize']))[-1] + video_url = best_format['url'] + + return {'id': info['vid'], + 'title': info['Subject'], + 'url': video_url, + 'ext': determine_ext(video_url), + 'thumbnail': info.get('bimg') or info.get('img'), + } diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py new file mode 100644 index 0000000..3b1c888 --- /dev/null +++ b/youtube_dl/extractor/canalplus.py @@ -0,0 +1,46 @@ +import re +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import unified_strdate + +class CanalplusIE(InfoExtractor): + _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P\d+)' + _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s' + IE_NAME = u'canalplus.fr' + + _TEST = { + u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861', + u'file': u'889861.flv', + u'md5': u'590a888158b5f0d6832f84001fbf3e99', + u'info_dict': { + u'title': u'Le Petit Journal 20/06/13 - La guerre des drone', + u'upload_date': u'20130620', + }, + u'skip': u'Requires rtmpdump' + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + info_url = self._VIDEO_INFO_TEMPLATE % video_id + info_page = self._download_webpage(info_url,video_id, + u'Downloading video info') + + self.report_extraction(video_id) + doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8')) + video_info = [video for video in doc if video.find('ID').text == video_id][0] + infos = video_info.find('INFOS') + media = video_info.find('MEDIA') + formats = [media.find('VIDEOS/%s' % format) + for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']] + video_url = [format.text for format in formats if format is not None][-1] + + return {'id': video_id, + 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text, + infos.find('TITRAGE/SOUS_TITRE').text), + 'url': video_url, + 'ext': 'flv', + 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text), + 'thumbnail': media.find('IMAGES/GRAND').text, + } diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 7ae0972..5badde0 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,26 +1,26 @@ import re -import socket import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( - compat_http_client, - compat_str, - compat_urllib_error, compat_urllib_parse_urlparse, - compat_urllib_request, ExtractorError, ) class CollegeHumorIE(InfoExtractor): - _WORKING = False - _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P[0-9]+)/(?P.*)$' + _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P[0-9]+)/(?P.*)$' - def report_manifest(self, video_id): - """Report information extraction.""" - self.to_screen(u'%s: Downloading XML manifest' % video_id) + _TEST = { + u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe', + u'file': u'6902724.mp4', + u'md5': u'1264c12ad95dca142a9f0bf7968105a0', + u'info_dict': { + u'title': u'Comic-Con Cosplay Catastrophe', + u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.', + }, + } def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) @@ -36,14 +36,16 @@ class CollegeHumorIE(InfoExtractor): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - try: - metaXml = compat_urllib_request.urlopen(xmlUrl).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + metaXml = self._download_webpage(xmlUrl, video_id, + u'Downloading info XML', + u'Unable to download video info XML') mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] + youtubeIdNode = videoNode.find('./youtubeID') + if youtubeIdNode is not None: + return self.url_result(youtubeIdNode.text, 'Youtube') info['description'] = videoNode.findall('./description')[0].text info['title'] = videoNode.findall('./caption')[0].text info['thumbnail'] = videoNode.findall('./thumbnail')[0].text @@ -52,11 +54,9 @@ class CollegeHumorIE(InfoExtractor): raise ExtractorError(u'Invalid metadata XML file') manifest_url += '?hdcore=2.10.3' - self.report_manifest(video_id) - try: - manifestXml = compat_urllib_request.urlopen(manifest_url).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err)) + manifestXml = self._download_webpage(manifest_url, video_id, + u'Downloading XML manifest', + u'Unable to download video info XML') adoc = xml.etree.ElementTree.fromstring(manifestXml) try: @@ -66,9 +66,8 @@ class CollegeHumorIE(InfoExtractor): except IndexError as err: raise ExtractorError(u'Invalid manifest file') - url_pr = compat_urllib_parse_urlparse(manifest_url) - url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1' + url_pr = compat_urllib_parse_urlparse(info['thumbnail']) - info['url'] = url - info['ext'] = 'f4f' + info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','') + info['ext'] = 'mp4' return [info] diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 93d9e3d..bf8d711 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -24,7 +24,9 @@ class ComedyCentralIE(InfoExtractor): (full-episodes/(?P.*)| (?P (the-colbert-report-(videos|collections)/(?P[0-9]+)/[^/]*/(?P.*?)) - |(watch/(?P[^/]*)/(?P.*))))) + |(watch/(?P[^/]*)/(?P.*)))| + (?P + extended-interviews/(?P[0-9]+)/playlist_tds_extended_(?P.*?)/.*?))) $""" _TEST = { u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart', @@ -87,6 +89,9 @@ class ComedyCentralIE(InfoExtractor): else: epTitle = mobj.group('cntitle') dlNewest = False + elif mobj.group('interview'): + epTitle = mobj.group('interview_title') + dlNewest = False else: dlNewest = not mobj.group('episode') if dlNewest: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 1d98222..da50abf 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -14,6 +14,7 @@ from ..utils import ( clean_html, compiled_regex_type, ExtractorError, + unescapeHTML, ) class InfoExtractor(object): @@ -125,6 +126,11 @@ class InfoExtractor(object): def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None): """ Returns a tuple (page content as string, URL handle) """ + + # Strip hashes from the URL (#1038) + if isinstance(url_or_request, (compat_str, str)): + url_or_request = url_or_request.partition('#')[0] + urlh = self._request_webpage(url_or_request, video_id, note, errnote) content_type = urlh.headers.get('Content-Type', '') m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) @@ -169,11 +175,6 @@ class InfoExtractor(object): self.to_screen(u'Logging in') #Methods for following #608 - #They set the correct value of the '_type' key - def video_result(self, video_info): - """Returns a video""" - video_info['_type'] = 'video' - return video_info def url_result(self, url, ie=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info @@ -262,6 +263,31 @@ class InfoExtractor(object): return (username, password) + # Helper functions for extracting OpenGraph info + @staticmethod + def _og_regex(prop): + return r'%s).com/(?Pwatch|series|video)/(?P.+)' % '|'.join(_SITES.keys()) + IE_DESC = u'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) + + _TEST = { + u'url': u'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', + u'file': u'5171b343c2b4c00dd0c1ccb3.mp4', + u'md5': u'1921f713ed48aabd715691f774c451f7', + u'info_dict': { + u'title': u'3D Printed Speakers Lit With LED', + u'description': u'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', + } + } + + def _extract_series(self, url, webpage): + title = self._html_search_regex(r'
.*?

(.+?)

', + webpage, u'series title', flags=re.DOTALL) + url_object = compat_urllib_parse_urlparse(url) + base_url = '%s://%s' % (url_object.scheme, url_object.netloc) + m_paths = re.finditer(r'

.*?(.+?)

', + r'
(.+?)
', + ], + webpage, u'description', + fatal=False, flags=re.DOTALL) + params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, + u'player params', flags=re.DOTALL) + video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, u'video id') + player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, u'player id') + target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, u'target') + data = compat_urllib_parse.urlencode({'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) + base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]', + webpage, u'base info url', + default='http://player.cnevids.com/player/loader.js?') + info_url = base_info_url + data + info_page = self._download_webpage(info_url, video_id, + u'Downloading video info') + video_info = self._search_regex(r'var video = ({.+?});', info_page, u'video info') + video_info = json.loads(video_info) + + def _formats_sort_key(f): + type_ord = 1 if f['type'] == 'video/mp4' else 0 + quality_ord = 1 if f['quality'] == 'high' else 0 + return (quality_ord, type_ord) + best_format = sorted(video_info['sources'][0], key=_formats_sort_key)[-1] + + return {'id': video_id, + 'url': best_format['src'], + 'ext': best_format['type'].split('/')[-1], + 'title': video_info['title'], + 'thumbnail': video_info['poster_frame'], + 'description': description, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + site = mobj.group('site') + url_type = mobj.group('type') + id = mobj.group('id') + + self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site]) + webpage = self._download_webpage(url, id) + + if url_type == 'series': + return self._extract_series(url, webpage) + else: + return self._extract_video(webpage) diff --git a/youtube_dl/extractor/criterion.py b/youtube_dl/extractor/criterion.py new file mode 100644 index 0000000..31fe3d5 --- /dev/null +++ b/youtube_dl/extractor/criterion.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- + +import re + +from .common import InfoExtractor +from ..utils import determine_ext + +class CriterionIE(InfoExtractor): + _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+' + _TEST = { + u'url': u'http://www.criterion.com/films/184-le-samourai', + u'file': u'184.mp4', + u'md5': u'bc51beba55685509883a9a7830919ec3', + u'info_dict': { + u"title": u"Le Samouraï", + u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + webpage = self._download_webpage(url, video_id) + + final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;', + webpage, 'video url') + title = self._html_search_regex(r'', + webpage, 'video title') + description = self._html_search_regex(r'', + webpage, 'video description') + thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;', + webpage, 'thumbnail url') + + return {'id': video_id, + 'url' : final_url, + 'title': title, + 'ext': determine_ext(final_url), + 'description': description, + 'thumbnail': thumbnail, + } diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index a485327..7bf03c5 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -34,8 +34,6 @@ class CSpanIE(InfoExtractor): description = self._html_search_regex(r'(.*?)', video_info, 'video url') @@ -49,5 +47,5 @@ class CSpanIE(InfoExtractor): 'url': url, 'play_path': path, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_thumbnail(webpage), } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 5fd2221..fa8c630 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -1,9 +1,12 @@ import re import json +import itertools from .common import InfoExtractor from ..utils import ( compat_urllib_request, + get_element_by_attribute, + get_element_by_id, ExtractorError, ) @@ -39,9 +42,6 @@ class DailymotionIE(InfoExtractor): # Extract URL, uploader and title from webpage self.report_extraction(video_id) - video_title = self._html_search_regex(r'', - webpage, 'title') - video_uploader = self._search_regex([r'(?im)[^<]+?
]+?>([^<]+?)', # Looking for official user r'<(?:span|a) .*?rel="author".*?>([^<]+?).+?)/' + _MORE_PAGES_INDICATOR = r'' + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + playlist_id = mobj.group('id') + video_ids = [] + + for pagenum in itertools.count(1): + webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum), + playlist_id, u'Downloading page %s' % pagenum) + + playlist_el = get_element_by_attribute(u'class', u'video_list', webpage) + video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el)) + + if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None: + break + + entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + for video_id in video_ids] + return {'_type': 'playlist', + 'id': playlist_id, + 'title': get_element_by_id(u'playlist_name', webpage), + 'entries': entries, + } diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 847f733..64b4658 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -67,6 +67,7 @@ class DreiSatIE(InfoExtractor): formats.sort(key=_sortkey) info = { + '_type': 'video', 'id': video_id, 'title': video_title, 'formats': formats, @@ -81,4 +82,4 @@ class DreiSatIE(InfoExtractor): info['url'] = formats[-1]['url'] info['ext'] = determine_ext(formats[-1]['url']) - return self.video_result(info) \ No newline at end of file + return info \ No newline at end of file diff --git a/youtube_dl/extractor/ehow.py b/youtube_dl/extractor/ehow.py new file mode 100644 index 0000000..2bb77ae --- /dev/null +++ b/youtube_dl/extractor/ehow.py @@ -0,0 +1,46 @@ +import re + +from ..utils import ( + compat_urllib_parse, + determine_ext +) +from .common import InfoExtractor + + +class EHowIE(InfoExtractor): + IE_NAME = u'eHow' + _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P[0-9]+)' + _TEST = { + u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html', + u'file': u'12245069.flv', + u'md5': u'9809b4e3f115ae2088440bcb4efbf371', + u'info_dict': { + u"title": u"Hardwood Flooring Basics", + u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...", + u"uploader": u"Erick Nathan" + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)', + webpage, u'video URL') + final_url = compat_urllib_parse.unquote(video_url) + uploader = self._search_regex(r'', + webpage, u'uploader') + title = self._og_search_title(webpage).replace(' | eHow', '') + ext = determine_ext(final_url) + + return { + '_type': 'video', + 'id': video_id, + 'url': final_url, + 'ext': ext, + 'title': title, + 'thumbnail': self._og_search_thumbnail(webpage), + 'description': self._og_search_description(webpage), + 'uploader': uploader, + } + diff --git a/youtube_dl/extractor/escapist.py b/youtube_dl/extractor/escapist.py index 794460e..3aa2da5 100644 --- a/youtube_dl/extractor/escapist.py +++ b/youtube_dl/extractor/escapist.py @@ -36,11 +36,7 @@ class EscapistIE(InfoExtractor): videoDesc = self._html_search_regex('[^/]+)' + _TEST = { + u'url': u'http://www.freesound.org/people/miklovan/sounds/194503/', + u'file': u'194503.mp3', + u'md5': u'12280ceb42c81f19a515c745eae07650', + u'info_dict': { + u"title": u"gulls in the city.wav", + u"uploader" : u"miklovan", + u'description': u'the sounds of seagulls in the city', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + music_id = mobj.group('id') + webpage = self._download_webpage(url, music_id) + title = self._html_search_regex(r'
.*?(.+?)', + webpage, 'music title', flags=re.DOTALL) + music_url = self._og_search_property('audio', webpage, 'music url') + description = self._html_search_regex(r'
(.*?)
', + webpage, 'description', fatal=False, flags=re.DOTALL) + + return [{ + 'id': music_id, + 'title': title, + 'url': music_url, + 'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'), + 'ext': determine_ext(music_url), + 'description': description, + }] diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 388aacf..67a7e5f 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -27,14 +27,11 @@ class FunnyOrDieIE(InfoExtractor): title = self._html_search_regex((r"

(?P.*?)</h1>", r'<title>(?P<title>[^<]+?)'), webpage, 'title', flags=re.DOTALL) - video_description = self._html_search_regex(r'videos|reviews|full-episodes)/(?P.*?)/(?P.*)' _TEST = { u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer', - u'file': u'zbvr8i.flv', - u'md5': u'c3edbc995ab4081976e16779bd96a878', + u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4', + u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7', u'info_dict': { - u"title": u"E3 2013: Debut Trailer" + u'title': u'E3 2013: Debut Trailer', + u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!', }, - u'skip': u'Requires rtmpdump' } + # Overwrite MTVIE properties we don't want + _TESTS = [] + + _FEED_URL = 'http://www.gametrailers.com/feeds/mrss' + + def _get_thumbnail_url(self, uri, itemdoc): + search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail')) + return itemdoc.find(search_path).attrib['url'] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) video_id = mobj.group('id') - video_type = mobj.group('type') webpage = self._download_webpage(url, video_id) - if video_type == 'full-episodes': - mgid_re = r'data-video="(?P<mgid>mgid:.*?)"' - else: - mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\'' - mgid = self._search_regex(mgid_re, webpage, u'mgid') - data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'}) - - info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data, - video_id, u'Downloading video info') - links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data, - video_id, u'Downloading video urls info') - - self.report_extraction(video_id) - info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]>.* - .*?)\]\]>.* - .* - (?P.*?).* - ''' - - m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL) - if m_info is None: - raise ExtractorError(u'Unable to extract video info') - video_title = m_info.group('title') - video_description = m_info.group('description') - video_thumb = m_info.group('thumb') - - m_urls = list(re.finditer(r'(?P.*)', links_webpage)) - if m_urls is None or len(m_urls) == 0: - raise ExtractorError(u'Unable to extract video url') - # They are sorted from worst to best quality - video_url = m_urls[-1].group('url') - - return {'url': video_url, - 'id': video_id, - 'title': video_title, - # Videos are actually flv not mp4 - 'ext': 'flv', - 'thumbnail': video_thumb, - 'description': video_description, - } + mgid = self._search_regex([r'data-video="(?Pmgid:.*?)"', + r'data-contentId=\'(?Pmgid:.*?)\''], + webpage, u'mgid') + return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 20bc533..b633e89 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -1,3 +1,5 @@ +# encoding: utf-8 + import os import re @@ -9,20 +11,34 @@ from ..utils import ( ExtractorError, ) +from .brightcove import BrightcoveIE class GenericIE(InfoExtractor): IE_DESC = u'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = u'generic' - _TEST = { - u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', - u'file': u'13601338388002.mp4', - u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', - u'info_dict': { - u"uploader": u"www.hodiho.fr", - u"title": u"R\u00e9gis plante sa Jeep" - } - } + _TESTS = [ + { + u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', + u'file': u'13601338388002.mp4', + u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89', + u'info_dict': { + u"uploader": u"www.hodiho.fr", + u"title": u"R\u00e9gis plante sa Jeep" + } + }, + { + u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/', + u'file': u'2371591881001.mp4', + u'md5': u'9e80619e0a94663f0bdc849b4566af19', + u'note': u'Test Brightcove downloads and detection in GenericIE', + u'info_dict': { + u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”', + u'uploader': u'8TV', + u'description': u'md5:a950cc4285c43e44d763d036710cd9cd', + } + }, + ] def report_download_webpage(self, video_id): """Report webpage download.""" @@ -103,6 +119,13 @@ class GenericIE(InfoExtractor): raise ExtractorError(u'Invalid URL: %s' % url) self.report_extraction(video_id) + # Look for BrigthCove: + m_brightcove = re.search(r'', webpage, re.DOTALL) + if m_brightcove is not None: + self.to_screen(u'Brightcove video detected.') + bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group()) + return self.url_result(bc_url, 'Brightcove') + # Start with something easy: JW Player in SWFObject mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage) if mobj is None: diff --git a/youtube_dl/extractor/hotnewhiphop.py b/youtube_dl/extractor/hotnewhiphop.py index ca3abb7..ccca1d7 100644 --- a/youtube_dl/extractor/hotnewhiphop.py +++ b/youtube_dl/extractor/hotnewhiphop.py @@ -33,16 +33,12 @@ class HotNewHipHopIE(InfoExtractor): video_title = self._html_search_regex(r"(.*)", webpage_src, u'title') - - # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._html_search_regex(r'"og:image" content="(.*)"', - webpage_src, u'thumbnail', fatal=False) results = [{ 'id': video_id, 'url' : video_url, 'title' : video_title, - 'thumbnail' : thumbnail, + 'thumbnail' : self._og_search_thumbnail(webpage_src), 'ext' : 'mp3', }] - return results \ No newline at end of file + return results diff --git a/youtube_dl/extractor/ign.py b/youtube_dl/extractor/ign.py new file mode 100644 index 0000000..62abab6 --- /dev/null +++ b/youtube_dl/extractor/ign.py @@ -0,0 +1,91 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import ( + determine_ext, +) + + +class IGNIE(InfoExtractor): + """ + Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com. + Some videos of it.ign.com are also supported + """ + + _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P.+)' + IE_NAME = u'ign.com' + + _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config' + _DESCRIPTION_RE = [r'(.+?)', + r'id="my_show_video">.*?

(.*?)

', + ] + + _TEST = { + u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review', + u'file': u'8f862beef863986b2785559b9e1aa599.mp4', + u'md5': u'eac8bdc1890980122c3b66f14bdd02e9', + u'info_dict': { + u'title': u'The Last of Us Review', + u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c', + } + } + + def _find_video_id(self, webpage): + res_id = [r'data-video-id="(.+?)"', + r'.+)' + IE_NAME = '1up.com' + + _DESCRIPTION_RE = r'
(.+?)
' + + _TEST = { + u'url': u'http://gamevideos.1up.com/video/id/34976', + u'file': u'34976.mp4', + u'md5': u'68a54ce4ebc772e4b71e3123d413163d', + u'info_dict': { + u'title': u'Sniper Elite V2 - Trailer', + u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf', + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + id = mobj.group('name_or_id') + result = super(OneUPIE, self)._real_extract(url) + result['id'] = id + return result diff --git a/youtube_dl/extractor/ina.py b/youtube_dl/extractor/ina.py index 962c592..652f19b 100644 --- a/youtube_dl/extractor/ina.py +++ b/youtube_dl/extractor/ina.py @@ -5,7 +5,7 @@ from .common import InfoExtractor class InaIE(InfoExtractor): """Information Extractor for Ina.fr""" - _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI[0-9]+)/.*' + _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?PI?[A-F0-9]+)/.*' _TEST = { u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html', u'file': u'I12055569.mp4', diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index 6ae704e..ddc4288 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -5,12 +5,13 @@ from .common import InfoExtractor class InstagramIE(InfoExtractor): _VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/' _TEST = { - u'url': u'http://instagram.com/p/aye83DjauH/#', + u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc', u'file': u'aye83DjauH.mp4', u'md5': u'0d2da106a9d2631273e192b372806516', u'info_dict': { u"uploader_id": u"naomipq", - u"title": u"Video by naomipq" + u"title": u"Video by naomipq", + u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8', } } @@ -18,25 +19,17 @@ class InstagramIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) webpage = self._download_webpage(url, video_id) - video_url = self._html_search_regex( - r'', - webpage, u'thumbnail URL', fatal=False) - html_title = self._html_search_regex( - r'(.+?)', - webpage, u'title', flags=re.DOTALL) - title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip() - uploader_id = self._html_search_regex(r'content="(.*?)\'s video on Instagram', - webpage, u'uploader name', fatal=False) - ext = 'mp4' + uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"', + webpage, u'uploader id', fatal=False) + desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description', + fatal=False) return [{ 'id': video_id, - 'url': video_url, - 'ext': ext, - 'title': title, - 'thumbnail': thumbnail_url, - 'uploader_id' : uploader_id + 'url': self._og_search_video_url(webpage), + 'ext': 'mp4', + 'title': u'Video by %s' % uploader_id, + 'thumbnail': self._og_search_thumbnail(webpage), + 'uploader_id' : uploader_id, + 'description': desc, }] diff --git a/youtube_dl/extractor/kankan.py b/youtube_dl/extractor/kankan.py new file mode 100644 index 0000000..8537ba5 --- /dev/null +++ b/youtube_dl/extractor/kankan.py @@ -0,0 +1,37 @@ +import re + +from .common import InfoExtractor +from ..utils import determine_ext + + +class KankanIE(InfoExtractor): + _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P\d+)\.shtml' + + _TEST = { + u'url': u'http://yinyue.kankan.com/vod/48/48863.shtml', + u'file': u'48863.flv', + u'md5': u'29aca1e47ae68fc28804aca89f29507e', + u'info_dict': { + u'title': u'Ready To Go', + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + webpage = self._download_webpage(url, video_id) + + title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title') + gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid') + + video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid, + video_id, u'Downloading video url info') + ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip') + path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path') + video_url = 'http://%s%s' % (ip, path) + + return {'id': video_id, + 'title': title, + 'url': video_url, + 'ext': determine_ext(video_url), + } diff --git a/youtube_dl/extractor/keek.py b/youtube_dl/extractor/keek.py index 72ad6a3..a7b88d2 100644 --- a/youtube_dl/extractor/keek.py +++ b/youtube_dl/extractor/keek.py @@ -4,10 +4,10 @@ from .common import InfoExtractor class KeekIE(InfoExtractor): - _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' + _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P\w+)' IE_NAME = u'keek' _TEST = { - u'url': u'http://www.keek.com/ytdl/keeks/NODfbab', + u'url': u'https://www.keek.com/ytdl/keeks/NODfbab', u'file': u'NODfbab.mp4', u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83', u'info_dict': { @@ -24,8 +24,7 @@ class KeekIE(InfoExtractor): thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id webpage = self._download_webpage(url, video_id) - video_title = self._html_search_regex(r'[\S\s]+?

(?P.+?)

', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index cf8a2c9..dd062a1 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -33,11 +33,9 @@ class LiveLeakIE(InfoExtractor): video_url = self._search_regex(r'file: "(.*?)",', webpage, u'video URL') - video_title = self._html_search_regex(r'', webpage, u'uploader', fatal=False) diff --git a/youtube_dl/extractor/livestream.py b/youtube_dl/extractor/livestream.py new file mode 100644 index 0000000..3099210 --- /dev/null +++ b/youtube_dl/extractor/livestream.py @@ -0,0 +1,52 @@ +import re +import json + +from .common import InfoExtractor +from ..utils import compat_urllib_parse_urlparse, compat_urlparse + + +class LivestreamIE(InfoExtractor): + _VALID_URL = r'http://new.livestream.com/.*?/(?P.*?)(/videos/(?P\d+))?/?$' + _TEST = { + u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370', + u'file': u'4719370.mp4', + u'md5': u'0d2186e3187d185a04b3cdd02b828836', + u'info_dict': { + u'title': u'Live from Webster Hall NYC', + u'upload_date': u'20121012', + } + } + + def _extract_video_info(self, video_data): + video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url') + return {'id': video_data['id'], + 'url': video_url, + 'ext': 'mp4', + 'title': video_data['caption'], + 'thumbnail': video_data['thumbnail_url'], + 'upload_date': video_data['updated_at'].replace('-','')[:8], + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + event_name = mobj.group('event_name') + webpage = self._download_webpage(url, video_id or event_name) + + if video_id is None: + # This is an event page: + api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'', + webpage, 'api url') + info = json.loads(self._download_webpage(api_url, event_name, + u'Downloading event info')) + videos = [self._extract_video_info(video_data['data']) + for video_data in info['feed']['data'] if video_data['type'] == u'video'] + return self.playlist_result(videos, info['id'], info['full_name']) + else: + og_video = self._og_search_video_url(webpage, name=u'player url') + query_str = compat_urllib_parse_urlparse(og_video).query + query = compat_urlparse.parse_qs(query_str) + api_url = query['play_url'][0].replace('.smil', '') + info = json.loads(self._download_webpage(api_url, video_id, + u'Downloading video info')) + return self._extract_video_info(info) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index 4c3f81b..e38dc98 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -9,7 +9,7 @@ from ..utils import ( compat_urllib_parse, compat_urllib_request, compat_str, - + determine_ext, ExtractorError, ) @@ -20,7 +20,7 @@ class MetacafeIE(InfoExtractor): _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = u'metacafe' - _TEST = { + _TESTS = [{ u"add_ie": ["Youtube"], u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/", u"file": u"_aUehQsCQtM.flv", @@ -31,7 +31,16 @@ class MetacafeIE(InfoExtractor): u"uploader": u"PBS", u"uploader_id": u"PBS" } - } + }, + { + u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/", + u"file": u"an-dVVXnuY7Jh77J.mp4", + u"info_dict": { + u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3", + u"uploader": u"anyclip", + u"description": u"md5:38c711dd98f5bb87acf973d573442e67" + } + }] def report_disclaimer(self): @@ -73,14 +82,16 @@ class MetacafeIE(InfoExtractor): return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')] # Retrieve video webpage to extract further information - webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id) + req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id) + req.headers['Cookie'] = 'flashVersion=0;' + webpage = self._download_webpage(req, video_id) # Extract URL, uploader and title from webpage self.report_extraction(video_id) mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage) if mobj is not None: mediaURL = compat_urllib_parse.unquote(mobj.group(1)) - video_extension = mediaURL[-3:] + video_ext = mediaURL[-3:] # Extract gdaKey if available mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage) @@ -90,34 +101,37 @@ class MetacafeIE(InfoExtractor): gdaKey = mobj.group(1) video_url = '%s?__gda__=%s' % (mediaURL, gdaKey) else: - mobj = re.search(r' name="flashvars" value="(.*?)"', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - vardict = compat_parse_qs(mobj.group(1)) - if 'mediaData' not in vardict: - raise ExtractorError(u'Unable to extract media URL') - mobj = re.search(r'"mediaURL":"(?Phttp.*?)",(.*?)"key":"(?P.*?)"', vardict['mediaData'][0]) - if mobj is None: - raise ExtractorError(u'Unable to extract media URL') - mediaURL = mobj.group('mediaURL').replace('\\/', '/') - video_extension = mediaURL[-3:] - video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key')) - - mobj = re.search(r'(?im)(.*) - Video', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract title') - video_title = mobj.group(1).decode('utf-8') - - mobj = re.search(r'submitter=(.*?);', webpage) - if mobj is None: - raise ExtractorError(u'Unable to extract uploader nickname') - video_uploader = mobj.group(1) - - return [{ - 'id': video_id.decode('utf-8'), - 'url': video_url.decode('utf-8'), - 'uploader': video_uploader.decode('utf-8'), + mobj = re.search(r'