-h, --help print this help text and exit
--version print program version and exit
- -U, --update update this program to latest version
+ -U, --update update this program to latest version. Make sure
+ that you have sufficient permissions (run with
+ sudo if needed)
-i, --ignore-errors continue on download errors
--dump-user-agent display the current browser identification
--user-agent UA specify a custom user agent
-h, --help print this help text and exit
--version print program version and exit
- -U, --update update this program to latest version
+ -U, --update update this program to latest version. Make sure
+ that you have sufficient permissions (run with
+ sudo if needed)
-i, --ignore-errors continue on download errors
--dump-user-agent display the current browser identification
--user-agent UA specify a custom user agent
import sys
tests = [
+ # 92 - vflQw-fB4 2013/07/17
+ ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`~\"",
+ "mrtyuioplkjhgfdsazxcvbnq1234567890QWERTY}IOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]\"|:;"),
+ # 90
+ ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`",
+ "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"),
# 88
- # 87
+ # 87 - vflART1Nf 2013/07/24
- "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"),
- # 86 - vfl_ymO4Z 2013/06/27
+ "tyuioplkjhgfdsazxcv<nm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>"),
+ # 86 - vflm_D8eE 2013/07/31
- "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"),
- # 85
+ ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK<POIUYTREW509876L432/mnbvcxzasdfghjklpoiuytre"),
+ # 85 - vflSAFCP9 2013/07/19
- "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"),
+ "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"),
# 84
- # 83 - vfl26ng3K 2013/07/10
+ # 83 - vflTWC9KW 2013/08/01
- "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"),
+ "qwertyuioplkjhg>dsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/f"),
# 82
+ # 81 - vflLC8JvQ 2013/07/25
+ ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>.",
+ "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),
+ # 79 - vflLC8JvQ 2013/07/25 (sporadic)
+ ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/",
+ "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"),
+tests_age_gate = [
+ # 86 - vflqinMWD
+ ("qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<",
+ "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"),
def find_matching(wrong, right):
def main():
+ print(u' Age gate:')
+ print(genall(tests_age_gate))
if __name__ == '__main__':
--- /dev/null
+#!/usr/bin/env python
+import sys
+import unittest
+import json
+# Allow direct execution
+import os
+from youtube_dl.extractor import DailymotionPlaylistIE, VimeoChannelIE
+from youtube_dl.utils import *
+from helper import FakeYDL
+class TestPlaylists(unittest.TestCase):
+ def assertIsPlaylist(self, info):
+ """Make sure the info has '_type' set to 'playlist'"""
+ self.assertEqual(info['_type'], 'playlist')
+ def test_dailymotion_playlist(self):
+ dl = FakeYDL()
+ ie = DailymotionPlaylistIE(dl)
+ result = ie.extract('http://www.dailymotion.com/playlist/xv4bw_nqtv_sport/1#video=xl8v3q')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['title'], u'SPORT')
+ self.assertTrue(len(result['entries']) > 20)
+ def test_vimeo_channel(self):
+ dl = FakeYDL()
+ ie = VimeoChannelIE(dl)
+ result = ie.extract('http://vimeo.com/channels/tributes')
+ self.assertIsPlaylist(result)
+ self.assertEqual(result['title'], u'Vimeo Tributes')
+ self.assertTrue(len(result['entries']) > 24)
+if __name__ == '__main__':
+ unittest.main()
import sys
import unittest
+import xml.etree.ElementTree
# Allow direct execution
import os
from youtube_dl.utils import orderedSet
from youtube_dl.utils import DateRange
from youtube_dl.utils import unified_strdate
+from youtube_dl.utils import find_xpath_attr
if sys.version_info < (3, 0):
_compat_str = lambda b: b.decode('unicode-escape')
self.assertEqual(unified_strdate('Dec 14, 2012'), '20121214')
self.assertEqual(unified_strdate('2012/10/11 01:56:38 +0000'), '20121011')
+ def test_find_xpath_attr(self):
+ testxml = u'''<root>
+ <node/>
+ <node x="a"/>
+ <node x="a" y="c" />
+ <node x="b" y="d" />
+ </root>'''
+ doc = xml.etree.ElementTree.fromstring(testxml)
+ self.assertEqual(find_xpath_attr(doc, './/fourohfour', 'n', 'v'), None)
+ self.assertEqual(find_xpath_attr(doc, './/node', 'x', 'a'), doc[1])
+ self.assertEqual(find_xpath_attr(doc, './/node', 'y', 'c'), doc[2])
if __name__ == '__main__':
from youtube_dl.extractor.youtube import YoutubeIE
from helper import FakeYDL
-sig = YoutubeIE(FakeYDL())._decrypt_signature
+ie = YoutubeIE(FakeYDL())
+sig = ie._decrypt_signature
+sig_age_gate = ie._decrypt_signature_age_gate
class TestYoutubeSig(unittest.TestCase):
- def test_43_43(self):
- wrong = '5AEEAE0EC39677BC65FD9021CCD115F1F2DBD5A59E4.C0B243A3E2DED6769199AF3461781E75122AE135135'
- right = '931EA22157E1871643FA9519676DED253A342B0C.4E95A5DBD2F1F511DCC1209DF56CB77693CE0EAE'
+ def test_92(self):
+ wrong = "F9F9B6E6FD47029957AB911A964CC20D95A181A5D37A2DBEFD67D403DB0E8BE4F4910053E4E8A79.0B70B.0B80B8"
+ right = "69B6E6FD47029957AB911A9F4CC20D95A181A5D3.A2DBEFD67D403DB0E8BE4F4910053E4E8A7980B7"
+ self.assertEqual(sig(wrong), right)
+ def test_90(self):
+ wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[]}|:;?/>.<'`"
+ right = "mrtyuioplkjhgfdsazxcvbne1234567890QWER[YUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={`]}|"
self.assertEqual(sig(wrong), right)
def test_88(self):
def test_87(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>.<"
- right = "!?;:|}][{=+-_)(*&^$#@/MNBVCXZASqFGHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"
+ right = "tyuioplkjhgfdsazxcv<nm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$^&*()_-+={[]}|:;?/>"
self.assertEqual(sig(wrong), right)
def test_86(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"
- right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"
+ right = ">.1}|[{=+-_)(*&^%$#@!MNBVCXZASDFGHJK<POIUYTREW509876L432/mnbvcxzasdfghjklpoiuytre"
self.assertEqual(sig(wrong), right)
def test_85(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[};?/>.<"
- right = "{>/?;}[.=+-_)(*&^%$#@!MqBVCXZASDFwHJKLPOIUYTREWQ0987654321mnbvcxzasdfghjklpoiuytr"
+ right = "ertyuiqplkjhgfdsazx$vbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#<%^&*()_-+={[};?/c"
self.assertEqual(sig(wrong), right)
def test_84(self):
def test_83(self):
wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>.<"
- right = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/>"
+ right = "qwertyuioplkjhg>dsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!#$%^&*()_+={[};?/f"
self.assertEqual(sig(wrong), right)
def test_82(self):
right = "Q>/?;}[{=+-(*<^%$#@!MNBVCXZASDFGHKLPOIUY8REWT0q&7654321mnbvcxzasdfghjklpoiuytrew9"
self.assertEqual(sig(wrong), right)
+ def test_81(self):
+ wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/>."
+ right = "C>/?;}[{=+-(*&^%$#@!MNBVYXZASDFGHKLPOIU.TREWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"
+ self.assertEqual(sig(wrong), right)
+ def test_79(self):
+ wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKHGFDSAZXCVBNM!@#$%^&*(-+={[};?/"
+ right = "Z?;}[{=+-(*&^%$#@!MNBVCXRASDFGHKLPOIUYT/EWQ0q87659321mnbvcxzasdfghjkl4oiuytrewp"
+ self.assertEqual(sig(wrong), right)
+ def test_86_age_gate(self):
+ wrong = "qwertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!@#$%^&*()_-+={[|};?/>.<"
+ right = "ertyuioplkjhgfdsazxcvbnm1234567890QWERTYUIOPLKJHGFDSAZXCVBNM!/#$%^&*()_-+={[|};?@"
+ self.assertEqual(sig_age_gate(wrong), right)
if __name__ == '__main__':
\-h,\ \-\-help\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ this\ help\ text\ and\ exit
\-\-version\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ print\ program\ version\ and\ exit
-\-U,\ \-\-update\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ update\ this\ program\ to\ latest\ version
+\-U,\ \-\-update\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ update\ this\ program\ to\ latest\ version.\ Make\ sure
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ that\ you\ have\ sufficient\ permissions\ (run\ with
+\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ sudo\ if\ needed)
\-i,\ \-\-ignore\-errors\ \ \ \ \ \ \ \ continue\ on\ download\ errors
\-\-dump\-user\-agent\ \ \ \ \ \ \ \ \ \ display\ the\ current\ browser\ identification
\-\-user\-agent\ UA\ \ \ \ \ \ \ \ \ \ \ \ specify\ a\ custom\ user\ agent
self.report_error(u'mplayer exited with code %d' % retval)
return False
+ def _download_m3u8_with_ffmpeg(self, filename, url):
+ self.report_destination(filename)
+ tmpfilename = self.temp_name(filename)
+ args = ['ffmpeg', '-y', '-i', url, '-f', 'mp4', tmpfilename]
+ # Check for ffmpeg first
+ try:
+ subprocess.call(['ffmpeg', '-h'], stdout=(open(os.path.devnull, 'w')), stderr=subprocess.STDOUT)
+ except (OSError, IOError):
+ self.report_error(u'm3u8 download detected but "%s" could not be run' % args[0] )
+ return False
+ retval = subprocess.call(args)
+ if retval == 0:
+ fsize = os.path.getsize(encodeFilename(tmpfilename))
+ self.to_screen(u'\r[%s] %s bytes' % (args[0], fsize))
+ self.try_rename(tmpfilename, filename)
+ self._hook_progress({
+ 'downloaded_bytes': fsize,
+ 'total_bytes': fsize,
+ 'filename': filename,
+ 'status': 'finished',
+ })
+ return True
+ else:
+ self.to_stderr(u"\n")
+ self.report_error(u'ffmpeg exited with code %d' % retval)
+ return False
def _do_download(self, filename, info_dict):
url = info_dict['url']
if url.startswith('mms') or url.startswith('rtsp'):
return self._download_with_mplayer(filename, url)
+ # m3u8 manifest are downloaded with ffmpeg
+ if determine_ext(url) == u'm3u8':
+ return self._download_m3u8_with_ffmpeg(filename, url)
tmpfilename = self.temp_name(filename)
stream = None
self._nopostoverwrites = nopostoverwrites
def get_audio_codec(self, path):
- if not self._exes['ffprobe'] and not self._exes['avprobe']: return None
+ if not self._exes['ffprobe'] and not self._exes['avprobe']:
+ raise PostProcessingError(u'ffprobe or avprobe not found. Please install one.')
cmd = [self._exes['avprobe'] or self._exes['ffprobe'], '-show_streams', encodeFilename(self._ffmpeg_filename_argument(path))]
handle = subprocess.Popen(cmd, stderr=compat_subprocess_get_DEVNULL(), stdout=subprocess.PIPE)
os.utime(encodeFilename(new_path), (time.time(), information['filetime']))
- self._downloader.to_stderr(u'WARNING: Cannot update utime of audio file')
+ self._downloader.report_warning(u'Cannot update utime of audio file')
information['filepath'] = new_path
return self._nopostoverwrites,information
result_type = ie_result.get('_type', 'video') # If not given we suppose it's a video, support the default old system
if result_type == 'video':
+ ie_result.update(extra_info)
if 'playlist' not in ie_result:
# It isn't part of a playlist
ie_result['playlist'] = None
if self.params.get('writethumbnail', False):
- if 'thumbnail' in info_dict:
- thumb_format = info_dict['thumbnail'].rpartition(u'/')[2].rpartition(u'.')[2]
- if not thumb_format:
- thumb_format = 'jpg'
+ if info_dict.get('thumbnail') is not None:
+ thumb_format = determine_ext(info_dict['thumbnail'], u'jpg')
thumb_filename = filename.rpartition('.')[0] + u'.' + thumb_format
self.to_screen(u'[%s] %s: Downloading thumbnail ...' %
(info_dict['extractor'], info_dict['id']))
# No clear decision yet, let IE decide
keep_video = keep_video_wish
except PostProcessingError as e:
- self.to_stderr(u'ERROR: ' + e.msg)
+ self.report_error(e.msg)
if keep_video is False and not self.params.get('keepvideo', False):
self.to_screen(u'Deleting original file %s (pass -k to keep)' % filename)
general.add_option('-v', '--version',
action='version', help='print program version and exit')
general.add_option('-U', '--update',
- action='store_true', dest='update_self', help='update this program to latest version')
+ action='store_true', dest='update_self', help='update this program to latest version. Make sure that you have sufficient permissions (run with sudo if needed)')
general.add_option('-i', '--ignore-errors',
action='store_true', dest='ignoreerrors', help='continue on download errors', default=False)
batchurls = batchfd.readlines()
batchurls = [x.strip() for x in batchurls]
batchurls = [x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x)]
+ if opts.verbose:
+ sys.stderr.write(u'[debug] Batch file urls: ' + repr(batchurls) + u'\n')
except IOError:
sys.exit(u'ERROR: batch file could not be read')
all_urls = batchurls + args
if opts.verbose:
- ydl.to_screen(u'[debug] youtube-dl version ' + __version__)
+ sys.stderr.write(u'[debug] youtube-dl version ' + __version__ + u'\n')
sp = subprocess.Popen(
['git', 'rev-parse', '--short', 'HEAD'],
out, err = sp.communicate()
out = out.decode().strip()
if re.match('[0-9a-f]+', out):
- ydl.to_screen(u'[debug] Git HEAD: ' + out)
+ sys.stderr.write(u'[debug] Git HEAD: ' + out + u'\n')
- sys.exc_clear()
- ydl.to_screen(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()))
- ydl.to_screen(u'[debug] Proxy map: ' + str(proxy_handler.proxies))
+ try:
+ sys.exc_clear()
+ except:
+ pass
+ sys.stderr.write(u'[debug] Python version %s - %s' %(platform.python_version(), platform.platform()) + u'\n')
+ sys.stderr.write(u'[debug] Proxy map: ' + str(proxy_handler.proxies) + u'\n')
from .bliptv import BlipTVIE, BlipTVUserIE
from .breakcom import BreakIE
from .brightcove import BrightcoveIE
+from .canalplus import CanalplusIE
from .collegehumor import CollegeHumorIE
from .comedycentral import ComedyCentralIE
+from .condenast import CondeNastIE
+from .criterion import CriterionIE
from .cspan import CSpanIE
-from .dailymotion import DailymotionIE
+from .dailymotion import DailymotionIE, DailymotionPlaylistIE
from .depositfiles import DepositFilesIE
from .dotsub import DotsubIE
from .dreisat import DreiSatIE
+from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .escapist import EscapistIE
+from .exfm import ExfmIE
from .facebook import FacebookIE
from .flickr import FlickrIE
+from .freesound import FreesoundIE
from .funnyordie import FunnyOrDieIE
from .gamespot import GameSpotIE
from .gametrailers import GametrailersIE
from .hotnewhiphop import HotNewHipHopIE
from .howcast import HowcastIE
from .hypem import HypemIE
+from .ign import IGNIE, OneUPIE
from .ina import InaIE
from .infoq import InfoQIE
from .instagram import InstagramIE
from .jukebox import JukeboxIE
from .justintv import JustinTVIE
+from .kankan import KankanIE
from .keek import KeekIE
from .liveleak import LiveLeakIE
+from .livestream import LivestreamIE
from .metacafe import MetacafeIE
from .mixcloud import MixcloudIE
from .mtv import MTVIE
from .rbmaradio import RBMARadioIE
from .redtube import RedTubeIE
from .ringtv import RingTVIE
+from .roxwel import RoxwelIE
+from .sina import SinaIE
from .soundcloud import SoundcloudIE, SoundcloudSetIE
from .spiegel import SpiegelIE
from .stanfordoc import StanfordOpenClassroomIE
from .teamcoco import TeamcocoIE
from .ted import TEDIE
from .tf1 import TF1IE
+from .thisav import ThisAVIE
from .traileraddict import TrailerAddictIE
from .tudou import TudouIE
from .tumblr import TumblrIE
from .vbox7 import Vbox7IE
from .veoh import VeohIE
from .vevo import VevoIE
-from .vimeo import VimeoIE
+from .vimeo import VimeoIE, VimeoChannelIE
from .vine import VineIE
+from .c56 import C56IE
from .wat import WatIE
+from .weibo import WeiboIE
from .wimp import WimpIE
from .worldstarhiphop import WorldStarHipHopIE
from .xhamster import XHamsterIE
+ YoutubeRecommendedIE,
+ YoutubeWatchLaterIE,
+ YoutubeFavouritesIE,
from .zdf import ZDFIE
formats.sort(key=lambda fdata: fdata['file_size'])
info = {
+ '_type': 'video',
'id': video_id,
'title': title,
'formats': formats,
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
- return self.video_result(info)
\ No newline at end of file
+ return info
\ No newline at end of file
from .common import InfoExtractor
from ..utils import (
+ find_xpath_attr,
l = 'F'
elif lang == 'de':
l = 'A'
- regexes = [r'VO?%s' % l, r'V%s-ST.' % l]
+ regexes = [r'VO?%s' % l, r'VO?.-ST%s' % l]
return any(re.match(r, f['versionCode']) for r in regexes)
# Some formats may not be in the same language as the url
formats = filter(_match_lang, formats)
ref_xml_url = ref_xml_url.replace('.html', ',view,asPlayerXml.xml')
ref_xml = self._download_webpage(ref_xml_url, video_id, note=u'Downloading metadata')
ref_xml_doc = xml.etree.ElementTree.fromstring(ref_xml)
- config_node = ref_xml_doc.find('.//video[@lang="%s"]' % lang)
+ config_node = find_xpath_attr(ref_xml_doc, './/video', 'lang', lang)
config_xml_url = config_node.attrib['ref']
config_xml = self._download_webpage(config_xml_url, video_id, note=u'Downloading configuration')
import re
+import json
from .common import InfoExtractor
+from ..utils import determine_ext
class BreakIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1).split("-")[-1]
- webpage = self._download_webpage(url, video_id)
- video_url = re.search(r"videoPath: '(.+?)',",webpage).group(1)
- key = re.search(r"icon: '(.+?)',",webpage).group(1)
- final_url = str(video_url)+"?"+str(key)
- thumbnail_url = re.search(r"thumbnailURL: '(.+?)'",webpage).group(1)
- title = re.search(r"sVidTitle: '(.+)',",webpage).group(1)
- ext = video_url.split('.')[-1]
+ embed_url = 'http://www.break.com/embed/%s' % video_id
+ webpage = self._download_webpage(embed_url, video_id)
+ info_json = self._search_regex(r'var embedVars = ({.*?});', webpage,
+ u'info json', flags=re.DOTALL)
+ info = json.loads(info_json)
+ video_url = info['videoUri']
+ m_youtube = re.search(r'(https?://www\.youtube\.com/watch\?v=.*)', video_url)
+ if m_youtube is not None:
+ return self.url_result(m_youtube.group(1), 'Youtube')
+ final_url = video_url + '?' + info['AuthToken']
return [{
'id': video_id,
'url': final_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': thumbnail_url,
+ 'ext': determine_ext(final_url),
+ 'title': info['contentName'],
+ 'thumbnail': info['thumbUri'],
import re
import json
+import xml.etree.ElementTree
from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ find_xpath_attr,
+ compat_urlparse,
class BrightcoveIE(InfoExtractor):
- _VALID_URL = r'http://.*brightcove\.com/.*\?(?P<query>.*videoPlayer=(?P<id>\d*).*)'
+ _VALID_URL = r'https?://.*brightcove\.com/(services|viewer).*\?(?P<query>.*)'
+ _FEDERATED_URL_TEMPLATE = 'http://c.brightcove.com/services/viewer/htmlFederated?%s'
+ _PLAYLIST_URL_TEMPLATE = 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=%s'
+ # There is a test for Brigtcove in GenericIE, that way we test both the download
+ # and the detection of videos, and we don't have to find an URL that is always valid
+ @classmethod
+ def _build_brighcove_url(cls, object_str):
+ """
+ Build a Brightcove url from a xml string containing
+ <object class="BrightcoveExperience">{params}</object>
+ """
+ object_doc = xml.etree.ElementTree.fromstring(object_str)
+ assert u'BrightcoveExperience' in object_doc.attrib['class']
+ params = {'flashID': object_doc.attrib['id'],
+ 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'],
+ }
+ playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey')
+ # Not all pages define this value
+ if playerKey is not None:
+ params['playerKey'] = playerKey.attrib['value']
+ videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer')
+ if videoPlayer is not None:
+ params['@videoPlayer'] = videoPlayer.attrib['value']
+ data = compat_urllib_parse.urlencode(params)
+ return cls._FEDERATED_URL_TEMPLATE % data
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- query = mobj.group('query')
- video_id = mobj.group('id')
+ query_str = mobj.group('query')
+ query = compat_urlparse.parse_qs(query_str)
+ videoPlayer = query.get('@videoPlayer')
+ if videoPlayer:
+ return self._get_video_info(videoPlayer[0], query_str)
+ else:
+ player_key = query['playerKey']
+ return self._get_playlist_info(player_key[0])
- request_url = 'http://c.brightcove.com/services/viewer/htmlFederated?%s' % query
+ def _get_video_info(self, video_id, query):
+ request_url = self._FEDERATED_URL_TEMPLATE % query
webpage = self._download_webpage(request_url, video_id)
info = self._search_regex(r'var experienceJSON = ({.*?});', webpage, 'json')
info = json.loads(info)['data']
video_info = info['programmedContent']['videoPlayer']['mediaDTO']
+ return self._extract_video_info(video_info)
+ def _get_playlist_info(self, player_key):
+ playlist_info = self._download_webpage(self._PLAYLIST_URL_TEMPLATE % player_key,
+ player_key, u'Downloading playlist information')
+ playlist_info = json.loads(playlist_info)['videoList']
+ videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']]
+ return self.playlist_result(videos, playlist_id=playlist_info['id'],
+ playlist_title=playlist_info['mediaCollectionDTO']['displayName'])
+ def _extract_video_info(self, video_info):
renditions = video_info['renditions']
renditions = sorted(renditions, key=lambda r: r['size'])
best_format = renditions[-1]
- return {'id': video_id,
+ return {'id': video_info['id'],
'title': video_info['displayName'],
'url': best_format['defaultURL'],
'ext': 'mp4',
--- /dev/null
+# coding: utf-8
+import re
+import json
+from .common import InfoExtractor
+from ..utils import determine_ext
+class C56IE(InfoExtractor):
+ _VALID_URL = r'https?://((www|player)\.)?56\.com/(.+?/)?(v_|(play_album.+-))(?P<textid>.+?)\.(html|swf)'
+ IE_NAME = u'56.com'
+ _TEST ={
+ u'url': u'http://www.56.com/u39/v_OTM0NDA3MTY.html',
+ u'file': u'93440716.mp4',
+ u'md5': u'9dc07b5c8e978112a6441f9e75d2b59e',
+ u'info_dict': {
+ u'title': u'网事知多少 第32期:车怒',
+ },
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+ text_id = mobj.group('textid')
+ info_page = self._download_webpage('http://vxml.56.com/json/%s/' % text_id,
+ text_id, u'Downloading video info')
+ info = json.loads(info_page)['info']
+ best_format = sorted(info['rfiles'], key=lambda f: int(f['filesize']))[-1]
+ video_url = best_format['url']
+ return {'id': info['vid'],
+ 'title': info['Subject'],
+ 'url': video_url,
+ 'ext': determine_ext(video_url),
+ 'thumbnail': info.get('bimg') or info.get('img'),
+ }
--- /dev/null
+import re
+import xml.etree.ElementTree
+from .common import InfoExtractor
+from ..utils import unified_strdate
+class CanalplusIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.canalplus\.fr/.*?\?vid=(?P<id>\d+)'
+ _VIDEO_INFO_TEMPLATE = 'http://service.canal-plus.com/video/rest/getVideosLiees/cplus/%s'
+ IE_NAME = u'canalplus.fr'
+ _TEST = {
+ u'url': u'http://www.canalplus.fr/c-divertissement/pid3351-c-le-petit-journal.html?vid=889861',
+ u'file': u'889861.flv',
+ u'md5': u'590a888158b5f0d6832f84001fbf3e99',
+ u'info_dict': {
+ u'title': u'Le Petit Journal 20/06/13 - La guerre des drone',
+ u'upload_date': u'20130620',
+ },
+ u'skip': u'Requires rtmpdump'
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ info_url = self._VIDEO_INFO_TEMPLATE % video_id
+ info_page = self._download_webpage(info_url,video_id,
+ u'Downloading video info')
+ self.report_extraction(video_id)
+ doc = xml.etree.ElementTree.fromstring(info_page.encode('utf-8'))
+ video_info = [video for video in doc if video.find('ID').text == video_id][0]
+ infos = video_info.find('INFOS')
+ media = video_info.find('MEDIA')
+ formats = [media.find('VIDEOS/%s' % format)
+ for format in ['BAS_DEBIT', 'HAUT_DEBIT', 'HD']]
+ video_url = [format.text for format in formats if format is not None][-1]
+ return {'id': video_id,
+ 'title': u'%s - %s' % (infos.find('TITRAGE/TITRE').text,
+ infos.find('TITRAGE/SOUS_TITRE').text),
+ 'url': video_url,
+ 'ext': 'flv',
+ 'upload_date': unified_strdate(infos.find('PUBLICATION/DATE').text),
+ 'thumbnail': media.find('IMAGES/GRAND').text,
+ }
import re
-import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_str,
- compat_urllib_error,
- compat_urllib_request,
class CollegeHumorIE(InfoExtractor):
- _WORKING = False
- _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/video/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
+ _VALID_URL = r'^(?:https?://)?(?:www\.)?collegehumor\.com/(video|embed|e)/(?P<videoid>[0-9]+)/(?P<shorttitle>.*)$'
- def report_manifest(self, video_id):
- """Report information extraction."""
- self.to_screen(u'%s: Downloading XML manifest' % video_id)
+ _TEST = {
+ u'url': u'http://www.collegehumor.com/video/6902724/comic-con-cosplay-catastrophe',
+ u'file': u'6902724.mp4',
+ u'md5': u'1264c12ad95dca142a9f0bf7968105a0',
+ u'info_dict': {
+ u'title': u'Comic-Con Cosplay Catastrophe',
+ u'description': u'Fans get creative this year at San Diego. Too creative. And yes, that\'s really Joss Whedon.',
+ },
+ }
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id
- try:
- metaXml = compat_urllib_request.urlopen(xmlUrl).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
+ metaXml = self._download_webpage(xmlUrl, video_id,
+ u'Downloading info XML',
+ u'Unable to download video info XML')
mdoc = xml.etree.ElementTree.fromstring(metaXml)
videoNode = mdoc.findall('./video')[0]
+ youtubeIdNode = videoNode.find('./youtubeID')
+ if youtubeIdNode is not None:
+ return self.url_result(youtubeIdNode.text, 'Youtube')
info['description'] = videoNode.findall('./description')[0].text
info['title'] = videoNode.findall('./caption')[0].text
info['thumbnail'] = videoNode.findall('./thumbnail')[0].text
raise ExtractorError(u'Invalid metadata XML file')
manifest_url += '?hdcore=2.10.3'
- self.report_manifest(video_id)
- try:
- manifestXml = compat_urllib_request.urlopen(manifest_url).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video info XML: %s' % compat_str(err))
+ manifestXml = self._download_webpage(manifest_url, video_id,
+ u'Downloading XML manifest',
+ u'Unable to download video info XML')
adoc = xml.etree.ElementTree.fromstring(manifestXml)
except IndexError as err:
raise ExtractorError(u'Invalid manifest file')
- url_pr = compat_urllib_parse_urlparse(manifest_url)
- url = url_pr.scheme + '://' + url_pr.netloc + '/z' + video_id[:-2] + '/' + node_id + 'Seg1-Frag1'
+ url_pr = compat_urllib_parse_urlparse(info['thumbnail'])
- info['url'] = url
- info['ext'] = 'f4f'
+ info['url'] = url_pr.scheme + '://' + url_pr.netloc + video_id[:-2].replace('.csmil','').replace(',','')
+ info['ext'] = 'mp4'
return [info]
- |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))))
+ |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)))|
+ (?P<interview>
+ extended-interviews/(?P<interID>[0-9]+)/playlist_tds_extended_(?P<interview_title>.*?)/.*?)))
_TEST = {
u'url': u'http://www.thedailyshow.com/watch/thu-december-13-2012/kristen-stewart',
epTitle = mobj.group('cntitle')
dlNewest = False
+ elif mobj.group('interview'):
+ epTitle = mobj.group('interview_title')
+ dlNewest = False
dlNewest = not mobj.group('episode')
if dlNewest:
+ unescapeHTML,
class InfoExtractor(object):
def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None):
""" Returns a tuple (page content as string, URL handle) """
+ # Strip hashes from the URL (#1038)
+ if isinstance(url_or_request, (compat_str, str)):
+ url_or_request = url_or_request.partition('#')[0]
urlh = self._request_webpage(url_or_request, video_id, note, errnote)
content_type = urlh.headers.get('Content-Type', '')
m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type)
self.to_screen(u'Logging in')
#Methods for following #608
- #They set the correct value of the '_type' key
- def video_result(self, video_info):
- """Returns a video"""
- video_info['_type'] = 'video'
- return video_info
def url_result(self, url, ie=None):
"""Returns a url that points to a page that should be processed"""
#TODO: ie should be the class used for getting the info
return (username, password)
+ # Helper functions for extracting OpenGraph info
+ @staticmethod
+ def _og_regex(prop):
+ return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
+ def _og_search_property(self, prop, html, name=None, **kargs):
+ if name is None:
+ name = 'OpenGraph %s' % prop
+ escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
+ return unescapeHTML(escaped)
+ def _og_search_thumbnail(self, html, **kargs):
+ return self._og_search_property('image', html, u'thumbnail url', fatal=False, **kargs)
+ def _og_search_description(self, html, **kargs):
+ return self._og_search_property('description', html, fatal=False, **kargs)
+ def _og_search_title(self, html, **kargs):
+ return self._og_search_property('title', html, **kargs)
+ def _og_search_video_url(self, html, name='video url', **kargs):
+ return self._html_search_regex([self._og_regex('video:secure_url'),
+ self._og_regex('video')],
+ html, name, **kargs)
class SearchInfoExtractor(InfoExtractor):
Base class for paged search queries extractors.
--- /dev/null
+# coding: utf-8
+import re
+import json
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_parse,
+ orderedSet,
+ compat_urllib_parse_urlparse,
+ compat_urlparse,
+class CondeNastIE(InfoExtractor):
+ """
+ Condé Nast is a media group, some of its sites use a custom HTML5 player
+ that works the same in all of them.
+ """
+ # The keys are the supported sites and the values are the name to be shown
+ # to the user and in the extractor description.
+ _SITES = {'wired': u'WIRED',
+ 'gq': u'GQ',
+ 'vogue': u'Vogue',
+ 'glamour': u'Glamour',
+ 'wmagazine': u'W Magazine',
+ 'vanityfair': u'Vanity Fair',
+ }
+ _VALID_URL = r'http://(video|www).(?P<site>%s).com/(?P<type>watch|series|video)/(?P<id>.+)' % '|'.join(_SITES.keys())
+ IE_DESC = u'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
+ _TEST = {
+ u'url': u'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
+ u'file': u'5171b343c2b4c00dd0c1ccb3.mp4',
+ u'md5': u'1921f713ed48aabd715691f774c451f7',
+ u'info_dict': {
+ u'title': u'3D Printed Speakers Lit With LED',
+ u'description': u'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.',
+ }
+ }
+ def _extract_series(self, url, webpage):
+ title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>',
+ webpage, u'series title', flags=re.DOTALL)
+ url_object = compat_urllib_parse_urlparse(url)
+ base_url = '%s://%s' % (url_object.scheme, url_object.netloc)
+ m_paths = re.finditer(r'<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]',
+ webpage, flags=re.DOTALL)
+ paths = orderedSet(m.group(1) for m in m_paths)
+ build_url = lambda path: compat_urlparse.urljoin(base_url, path)
+ entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
+ return self.playlist_result(entries, playlist_title=title)
+ def _extract_video(self, webpage):
+ description = self._html_search_regex([r'<div class="cne-video-description">(.+?)</div>',
+ r'<div class="video-post-content">(.+?)</div>',
+ ],
+ webpage, u'description',
+ fatal=False, flags=re.DOTALL)
+ params = self._search_regex(r'var params = {(.+?)}[;,]', webpage,
+ u'player params', flags=re.DOTALL)
+ video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, u'video id')
+ player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, u'player id')
+ target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, u'target')
+ data = compat_urllib_parse.urlencode({'videoId': video_id,
+ 'playerId': player_id,
+ 'target': target,
+ })
+ base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]',
+ webpage, u'base info url',
+ default='http://player.cnevids.com/player/loader.js?')
+ info_url = base_info_url + data
+ info_page = self._download_webpage(info_url, video_id,
+ u'Downloading video info')
+ video_info = self._search_regex(r'var video = ({.+?});', info_page, u'video info')
+ video_info = json.loads(video_info)
+ def _formats_sort_key(f):
+ type_ord = 1 if f['type'] == 'video/mp4' else 0
+ quality_ord = 1 if f['quality'] == 'high' else 0
+ return (quality_ord, type_ord)
+ best_format = sorted(video_info['sources'][0], key=_formats_sort_key)[-1]
+ return {'id': video_id,
+ 'url': best_format['src'],
+ 'ext': best_format['type'].split('/')[-1],
+ 'title': video_info['title'],
+ 'thumbnail': video_info['poster_frame'],
+ 'description': description,
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ site = mobj.group('site')
+ url_type = mobj.group('type')
+ id = mobj.group('id')
+ self.to_screen(u'Extracting from %s with the Condé Nast extractor' % self._SITES[site])
+ webpage = self._download_webpage(url, id)
+ if url_type == 'series':
+ return self._extract_series(url, webpage)
+ else:
+ return self._extract_video(webpage)
--- /dev/null
+# -*- coding: utf-8 -*-
+import re
+from .common import InfoExtractor
+from ..utils import determine_ext
+class CriterionIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.criterion\.com/films/(\d*)-.+'
+ _TEST = {
+ u'url': u'http://www.criterion.com/films/184-le-samourai',
+ u'file': u'184.mp4',
+ u'md5': u'bc51beba55685509883a9a7830919ec3',
+ u'info_dict': {
+ u"title": u"Le Samouraï",
+ u"description" : u'md5:a2b4b116326558149bef81f76dcbb93f',
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group(1)
+ webpage = self._download_webpage(url, video_id)
+ final_url = self._search_regex(r'so.addVariable\("videoURL", "(.+?)"\)\;',
+ webpage, 'video url')
+ title = self._html_search_regex(r'<meta content="(.+?)" property="og:title" />',
+ webpage, 'video title')
+ description = self._html_search_regex(r'<meta name="description" content="(.+?)" />',
+ webpage, 'video description')
+ thumbnail = self._search_regex(r'so.addVariable\("thumbnailURL", "(.+?)"\)\;',
+ webpage, 'thumbnail url')
+ return {'id': video_id,
+ 'url' : final_url,
+ 'title': title,
+ 'ext': determine_ext(final_url),
+ 'description': description,
+ 'thumbnail': thumbnail,
+ }
description = self._html_search_regex(r'<meta (?:property="og:|name=")description" content="(.*?)"',
webpage, 'description',
- thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.*?)"',
- webpage, 'thumbnail')
url = self._search_regex(r'<string name="URL">(.*?)</string>',
video_info, 'video url')
'url': url,
'play_path': path,
'description': description,
- 'thumbnail': thumbnail,
+ 'thumbnail': self._og_search_thumbnail(webpage),
import re
import json
+import itertools
from .common import InfoExtractor
from ..utils import (
+ get_element_by_attribute,
+ get_element_by_id,
# Extract URL, uploader and title from webpage
- video_title = self._html_search_regex(r'<meta property="og:title" content="(.*?)" />',
- webpage, 'title')
video_uploader = self._search_regex([r'(?im)<span class="owner[^\"]+?">[^<]+?<a [^>]+?>([^<]+?)</a>',
# Looking for official user
r'<(?:span|a) .*?rel="author".*?>([^<]+?)</'],
'url': video_url,
'uploader': video_uploader,
'upload_date': video_upload_date,
- 'title': video_title,
+ 'title': self._og_search_title(webpage),
'ext': video_extension,
'thumbnail': info['thumbnail_url']
+class DailymotionPlaylistIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)?(?:www\.)?dailymotion\.[a-z]{2,3}/playlist/(?P<id>.+?)/'
+ _MORE_PAGES_INDICATOR = r'<div class="next">.*?<a.*?href="/playlist/.+?".*?>.*?</a>.*?</div>'
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ playlist_id = mobj.group('id')
+ video_ids = []
+ for pagenum in itertools.count(1):
+ webpage = self._download_webpage('https://www.dailymotion.com/playlist/%s/%s' % (playlist_id, pagenum),
+ playlist_id, u'Downloading page %s' % pagenum)
+ playlist_el = get_element_by_attribute(u'class', u'video_list', webpage)
+ video_ids.extend(re.findall(r'data-id="(.+?)" data-ext-id', playlist_el))
+ if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+ break
+ entries = [self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion')
+ for video_id in video_ids]
+ return {'_type': 'playlist',
+ 'id': playlist_id,
+ 'title': get_element_by_id(u'playlist_name', webpage),
+ 'entries': entries,
+ }
info = {
+ '_type': 'video',
'id': video_id,
'title': video_title,
'formats': formats,
info['url'] = formats[-1]['url']
info['ext'] = determine_ext(formats[-1]['url'])
- return self.video_result(info)
\ No newline at end of file
+ return info
\ No newline at end of file
--- /dev/null
+import re
+from ..utils import (
+ compat_urllib_parse,
+ determine_ext
+from .common import InfoExtractor
+class EHowIE(InfoExtractor):
+ IE_NAME = u'eHow'
+ _VALID_URL = r'(?:https?://)?(?:www\.)?ehow\.com/[^/_?]*_(?P<id>[0-9]+)'
+ _TEST = {
+ u'url': u'http://www.ehow.com/video_12245069_hardwood-flooring-basics.html',
+ u'file': u'12245069.flv',
+ u'md5': u'9809b4e3f115ae2088440bcb4efbf371',
+ u'info_dict': {
+ u"title": u"Hardwood Flooring Basics",
+ u"description": u"Hardwood flooring may be time consuming, but its ultimately a pretty straightforward concept. Learn about hardwood flooring basics with help from a hardware flooring business owner in this free video...",
+ u"uploader": u"Erick Nathan"
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ video_url = self._search_regex(r'(?:file|source)=(http[^\'"&]*)',
+ webpage, u'video URL')
+ final_url = compat_urllib_parse.unquote(video_url)
+ uploader = self._search_regex(r'<meta name="uploader" content="(.+?)" />',
+ webpage, u'uploader')
+ title = self._og_search_title(webpage).replace(' | eHow', '')
+ ext = determine_ext(final_url)
+ return {
+ '_type': 'video',
+ 'id': video_id,
+ 'url': final_url,
+ 'ext': ext,
+ 'title': title,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
+ 'uploader': uploader,
+ }
videoDesc = self._html_search_regex('<meta name="description" content="([^"]*)"',
webpage, u'description', fatal=False)
- imgUrl = self._html_search_regex('<meta property="og:image" content="([^"]*)"',
- webpage, u'thumbnail', fatal=False)
- playerUrl = self._html_search_regex('<meta property="og:video" content="([^"]*)"',
- webpage, u'player url')
+ playerUrl = self._og_search_video_url(webpage, name='player url')
title = self._html_search_regex('<meta name="title" content="([^"]*)"',
webpage, u'player url').split(' : ')[-1]
'upload_date': None,
'title': title,
'ext': 'mp4',
- 'thumbnail': imgUrl,
+ 'thumbnail': self._og_search_thumbnail(webpage),
'description': videoDesc,
'player_url': playerUrl,
--- /dev/null
+import re
+import json
+from .common import InfoExtractor
+class ExfmIE(InfoExtractor):
+ IE_NAME = u'exfm'
+ IE_DESC = u'ex.fm'
+ _VALID_URL = r'(?:http://)?(?:www\.)?ex\.fm/song/([^/]+)'
+ _SOUNDCLOUD_URL = r'(?:http://)?(?:www\.)?api\.soundcloud.com/tracks/([^/]+)/stream'
+ _TESTS = [
+ {
+ u'url': u'http://ex.fm/song/1bgtzg',
+ u'file': u'95223130.mp3',
+ u'md5': u'8a7967a3fef10e59a1d6f86240fd41cf',
+ u'info_dict': {
+ u"title": u"We Can't Stop - Miley Cyrus",
+ u"uploader": u"Miley Cyrus",
+ u'upload_date': u'20130603',
+ u'description': u'Download "We Can\'t Stop" \r\niTunes: http://smarturl.it/WeCantStop?IQid=SC\r\nAmazon: http://smarturl.it/WeCantStopAMZ?IQid=SC',
+ },
+ u'note': u'Soundcloud song',
+ },
+ {
+ u'url': u'http://ex.fm/song/wddt8',
+ u'file': u'wddt8.mp3',
+ u'md5': u'966bd70741ac5b8570d8e45bfaed3643',
+ u'info_dict': {
+ u'title': u'Safe and Sound',
+ u'uploader': u'Capital Cities',
+ },
+ },
+ ]
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ song_id = mobj.group(1)
+ info_url = "http://ex.fm/api/v3/song/%s" %(song_id)
+ webpage = self._download_webpage(info_url, song_id)
+ info = json.loads(webpage)
+ song_url = info['song']['url']
+ if re.match(self._SOUNDCLOUD_URL, song_url) is not None:
+ self.to_screen('Soundcloud song detected')
+ return self.url_result(song_url.replace('/stream',''), 'Soundcloud')
+ return [{
+ 'id': song_id,
+ 'url': song_url,
+ 'ext': 'mp3',
+ 'title': info['song']['title'],
+ 'thumbnail': info['song']['image']['large'],
+ 'uploader': info['song']['artist'],
+ 'view_count': info['song']['loved_count'],
+ }]
raise ExtractorError(u'Unable to extract video url')
video_url = mobj.group(1) + unescapeHTML(mobj.group(2))
- video_title = self._html_search_regex(r'<meta property="og:title" content=(?:"([^"]+)"|\'([^\']+)\')',
- webpage, u'video title')
- video_description = self._html_search_regex(r'<meta property="og:description" content=(?:"([^"]+)"|\'([^\']+)\')',
- webpage, u'description', fatal=False)
- thumbnail = self._html_search_regex(r'<meta property="og:image" content=(?:"([^"]+)"|\'([^\']+)\')',
- webpage, u'thumbnail', fatal=False)
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'description': video_description,
- 'thumbnail': thumbnail,
+ 'title': self._og_search_title(webpage),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id': video_uploader_id,
--- /dev/null
+import re
+from .common import InfoExtractor
+from ..utils import determine_ext
+class FreesoundIE(InfoExtractor):
+ _VALID_URL = r'(?:https?://)?(?:www\.)?freesound\.org/people/([^/]+)/sounds/(?P<id>[^/]+)'
+ _TEST = {
+ u'url': u'http://www.freesound.org/people/miklovan/sounds/194503/',
+ u'file': u'194503.mp3',
+ u'md5': u'12280ceb42c81f19a515c745eae07650',
+ u'info_dict': {
+ u"title": u"gulls in the city.wav",
+ u"uploader" : u"miklovan",
+ u'description': u'the sounds of seagulls in the city',
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ music_id = mobj.group('id')
+ webpage = self._download_webpage(url, music_id)
+ title = self._html_search_regex(r'<div id="single_sample_header">.*?<a href="#">(.+?)</a>',
+ webpage, 'music title', flags=re.DOTALL)
+ music_url = self._og_search_property('audio', webpage, 'music url')
+ description = self._html_search_regex(r'<div id="sound_description">(.*?)</div>',
+ webpage, 'description', fatal=False, flags=re.DOTALL)
+ return [{
+ 'id': music_id,
+ 'title': title,
+ 'url': music_url,
+ 'uploader': self._og_search_property('audio:artist', webpage, 'music uploader'),
+ 'ext': determine_ext(music_url),
+ 'description': description,
+ }]
title = self._html_search_regex((r"<h1 class='player_page_h1'.*?>(?P<title>.*?)</h1>",
r'<title>(?P<title>[^<]+?)</title>'), webpage, 'title', flags=re.DOTALL)
- video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
- webpage, u'description', fatal=False, flags=re.DOTALL)
info = {
'id': video_id,
'url': video_url,
'ext': 'mp4',
'title': title,
- 'description': video_description,
+ 'description': self._og_search_description(webpage),
return [info]
import re
-from .common import InfoExtractor
-from ..utils import (
- compat_urllib_parse,
+from .mtv import MTVIE, _media_xml_tag
- ExtractorError,
-class GametrailersIE(InfoExtractor):
+class GametrailersIE(MTVIE):
+ """
+ Gametrailers use the same videos system as MTVIE, it just changes the feed
+ url, where the uri is and the method to get the thumbnails.
+ """
_VALID_URL = r'http://www.gametrailers.com/(?P<type>videos|reviews|full-episodes)/(?P<id>.*?)/(?P<title>.*)'
_TEST = {
u'url': u'http://www.gametrailers.com/videos/zbvr8i/mirror-s-edge-2-e3-2013--debut-trailer',
- u'file': u'zbvr8i.flv',
- u'md5': u'c3edbc995ab4081976e16779bd96a878',
+ u'file': u'70e9a5d7-cf25-4a10-9104-6f3e7342ae0d.mp4',
+ u'md5': u'4c8e67681a0ea7ec241e8c09b3ea8cf7',
u'info_dict': {
- u"title": u"E3 2013: Debut Trailer"
+ u'title': u'E3 2013: Debut Trailer',
+ u'description': u'Faith is back! Check out the World Premiere trailer for Mirror\'s Edge 2 straight from the EA Press Conference at E3 2013!',
- u'skip': u'Requires rtmpdump'
+ # Overwrite MTVIE properties we don't want
+ _TESTS = []
+ _FEED_URL = 'http://www.gametrailers.com/feeds/mrss'
+ def _get_thumbnail_url(self, uri, itemdoc):
+ search_path = '%s/%s' % (_media_xml_tag('group'), _media_xml_tag('thumbnail'))
+ return itemdoc.find(search_path).attrib['url']
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
video_id = mobj.group('id')
- video_type = mobj.group('type')
webpage = self._download_webpage(url, video_id)
- if video_type == 'full-episodes':
- mgid_re = r'data-video="(?P<mgid>mgid:.*?)"'
- else:
- mgid_re = r'data-contentId=\'(?P<mgid>mgid:.*?)\''
- mgid = self._search_regex(mgid_re, webpage, u'mgid')
- data = compat_urllib_parse.urlencode({'uri': mgid, 'acceptMethods': 'fms'})
- info_page = self._download_webpage('http://www.gametrailers.com/feeds/mrss?' + data,
- video_id, u'Downloading video info')
- links_webpage = self._download_webpage('http://www.gametrailers.com/feeds/mediagen/?' + data,
- video_id, u'Downloading video urls info')
- self.report_extraction(video_id)
- info_re = r'''<title><!\[CDATA\[(?P<title>.*?)\]\]></title>.*
- <description><!\[CDATA\[(?P<description>.*?)\]\]></description>.*
- <image>.*
- <url>(?P<thumb>.*?)</url>.*
- </image>'''
- m_info = re.search(info_re, info_page, re.VERBOSE|re.DOTALL)
- if m_info is None:
- raise ExtractorError(u'Unable to extract video info')
- video_title = m_info.group('title')
- video_description = m_info.group('description')
- video_thumb = m_info.group('thumb')
- m_urls = list(re.finditer(r'<src>(?P<url>.*)</src>', links_webpage))
- if m_urls is None or len(m_urls) == 0:
- raise ExtractorError(u'Unable to extract video url')
- # They are sorted from worst to best quality
- video_url = m_urls[-1].group('url')
- return {'url': video_url,
- 'id': video_id,
- 'title': video_title,
- # Videos are actually flv not mp4
- 'ext': 'flv',
- 'thumbnail': video_thumb,
- 'description': video_description,
- }
+ mgid = self._search_regex([r'data-video="(?P<mgid>mgid:.*?)"',
+ r'data-contentId=\'(?P<mgid>mgid:.*?)\''],
+ webpage, u'mgid')
+ return self._get_videos_info(mgid)
+# encoding: utf-8
import os
import re
+from .brightcove import BrightcoveIE
class GenericIE(InfoExtractor):
IE_DESC = u'Generic downloader that works on some sites'
_VALID_URL = r'.*'
IE_NAME = u'generic'
- _TEST = {
- u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
- u'file': u'13601338388002.mp4',
- u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
- u'info_dict': {
- u"uploader": u"www.hodiho.fr",
- u"title": u"R\u00e9gis plante sa Jeep"
- }
- }
+ _TESTS = [
+ {
+ u'url': u'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html',
+ u'file': u'13601338388002.mp4',
+ u'md5': u'85b90ccc9d73b4acd9138d3af4c27f89',
+ u'info_dict': {
+ u"uploader": u"www.hodiho.fr",
+ u"title": u"R\u00e9gis plante sa Jeep"
+ }
+ },
+ {
+ u'url': u'http://www.8tv.cat/8aldia/videos/xavier-sala-i-martin-aquesta-tarda-a-8-al-dia/',
+ u'file': u'2371591881001.mp4',
+ u'md5': u'9e80619e0a94663f0bdc849b4566af19',
+ u'note': u'Test Brightcove downloads and detection in GenericIE',
+ u'info_dict': {
+ u'title': u'Xavier Sala i Martín: “Un banc que no presta és un banc zombi que no serveix per a res”',
+ u'uploader': u'8TV',
+ u'description': u'md5:a950cc4285c43e44d763d036710cd9cd',
+ }
+ },
+ ]
def report_download_webpage(self, video_id):
"""Report webpage download."""
raise ExtractorError(u'Invalid URL: %s' % url)
+ # Look for BrigthCove:
+ m_brightcove = re.search(r'<object.+?class=([\'"]).*?BrightcoveExperience.*?\1.+?</object>', webpage, re.DOTALL)
+ if m_brightcove is not None:
+ self.to_screen(u'Brightcove video detected.')
+ bc_url = BrightcoveIE._build_brighcove_url(m_brightcove.group())
+ return self.url_result(bc_url, 'Brightcove')
# Start with something easy: JW Player in SWFObject
mobj = re.search(r'flashvars: [\'"](?:.*&)?file=(http[^\'"&]*)', webpage)
if mobj is None:
video_title = self._html_search_regex(r"<title>(.*)</title>",
webpage_src, u'title')
- # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._html_search_regex(r'"og:image" content="(.*)"',
- webpage_src, u'thumbnail', fatal=False)
results = [{
'id': video_id,
'url' : video_url,
'title' : video_title,
- 'thumbnail' : thumbnail,
+ 'thumbnail' : self._og_search_thumbnail(webpage_src),
'ext' : 'mp3',
- return results
\ No newline at end of file
+ return results
--- /dev/null
+import re
+import json
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+class IGNIE(InfoExtractor):
+ """
+ Extractor for some of the IGN sites, like www.ign.com, es.ign.com de.ign.com.
+ Some videos of it.ign.com are also supported
+ """
+ _VALID_URL = r'https?://.+?\.ign\.com/(?:videos|show_videos)(/.+)?/(?P<name_or_id>.+)'
+ IE_NAME = u'ign.com'
+ _CONFIG_URL_TEMPLATE = 'http://www.ign.com/videos/configs/id/%s.config'
+ _DESCRIPTION_RE = [r'<span class="page-object-description">(.+?)</span>',
+ r'id="my_show_video">.*?<p>(.*?)</p>',
+ ]
+ _TEST = {
+ u'url': u'http://www.ign.com/videos/2013/06/05/the-last-of-us-review',
+ u'file': u'8f862beef863986b2785559b9e1aa599.mp4',
+ u'md5': u'eac8bdc1890980122c3b66f14bdd02e9',
+ u'info_dict': {
+ u'title': u'The Last of Us Review',
+ u'description': u'md5:c8946d4260a4d43a00d5ae8ed998870c',
+ }
+ }
+ def _find_video_id(self, webpage):
+ res_id = [r'data-video-id="(.+?)"',
+ r'<object id="vid_(.+?)"',
+ r'<meta name="og:image" content=".*/(.+?)-(.+?)/.+.jpg"',
+ ]
+ return self._search_regex(res_id, webpage, 'video id')
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ name_or_id = mobj.group('name_or_id')
+ webpage = self._download_webpage(url, name_or_id)
+ video_id = self._find_video_id(webpage)
+ result = self._get_video_info(video_id)
+ description = self._html_search_regex(self._DESCRIPTION_RE,
+ webpage, 'video description',
+ flags=re.DOTALL)
+ result['description'] = description
+ return result
+ def _get_video_info(self, video_id):
+ config_url = self._CONFIG_URL_TEMPLATE % video_id
+ config = json.loads(self._download_webpage(config_url, video_id,
+ u'Downloading video info'))
+ media = config['playlist']['media']
+ video_url = media['url']
+ return {'id': media['metadata']['videoId'],
+ 'url': video_url,
+ 'ext': determine_ext(video_url),
+ 'title': media['metadata']['title'],
+ 'thumbnail': media['poster'][0]['url'].replace('{size}', 'grande'),
+ }
+class OneUPIE(IGNIE):
+ """Extractor for 1up.com, it uses the ign videos system."""
+ _VALID_URL = r'https?://gamevideos.1up.com/video/id/(?P<name_or_id>.+)'
+ IE_NAME = '1up.com'
+ _DESCRIPTION_RE = r'<div id="vid_summary">(.+?)</div>'
+ _TEST = {
+ u'url': u'http://gamevideos.1up.com/video/id/34976',
+ u'file': u'34976.mp4',
+ u'md5': u'68a54ce4ebc772e4b71e3123d413163d',
+ u'info_dict': {
+ u'title': u'Sniper Elite V2 - Trailer',
+ u'description': u'md5:5d289b722f5a6d940ca3136e9dae89cf',
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ id = mobj.group('name_or_id')
+ result = super(OneUPIE, self)._real_extract(url)
+ result['id'] = id
+ return result
class InaIE(InfoExtractor):
"""Information Extractor for Ina.fr"""
- _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I[0-9]+)/.*'
+ _VALID_URL = r'(?:http://)?(?:www\.)?ina\.fr/video/(?P<id>I?[A-F0-9]+)/.*'
_TEST = {
u'url': u'www.ina.fr/video/I12055569/francois-hollande-je-crois-que-c-est-clair-video.html',
u'file': u'I12055569.mp4',
class InstagramIE(InfoExtractor):
_VALID_URL = r'(?:http://)?instagram.com/p/(.*?)/'
_TEST = {
- u'url': u'http://instagram.com/p/aye83DjauH/#',
+ u'url': u'http://instagram.com/p/aye83DjauH/?foo=bar#abc',
u'file': u'aye83DjauH.mp4',
u'md5': u'0d2da106a9d2631273e192b372806516',
u'info_dict': {
u"uploader_id": u"naomipq",
- u"title": u"Video by naomipq"
+ u"title": u"Video by naomipq",
+ u'description': u'md5:1f17f0ab29bd6fe2bfad705f58de3cb8',
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'<meta property="og:video" content="(.+?)"',
- webpage, u'video URL')
- thumbnail_url = self._html_search_regex(
- r'<meta property="og:image" content="(.+?)" />',
- webpage, u'thumbnail URL', fatal=False)
- html_title = self._html_search_regex(
- r'<title>(.+?)</title>',
- webpage, u'title', flags=re.DOTALL)
- title = re.sub(u'(?: *\(Videos?\))? \u2022 Instagram$', '', html_title).strip()
- uploader_id = self._html_search_regex(r'content="(.*?)\'s video on Instagram',
- webpage, u'uploader name', fatal=False)
- ext = 'mp4'
+ uploader_id = self._search_regex(r'"owner":{"username":"(.+?)"',
+ webpage, u'uploader id', fatal=False)
+ desc = self._search_regex(r'"caption":"(.*?)"', webpage, u'description',
+ fatal=False)
return [{
'id': video_id,
- 'url': video_url,
- 'ext': ext,
- 'title': title,
- 'thumbnail': thumbnail_url,
- 'uploader_id' : uploader_id
+ 'url': self._og_search_video_url(webpage),
+ 'ext': 'mp4',
+ 'title': u'Video by %s' % uploader_id,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'uploader_id' : uploader_id,
+ 'description': desc,
--- /dev/null
+import re
+from .common import InfoExtractor
+from ..utils import determine_ext
+class KankanIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:.*?\.)?kankan\.com/.+?/(?P<id>\d+)\.shtml'
+ _TEST = {
+ u'url': u'http://yinyue.kankan.com/vod/48/48863.shtml',
+ u'file': u'48863.flv',
+ u'md5': u'29aca1e47ae68fc28804aca89f29507e',
+ u'info_dict': {
+ u'title': u'Ready To Go',
+ },
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ title = self._search_regex(r'G_TITLE=[\'"](.+?)[\'"]', webpage, u'video title')
+ gcid = self._search_regex(r'lurl:[\'"]http://.+?/.+?/(.+?)/', webpage, u'gcid')
+ video_info_page = self._download_webpage('http://p2s.cl.kankan.com/getCdnresource_flv?gcid=%s' % gcid,
+ video_id, u'Downloading video url info')
+ ip = self._search_regex(r'ip:"(.+?)"', video_info_page, u'video url ip')
+ path = self._search_regex(r'path:"(.+?)"', video_info_page, u'video url path')
+ video_url = 'http://%s%s' % (ip, path)
+ return {'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ 'ext': determine_ext(video_url),
+ }
class KeekIE(InfoExtractor):
- _VALID_URL = r'http://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
+ _VALID_URL = r'https?://(?:www\.)?keek\.com/(?:!|\w+/keeks/)(?P<videoID>\w+)'
IE_NAME = u'keek'
_TEST = {
- u'url': u'http://www.keek.com/ytdl/keeks/NODfbab',
+ u'url': u'https://www.keek.com/ytdl/keeks/NODfbab',
u'file': u'NODfbab.mp4',
u'md5': u'9b0636f8c0f7614afa4ea5e4c6e57e83',
u'info_dict': {
thumbnail = u'http://cdn.keek.com/keek/thumbnail/%s/w100/h75' % video_id
webpage = self._download_webpage(url, video_id)
- video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
- webpage, u'title')
+ video_title = self._og_search_title(webpage)
uploader = self._html_search_regex(r'<div class="user-name-and-bio">[\S\s]+?<h2>(?P<uploader>.+?)</h2>',
webpage, u'uploader', fatal=False)
video_url = self._search_regex(r'file: "(.*?)",',
webpage, u'video URL')
- video_title = self._html_search_regex(r'<meta property="og:title" content="(?P<title>.*?)"',
- webpage, u'title').replace('LiveLeak.com -', '').strip()
+ video_title = self._og_search_title(webpage).replace('LiveLeak.com -', '').strip()
- video_description = self._html_search_regex(r'<meta property="og:description" content="(?P<desc>.*?)"',
- webpage, u'description', fatal=False)
+ video_description = self._og_search_description(webpage)
video_uploader = self._html_search_regex(r'By:.*?(\w+)</a>',
webpage, u'uploader', fatal=False)
--- /dev/null
+import re
+import json
+from .common import InfoExtractor
+from ..utils import compat_urllib_parse_urlparse, compat_urlparse
+class LivestreamIE(InfoExtractor):
+ _VALID_URL = r'http://new.livestream.com/.*?/(?P<event_name>.*?)(/videos/(?P<id>\d+))?/?$'
+ _TEST = {
+ u'url': u'http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370',
+ u'file': u'4719370.mp4',
+ u'md5': u'0d2186e3187d185a04b3cdd02b828836',
+ u'info_dict': {
+ u'title': u'Live from Webster Hall NYC',
+ u'upload_date': u'20121012',
+ }
+ }
+ def _extract_video_info(self, video_data):
+ video_url = video_data.get('progressive_url_hd') or video_data.get('progressive_url')
+ return {'id': video_data['id'],
+ 'url': video_url,
+ 'ext': 'mp4',
+ 'title': video_data['caption'],
+ 'thumbnail': video_data['thumbnail_url'],
+ 'upload_date': video_data['updated_at'].replace('-','')[:8],
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ event_name = mobj.group('event_name')
+ webpage = self._download_webpage(url, video_id or event_name)
+ if video_id is None:
+ # This is an event page:
+ api_url = self._search_regex(r'event_design_eventId: \'(.+?)\'',
+ webpage, 'api url')
+ info = json.loads(self._download_webpage(api_url, event_name,
+ u'Downloading event info'))
+ videos = [self._extract_video_info(video_data['data'])
+ for video_data in info['feed']['data'] if video_data['type'] == u'video']
+ return self.playlist_result(videos, info['id'], info['full_name'])
+ else:
+ og_video = self._og_search_video_url(webpage, name=u'player url')
+ query_str = compat_urllib_parse_urlparse(og_video).query
+ query = compat_urlparse.parse_qs(query_str)
+ api_url = query['play_url'][0].replace('.smil', '')
+ info = json.loads(self._download_webpage(api_url, video_id,
+ u'Downloading video info'))
+ return self._extract_video_info(info)
+ determine_ext,
_DISCLAIMER = 'http://www.metacafe.com/family_filter/'
_FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user'
IE_NAME = u'metacafe'
- _TEST = {
+ _TESTS = [{
u"add_ie": ["Youtube"],
u"url": u"http://metacafe.com/watch/yt-_aUehQsCQtM/the_electric_company_short_i_pbs_kids_go/",
u"file": u"_aUehQsCQtM.flv",
u"uploader": u"PBS",
u"uploader_id": u"PBS"
- }
+ },
+ {
+ u"url": u"http://www.metacafe.com/watch/an-dVVXnuY7Jh77J/the_andromeda_strain_1971_stop_the_bomb_part_3/",
+ u"file": u"an-dVVXnuY7Jh77J.mp4",
+ u"info_dict": {
+ u"title": u"The Andromeda Strain (1971): Stop the Bomb Part 3",
+ u"uploader": u"anyclip",
+ u"description": u"md5:38c711dd98f5bb87acf973d573442e67"
+ }
+ }]
def report_disclaimer(self):
return [self.url_result('http://www.youtube.com/watch?v=%s' % mobj2.group(1), 'Youtube')]
# Retrieve video webpage to extract further information
- webpage = self._download_webpage('http://www.metacafe.com/watch/%s/' % video_id, video_id)
+ req = compat_urllib_request.Request('http://www.metacafe.com/watch/%s/' % video_id)
+ req.headers['Cookie'] = 'flashVersion=0;'
+ webpage = self._download_webpage(req, video_id)
# Extract URL, uploader and title from webpage
mobj = re.search(r'(?m)&mediaURL=([^&]+)', webpage)
if mobj is not None:
mediaURL = compat_urllib_parse.unquote(mobj.group(1))
- video_extension = mediaURL[-3:]
+ video_ext = mediaURL[-3:]
# Extract gdaKey if available
mobj = re.search(r'(?m)&gdaKey=(.*?)&', webpage)
gdaKey = mobj.group(1)
video_url = '%s?__gda__=%s' % (mediaURL, gdaKey)
- mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
- vardict = compat_parse_qs(mobj.group(1))
- if 'mediaData' not in vardict:
- raise ExtractorError(u'Unable to extract media URL')
- mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
- if mobj is None:
- raise ExtractorError(u'Unable to extract media URL')
- mediaURL = mobj.group('mediaURL').replace('\\/', '/')
- video_extension = mediaURL[-3:]
- video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
- mobj = re.search(r'(?im)<title>(.*) - Video</title>', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract title')
- video_title = mobj.group(1).decode('utf-8')
- mobj = re.search(r'submitter=(.*?);', webpage)
- if mobj is None:
- raise ExtractorError(u'Unable to extract uploader nickname')
- video_uploader = mobj.group(1)
- return [{
- 'id': video_id.decode('utf-8'),
- 'url': video_url.decode('utf-8'),
- 'uploader': video_uploader.decode('utf-8'),
+ mobj = re.search(r'<video src="([^"]+)"', webpage)
+ if mobj:
+ video_url = mobj.group(1)
+ video_ext = 'mp4'
+ else:
+ mobj = re.search(r' name="flashvars" value="(.*?)"', webpage)
+ if mobj is None:
+ raise ExtractorError(u'Unable to extract media URL')
+ vardict = compat_parse_qs(mobj.group(1))
+ if 'mediaData' not in vardict:
+ raise ExtractorError(u'Unable to extract media URL')
+ mobj = re.search(r'"mediaURL":"(?P<mediaURL>http.*?)",(.*?)"key":"(?P<key>.*?)"', vardict['mediaData'][0])
+ if mobj is None:
+ raise ExtractorError(u'Unable to extract media URL')
+ mediaURL = mobj.group('mediaURL').replace('\\/', '/')
+ video_url = '%s?__gda__=%s' % (mediaURL, mobj.group('key'))
+ video_ext = determine_ext(video_url)
+ video_title = self._html_search_regex(r'(?im)<title>(.*) - Video</title>', webpage, u'title')
+ description = self._og_search_description(webpage)
+ video_uploader = self._html_search_regex(
+ r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("channel","([^"]+)"\);',
+ webpage, u'uploader nickname', fatal=False)
+ return {
+ '_type': 'video',
+ 'id': video_id,
+ 'url': video_url,
+ 'description': description,
+ 'uploader': video_uploader,
'upload_date': None,
'title': video_title,
- 'ext': video_extension.decode('utf-8'),
- }]
+ 'ext': video_ext,
+ }
import re
-import socket
import xml.etree.ElementTree
from .common import InfoExtractor
from ..utils import (
- compat_http_client,
- compat_str,
- compat_urllib_error,
- compat_urllib_request,
+ compat_urllib_parse,
+def _media_xml_tag(tag):
+ return '{http://search.yahoo.com/mrss/}%s' % tag
class MTVIE(InfoExtractor):
- _VALID_URL = r'^(?P<proto>https?://)?(?:www\.)?mtv\.com/videos/[^/]+/(?P<videoid>[0-9]+)/[^/]+$'
- _WORKING = False
+ _VALID_URL = r'^https?://(?:www\.)?mtv\.com/videos/.+?/(?P<videoid>[0-9]+)/[^/]+$'
+ _FEED_URL = 'http://www.mtv.com/player/embed/AS3/rss/'
+ _TESTS = [
+ {
+ u'url': u'http://www.mtv.com/videos/misc/853555/ours-vh1-storytellers.jhtml',
+ u'file': u'853555.mp4',
+ u'md5': u'850f3f143316b1e71fa56a4edfd6e0f8',
+ u'info_dict': {
+ u'title': u'Taylor Swift - "Ours (VH1 Storytellers)"',
+ u'description': u'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.',
+ },
+ },
+ {
+ u'url': u'http://www.mtv.com/videos/taylor-swift/916187/everything-has-changed-ft-ed-sheeran.jhtml',
+ u'file': u'USCJY1331283.mp4',
+ u'md5': u'73b4e7fcadd88929292fe52c3ced8caf',
+ u'info_dict': {
+ u'title': u'Everything Has Changed',
+ u'upload_date': u'20130606',
+ u'uploader': u'Taylor Swift',
+ },
+ u'skip': u'VEVO is only available in some countries',
+ },
+ ]
+ @staticmethod
+ def _id_from_uri(uri):
+ return uri.split(':')[-1]
+ # This was originally implemented for ComedyCentral, but it also works here
+ @staticmethod
+ def _transform_rtmp_url(rtmp_video_url):
+ m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url)
+ if not m:
+ raise ExtractorError(u'Cannot transform RTMP url')
+ base = 'http://mtvnmobile.vo.llnwd.net/kip0/_pxn=1+_pxI0=Ripod-h264+_pxL0=undefined+_pxM0=+_pxK=18639+_pxE=mp4/44620/mtvnorigin/'
+ return base + m.group('finalid')
+ def _get_thumbnail_url(self, uri, itemdoc):
+ return 'http://mtv.mtvnimages.com/uri/' + uri
+ def _extract_video_url(self, metadataXml):
+ if '/error_country_block.swf' in metadataXml:
+ raise ExtractorError(u'This video is not available from your country.', expected=True)
+ mdoc = xml.etree.ElementTree.fromstring(metadataXml.encode('utf-8'))
+ renditions = mdoc.findall('.//rendition')
+ # For now, always pick the highest quality.
+ rendition = renditions[-1]
+ try:
+ _,_,ext = rendition.attrib['type'].partition('/')
+ format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
+ rtmp_video_url = rendition.find('./src').text
+ except KeyError:
+ raise ExtractorError('Invalid rendition field.')
+ video_url = self._transform_rtmp_url(rtmp_video_url)
+ return {'ext': ext, 'url': video_url, 'format': format}
+ def _get_video_info(self, itemdoc):
+ uri = itemdoc.find('guid').text
+ video_id = self._id_from_uri(uri)
+ self.report_extraction(video_id)
+ mediagen_url = itemdoc.find('%s/%s' % (_media_xml_tag('group'), _media_xml_tag('content'))).attrib['url']
+ if 'acceptMethods' not in mediagen_url:
+ mediagen_url += '&acceptMethods=fms'
+ mediagen_page = self._download_webpage(mediagen_url, video_id,
+ u'Downloading video urls')
+ video_info = self._extract_video_url(mediagen_page)
+ description_node = itemdoc.find('description')
+ if description_node is not None:
+ description = description_node.text
+ else:
+ description = None
+ video_info.update({'title': itemdoc.find('title').text,
+ 'id': video_id,
+ 'thumbnail': self._get_thumbnail_url(uri, itemdoc),
+ 'description': description,
+ })
+ return video_info
+ def _get_videos_info(self, uri):
+ video_id = self._id_from_uri(uri)
+ data = compat_urllib_parse.urlencode({'uri': uri})
+ infoXml = self._download_webpage(self._FEED_URL +'?' + data, video_id,
+ u'Downloading info')
+ idoc = xml.etree.ElementTree.fromstring(infoXml.encode('utf-8'))
+ return [self._get_video_info(item) for item in idoc.findall('.//item')]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- if not mobj.group('proto'):
- url = 'http://' + url
video_id = mobj.group('videoid')
webpage = self._download_webpage(url, video_id)
self.to_screen(u'Vevo video detected: %s' % vevo_id)
return self.url_result('vevo:%s' % vevo_id, ie='Vevo')
- #song_name = self._html_search_regex(r'<meta name="mtv_vt" content="([^"]+)"/>',
- # webpage, u'song name', fatal=False)
- video_title = self._html_search_regex(r'<meta name="mtv_an" content="([^"]+)"/>',
- webpage, u'title')
- mtvn_uri = self._html_search_regex(r'<meta name="mtvn_uri" content="([^"]+)"/>',
- webpage, u'mtvn_uri', fatal=False)
- content_id = self._search_regex(r'MTVN.Player.defaultPlaylistId = ([0-9]+);',
- webpage, u'content id', fatal=False)
- videogen_url = 'http://www.mtv.com/player/includes/mediaGen.jhtml?uri=' + mtvn_uri + '&id=' + content_id + '&vid=' + video_id + '&ref=www.mtvn.com&viewUri=' + mtvn_uri
- self.report_extraction(video_id)
- request = compat_urllib_request.Request(videogen_url)
- try:
- metadataXml = compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to download video metadata: %s' % compat_str(err))
- mdoc = xml.etree.ElementTree.fromstring(metadataXml)
- renditions = mdoc.findall('.//rendition')
- # For now, always pick the highest quality.
- rendition = renditions[-1]
- try:
- _,_,ext = rendition.attrib['type'].partition('/')
- format = ext + '-' + rendition.attrib['width'] + 'x' + rendition.attrib['height'] + '_' + rendition.attrib['bitrate']
- video_url = rendition.find('./src').text
- except KeyError:
- raise ExtractorError('Invalid rendition field.')
- info = {
- 'id': video_id,
- 'url': video_url,
- 'upload_date': None,
- 'title': video_title,
- 'ext': ext,
- 'format': format,
- }
- return [info]
+ uri = self._html_search_regex(r'/uri/(.*?)\?', webpage, u'uri')
+ return self._get_videos_info(uri)
video_url = u'http://ht-mobile.cdn.turner.com/nba/big' + video_id + '_nba_1280x720.mp4'
shortened_video_id = video_id.rpartition('/')[2]
- title = self._html_search_regex(r'<meta property="og:title" content="(.*?)"',
- webpage, 'title', default=shortened_video_id).replace('NBA.com: ', '')
+ title = self._og_search_title(webpage, default=shortened_video_id).replace('NBA.com: ', '')
# It isn't there in the HTML it returns to us
# uploader_date = self._html_search_regex(r'<b>Date:</b> (.*?)</div>', webpage, 'upload_date', fatal=False)
--- /dev/null
+import re
+import json
+from .common import InfoExtractor
+from ..utils import unified_strdate, determine_ext
+class RoxwelIE(InfoExtractor):
+ _VALID_URL = r'https?://www\.roxwel\.com/player/(?P<filename>.+?)(\.|\?|$)'
+ _TEST = {
+ u'url': u'http://www.roxwel.com/player/passionpittakeawalklive.html',
+ u'file': u'passionpittakeawalklive.flv',
+ u'md5': u'd9dea8360a1e7d485d2206db7fe13035',
+ u'info_dict': {
+ u'title': u'Take A Walk (live)',
+ u'uploader': u'Passion Pit',
+ u'description': u'Passion Pit performs "Take A Walk\" live at The Backyard in Austin, Texas. ',
+ },
+ u'skip': u'Requires rtmpdump',
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ filename = mobj.group('filename')
+ info_url = 'http://www.roxwel.com/api/videos/%s' % filename
+ info_page = self._download_webpage(info_url, filename,
+ u'Downloading video info')
+ self.report_extraction(filename)
+ info = json.loads(info_page)
+ rtmp_rates = sorted([int(r.replace('flv_', '')) for r in info['media_rates'] if r.startswith('flv_')])
+ best_rate = rtmp_rates[-1]
+ url_page_url = 'http://roxwel.com/pl_one_time.php?filename=%s&quality=%s' % (filename, best_rate)
+ rtmp_url = self._download_webpage(url_page_url, filename, u'Downloading video url')
+ ext = determine_ext(rtmp_url)
+ if ext == 'f4v':
+ rtmp_url = rtmp_url.replace(filename, 'mp4:%s' % filename)
+ return {'id': filename,
+ 'title': info['title'],
+ 'url': rtmp_url,
+ 'ext': 'flv',
+ 'description': info['description'],
+ 'thumbnail': info.get('player_image_url') or info.get('image_url_large'),
+ 'uploader': info['artist'],
+ 'uploader_id': info['artistname'],
+ 'upload_date': unified_strdate(info['dbdate']),
+ }
--- /dev/null
+# coding: utf-8
+import re
+import xml.etree.ElementTree
+from .common import InfoExtractor
+from ..utils import (
+ compat_urllib_request,
+ compat_urllib_parse,
+class SinaIE(InfoExtractor):
+ _VALID_URL = r'''https?://(.*?\.)?video\.sina\.com\.cn/
+ (
+ (.+?/(((?P<pseudo_id>\d+).html)|(.*?(\#|(vid=))(?P<id>\d+?)($|&))))
+ |
+ # This is used by external sites like Weibo
+ (api/sinawebApi/outplay.php/(?P<token>.+?)\.swf)
+ )
+ '''
+ _TEST = {
+ u'url': u'http://video.sina.com.cn/news/vlist/zt/chczlj2013/?opsubject_id=top12#110028898',
+ u'file': u'110028898.flv',
+ u'md5': u'd65dd22ddcf44e38ce2bf58a10c3e71f',
+ u'info_dict': {
+ u'title': u'《中国新闻》 朝鲜要求巴拿马立即释放被扣船员',
+ }
+ }
+ @classmethod
+ def suitable(cls, url):
+ return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
+ def _extract_video(self, video_id):
+ data = compat_urllib_parse.urlencode({'vid': video_id})
+ url_page = self._download_webpage('http://v.iask.com/v_play.php?%s' % data,
+ video_id, u'Downloading video url')
+ image_page = self._download_webpage(
+ 'http://interface.video.sina.com.cn/interface/common/getVideoImage.php?%s' % data,
+ video_id, u'Downloading thumbnail info')
+ url_doc = xml.etree.ElementTree.fromstring(url_page.encode('utf-8'))
+ return {'id': video_id,
+ 'url': url_doc.find('./durl/url').text,
+ 'ext': 'flv',
+ 'title': url_doc.find('./vname').text,
+ 'thumbnail': image_page.split('=')[1],
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+ video_id = mobj.group('id')
+ if mobj.group('token') is not None:
+ # The video id is in the redirected url
+ self.to_screen(u'Getting video id')
+ request = compat_urllib_request.Request(url)
+ request.get_method = lambda: 'HEAD'
+ (_, urlh) = self._download_webpage_handle(request, 'NA', False)
+ return self._real_extract(urlh.geturl())
+ elif video_id is None:
+ pseudo_id = mobj.group('pseudo_id')
+ webpage = self._download_webpage(url, pseudo_id)
+ video_id = self._search_regex(r'vid:\'(\d+?)\'', webpage, u'video id')
+ return self._extract_video(video_id)
of the stream token and uid
- _VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)(?:[?].*)?$'
+ _VALID_URL = r'''^(?:https?://)?
+ (?:(?:(?:www\.)?soundcloud\.com/([\w\d-]+)/([\w\d-]+)/?(?:[?].*)?$)
+ |(?:api\.soundcloud\.com/tracks/(?P<track_id>\d+))
+ )
+ '''
IE_NAME = u'soundcloud'
_TEST = {
u'url': u'http://soundcloud.com/ethmusic/lostin-powers-she-so-heavy',
+ _CLIENT_ID = 'b45b1aa10f1ac2941910a7f0d10f8e28'
+ @classmethod
+ def suitable(cls, url):
+ return re.match(cls._VALID_URL, url, flags=re.VERBOSE) is not None
def report_resolve(self, video_id):
"""Report information extraction."""
self.to_screen(u'%s: Resolving id' % video_id)
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- if mobj is None:
- raise ExtractorError(u'Invalid URL: %s' % url)
- # extract uploader (which is in the url)
- uploader = mobj.group(1)
- # extract simple title (uploader + slug of song title)
- slug_title = mobj.group(2)
- full_title = '%s/%s' % (uploader, slug_title)
- self.report_resolve(full_title)
- url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
- resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
- info_json = self._download_webpage(resolv_url, full_title, u'Downloading info JSON')
+ @classmethod
+ def _resolv_url(cls, url):
+ return 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=' + cls._CLIENT_ID
- info = json.loads(info_json)
+ def _extract_info_dict(self, info, full_title=None):
video_id = info['id']
- self.report_extraction(full_title)
- streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
- stream_json = self._download_webpage(streams_url, full_title,
- u'Downloading stream definitions',
- u'unable to download stream definitions')
- streams = json.loads(stream_json)
- mediaURL = streams['http_mp3_128_url']
- upload_date = unified_strdate(info['created_at'])
+ name = full_title or video_id
+ self.report_extraction(name)
- return [{
+ thumbnail = info['artwork_url']
+ if thumbnail is not None:
+ thumbnail = thumbnail.replace('-large', '-t500x500')
+ return {
'id': info['id'],
- 'url': mediaURL,
+ 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
'uploader': info['user']['username'],
- 'upload_date': upload_date,
+ 'upload_date': unified_strdate(info['created_at']),
'title': info['title'],
'ext': u'mp3',
'description': info['description'],
- }]
+ 'thumbnail': thumbnail,
+ }
-class SoundcloudSetIE(InfoExtractor):
- """Information extractor for soundcloud.com sets
- To access the media, the uid of the song and a stream token
- must be extracted from the page source and the script must make
- a request to media.soundcloud.com/crossdomain.xml. Then
- the media can be grabbed by requesting from an url composed
- of the stream token and uid
- """
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+ if mobj is None:
+ raise ExtractorError(u'Invalid URL: %s' % url)
+ track_id = mobj.group('track_id')
+ if track_id is not None:
+ info_json_url = 'http://api.soundcloud.com/tracks/' + track_id + '.json?client_id=' + self._CLIENT_ID
+ full_title = track_id
+ else:
+ # extract uploader (which is in the url)
+ uploader = mobj.group(1)
+ # extract simple title (uploader + slug of song title)
+ slug_title = mobj.group(2)
+ full_title = '%s/%s' % (uploader, slug_title)
+ self.report_resolve(full_title)
+ url = 'http://soundcloud.com/%s/%s' % (uploader, slug_title)
+ info_json_url = self._resolv_url(url)
+ info_json = self._download_webpage(info_json_url, full_title, u'Downloading info JSON')
+ info = json.loads(info_json)
+ return self._extract_info_dict(info, full_title)
+class SoundcloudSetIE(SoundcloudIE):
_VALID_URL = r'^(?:https?://)?(?:www\.)?soundcloud\.com/([\w\d-]+)/sets/([\w\d-]+)(?:[?].*)?$'
IE_NAME = u'soundcloud:set'
_TEST = {
- def report_resolve(self, video_id):
- """Report information extraction."""
- self.to_screen(u'%s: Resolving id' % video_id)
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
if mobj is None:
url = 'http://soundcloud.com/%s/sets/%s' % (uploader, slug_title)
- resolv_url = 'http://api.soundcloud.com/resolve.json?url=' + url + '&client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
+ resolv_url = self._resolv_url(url)
info_json = self._download_webpage(resolv_url, full_title)
videos = []
- for track in info['tracks']:
- video_id = track['id']
- streams_url = 'https://api.sndcdn.com/i1/tracks/' + str(video_id) + '/streams?client_id=b45b1aa10f1ac2941910a7f0d10f8e28'
- stream_json = self._download_webpage(streams_url, video_id, u'Downloading track info JSON')
- self.report_extraction(video_id)
- streams = json.loads(stream_json)
- mediaURL = streams['http_mp3_128_url']
- videos.append({
- 'id': video_id,
- 'url': mediaURL,
- 'uploader': track['user']['username'],
- 'upload_date': unified_strdate(track['created_at']),
- 'title': track['title'],
- 'ext': u'mp3',
- 'description': track['description'],
- })
- return videos
+ return {'_type': 'playlist',
+ 'entries': [self._extract_info_dict(track) for track in info['tracks']],
+ 'id': info['id'],
+ 'title': info['title'],
+ }
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group(1)
webpage = self._download_webpage(url, video_id)
- video_url = self._html_search_regex(
- r'<meta property="og:video:secure_url" content="(.+?)">',
- webpage, u'video URL')
- thumbnail_url = self._html_search_regex(
- r'<meta property="og:image" content="(.+?)" />',
- webpage, u'thumbnail URL', fatal=False)
html_title = self._html_search_regex(
webpage, u'title')
return [{
'id': video_id,
- 'url': video_url,
+ 'url': self._og_search_video_url(webpage),
'ext': ext,
'title': title,
- 'thumbnail': thumbnail_url,
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader_id' : uploader_id
u"file": u"81300.flv",
u"md5": u"f870007cee7065d7c76b88f0a45ecc07",
u"info_dict": {
- u"title": u"Terraria 1.1 Trailer"
+ u"title": u"Terraria 1.1 Trailer",
+ u'playlist_index': 1,
u"file": u"80859.flv",
u"md5": u"61aaf31a5c5c3041afb58fb83cbb5751",
u"info_dict": {
- u"title": u"Terraria Trailer"
+ u"title": u"Terraria Trailer",
+ u'playlist_index': 2,
- video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
- webpage, u'title')
- thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)"',
- webpage, u'thumbnail', fatal=False)
- video_description = self._html_search_regex(r'<meta property="og:description" content="(.*?)"',
- webpage, u'description', fatal=False)
data_url = 'http://teamcoco.com/cvp/2.0/%s.xml' % video_id
data = self._download_webpage(data_url, video_id, 'Downloading data webpage')
- video_url = self._html_search_regex(r'<file type="high".*?>(.*?)</file>',
+ video_url = self._html_search_regex(r'<file [^>]*type="high".*?>(.*?)</file>',
data, u'video URL')
return [{
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'thumbnail': thumbnail,
- 'description': video_description,
+ 'title': self._og_search_title(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'description': self._og_search_description(webpage),
webpage = self._download_webpage(url, video_id, 'Downloading \"%s\" page' % video_name)
# If the url includes the language we get the title translated
- title = self._html_search_regex(r'<span id="altHeadline" >(?P<title>.*)</span>',
+ title = self._html_search_regex(r'<span .*?id="altHeadline".+?>(?P<title>.*)</span>',
webpage, 'title')
json_data = self._search_regex(r'<script.*?>var talkDetails = ({.*?})</script>',
webpage, 'json data')
from .common import InfoExtractor
class TF1IE(InfoExtractor):
- """
- TF1 uses the wat.tv player, currently it can only download videos with the
- html5 player enabled, it cannot download HD videos.
- """
+ """TF1 uses the wat.tv player."""
_VALID_URL = r'http://videos.tf1.fr/.*-(.*?).html'
_TEST = {
u'url': u'http://videos.tf1.fr/auto-moto/citroen-grand-c4-picasso-2013-presentation-officielle-8062060.html',
u'file': u'10635995.mp4',
- u'md5': u'66789d3e91278d332f75e1feb7aea327',
+ u'md5': u'2e378cc28b9957607d5e88f274e637d8',
u'info_dict': {
u'title': u'Citroën Grand C4 Picasso 2013 : présentation officielle',
u'description': u'Vidéo officielle du nouveau Citroën Grand C4 Picasso, lancé à l\'automne 2013.',
- }
+ },
+ u'skip': u'Sometimes wat serves the whole file with the --test option',
def _real_extract(self, url):
--- /dev/null
+#coding: utf-8
+import re
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+class ThisAVIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?thisav\.com/video/(?P<id>[0-9]+)/.*'
+ _TEST = {
+ u"url": u"http://www.thisav.com/video/47734/%98%26sup1%3B%83%9E%83%82---just-fit.html",
+ u"file": u"47734.flv",
+ u"md5": u"0480f1ef3932d901f0e0e719f188f19b",
+ u"info_dict": {
+ u"title": u"高樹マリア - Just fit",
+ u"uploader": u"dj7970",
+ u"uploader_id": u"dj7970"
+ }
+ }
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ webpage = self._download_webpage(url, video_id)
+ title = self._html_search_regex(r'<h1>([^<]*)</h1>', webpage, u'title')
+ video_url = self._html_search_regex(
+ r"addVariable\('file','([^']+)'\);", webpage, u'video url')
+ uploader = self._html_search_regex(
+ r': <a href="http://www.thisav.com/user/[0-9]+/(?:[^"]+)">([^<]+)</a>',
+ webpage, u'uploader name', fatal=False)
+ uploader_id = self._html_search_regex(
+ r': <a href="http://www.thisav.com/user/[0-9]+/([^"]+)">(?:[^<]+)</a>',
+ webpage, u'uploader id', fatal=False)
+ ext = determine_ext(video_url)
+ return {
+ '_type': 'video',
+ 'id': video_id,
+ 'url': video_url,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'title': title,
+ 'ext': ext,
+ }
class TrailerAddictIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/trailer/([^/]+)/(?:trailer|feature-trailer)'
+ _VALID_URL = r'(?:http://)?(?:www\.)?traileraddict\.com/(?:trailer|clip)/(?P<movie>.+?)/(?P<trailer_name>.+)'
_TEST = {
u'url': u'http://www.traileraddict.com/trailer/prince-avalanche/trailer',
u'file': u'76184.mp4',
- u'md5': u'41365557f3c8c397d091da510e73ceb4',
+ u'md5': u'57e39dbcf4142ceb8e1f242ff423fd71',
u'info_dict': {
u"title": u"Prince Avalanche Trailer",
u"description": u"Trailer for Prince Avalanche.Two highway road workers spend the summer of 1988 away from their city lives. The isolated landscape becomes a place of misadventure as the men find themselves at odds with each other and the women they left behind."
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group(1)
- webpage = self._download_webpage(url, video_id)
+ name = mobj.group('movie') + '/' + mobj.group('trailer_name')
+ webpage = self._download_webpage(url, name)
title = self._search_regex(r'<title>(.+?)</title>',
webpage, 'video title').replace(' - Trailer Addict','')
view_count = self._search_regex(r'Views: (.+?)<br />',
webpage, 'Views Count')
- description = self._search_regex(r'<meta property="og:description" content="(.+?)" />',
- webpage, 'video description')
- video_id = self._search_regex(r'<meta property="og:video" content="(.+?)" />',
- webpage, 'Video id').split('=')[1]
- info_url = "http://www.traileraddict.com/fvar.php?tid=%s" %(str(video_id))
+ video_id = self._og_search_property('video', webpage, 'Video id').split('=')[1]
+ # Presence of (no)watchplus function indicates HD quality is available
+ if re.search(r'function (no)?watchplus()', webpage):
+ fvar = "fvarhd"
+ else:
+ fvar = "fvar"
+ info_url = "http://www.traileraddict.com/%s.php?tid=%s" % (fvar, str(video_id))
info_webpage = self._download_webpage(info_url, video_id , "Downloading the info webpage")
final_url = self._search_regex(r'&fileurl=(.+)',
info_webpage, 'Download url').replace('%3F','?')
thumbnail_url = self._search_regex(r'&image=(.+?)&',
info_webpage, 'thumbnail url')
ext = final_url.split('.')[-1].split('?')[0]
return [{
'id' : video_id,
'url' : final_url,
'ext' : ext,
'title' : title,
'thumbnail' : thumbnail_url,
- 'description' : description,
+ 'description' : self._og_search_description(webpage),
'view_count' : view_count,
video_id = mobj.group('id')
webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(
- r'<meta property="og:title" content="(.*?)">', webpage, u'title')
internal_id = self._search_regex(r'codVideo=([0-9]+)', webpage, u'internal video ID')
data_url = u'http://tu.tv/flvurl.php?codVideo=' + str(internal_id)
'id': internal_id,
'url': video_url,
'ext': ext,
- 'title': title,
+ 'title': self._og_search_title(webpage),
return [info]
video_info = json.loads(info_json)
- m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):(?P<url>.*?)"', links_webpage))
+ m_urls = list(re.finditer(r'<video src="(?P<ext>.*?):/?(?P<url>.*?)"', links_webpage))
if m_urls is None or len(m_urls) == 0:
raise ExtractorError(u'Unable to extract video url')
# They are sorted from worst to best quality
m_url = m_urls[-1]
- video_url = base_url + m_url.group('url')
+ video_url = base_url + '/' + m_url.group('url')
ext = m_url.group('ext')
return {'url': video_url,
import json
import re
+import itertools
from .common import InfoExtractor
from ..utils import (
'thumbnail': video_thumbnail,
'description': video_description,
+class VimeoChannelIE(InfoExtractor):
+ IE_NAME = u'vimeo:channel'
+ _VALID_URL = r'(?:https?://)?vimeo.\com/channels/(?P<id>[^/]+)'
+ _MORE_PAGES_INDICATOR = r'<a.+?rel="next"'
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ channel_id = mobj.group('id')
+ video_ids = []
+ for pagenum in itertools.count(1):
+ webpage = self._download_webpage('http://vimeo.com/channels/%s/videos/page:%d' % (channel_id, pagenum),
+ channel_id, u'Downloading page %s' % pagenum)
+ video_ids.extend(re.findall(r'id="clip_(\d+?)"', webpage))
+ if re.search(self._MORE_PAGES_INDICATOR, webpage, re.DOTALL) is None:
+ break
+ entries = [self.url_result('http://vimeo.com/%s' % video_id, 'Vimeo')
+ for video_id in video_ids]
+ channel_title = self._html_search_regex(r'<a href="/channels/%s">(.*?)</a>' % channel_id,
+ webpage, u'channel title')
+ return {'_type': 'playlist',
+ 'id': channel_id,
+ 'title': channel_title,
+ 'entries': entries,
+ }
video_url = self._html_search_regex(r'<meta property="twitter:player:stream" content="(.+?)"',
webpage, u'video URL')
- video_title = self._html_search_regex(r'<meta property="og:title" content="(.+?)"',
- webpage, u'title')
- thumbnail = self._html_search_regex(r'<meta property="og:image" content="(.+?)(\?.*?)?"',
- webpage, u'thumbnail', fatal=False)
uploader = self._html_search_regex(r'<div class="user">.*?<h2>(.+?)</h2>',
webpage, u'uploader', fatal=False, flags=re.DOTALL)
'id': video_id,
'url': video_url,
'ext': 'mp4',
- 'title': video_title,
- 'thumbnail': thumbnail,
+ 'title': self._og_search_title(webpage),
+ 'thumbnail': self._og_search_thumbnail(webpage),
'uploader': uploader,
_TEST = {
u'url': u'http://www.wat.tv/video/world-war-philadelphia-vost-6bv55_2fjr7_.html',
u'file': u'10631273.mp4',
- u'md5': u'0a4fe7870f31eaeabb5e25fd8da8414a',
+ u'md5': u'd8b2231e1e333acd12aad94b80937e19',
u'info_dict': {
u'title': u'World War Z - Philadelphia VOST',
u'description': u'La menace est partout. Que se passe-t-il à Philadelphia ?\r\nWORLD WAR Z, avec Brad Pitt, au cinéma le 3 juillet.\r\nhttp://www.worldwarz.fr',
- }
+ },
+ u'skip': u'Sometimes wat serves the whole file with the --test option',
def download_video_info(self, real_id):
# Otherwise we can continue and extract just one part, we have to use
# the short id for getting the video url
- player_data = compat_urllib_parse.urlencode({'shortVideoId': short_id,
- 'html5': '1'})
- player_info = self._download_webpage('http://www.wat.tv/player?' + player_data,
- real_id, u'Downloading player info')
- player = json.loads(player_info)['player']
- html5_player = self._html_search_regex(r'iframe src="(.*?)"', player,
- 'html5 player')
- player_webpage = self._download_webpage(html5_player, real_id,
- u'Downloading player webpage')
- video_url = self._search_regex(r'urlhtml5 : "(.*?)"', player_webpage,
- 'video url')
info = {'id': real_id,
- 'url': video_url,
+ 'url': 'http://wat.tv/get/android5/%s.mp4' % real_id,
'ext': 'mp4',
'title': first_chapter['title'],
'thumbnail': first_chapter['preview'],
--- /dev/null
+# coding: utf-8
+import re
+import json
+from .common import InfoExtractor
+class WeiboIE(InfoExtractor):
+ """
+ The videos in Weibo come from different sites, this IE just finds the link
+ to the external video and returns it.
+ """
+ _VALID_URL = r'https?://video\.weibo\.com/v/weishipin/t_(?P<id>.+?)\.htm'
+ _TEST = {
+ u'url': u'http://video.weibo.com/v/weishipin/t_zjUw2kZ.htm',
+ u'file': u'98322879.flv',
+ u'info_dict': {
+ u'title': u'魔声耳机最新广告“All Eyes On Us”',
+ },
+ u'note': u'Sina video',
+ u'params': {
+ u'skip_download': True,
+ },
+ }
+ # Additional example videos from different sites
+ # Youku: http://video.weibo.com/v/weishipin/t_zQGDWQ8.htm
+ # 56.com: http://video.weibo.com/v/weishipin/t_zQ44HxN.htm
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url, flags=re.VERBOSE)
+ video_id = mobj.group('id')
+ info_url = 'http://video.weibo.com/?s=v&a=play_list&format=json&mix_video_id=t_%s' % video_id
+ info_page = self._download_webpage(info_url, video_id)
+ info = json.loads(info_page)
+ videos_urls = map(lambda v: v['play_page_url'], info['result']['data'])
+ #Prefer sina video since they have thumbnails
+ videos_urls = sorted(videos_urls, key=lambda u: u'video.sina.com' in u)
+ player_url = videos_urls[-1]
+ m_sina = re.match(r'https?://video.sina.com.cn/v/b/(\d+)-\d+.html', player_url)
+ if m_sina is not None:
+ self.to_screen('Sina video detected')
+ sina_id = m_sina.group(1)
+ player_url = 'http://you.video.sina.com.cn/swf/quotePlayer.swf?vid=%s' % sina_id
+ return self.url_result(player_url)
webpage_src = self._download_webpage(url, video_id)
+ m_vevo_id = re.search(r'videoId=(.*?)&?',
+ webpage_src)
+ if m_vevo_id is not None:
+ self.to_screen(u'Vevo video detected:')
+ return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
video_url = self._search_regex(r'so\.addVariable\("file","(.*?)"\)',
webpage_src, u'video URL')
webpage = self._download_webpage(embed_page_url, video_id)
# Get the video URL
- video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
- webpage, u'video URL')
+ m_playlist = re.search(r'so.addVariable\("playlist", ?"(?P<playlist>.+?)"\);', webpage)
+ if m_playlist is not None:
+ playlist_url = m_playlist.group('playlist')
+ playlist_page = self._download_webpage(playlist_url, video_id,
+ u'Downloading playlist page')
+ m_levels = list(re.finditer(r'<level bitrate="(\d+?)" file="(.*?)"', playlist_page))
+ if len(m_levels) == 0:
+ raise ExtractorError(u'Unable to extract video url')
+ videos = [(int(m.group(1)), m.group(2)) for m in m_levels]
+ (_, video_url) = sorted(videos)[0]
+ video_url = video_url.replace('%252F', '%2F')
+ else:
+ video_url = self._search_regex(r'so.addVariable\("file",encodeURIComponent\("(?P<source>[^"]+)"\)\);',
+ webpage, u'video URL')
info = {'id': video_id,
'url': video_url,
class YoukuIE(InfoExtractor):
- _VALID_URL = r'(?:http://)?v\.youku\.com/v_show/id_(?P<ID>[A-Za-z0-9]+)\.html'
+ _VALID_URL = r'(?:http://)?(v|player)\.youku\.com/(v_show/id_|player\.php/sid/)(?P<ID>[A-Za-z0-9]+)(\.html|/v.swf)'
_TEST = {
u"url": u"http://v.youku.com/v_show/id_XNDgyMDQ2NTQw.html",
u"file": u"XNDgyMDQ2NTQw_part00.flv",
+class YoutubeBaseInfoExtractor(InfoExtractor):
+ """Provide base functions for Youtube extractors"""
+ _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
+ _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
+ _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
+ _NETRC_MACHINE = 'youtube'
+ # If True it will raise an error if no login info is provided
+ def report_lang(self):
+ """Report attempt to set language."""
+ self.to_screen(u'Setting language')
+ def _set_language(self):
+ request = compat_urllib_request.Request(self._LANG_URL)
+ try:
+ self.report_lang()
+ compat_urllib_request.urlopen(request).read()
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
+ return False
+ return True
+ def _login(self):
+ (username, password) = self._get_login_info()
+ # No authentication to be performed
+ if username is None:
+ if self._LOGIN_REQUIRED:
+ raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True)
+ return False
+ request = compat_urllib_request.Request(self._LOGIN_URL)
+ try:
+ login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
+ return False
+ galx = None
+ dsh = None
+ match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
+ if match:
+ galx = match.group(1)
+ match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
+ if match:
+ dsh = match.group(1)
+ # Log in
+ login_form_strs = {
+ u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
+ u'Email': username,
+ u'GALX': galx,
+ u'Passwd': password,
+ u'PersistentCookie': u'yes',
+ u'_utf8': u'霱',
+ u'bgresponse': u'js_disabled',
+ u'checkConnection': u'',
+ u'checkedDomains': u'youtube',
+ u'dnConn': u'',
+ u'dsh': dsh,
+ u'pstMsg': u'0',
+ u'rmShown': u'1',
+ u'secTok': u'',
+ u'signIn': u'Sign in',
+ u'timeStmp': u'',
+ u'service': u'youtube',
+ u'uilel': u'3',
+ u'hl': u'en_US',
+ }
+ # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
+ # chokes on unicode
+ login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
+ login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
+ request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
+ try:
+ self.report_login()
+ login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
+ if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
+ self._downloader.report_warning(u'unable to log in: bad username or password')
+ return False
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
+ return False
+ return True
-class YoutubeIE(InfoExtractor):
+ def _confirm_age(self):
+ age_form = {
+ 'next_url': '/',
+ 'action_confirm': 'Confirm',
+ }
+ request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
+ try:
+ self.report_age_confirmation()
+ compat_urllib_request.urlopen(request).read().decode('utf-8')
+ except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
+ raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+ return True
+ def _real_initialize(self):
+ if self._downloader is None:
+ return
+ if not self._set_language():
+ return
+ if not self._login():
+ return
+ self._confirm_age()
+class YoutubeIE(YoutubeBaseInfoExtractor):
IE_DESC = u'YouTube.com'
_VALID_URL = r"""^
([0-9A-Za-z_-]+) # here is it! the YouTube video ID
(?(1).+)? # if we found the ID, everything can follow
- _LANG_URL = r'https://www.youtube.com/?hl=en&persist_hl=1&gl=US&persist_gl=1&opt_out_ackd=1'
- _LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
- _AGE_URL = 'http://www.youtube.com/verify_age?next_url=/&gl=US&hl=en'
_NEXT_URL_RE = r'[\?&]next_url=([^&]+)'
- _NETRC_MACHINE = 'youtube'
# Listed in order of quality
- _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13']
- _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13']
+ _available_formats = ['38', '37', '46', '22', '45', '35', '44', '34', '18', '43', '6', '5', '17', '13',
+ '95', '94', '93', '92', '132', '151',
+ '85', '84', '102', '83', '101', '82', '100',
+ ]
+ _available_formats_prefer_free = ['38', '46', '37', '45', '22', '44', '35', '43', '34', '18', '6', '5', '17', '13',
+ '95', '94', '93', '92', '132', '151',
+ '85', '102', '84', '101', '83', '100', '82',
+ ]
_video_extensions = {
'13': '3gp',
'17': 'mp4',
'44': 'webm',
'45': 'webm',
'46': 'webm',
+ # 3d videos
+ '82': 'mp4',
+ '83': 'mp4',
+ '84': 'mp4',
+ '85': 'mp4',
+ '100': 'webm',
+ '101': 'webm',
+ '102': 'webm',
+ # videos that use m3u8
+ '92': 'mp4',
+ '93': 'mp4',
+ '94': 'mp4',
+ '95': 'mp4',
+ '96': 'mp4',
+ '132': 'mp4',
+ '151': 'mp4',
_video_dimensions = {
'5': '240x400',
'44': '480x854',
'45': '720x1280',
'46': '1080x1920',
+ '82': '360p',
+ '83': '480p',
+ '84': '720p',
+ '85': '1080p',
+ '92': '240p',
+ '93': '360p',
+ '94': '480p',
+ '95': '720p',
+ '96': '1080p',
+ '100': '360p',
+ '101': '480p',
+ '102': '720p',
+ '132': '240p',
+ '151': '72p',
+ _3d_itags = ['85', '84', '102', '83', '101', '82', '100']
IE_NAME = u'youtube'
_TESTS = [
u"uploader_id": u"justintimberlakeVEVO"
+ {
+ u'url': u'https://www.youtube.com/watch?v=TGi3HqYrWHE',
+ u'file': u'TGi3HqYrWHE.mp4',
+ u'note': u'm3u8 video',
+ u'info_dict': {
+ u'title': u'Triathlon - Men - London 2012 Olympic Games',
+ u'description': u'- Men - TR02 - Triathlon - 07 August 2012 - London 2012 Olympic Games',
+ u'uploader': u'olympic',
+ u'upload_date': u'20120807',
+ u'uploader_id': u'olympic',
+ },
+ u'params': {
+ u'skip_download': True,
+ },
+ },
if YoutubePlaylistIE.suitable(url) or YoutubeSubscriptionsIE.suitable(url): return False
return re.match(cls._VALID_URL, url, re.VERBOSE) is not None
- def report_lang(self):
- """Report attempt to set language."""
- self.to_screen(u'Setting language')
def report_video_webpage_download(self, video_id):
"""Report attempt to download video webpage."""
self.to_screen(u'%s: Downloading video webpage' % video_id)
def _decrypt_signature(self, s):
"""Turn the encrypted s field into a working signature"""
- if len(s) == 88:
+ if len(s) == 92:
+ return s[25] + s[3:25] + s[0] + s[26:42] + s[79] + s[43:79] + s[91] + s[80:83]
+ elif len(s) == 90:
+ return s[25] + s[3:25] + s[2] + s[26:40] + s[77] + s[41:77] + s[89] + s[78:81]
+ elif len(s) == 88:
return s[48] + s[81:67:-1] + s[82] + s[66:62:-1] + s[85] + s[61:48:-1] + s[67] + s[47:12:-1] + s[3] + s[11:3:-1] + s[2] + s[12]
elif len(s) == 87:
- return s[62] + s[82:62:-1] + s[83] + s[61:52:-1] + s[0] + s[51:2:-1]
+ return s[4:23] + s[86] + s[24:85]
elif len(s) == 86:
- return s[2:63] + s[82] + s[64:82] + s[63]
+ return s[83:85] + s[26] + s[79:46:-1] + s[85] + s[45:36:-1] + s[30] + s[35:30:-1] + s[46] + s[29:26:-1] + s[82] + s[25:1:-1]
elif len(s) == 85:
- return s[76] + s[82:76:-1] + s[83] + s[75:60:-1] + s[0] + s[59:50:-1] + s[1] + s[49:2:-1]
+ return s[2:8] + s[0] + s[9:21] + s[65] + s[22:65] + s[84] + s[66:82] + s[21]
elif len(s) == 84:
return s[83:36:-1] + s[2] + s[35:26:-1] + s[3] + s[25:3:-1] + s[26]
elif len(s) == 83:
- return s[:81]
+ return s[:15] + s[80] + s[16:80] + s[15]
elif len(s) == 82:
return s[36] + s[79:67:-1] + s[81] + s[66:40:-1] + s[33] + s[39:36:-1] + s[40] + s[35] + s[0] + s[67] + s[32:0:-1] + s[34]
+ elif len(s) == 81:
+ return s[56] + s[79:56:-1] + s[41] + s[55:41:-1] + s[80] + s[40:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
+ elif len(s) == 79:
+ return s[54] + s[77:54:-1] + s[39] + s[53:39:-1] + s[78] + s[38:34:-1] + s[0] + s[33:29:-1] + s[34] + s[28:9:-1] + s[29] + s[8:0:-1] + s[9]
raise ExtractorError(u'Unable to decrypt signature, key length %d not supported; retrying might work' % (len(s)))
+ def _decrypt_signature_age_gate(self, s):
+ # The videos with age protection use another player, so the algorithms
+ # can be different.
+ if len(s) == 86:
+ return s[2:63] + s[82] + s[64:82] + s[63]
+ else:
+ # Fallback to the other algortihms
+ return self._decrypt_signature(s)
def _get_available_subtitles(self, video_id):
request = compat_urllib_request.Request('http://video.google.com/timedtext?hl=en&type=list&v=%s' % video_id)
def _print_formats(self, formats):
print('Available formats:')
for x in formats:
- print('%s\t:\t%s\t[%s]' %(x, self._video_extensions.get(x, 'flv'), self._video_dimensions.get(x, '???')))
- def _real_initialize(self):
- if self._downloader is None:
- return
- # Set language
- request = compat_urllib_request.Request(self._LANG_URL)
- try:
- self.report_lang()
- compat_urllib_request.urlopen(request).read()
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to set language: %s' % compat_str(err))
- return
- (username, password) = self._get_login_info()
- # No authentication to be performed
- if username is None:
- return
- request = compat_urllib_request.Request(self._LOGIN_URL)
- try:
- login_page = compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to fetch login page: %s' % compat_str(err))
- return
- galx = None
- dsh = None
- match = re.search(re.compile(r'<input.+?name="GALX".+?value="(.+?)"', re.DOTALL), login_page)
- if match:
- galx = match.group(1)
- match = re.search(re.compile(r'<input.+?name="dsh".+?value="(.+?)"', re.DOTALL), login_page)
- if match:
- dsh = match.group(1)
- # Log in
- login_form_strs = {
- u'continue': u'https://www.youtube.com/signin?action_handle_signin=true&feature=sign_in_button&hl=en_US&nomobiletemp=1',
- u'Email': username,
- u'GALX': galx,
- u'Passwd': password,
- u'PersistentCookie': u'yes',
- u'_utf8': u'霱',
- u'bgresponse': u'js_disabled',
- u'checkConnection': u'',
- u'checkedDomains': u'youtube',
- u'dnConn': u'',
- u'dsh': dsh,
- u'pstMsg': u'0',
- u'rmShown': u'1',
- u'secTok': u'',
- u'signIn': u'Sign in',
- u'timeStmp': u'',
- u'service': u'youtube',
- u'uilel': u'3',
- u'hl': u'en_US',
- }
- # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode
- # chokes on unicode
- login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items())
- login_data = compat_urllib_parse.urlencode(login_form).encode('ascii')
- request = compat_urllib_request.Request(self._LOGIN_URL, login_data)
- try:
- self.report_login()
- login_results = compat_urllib_request.urlopen(request).read().decode('utf-8')
- if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning(u'unable to log in: bad username or password')
- return
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- self._downloader.report_warning(u'unable to log in: %s' % compat_str(err))
- return
- # Confirm age
- age_form = {
- 'next_url': '/',
- 'action_confirm': 'Confirm',
- }
- request = compat_urllib_request.Request(self._AGE_URL, compat_urllib_parse.urlencode(age_form))
- try:
- self.report_age_confirmation()
- compat_urllib_request.urlopen(request).read().decode('utf-8')
- except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err:
- raise ExtractorError(u'Unable to confirm age: %s' % compat_str(err))
+ print('%s\t:\t%s\t[%s]%s' %(x, self._video_extensions.get(x, 'flv'),
+ self._video_dimensions.get(x, '???'),
+ ' (3D)' if x in self._3d_itags else ''))
def _extract_id(self, url):
mobj = re.match(self._VALID_URL, url, re.VERBOSE)
video_id = mobj.group(2)
return video_id
+ def _get_video_url_list(self, url_map):
+ """
+ Transform a dictionary in the format {itag:url} to a list of (itag, url)
+ with the requested formats.
+ """
+ req_format = self._downloader.params.get('format', None)
+ format_limit = self._downloader.params.get('format_limit', None)
+ available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
+ if format_limit is not None and format_limit in available_formats:
+ format_list = available_formats[available_formats.index(format_limit):]
+ else:
+ format_list = available_formats
+ existing_formats = [x for x in format_list if x in url_map]
+ if len(existing_formats) == 0:
+ raise ExtractorError(u'no known formats available for video')
+ if self._downloader.params.get('listformats', None):
+ self._print_formats(existing_formats)
+ return
+ if req_format is None or req_format == 'best':
+ video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
+ elif req_format == 'worst':
+ video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
+ elif req_format in ('-1', 'all'):
+ video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
+ else:
+ # Specific formats. We pick the first in a slash-delimeted sequence.
+ # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
+ req_formats = req_format.split('/')
+ video_url_list = None
+ for rf in req_formats:
+ if rf in url_map:
+ video_url_list = [(rf, url_map[rf])]
+ break
+ if video_url_list is None:
+ raise ExtractorError(u'requested format not available')
+ return video_url_list
+ def _extract_from_m3u8(self, manifest_url, video_id):
+ url_map = {}
+ def _get_urls(_manifest):
+ lines = _manifest.split('\n')
+ urls = filter(lambda l: l and not l.startswith('#'),
+ lines)
+ return urls
+ manifest = self._download_webpage(manifest_url, video_id, u'Downloading formats manifest')
+ formats_urls = _get_urls(manifest)
+ for format_url in formats_urls:
+ itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
+ url_map[itag] = format_url
+ return url_map
def _real_extract(self, url):
if re.match(r'(?:https?://)?[^/]+/watch\?feature=[a-z_]+$', url):
self._downloader.report_warning(u'Did you forget to quote the URL? Remember that & is a meta-character in most shells, so you want to put the URL in quotes, like youtube-dl \'http://www.youtube.com/watch?feature=foo&v=BaW_jenozKc\' (or simply youtube-dl BaW_jenozKc ).')
video_duration = compat_urllib_parse.unquote_plus(video_info['length_seconds'][0])
# Decide which formats to download
- req_format = self._downloader.params.get('format', None)
mobj = re.search(r';ytplayer.config = ({.*?});', video_webpage)
video_url_list = [(None, video_info['conn'][0])]
elif 'url_encoded_fmt_stream_map' in video_info and len(video_info['url_encoded_fmt_stream_map']) >= 1:
+ if 'rtmpe%3Dyes' in video_info['url_encoded_fmt_stream_map'][0]:
+ raise ExtractorError('rtmpe downloads are not supported, see https://github.com/rg3/youtube-dl/issues/343 for more information.', expected=True)
url_map = {}
for url_data_str in video_info['url_encoded_fmt_stream_map'][0].split(','):
url_data = compat_parse_qs(url_data_str)
player = u'html5 player %s' % self._search_regex(r'html5player-(.+?)\.js', video_webpage,
'html5 player', fatal=False)
- self.to_screen('encrypted signature length %d (%d.%d), itag %s, %s' %
- (len(s), len(s.split('.')[0]), len(s.split('.')[1]), url_data['itag'][0], player))
- signature = self._decrypt_signature(url_data['s'][0])
+ parts_sizes = u'.'.join(compat_str(len(part)) for part in s.split('.'))
+ self.to_screen(u'encrypted signature length %d (%s), itag %s, %s' %
+ (len(s), parts_sizes, url_data['itag'][0], player))
+ encrypted_sig = url_data['s'][0]
+ if age_gate:
+ signature = self._decrypt_signature_age_gate(encrypted_sig)
+ else:
+ signature = self._decrypt_signature(encrypted_sig)
url += '&signature=' + signature
if 'ratebypass' not in url:
url += '&ratebypass=yes'
url_map[url_data['itag'][0]] = url
- format_limit = self._downloader.params.get('format_limit', None)
- available_formats = self._available_formats_prefer_free if self._downloader.params.get('prefer_free_formats', False) else self._available_formats
- if format_limit is not None and format_limit in available_formats:
- format_list = available_formats[available_formats.index(format_limit):]
- else:
- format_list = available_formats
- existing_formats = [x for x in format_list if x in url_map]
- if len(existing_formats) == 0:
- raise ExtractorError(u'no known formats available for video')
- if self._downloader.params.get('listformats', None):
- self._print_formats(existing_formats)
+ video_url_list = self._get_video_url_list(url_map)
+ if not video_url_list:
- if req_format is None or req_format == 'best':
- video_url_list = [(existing_formats[0], url_map[existing_formats[0]])] # Best quality
- elif req_format == 'worst':
- video_url_list = [(existing_formats[-1], url_map[existing_formats[-1]])] # worst quality
- elif req_format in ('-1', 'all'):
- video_url_list = [(f, url_map[f]) for f in existing_formats] # All formats
- else:
- # Specific formats. We pick the first in a slash-delimeted sequence.
- # For example, if '1/2/3/4' is requested and '2' and '4' are available, we pick '2'.
- req_formats = req_format.split('/')
- video_url_list = None
- for rf in req_formats:
- if rf in url_map:
- video_url_list = [(rf, url_map[rf])]
- break
- if video_url_list is None:
- raise ExtractorError(u'requested format not available')
+ elif video_info.get('hlsvp'):
+ manifest_url = video_info['hlsvp'][0]
+ url_map = self._extract_from_m3u8(manifest_url, video_id)
+ video_url_list = self._get_video_url_list(url_map)
+ if not video_url_list:
+ return
raise ExtractorError(u'no conn or url_encoded_fmt_stream_map information found in video info')
# Extension
video_extension = self._video_extensions.get(format_param, 'flv')
- video_format = '{0} - {1}'.format(format_param if format_param else video_extension,
- self._video_dimensions.get(format_param, '???'))
+ video_format = '{0} - {1}{2}'.format(format_param if format_param else video_extension,
+ self._video_dimensions.get(format_param, '???'),
+ ' (3D)' if format_param in self._3d_itags else '')
'id': video_id,
\? (?:.*?&)*? (?:p|a|list)=
| p/
- ((?:PL|EC|UU)?[0-9A-Za-z-_]{10,})
+ ((?:PL|EC|UU|FL)?[0-9A-Za-z-_]{10,})
- ((?:PL|EC|UU)[0-9A-Za-z-_]{10,})
+ ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,})
_TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none'
# Download playlist videos from API
playlist_id = mobj.group(1) or mobj.group(2)
- page_num = 1
videos = []
- while True:
- url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, self._MAX_RESULTS * (page_num - 1) + 1)
+ for page_num in itertools.count(1):
+ start_index = self._MAX_RESULTS * (page_num - 1) + 1
+ if start_index >= 1000:
+ self._downloader.report_warning(u'Max number of results reached')
+ break
+ url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index)
page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num)
if 'media$group' in entry and 'media$player' in entry['media$group']:
videos.append((index, entry['media$group']['media$player']['url']))
- if len(response['feed']['entry']) < self._MAX_RESULTS:
- break
- page_num += 1
videos = [v[1] for v in sorted(videos)]
url_results = [self.url_result(vurl, 'Youtube') for vurl in videos]
_VALID_URL = r"^(?:https?://)?(?:youtu\.be|(?:\w+\.)?youtube(?:-nocookie)?\.com)/channel/([0-9A-Za-z_-]+)"
_TEMPLATE_URL = 'http://www.youtube.com/channel/%s/videos?sort=da&flow=list&view=0&page=%s&gl=US&hl=en'
_MORE_PAGES_INDICATOR = 'yt-uix-load-more'
- _MORE_PAGES_URL = 'http://www.youtube.com/channel_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
+ _MORE_PAGES_URL = 'http://www.youtube.com/c4_browse_ajax?action_load_more_videos=1&flow=list&paging=%s&view=0&sort=da&channel_id=%s'
IE_NAME = u'youtube:channel'
def extract_videos_from_page(self, page):
# Download any subsequent channel pages using the json-based channel_ajax query
if self._MORE_PAGES_INDICATOR in page:
- while True:
- pagenum = pagenum + 1
+ for pagenum in itertools.count(1):
url = self._MORE_PAGES_URL % (pagenum, channel_id)
page = self._download_webpage(url, channel_id,
u'Downloading page #%s' % pagenum)
# all of them.
video_ids = []
- pagenum = 0
- while True:
+ for pagenum in itertools.count(0):
start_index = pagenum * self._GDATA_PAGE_SIZE + 1
gdata_url = self._GDATA_URL % (username, self._GDATA_PAGE_SIZE, start_index)
if len(ids_in_page) < self._GDATA_PAGE_SIZE:
- pagenum += 1
urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids]
url_results = [self.url_result(rurl, 'Youtube') for rurl in urls]
return [self.playlist_result(url_results, playlist_title = username)]
return [self.url_result('https://www.youtube.com' + season.group(1), 'YoutubePlaylist') for season in m_seasons]
-class YoutubeSubscriptionsIE(YoutubeIE):
- """It's a subclass of YoutubeIE because we need to login"""
- IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
- _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
- IE_NAME = u'youtube:subscriptions'
- _FEED_TEMPLATE = 'http://www.youtube.com/feed_ajax?action_load_system_feed=1&feed_name=subscriptions&paging=%s'
+class YoutubeFeedsInfoExtractor(YoutubeBaseInfoExtractor):
+ """
+ Base class for extractors that fetch info from
+ http://www.youtube.com/feed_ajax
+ Subclasses must define the _FEED_NAME and _PLAYLIST_TITLE properties.
+ """
+ # use action_load_personal_feed instead of action_load_system_feed
- # Overwrite YoutubeIE properties we don't want
- _TESTS = []
- @classmethod
- def suitable(cls, url):
- return re.match(cls._VALID_URL, url) is not None
+ @property
+ def _FEED_TEMPLATE(self):
+ action = 'action_load_system_feed'
+ if self._PERSONAL_FEED:
+ action = 'action_load_personal_feed'
+ return 'http://www.youtube.com/feed_ajax?%s=1&feed_name=%s&paging=%%s' % (action, self._FEED_NAME)
+ @property
+ def IE_NAME(self):
+ return u'youtube:%s' % self._FEED_NAME
def _real_initialize(self):
- (username, password) = self._get_login_info()
- if username is None:
- raise ExtractorError(u'No login info available, needed for downloading the Youtube subscriptions.', expected=True)
- super(YoutubeSubscriptionsIE, self)._real_initialize()
+ self._login()
def _real_extract(self, url):
feed_entries = []
# The step argument is available only in 2.7 or higher
for i in itertools.count(0):
paging = i*self._PAGING_STEP
- info = self._download_webpage(self._FEED_TEMPLATE % paging, 'feed',
+ info = self._download_webpage(self._FEED_TEMPLATE % paging,
+ u'%s feed' % self._FEED_NAME,
u'Downloading page %s' % i)
info = json.loads(info)
feed_html = info['feed_html']
- m_ids = re.finditer(r'"/watch\?v=(.*?)"', feed_html)
+ m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html)
ids = orderedSet(m.group(1) for m in m_ids)
feed_entries.extend(self.url_result(id, 'Youtube') for id in ids)
if info['paging'] is None:
- return self.playlist_result(feed_entries, playlist_title='Youtube Subscriptions')
+ return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE)
+class YoutubeSubscriptionsIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = u'YouTube.com subscriptions feed, "ytsubs" keyword(requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/subscriptions|:ytsubs(?:criptions)?'
+ _FEED_NAME = 'subscriptions'
+ _PLAYLIST_TITLE = u'Youtube Subscriptions'
+class YoutubeRecommendedIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = u'YouTube.com recommended videos, "ytrec" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/recommended|:ytrec(?:ommended)?'
+ _FEED_NAME = 'recommended'
+ _PLAYLIST_TITLE = u'Youtube Recommended videos'
+class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor):
+ IE_DESC = u'Youtube watch later list, "ytwatchlater" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/feed/watch_later|:ytwatchlater'
+ _FEED_NAME = 'watch_later'
+ _PLAYLIST_TITLE = u'Youtube Watch Later'
+ _PAGING_STEP = 100
+class YoutubeFavouritesIE(YoutubeBaseInfoExtractor):
+ IE_NAME = u'youtube:favorites'
+ IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)'
+ _VALID_URL = r'https?://www\.youtube\.com/my_favorites|:ytfav(?:o?rites)?'
+ def _real_extract(self, url):
+ webpage = self._download_webpage('https://www.youtube.com/my_favorites', 'Youtube Favourites videos')
+ playlist_id = self._search_regex(r'list=(.+?)["&]', webpage, u'favourites playlist id')
+ return self.url_result(playlist_id, 'YoutubePlaylist')
except ImportError: # Python 2
from urlparse import urlparse as compat_urllib_parse_urlparse
+ import urllib.parse as compat_urlparse
+except ImportError: # Python 2
+ import urlparse as compat_urlparse
import http.cookiejar as compat_cookiejar
except ImportError: # Python 2
with open(fn, 'w', encoding='utf-8') as f:
json.dump(obj, f)
+if sys.version_info >= (2,7):
+ def find_xpath_attr(node, xpath, key, val):
+ """ Find the xpath xpath[@key=val] """
+ assert re.match(r'^[a-zA-Z]+$', key)
+ assert re.match(r'^[a-zA-Z@]*$', val)
+ expr = xpath + u"[@%s='%s']" % (key, val)
+ return node.find(expr)
+ def find_xpath_attr(node, xpath, key, val):
+ for f in node.findall(xpath):
+ if f.attrib.get(key) == val:
+ return f
+ return None
def htmlentity_transform(matchobj):
"""Transforms an HTML entity to a character.
return upload_date
-def determine_ext(url):
+def determine_ext(url, default_ext=u'unknown_video'):
guess = url.partition(u'?')[0].rpartition(u'.')[2]
if re.match(r'^[A-Za-z0-9]+$', guess):
return guess
- return u'unknown_video'
+ return default_ext
def date_from_str(date_str):
-__version__ = '2013.07.10'
+__version__ = '2013.08.02'